Investigating the best way to get Deaths numbers for Italy (Part 2)¶
AIM: for each AGE and YEAR_OF_OBSERVATION, we want the most reliable estimate of the number of deaths in Italy.
import warnings
from istatapi import discovery, retrieval
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"
def get_colors(n, cmap_name="rainbow"):
"""Get colors for px colors_discrete argument, given the number of colors needed, n."""
cmap = matplotlib.colormaps[cmap_name]
colors = [cmap(i) for i in np.linspace(0, 1, n)] # Generate colors
colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
return colors_str
age_group_labels = {k: v for k, v in zip( # Rename to more conventional labels for sorting
['Y_UN4', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y_GE95'],
['Y00-04', 'Y05-09', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y95+']
)}
dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfpc=pd.read_csv("../data/cc_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
Now that I know from previous Notebook that DCIS_DECESSI is the most reliable source of data for Deaths in Italy,
try to find a way to convert age_groups to ages.
ds = discovery.DataSet(dataflow_identifier="DCIS_DECESSI")
ds.set_filters(
freq="A",
#eta="TOTAL",
paese_cittad="TOTAL", itter107="IT",
paese_nasc="X1033", # mondo
sesso="9", stato_civ="99", tipo_dato="DEATH", titolo_studio="99"
)
df6 = retrieval.get_data(ds)
df6.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
# convert to table: year x age_group
dfdg = (
df6
.query("ETA!='TOTAL'")
.assign(age_group= lambda x: x["ETA"].map(age_group_labels)) # rename to my convenient age group labels (better for sorting)
.assign(year= lambda x: x["TIME_PERIOD"].dt.year)
.groupby(["age_group", "year"], as_index=False)["OBS_VALUE"].sum()
.rename(columns={"OBS_VALUE": "deaths"})
.pivot(index="age_group", columns="year", values="deaths")
)
dfdg
# equally divide the age
ageg2age = { x: range(5*i, 5*(i+1)) for i, x in enumerate(dfdg.index) }
ageg2age_df = pd.DataFrame(ageg2age.items(), columns=["age_group", "age"]).explode("age")
dfd1 = (
pd.merge(dfdg, ageg2age_df, on="age_group")
.drop(columns="age_group")
.set_index("age")
.sort_index(ascending=True)
.divide(5)
.astype(int)
)
dfd1
px.line(
data_frame=dfd1,
x=dfd1.index,
y=dfd1.columns,
color_discrete_sequence=get_colors(n=dfd1.columns.size),
title="Method: equally divide deaths from DCIS_DECESSI for all ages of each age group",
).update_layout(
xaxis_title="Age",
yaxis_title="Deaths",
legend_title_text="Year of observation",
width=1000,
).show()
Clearly too simplistic: there are evident steps that will overestimate deaths at the beginning of the age range and underestimate them at the end of the age range.
Compare with change (delta) population:
px.line(
data_frame=-dfpc,
x=dfpc.index,
y=dfpc.columns,
color_discrete_sequence=get_colors(n=dfpc.columns.size),
title="Population Change by age, on each year of observation",
).update_layout(
xaxis_title="Age",
yaxis_title="- Population Change",
legend_title_text="Year of observation",
width=1000,
).show()
# Split population an population change by age group
dfpg = dfp.copy()
dfpg['age_group'] = pd.cut(
dfpg.index,
bins=[-1, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 100],
labels=[ # Use the conventional labels from ISTAT
'Y_UN4', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y_GE95'
]
)
dfpg = dfpg.groupby('age_group').sum()
dfpg.index = dfpg.index.map(age_group_labels)
dfpg = dfpg.iloc[:, 8:]
display(dfpg)
dfpcg = dfpc.copy()
dfpcg['age_group'] = pd.cut(
dfpcg.index,
bins=[-1, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 100],
labels=[ # Use the conventional labels from ISTAT
'Y_UN4', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y_GE95'
]
)
dfpcg = dfpcg.groupby('age_group').sum()
dfpcg.index = dfpcg.index.map(age_group_labels)
display(dfpcg)
# percentage of deaths by age group =deaths / population
dfdg_perc = dfdg / dfpg
dfdg_perc = dfdg_perc.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
# with pd.option_context('display.float_format', '{:.2f}%'.format):
# display(dfdg_perc*100)
px.line(
data_frame=dfdg_perc,
x=dfdg_perc.index,
y=dfdg_perc.columns,
color_discrete_sequence=get_colors(n=dfdg_perc.columns.size),
title="Percentage of Deaths over Population by Age Group",
).add_trace(
go.Scatter(
x=dfdg_perc.index,
y=dfdg_perc.mean(axis=1),
mode='lines',
name='Average',
line=dict(dash='dash', color='black')
)
).update_layout(
xaxis_title="Age Group",
yaxis_title="Deaths / Population",
yaxis_tickformat=',.2%',
legend_title_text="Year of observation",
width=1000,
yaxis_type="log",
).show()
# percentage of migrants by age group = change - deaths / population
dfmg = (dfpcg + dfdg)
dfmg_perc = dfmg / dfpg
dfmg_perc = dfmg_perc.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
# with pd.option_context('display.float_format', '{:.2f}%'.format):
# display(dfmg_perc*100)
px.line(
data_frame=dfmg_perc,
x=dfmg_perc.index,
y=dfmg_perc.columns,
color_discrete_sequence=get_colors(n=dfmg_perc.columns.size),
title="Percentage of Migration Balance related to the previous year (compared to Avg Deaths)",
).add_trace(
go.Scatter(
x=dfdg_perc.index,
y=dfdg_perc.mean(axis=1),
mode='lines',
name='Average Deaths/Pop',
line=dict(dash='dash', color='black')
)
).update_layout(
xaxis_title="Age Group",
yaxis_title="Migration Balance / Population",
yaxis_tickformat = ',.2%',
legend_title_text="Year of observation",
width=1000,
yaxis=dict(range=[-0.02, 0.02]),
).show()
Conclusions¶
- This last plot proves that the trade-off age of 50 I will use in next Notebook (and that I guessed) is indeed a good choice: UNDER that age the change-of-population in mainly due to migrations and OVER that age it is mainly due to deaths.
- Still, in the range 50-60 the two reason of change-of-population are comparable, which is some source of the error for extrapolations of improving health care and migration.
- It does not make much sense to use death data directly (I will miss 2004-2010 data) but to infer some qualitative estimates to treat the change-of-population data better.
- Oldest grops show some artifacts: Y90-94 is indeed a small problem (ca 1-2% vs 20% mortality rate, i.e., 5-10% error) and the Y95+ is expectet to be problematic due to the counting of >100 years old people.
NOTE: In 2020 the change-of-population is weird but this is an artifact of a change of methodology in the data collection which can better account (1) those residents that are not registered and (2) those Italians who are living abroad.
Basically from this 2020 change it emerged that:
- more children (<15y) were resident in Italy
- more adults (15-70) were practically abroad (i.e., not resident in Italy)
- more elderly (>70) were resident in Italy
Follow-up¶
- I could maybe use these death data to estimate future deaths among younger population (<50 years old) but I'm not sure this will make much difference in the final results.
- As for migrants, I could extrapolate some kind of bell-shape distribution of migrants, by age, as it visually appers in the last graph, centered around 10-50 years old.