Investigating the best way to get Deaths numbers for Italy (Part 2)¶

NO NEED TO UPDATE THIS NOTEBOOK, as it is just for investigation purposes

AIM: for each AGE and YEAR_OF_OBSERVATION, we want the most reliable estimate of the number of deaths in Italy.

import warnings 
from istatapi import discovery, retrieval
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"

def get_colors(n, cmap_name="rainbow"):
    """Get colors for px colors_discrete argument, given the number of colors needed, n."""
    cmap = matplotlib.colormaps[cmap_name]
    colors = [cmap(i) for i in np.linspace(0, 1, n)]  # Generate colors
    colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
    return colors_str

age_group_labels = {k: v for k, v in zip( # Rename to more conventional labels for sorting
    ['Y_UN4', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
     'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y_GE95'],
    ['Y00-04', 'Y05-09', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
     'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y95+']
)}

dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfpc=pd.read_csv("../data/cc_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14

Now that I know from previous Notebook that DCIS_DECESSI is the most reliable source of data for Deaths in Italy,

try to find a way to convert age_groups to ages.

ds = discovery.DataSet(dataflow_identifier="DCIS_DECESSI") 
ds.set_filters(
    freq="A", 
    #eta="TOTAL", 
    paese_cittad="TOTAL", itter107="IT", 
    paese_nasc="X1033", # mondo
    sesso="9", stato_civ="99", tipo_dato="DEATH", titolo_studio="99"
)
df6 = retrieval.get_data(ds)
df6.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]

# convert to table: year x age_group
dfdg = (
    df6
    .query("ETA!='TOTAL'")
    .assign(age_group= lambda x: x["ETA"].map(age_group_labels)) # rename to my convenient age group labels (better for sorting)
    .assign(year= lambda x: x["TIME_PERIOD"].dt.year)
    .groupby(["age_group", "year"], as_index=False)["OBS_VALUE"].sum()
    .rename(columns={"OBS_VALUE": "deaths"})
    .pivot(index="age_group", columns="year", values="deaths")
)
dfdg

# equally divide the age
ageg2age = { x: range(5*i, 5*(i+1)) for i, x in enumerate(dfdg.index) }
ageg2age_df = pd.DataFrame(ageg2age.items(), columns=["age_group", "age"]).explode("age")
dfd1 = (
    pd.merge(dfdg, ageg2age_df, on="age_group")
    .drop(columns="age_group")
    .set_index("age")
    .sort_index(ascending=True)
    .divide(5)
    .astype(int)
)
dfd1

px.line(
    data_frame=dfd1,
    x=dfd1.index,
    y=dfd1.columns,
    color_discrete_sequence=get_colors(n=dfd1.columns.size),
    title="Method: equally divide deaths from DCIS_DECESSI for all ages of each age group",
).update_layout(
    xaxis_title="Age",
    yaxis_title="Deaths",
    legend_title_text="Year of observation",
    width=1000, 
).show()

Clearly too simplistic: there are evident steps that will overestimate deaths at the beginning of the age range and underestimate them at the end of the age range.

Compare with change (delta) population:

px.line(
    data_frame=-dfpc,
    x=dfpc.index,
    y=dfpc.columns,
    color_discrete_sequence=get_colors(n=dfpc.columns.size),
    title="Population Change by age, on each year of observation",
).update_layout(
    xaxis_title="Age",
    yaxis_title="- Population Change",
    legend_title_text="Year of observation",
    width=1000, 
).show()

# Split population an population change by age group
dfpg = dfp.copy()
dfpg['age_group'] = pd.cut(
    dfpg.index,
    bins=[-1, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 100],
    labels=[ # Use the conventional labels from ISTAT
        'Y_UN4', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
        'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y_GE95'
    ]
)
dfpg = dfpg.groupby('age_group').sum()
dfpg.index = dfpg.index.map(age_group_labels)
dfpg = dfpg.iloc[:, 8:]
display(dfpg)

dfpcg = dfpc.copy()
dfpcg['age_group'] = pd.cut(
    dfpcg.index,
    bins=[-1, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 100],
    labels=[ # Use the conventional labels from ISTAT
        'Y_UN4', 'Y5-9', 'Y10-14', 'Y15-19', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-44', 'Y45-49',
        'Y50-54', 'Y55-59', 'Y60-64', 'Y65-69', 'Y70-74', 'Y75-79', 'Y80-84', 'Y85-89', 'Y90-94', 'Y_GE95'
    ]
)
dfpcg = dfpcg.groupby('age_group').sum()
dfpcg.index = dfpcg.index.map(age_group_labels)
display(dfpcg)

# percentage of deaths by age group =deaths / population
dfdg_perc = dfdg / dfpg
dfdg_perc = dfdg_perc.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
# with pd.option_context('display.float_format', '{:.2f}%'.format):
#     display(dfdg_perc*100)

px.line(
    data_frame=dfdg_perc,
    x=dfdg_perc.index,
    y=dfdg_perc.columns,
    color_discrete_sequence=get_colors(n=dfdg_perc.columns.size),
    title="Percentage of Deaths over Population by Age Group",
).add_trace(
    go.Scatter(
        x=dfdg_perc.index,
        y=dfdg_perc.mean(axis=1),
        mode='lines',
        name='Average',
        line=dict(dash='dash', color='black')
    )
).update_layout(
    xaxis_title="Age Group",
    yaxis_title="Deaths / Population",
    yaxis_tickformat=',.2%',
    legend_title_text="Year of observation",
    width=1000,
    yaxis_type="log",
).show()

# percentage of migrants by age group = change - deaths / population
dfmg = (dfpcg + dfdg) 
dfmg_perc =  dfmg / dfpg
dfmg_perc = dfmg_perc.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
# with pd.option_context('display.float_format', '{:.2f}%'.format):
#     display(dfmg_perc*100)
    
px.line(
    data_frame=dfmg_perc,
    x=dfmg_perc.index,
    y=dfmg_perc.columns,
    color_discrete_sequence=get_colors(n=dfmg_perc.columns.size),
    title="Percentage of Migration Balance related to the previous year (compared to Avg Deaths)",
).add_trace(
    go.Scatter(
        x=dfdg_perc.index,
        y=dfdg_perc.mean(axis=1),
        mode='lines',
        name='Average Deaths/Pop',
        line=dict(dash='dash', color='black')
    )
).update_layout(
    xaxis_title="Age Group",
    yaxis_title="Migration Balance / Population",
    yaxis_tickformat = ',.2%',
    legend_title_text="Year of observation",
    width=1000, 
    yaxis=dict(range=[-0.02, 0.02]),
).show()

Conclusions¶

This last plot proves that the trade-off age of 50 I will use in next Notebook (and that I guessed) is indeed a good choice: UNDER that age the change-of-population in mainly due to migrations and OVER that age it is mainly due to deaths.
Still, in the range 50-60 the two reason of change-of-population are comparable, which is some source of the error for extrapolations of improving health care and migration.
It does not make much sense to use death data directly (I will miss 2004-2010 data) but to infer some qualitative estimates to treat the change-of-population data better.
Oldest grops show some artifacts: Y90-94 is indeed a small problem (ca 1-2% vs 20% mortality rate, i.e., 5-10% error) and the Y95+ is expectet to be problematic due to the counting of >100 years old people.

NOTE: In 2020 the change-of-population is weird but this is an artifact of a change of methodology in the data collection which can better account (1) those residents that are not registered and (2) those Italians who are living abroad.

Basically from this 2020 change it emerged that:

more children (<15y) were resident in Italy

more adults (15-70) were practically abroad (i.e., not resident in Italy)

more elderly (>70) were resident in Italy

Follow-up¶

I could maybe use these death data to estimate future deaths among younger population (<50 years old) but I'm not sure this will make much difference in the final results.
As for migrants, I could extrapolate some kind of bell-shape distribution of migrants, by age, as it visually appers in the last graph, centered around 10-50 years old.

year	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022	2023
age_group
Y00-04	2004	1911	1815	1717	1678	1563	1483	1487	1239	1135	1101	1079	1159
Y05-09	242	202	207	206	181	219	192	198	149	149	150	165	194
Y10-14	282	250	241	241	232	262	261	247	210	218	227	211	256
Y15-19	730	648	610	565	648	569	561	596	550	489	573	589	574
Y20-24	1154	1019	959	877	901	874	872	840	814	737	769	821	837
Y25-29	1270	1181	1149	1069	1047	1024	1001	1062	995	896	1012	959	985
Y30-34	1764	1619	1560	1410	1452	1365	1324	1342	1231	1227	1316	1311	1291
Y35-39	2864	2877	2659	2538	2350	2237	2055	2121	1915	1957	2017	1893	1918
Y40-44	4765	4725	4555	4304	4308	4153	4084	3952	3743	3661	3642	3501	3302
Y45-49	7983	7984	7767	7387	7662	7103	7167	6911	6675	6962	6931	6592	6296
Y50-54	11239	11315	11343	11379	11926	11646	11480	11598	11105	12039	11930	11174	10883
Y55-59	16046	16096	15924	15781	16525	16173	16282	16435	16499	18372	18683	18189	17786
Y60-64	25861	25313	24067	23198	23995	22398	23211	23140	22919	26380	26299	25264	24618
Y65-69	33930	34985	35060	35631	37615	36488	35704	34076	32725	37688	37415	35754	34577
Y70-74	53831	54122	52089	49874	49951	47379	49737	49587	50459	61973	59726	56056	52107
Y75-79	78610	78818	76831	75696	80580	76350	79492	75479	72840	83204	78321	79310	76556
Y80-84	112457	115208	110882	109415	116284	107560	112299	108840	109518	131356	122810	123404	114422
Y85-89	126291	131093	127394	125989	139013	132045	140863	135177	136600	158488	146985	151705	142287
Y90-94	70261	82190	89084	97406	109970	103969	112295	109421	110500	130669	121974	131257	120585
Y95+	41843	41327	36548	33681	41253	41884	48698	50624	53731	62717	59465	65843	60432

	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022	2023
age
0	400	382	363	343	335	312	296	297	247	227	220	215	231
1	400	382	363	343	335	312	296	297	247	227	220	215	231
2	400	382	363	343	335	312	296	297	247	227	220	215	231
3	400	382	363	343	335	312	296	297	247	227	220	215	231
4	400	382	363	343	335	312	296	297	247	227	220	215	231
...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	8368	8265	7309	6736	8250	8376	9739	10124	10746	12543	11893	13168	12086
96	8368	8265	7309	6736	8250	8376	9739	10124	10746	12543	11893	13168	12086
97	8368	8265	7309	6736	8250	8376	9739	10124	10746	12543	11893	13168	12086
98	8368	8265	7309	6736	8250	8376	9739	10124	10746	12543	11893	13168	12086
99	8368	8265	7309	6736	8250	8376	9739	10124	10746	12543	11893	13168	12086

	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021	2022	2023	2024	2025
age_group
Y00-04	2822037	2814758	2783769	2761737	2707604	2639830	2563779	2491284	2414284	2338752	2264538	2216510	2142834	2085244	2031476	1982728
Y05-09	2807843	2809995	2819562	2836532	2846970	2839650	2826949	2793205	2752385	2691273	2627956	2578340	2523317	2464447	2409672	2342876
Y10-14	2785425	2815128	2830915	2836934	2840653	2845693	2832275	2830996	2836003	2841862	2835060	2841695	2823644	2794408	2745119	2693561
Y15-19	2941153	2917378	2896250	2865512	2850419	2847440	2863240	2870747	2879427	2873676	2871056	2857013	2866117	2895108	2921261	2922469
Y20-24	3051364	3075031	3099677	3125714	3099555	3060268	3011433	2982184	2961062	2962307	2955888	2957340	2923341	2938344	2950359	2988760
Y25-29	3396597	3357372	3314664	3278455	3266140	3235616	3218786	3215851	3216640	3175599	3128494	3045579	3001345	2995670	3012211	3051798
Y30-34	4114496	3954616	3809575	3721266	3628726	3548205	3480793	3412512	3346272	3320500	3282441	3226146	3211649	3234123	3213337	3203692
Y35-39	4718939	4689885	4601481	4499274	4374615	4189436	4019133	3872779	3758255	3654733	3572191	3483252	3412872	3366663	3357101	3342755
Y40-44	4891294	4870390	4846233	4833220	4809514	4773914	4725830	4628648	4504958	4372031	4187464	4008570	3868084	3773564	3680853	3615932
Y45-49	4606933	4738991	4838039	4912951	4949356	4922880	4883298	4850613	4818286	4785280	4749765	4692501	4609870	4503977	4380844	4207289
Y50-54	4020325	4115911	4219694	4328297	4457493	4606435	4720066	4808923	4870982	4900974	4876704	4817675	4807237	4791465	4770527	4740799
Y55-59	3690217	3710147	3771147	3848124	3920002	3991998	4071859	4166061	4263608	4387417	4537491	4623965	4735135	4810601	4848988	4826781
Y60-64	3689905	3824379	3754974	3687838	3615804	3612371	3625089	3680986	3750459	3819054	3893350	3946096	4053284	4162290	4292746	4441564
Y65-69	3163420	3051944	3179331	3304792	3441090	3551096	3670374	3599731	3534776	3468709	3471014	3474333	3527442	3601512	3672071	3741720
Y70-74	3037941	3106471	3127001	3104623	3041127	2958073	2848705	2965634	3083893	3215420	3324360	3441171	3368919	3312180	3254150	3254996
Y75-79	2525832	2528778	2537412	2577066	2644574	2716379	2775042	2790572	2767716	2712798	2644013	2547210	2649517	2760935	2887393	2984910
Y80-84	1875153	1931624	1963059	1981859	2010325	2043916	2050515	2063678	2098937	2162715	2231536	2277963	2283923	2263224	2223335	2169220
Y85-89	1129987	1156291	1177107	1201229	1227348	1257686	1299278	1330825	1343948	1367800	1396624	1396102	1401252	1424476	1481572	1543258
Y90-94	302254	352534	408707	458180	510073	547392	554313	571057	583270	599445	616360	629658	640339	639782	654100	681422
Y95+	119201	126874	126588	113706	104529	107219	122955	140448	152608	166328	175183	175094	180012	179188	184115	197647

	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	...	2015	2016	2017	2018	2019	2020	2021	2022	2023	2024
age_group
Y00-04	13873	14428	15437	10952	7358	13019	10107	4070	402	-2773	...	-3381	-3277	-1382	813	9052	30785	6173	13243	12976	10211
Y05-09	18203	20401	21722	23939	19379	27446	21547	13945	8445	6056	...	-2741	-4082	-3770	409	1489	20761	5997	11636	12960	12091
Y10-14	12149	14353	20220	22763	19564	28703	27974	21921	18866	15127	...	-518	-2038	-721	2920	1320	18360	13634	10193	13573	11700
Y15-19	7199	13234	19152	22997	21083	31240	33001	29564	25754	21343	...	9360	14044	24212	15600	11747	10830	12206	21593	25337	21153
Y20-24	5232	43614	33824	16805	18065	67109	63158	46638	45927	37277	...	14324	22543	30960	21663	6009	2340	-18493	34632	31042	42259
Y25-29	26770	91607	75156	35502	25786	78198	65427	43584	49060	42733	...	13275	16450	15864	10890	1464	-26754	-8257	37266	31844	49315
Y30-34	26131	73157	59026	25533	16335	60374	43186	20046	23066	14727	...	7040	8906	7104	10009	303	-31547	4749	37328	31499	36601
Y35-39	23289	55370	46503	22129	16492	51501	35164	19042	19862	9000	...	2718	1871	1268	8117	-368	-17750	5719	28079	26135	23915
Y40-44	8469	35686	29623	12290	10107	36021	28605	16054	17405	6788	...	-3281	-4151	-4199	2526	-3327	-12193	3235	18520	16362	14366
Y45-49	3713	24955	19254	6745	4326	24009	17479	8727	10916	2624	...	-7630	-7808	-7793	-3204	-5978	-16630	6069	10135	9513	3646
Y50-54	-4409	7871	4642	-2248	-2097	11865	10391	4640	5675	-984	...	-12817	-12198	-12328	-9376	-9752	-32039	7007	3051	1758	-5780
Y55-59	-7875	-2904	-3808	-8593	-9567	-5590	-4634	-7042	-5624	-8405	...	-17154	-16847	-16091	-14766	-13361	-43934	5136	-5209	-7238	-15437
Y60-64	-18303	-17931	-17242	-17858	-16515	-15003	-13262	-15254	-16271	-18596	...	-24574	-22910	-23747	-21773	-20008	-51643	-10590	-13825	-17362	-24636
Y65-69	-31040	-31221	-30295	-31644	-30745	-30652	-28232	-28663	-27897	-27634	...	-39136	-36542	-34968	-32631	-30007	-45455	-29016	-26328	-28738	-34341
Y70-74	-55227	-57663	-53848	-53695	-50562	-48880	-47120	-48999	-48842	-47910	...	-49698	-49010	-51372	-51316	-48566	-51928	-62999	-49324	-47503	-50278
Y75-79	-77898	-81497	-76367	-79642	-78009	-79101	-78999	-79715	-76373	-72965	...	-81135	-76849	-79767	-75140	-68269	-72704	-80321	-75141	-73156	-77617
Y80-84	-89449	-107059	-105685	-110489	-107950	-109549	-113571	-113417	-114998	-112910	...	-114105	-106172	-112202	-109451	-106765	-123840	-121742	-118170	-108402	-104702
Y85-89	-97032	-93409	-75360	-82795	-87875	-101769	-116087	-124030	-123076	-128072	...	-140247	-132363	-141602	-136712	-136136	-155438	-147596	-151078	-141196	-139879
Y90-94	-78333	-86798	-77577	-82812	-74247	-69001	-64004	-61972	-66032	-75457	...	-115657	-107600	-117570	-113757	-116365	-137912	-129322	-137124	-126338	-118377
Y95+	-24116	-28310	-26423	-31410	-30985	-34104	-36038	-35407	-34564	-33750	...	-37955	-39503	-47208	-49383	-54328	-62492	-57197	-63318	-57030	-52302

	DATAFLOW	FREQ	ETA	PAESE_CITTAD	ITTER107	PAESE_NASC	SESSO	STATO_CIV	TIPO_DATO	TITOLO_STUDIO	TIME_PERIOD	OBS_VALUE
0	IT1:26_29(1.1)	A	TOTAL	TOTAL	IT	X1033	9	99	DEATH	99	2011-01-01	593427
221	IT1:26_29(1.1)	A	Y75-79	TOTAL	IT	X1033	9	99	DEATH	99	2011-01-01	78610
208	IT1:26_29(1.1)	A	Y70-74	TOTAL	IT	X1033	9	99	DEATH	99	2011-01-01	53831
65	IT1:26_29(1.1)	A	Y20-24	TOTAL	IT	X1033	9	99	DEATH	99	2011-01-01	1154
195	IT1:26_29(1.1)	A	Y65-69	TOTAL	IT	X1033	9	99	DEATH	99	2011-01-01	33930
...	...	...	...	...	...	...	...	...	...	...	...	...
181	IT1:26_29(1.1)	A	Y55-59	TOTAL	IT	X1033	9	99	DEATH	99	2023-01-01	17786
25	IT1:26_29(1.1)	A	Y_GE95	TOTAL	IT	X1033	9	99	DEATH	99	2023-01-01	60432
246	IT1:26_29(1.1)	A	Y80-84	TOTAL	IT	X1033	9	99	DEATH	99	2023-01-01	114422
259	IT1:26_29(1.1)	A	Y85-89	TOTAL	IT	X1033	9	99	DEATH	99	2023-01-01	142287
272	IT1:26_29(1.1)	A	Y90-94	TOTAL	IT	X1033	9	99	DEATH	99	2023-01-01	120585