Investigating Italian Population¶

AIM: make the best of data available from ISTAT.

In particular, I want to have:

the number of residents each year, by age (2D table: AGE x YEAR_OF_OBSERVATION)
compute the delta of residents each year, by age (2D table: AGE x YEAR_OF_OBSERVATION)
compute the Cohort Change Ratio (CCR) each year, by age (2D table: AGE x YEAR_OF_OBSERVATION)

import time
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import sdmx
import warnings 

client = sdmx.Client("ISTAT")
pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"

def get_colors(n, cmap_name="rainbow"):
    """Get colors for px colors_discrete argument, given the number of colors needed, n."""
    cmap = matplotlib.colormaps[cmap_name]
    colors = [cmap(i) for i in np.linspace(0, 1, n)]  # Generate colors
    colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
    return colors_str

I need to put together data from RICPOPRES dataset (riconstructed resident population) and the updated POPRES dataset (resident population since 2020).

For further information about these datasets check Notebook#00

# Concatenate all the dataframes about population on the first of January, from 3 datasets
all_popres_ids = [
    "164_346_DF_DCIS_RICPOPRES1971_1", # 1952-1971
    "164_347_DF_DCIS_RICPOPRES1981_1", # 1972-1981 
    "164_279_DF_DCIS_RICPOPRES1991_1", # 1982-1991
    "164_305_DF_DCIS_RICPOPRES2001_1", # 1992-2001
    "164_164_DF_DCIS_RICPOPRES2011_1", # 2001-2019
    "22_289_DF_DCIS_POPRES1_1", # 2019-latest
]
dfs = []
for ds_id in all_popres_ids:
    keys = {
        "FREQ": "A",
        "REF_AREA": "IT",
        "DATA_TYPE": "JAN",     
        "AGE": [], # I want them all
        "SEX": [], # 9 is total
    }
    if ds_id == "164_164_DF_DCIS_RICPOPRES2011_1": # the only one with the "CITIZENSHIP" dimension
        keys["CITIZENSHIP"] = "TOTAL"
    if ds_id == "22_289_DF_DCIS_POPRES1_1": # the only one with the "MARITAL_STATUS" dimension
        keys["MARITAL_STATUS"] = "99"
    dfs.append(
        sdmx.to_pandas(client.data(resource_id=ds_id, key=keys)).reset_index() # takes about 30 sec.
    )
    print(f"Dataset {ds_id} has {dfs[-1].shape[0]} rows and {dfs[-1].shape[1]} columns.")
    if ds_id != all_popres_ids[-1]: # avoid sleeping after the last dataset
        time.sleep(120)

Dataset 164_346_DF_DCIS_RICPOPRES1971_1 has 6120 rows and 7 columns.
Dataset 164_347_DF_DCIS_RICPOPRES1981_1 has 3060 rows and 7 columns.
Dataset 164_279_DF_DCIS_RICPOPRES1991_1 has 3060 rows and 7 columns.
Dataset 164_305_DF_DCIS_RICPOPRES2001_1 has 3060 rows and 7 columns.
Dataset 164_164_DF_DCIS_RICPOPRES2011_1 has 5511 rows and 8 columns.
Dataset 22_289_DF_DCIS_POPRES1_1 has 2448 rows and 8 columns.

Population is split in different groups by age, where "0" means from 0 (newborns) to 9 years old, "10" means from 10 to 19 years old, and so on. The last group is "100" which means, 100 or more.

dfp_long = (
    pd.concat(dfs, ignore_index=True)
    [["TIME_PERIOD", "AGE", "SEX","value"]]
    .query("AGE!='TOTAL'")
    .replace("Y_GE100", "Y100") # Remember that 100 is 100+, converting for simplicity
    .assign(AGE= lambda x: x["AGE"].str.split("Y").str[-1].astype(int))
    .assign(YEAR= lambda x: x["TIME_PERIOD"]) # Remember, this YEAR means "at the beginning (January 1st) of the year"
    [["YEAR", "AGE", "SEX","value"]]
    .astype(int)
    .assign(SEX= lambda x: x["SEX"].map({9: "T", 1: "M", 2: "F"}))
    .drop_duplicates() # remove 2019 duplicates, in both datasets
    .sort_values(["AGE", "YEAR", "SEX"])
    .reset_index(drop=True)
)
display(dfp_long)

fig = px.line(
    dfp_long,#.reset_index().melt(id_vars=["age"], var_name="year", value_name="population"), # convert to long format
    x="AGE", 
    y="value", 
    color="SEX",
    color_discrete_map={"T": "black", "M": "blue", "F": "red"},
    title="Total population in Italy by age (TOTAL)",
    animation_frame="YEAR",
    markers=False,
)
fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Population by age group",
    yaxis_range=[0, 1e6],
    title=None,
    legend_title="Sex",
    margin=dict(l=10, r=10, t=10, b=10),
    width=780,
    height=420,
)

# Initialize the animation at the last frame (current year)
last = fig.frames[-1]
for extra in last.data[len(fig.data):]:
    fig.add_trace(extra)
for i, tr in enumerate(last.data):
    payload = tr.to_plotly_json() if hasattr(tr, "to_plotly_json") else dict(tr)
    fig.data[i].update(**payload)
if "sliders" in fig.layout and fig.layout.sliders:
    fig.layout.sliders[0].active = len(fig.frames) - 1
    
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.write_html("../images_output/pop_by_age.html", auto_play=False)
print("Total population in Italy by age")
fig.show()

Total population in Italy by age

# Pivot the tabel by year (of observation) vs age, and visualize it as a heatmap
dfp = (
    dfp_long
    .query("SEX=='T'") # only total population
    .pivot(index='AGE', columns='YEAR', values='value') # make an age x year table
    .rename(columns={"index": "AGE"})
)
dfp.to_csv("../data/pop_by_age_year.csv", index=True)
fig = px.imshow(
    dfp, labels=dict(x="YEAR", y="AGE", color="Population"), aspect="auto",
).update_layout(width=1000).show()

By sliding from 2002 to 2022 we can see that the population age is quite rigidly shifting: there is some sort of equilibrium between the different age groups, with the exception of the newborns, which are decreasing in number.

We can observe some characteristics of the population age distribution. Let's refer to 2002:

World War I was fought by italian soldiers between 1915 and 1918, the well we see between the ages of 82 and 85 (born in 1917-1920) could well reflect the precarious conditions of the italian population during and right after the war, leading to less births (or child deaths)
Same story with WWII: the well is between 56 and 60, reflecting the drop of births in 1942-1946
We could also consider adult soldiers who died in the wars: data is too scarce for capturing the effect of WWI but for WWII we can consider the estimated ~0.5M deaths among soldiers and civilians to be spread for the ages 75 to 87, i.e., considering people that at the time of the conflict were between 18 and 30 years old. We can not see a clear effect of this in the data, not as evident as the two wells highlighted in the previous points.
Baby boomers, have a clear peak in 1965-1970, leading to an aboundance of people that were between 32 and 37 years old in 2002.

Now I'll split the population in age groups of 5-years-wide. This is necessary for me to (1) use DCIS_DECESSI data (2) have larger groups / less noise.

Let's now see what it is the change of population agening each year:

e.g., compare the amount of people that were 80 in a certain year and that are 81 in the next year
this is callled the "Cohort Change Ratio" $ccr = func(age, year)$
we can expect that the more we age, the more negative is the percentage of those who live another year

NOTE: ccr(year) refers to the difference between January 1st of year, and January 1st of year+1.

dfpc = pd.DataFrame(columns=dfp.columns.tolist()[:-1], index=pd.Index(range(1, 100), name="age"))
for year in dfpc.columns:
    prev_year = dfp[year].shift(1).to_numpy()
    dfpc[year] = (dfp[year+1] - prev_year).dropna().astype(int)
    
dfpcr = pd.DataFrame(columns=dfp.columns.tolist()[:-1], index=pd.Index(range(1, 100), name="age"))
for year in dfpcr.columns:
    prev_year = dfp[year].shift(1).to_numpy()
    dfpcr[year] = (dfp[year+1] - prev_year) / prev_year

dfpc.to_csv("../data/cc_by_age_year.csv", index=True)
dfpcr.to_csv("../data/ccr_by_age_year.csv", index=True)

print("Cohort Change")
display(dfpc)
print("Cohort Change Ratio")
display(dfpcr)

Cohort Change

Cohort Change Ratio

fig = px.line(
    data_frame=dfpc,
    x=dfpc.index,
    y=dfpc.columns,
)
fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Cohort Change",
    legend_title="Year",
    title=None,
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Cohort Change, on each year of observation")
fig.write_html("../images_output/cohort_change.html")
fig.show()

Cohort Change, on each year of observation

dfpcr.loc[:,2000:]

dfpcr_plot = dfpcr.loc[:,1992:] # Before this year, data is noisy due to the datasets' intersection 
fig = px.line(
    data_frame=dfpcr_plot,
    x=dfpcr_plot.index,
    y=dfpcr_plot.columns,
)
fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Cohort Change Ratio",
    yaxis_tickformat = ',.0%',
    legend_title="Year",
    title=None,
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Cohort Change Ratio, on each year of observation")
fig.write_html("../images_output/cohort_change_ratio.html")
fig.show()

Cohort Change Ratio, on each year of observation

As expected we see that the decrease of people is very modest (<2%) till the age of 70.

Then visually, an 80 years old has a 3-5% change of not surviving the year, which increases to 13-17% when he is 90 years old.

Each year is shown separately, but we can not see a clear trend in the data with respect to the year of measurment, except for 2021, weighting the death toll of COVID-19:
here we can see that CCR dropped visually for 20-70 years old, but the drop is not that significant for 70+ elder people, where the noise due to the year is more significant than the drop due to the COVID-19.

The drop from Jan 2021 to Jan 2022 is also localized to 20-30 years old, becoming visibly indistinguishable for older people.

# get heatmap of the table in squared image
fig = px.imshow(
    dfpcr_plot, labels=dict(x="Year of Observation", y="Age", color="%Pop.Growth"), aspect="auto"
    ).update_layout(width=1000)
fig.show()

Conclusions¶

Identified the trends of (de-)growth of the Italian population from 2002 to 2022, including the role of immigrants
Identified the distribution of population by age, and the agening of the baby boomers
Identified the Cohort Change Ratio (ccr)

Follow-up¶

Check if the reason of the ccr drop for elder people in 2003, 2005, 2012, 2015 is related to heatwaves the year before
Make a model of the ccr of population by age, to extrapolate the trends of future years
Check the evolution of the ratio between people in working and retirement age in the next years, testing different scenarios

	YEAR	AGE	SEX	value
0	1952	0	F	398977
1	1952	0	M	418500
2	1952	0	T	817477
3	1953	0	F	399947
4	1953	0	M	418434
...	...	...	...	...
22720	2025	100	M	3976
22721	2025	100	T	22652
22722	2026	100	F	20378
22723	2026	100	M	4332
22724	2026	100	T	24710

	1952	1953	1954	1955	1956	1957	1958	1959	1960	1961	...	2016	2017	2018	2019	2020	2021	2022	2023	2024	2025
age
1	-19264	-16899	-14349	-15753	-13204	-17036	-14296	-13892	-14831	-21494	...	-2	456	680	5778	8295	2678	3626	3273	3676	2697
2	-5631	-3137	-5152	-6189	-5753	-5739	-4469	-3533	-5281	-6564	...	-607	-165	401	1568	6786	1914	3613	3320	3266	2877
3	-4713	-1469	-2240	-4481	-4455	-5673	-2995	-4522	-3290	-4443	...	-1548	-456	136	883	6925	1067	3268	3334	2798	3238
4	-3678	-1708	-298	-1541	-3246	-5324	-3224	-1766	-3113	-3615	...	-1120	-1217	-404	823	8779	514	2736	3049	2741	3259
5	-3704	-1559	-2190	-810	-2741	6830	-3256	-2982	-2207	-4282	...	-859	-968	-234	904	9549	-1791	2387	3066	2945	3085
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	-188	-1599	-3424	-216	-1337	-262	-2357	-1074	-2107	-2316	...	-14840	-15861	-15761	-15847	-18591	-17158	-19053	-17151	-17243	-17293
96	-50	-1303	-2697	-176	-986	-95	-1835	-784	-1595	-1760	...	-11717	-12847	-12390	-13436	-15356	-14058	-15423	-14415	-14126	-13403
97	9	-1006	-2078	-157	-669	-62	-1393	-585	-1161	-1292	...	-6310	-10317	-9865	-10534	-12431	-11122	-12382	-11261	-11248	-10236
98	16	-761	-1577	-118	-444	-76	-1006	-430	-825	-890	...	-3639	-5265	-7657	-8282	-9189	-8488	-9335	-8206	-8161	-7871
99	6	-537	-1086	-75	-277	-49	-659	-270	-551	-563	...	-2997	-2918	-3710	-6229	-6925	-6371	-7125	-5997	-6220	-5909

	1952	1953	1954	1955	1956	1957	1958	1959	1960	1961	...	2016	2017	2018	2019	2020	2021	2022	2023	2024	2025
age
1	-0.023565	-0.020649	-0.017518	-0.018701	-0.015675	-0.020137	-0.016855	-0.016448	-0.016947	-0.024426	...	-0.000004	0.000970	0.001494	0.013406	0.019989	0.006613	0.009051	0.008309	0.009658	0.007280
2	-0.006716	-0.003930	-0.006428	-0.007691	-0.006960	-0.006921	-0.005391	-0.004237	-0.006357	-0.007630	...	-0.001215	-0.000342	0.000852	0.003441	0.015536	0.004522	0.008863	0.008213	0.008223	0.007486
3	-0.005533	-0.001764	-0.002817	-0.005627	-0.005579	-0.006911	-0.003637	-0.005485	-0.003962	-0.005383	...	-0.003035	-0.000914	0.000282	0.001875	0.015143	0.002405	0.007686	0.008107	0.006865	0.008086
4	-0.004052	-0.002016	-0.000358	-0.001944	-0.004099	-0.006704	-0.003955	-0.002153	-0.003797	-0.004371	...	-0.002106	-0.002393	-0.000810	0.001704	0.018612	0.001107	0.006153	0.007116	0.006611	0.007942
5	-0.004110	-0.001724	-0.002590	-0.000975	-0.003464	0.008661	-0.004128	-0.003673	-0.002696	-0.005242	...	-0.001593	-0.001824	-0.000461	0.001814	0.019739	-0.003728	0.005136	0.006853	0.006825	0.007392
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	-0.043519	-0.286971	-0.626418	-0.069231	-0.312675	-0.059845	-0.406590	-0.211918	-0.366881	-0.435011	...	-0.223514	-0.232952	-0.223105	-0.221649	-0.253108	-0.234357	-0.252341	-0.229925	-0.222962	-0.201386
96	-0.016480	-0.315344	-0.678832	-0.086190	-0.339532	-0.032324	-0.445821	-0.227907	-0.399349	-0.484048	...	-0.237470	-0.249195	-0.237238	-0.244812	-0.275944	-0.256252	-0.275140	-0.255350	-0.245913	-0.223038
97	0.004423	-0.337131	-0.734535	-0.123041	-0.358521	-0.032325	-0.489803	-0.256466	-0.437123	-0.538558	...	-0.259992	-0.274213	-0.254863	-0.264434	-0.299925	-0.276028	-0.303466	-0.277146	-0.267574	-0.236304
98	0.012261	-0.372309	-0.797270	-0.157124	-0.396783	-0.063492	-0.542026	-0.296347	-0.486439	-0.595318	...	-0.274724	-0.293151	-0.280404	-0.287151	-0.313596	-0.292528	-0.320010	-0.288740	-0.277859	-0.255643
99	0.007538	-0.406510	-0.846454	-0.187032	-0.437599	-0.072593	-0.587868	-0.317647	-0.539667	-0.646383	...	-0.295504	-0.303737	-0.292241	-0.316997	-0.336819	-0.316760	-0.347087	-0.302329	-0.307708	-0.278595

	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	...	2016	2017	2018	2019	2020	2021	2022	2023	2024	2025
age
1	-0.003411	-0.008132	0.006826	0.004413	0.006035	0.001015	0.003163	0.005646	0.004139	-0.001451	...	-0.000004	0.000970	0.001494	0.013406	0.019989	0.006613	0.009051	0.008309	0.009658	0.007280
2	-0.004874	-0.002936	0.007356	0.008481	0.005279	0.005299	0.002290	0.005925	0.004664	0.002302	...	-0.001215	-0.000342	0.000852	0.003441	0.015536	0.004522	0.008863	0.008213	0.008223	0.007486
3	-0.006728	-0.004784	0.005143	0.008218	0.008478	0.005377	0.004232	0.004735	0.005479	0.003370	...	-0.003035	-0.000914	0.000282	0.001875	0.015143	0.002405	0.007686	0.008107	0.006865	0.008086
4	-0.002946	-0.006053	0.007120	0.006060	0.008971	0.008563	0.003704	0.007164	0.003778	0.003051	...	-0.002106	-0.002393	-0.000810	0.001704	0.018612	0.001107	0.006153	0.007116	0.006611	0.007942
5	-0.000301	-0.002531	0.007592	0.008182	0.006784	0.009140	0.006976	0.007593	0.005785	0.002272	...	-0.001593	-0.001824	-0.000461	0.001814	0.019739	-0.003728	0.005136	0.006853	0.006825	0.007392
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	-0.228856	-0.233824	-0.245960	-0.263653	-0.233332	-0.247680	-0.231303	-0.232070	-0.231649	-0.215476	...	-0.223514	-0.232952	-0.223105	-0.221649	-0.253108	-0.234357	-0.252341	-0.229925	-0.222962	-0.201386
96	-0.241189	-0.221014	-0.261482	-0.280408	-0.255955	-0.271404	-0.251054	-0.253212	-0.256484	-0.239686	...	-0.237470	-0.249195	-0.237238	-0.244812	-0.275944	-0.256252	-0.275140	-0.255350	-0.245913	-0.223038
97	-0.267253	-0.237589	-0.280671	-0.305668	-0.270721	-0.298379	-0.272986	-0.278773	-0.274407	-0.259692	...	-0.259992	-0.274213	-0.254863	-0.264434	-0.299925	-0.276028	-0.303466	-0.277146	-0.267574	-0.236304
98	-0.265471	-0.267331	-0.303980	-0.330253	-0.290067	-0.316115	-0.297278	-0.292068	-0.296714	-0.286154	...	-0.274724	-0.293151	-0.280404	-0.287151	-0.313596	-0.292528	-0.320010	-0.288740	-0.277859	-0.255643
99	-0.264127	-0.296821	-0.312372	-0.349601	-0.315150	-0.331922	-0.318625	-0.328176	-0.312943	-0.307879	...	-0.295504	-0.303737	-0.292241	-0.316997	-0.336819	-0.316760	-0.347087	-0.302329	-0.307708	-0.278595