Compute italian migration¶

AIM: get a migration balance by subracting deaths from delta-residents. I need it to have the migration trend till today, and use it for my model to predict the future.

import warnings 
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import sdmx

client = sdmx.Client("ISTAT")
pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"

def get_colors(n, cmap_name="rainbow"):
    """Get colors for px colors_discrete argument, given the number of colors needed, n."""
    cmap = matplotlib.colormaps[cmap_name]
    colors = [cmap(i) for i in np.linspace(0, 1, n)]  # Generate colors
    colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
    return colors_str

dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfpc=pd.read_csv("../data/cc_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfdp=pd.read_csv("../data/deathprob_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#30

px.line(
    data_frame=dfpc,
    x=dfpc.index,
    y=dfpc.columns,
    color_discrete_sequence=get_colors(n=dfpc.columns.size),
    title="Population Change by age, on each year of observation",
).update_layout(
    xaxis_title="Age",
    yaxis_title="Population Change",
    legend_title_text="Year of observation",
    width=1000, 
).show()

# Compute the deaths from DEATHPROB*POPULATION and subract these from the change in population
# keep only 2002-today, and ages under 90
MAX_AGE = 80 # Assume there are no migrants above this age (conveniently, because there are numberical problems with the oldest ages)
last_year = dfp.columns[-1]
dfd = (dfp*dfdp).loc[1:MAX_AGE, 2002:]
dfpm = dfd + dfpc.loc[1:MAX_AGE, 2002:]

# get some average values, making a couple of adjustements: remove 2020-2021 (adjsutment in statistics), fix high ages
sspm = dfpm.clip(lower=0).drop(columns=[2020, 2021]).mean(axis=1).astype(int)
sspm.to_csv("../data/average_migration_balance.csv")

fig = go.Figure(   
).add_trace(go.Scatter(
    x=sspm.index,
    y=sspm,
    mode='lines',
    line=dict(color="black", width=5),
    name="Average",
    zorder=1000
)).add_traces([go.Scatter(
    x=dfpm.index, 
    y=dfpm[year],
    mode='lines',
    line=dict(color=color),
    name=str(year)
    ) for year, color in zip(dfpm.columns, get_colors(n=dfpm.columns.size))]
).update_layout(
    xaxis_title="Age",
    yaxis_title="Computed Migrants",
    legend_title_text="Year",
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Computed Migrants by age and year of observations by excluding deaths from population change.")
fig.write_html("../images_output/migrants_by_age.html")
fig.show()
display(dfpm)

Computed Migrants by age and year of observations by excluding deaths from population change.

Get raw data for migration¶

I can not get the same resolution per age, but I can compare with the total migration balance with the sum of the values I computed

# Dataset `28_185` has the following dimensions: 
# [<Dimension FREQ>, <Dimension REF_AREA>, <Dimension DATA_TYPE>, <Dimension CHANGE_OF_RESIDENCE>, 
# <Dimension CITIZENSHIP>, <Dimension SEX>, <Dimension AGE>, <Dimension TERRITORY_NEXT_RESID>, 
# <Dimension COUNTRY_PREV_RESID>, <Dimension COUNTRY_NEXT_RESID>, <TimeDimension TIME_PERIOD>]
df6 = sdmx.to_pandas(
    client.data(
        resource_id="28_185", 
        key={
            "FREQ": "A",
            "REF_AREA": ["IT", "ITTOT"], # PREV_RESID: IT is Italy, ITTOT out of Italy
            #"DATA_TYPE":      
            "CHANGE_OF_RESIDENCE": "FREIGN", # only from/to abroad, not internal migration
            "CITIZENSHIP": ["IT","FOR", "TOTAL"],
            "AGE": [], # I want them all
            "SEX": "9",
            "TERRITORY_NEXT_RESID": ["IT", "ITTOT"], # IT is for immigrants, ITOT is for emigrants
            "COUNTRY_PREV_RESID": "X1033", # all
            "COUNTRY_NEXT_RESID": "X1033", # all
            #"CITIZENSHIP": "TOTAL", # not present in this dataset   
        })
    ).reset_index()
df6

# Create a table with data the way I like it
replace_eta_num = {"Y_UN17": "Y00-17", "Y18-39": "Y18-39", "Y40-64": "Y40-64", "Y_GE65": "Y65+"} # more convenient for sorting
replace_paese_cittad = {"IT": "ITALIANS", "FOR": "FOREIGNERS"}
df_mig = pd.concat([
    ( # Immigrations: TOTAL, and by age
        df6
        .query("(TERRITORY_NEXT_RESID=='IT') & (REF_AREA=='ITTOT') & (CITIZENSHIP=='TOTAL')")
        .replace({"AGE": replace_eta_num})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="AGE", columns="year", values="value")
        .rename(index=lambda x: f"in_{x}")
    ),
    ( # Immigration: by nationality
        df6
        .query("(TERRITORY_NEXT_RESID=='IT') & (REF_AREA=='ITTOT') & (AGE=='TOTAL') &(CITIZENSHIP.isin(['FOR','IT']))")
        .replace({"CITIZENSHIP": replace_paese_cittad})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="CITIZENSHIP", columns="year", values="value")
        .rename(index=lambda x: f"in_{x}")
    ),
    ( # Emigrations: TOTAL, and by age
        df6
        .query("(TERRITORY_NEXT_RESID=='ITTOT') & (REF_AREA=='IT') & (CITIZENSHIP=='TOTAL')")
        .replace({"AGE": replace_eta_num})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="AGE", columns="year", values="value")
        .rename(index=lambda x: f"out_{x}")
    ),   
    ( # Emigrations: by nationality
        df6
        .query("(TERRITORY_NEXT_RESID=='ITTOT') & (REF_AREA=='IT') & (AGE=='TOTAL') &(CITIZENSHIP.isin(['FOR','IT']))")
        .replace({"CITIZENSHIP": replace_paese_cittad})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="CITIZENSHIP", columns="year", values="value")
        .rename(index=lambda x: f"out_{x}")
    ),
    

])
# add a row named "diff_TOTAL" with the difference between immigrations and emigrations
df_mig = pd.concat([
    df_mig, 
    df_mig.loc[["out_TOTAL", "in_TOTAL"], :].diff(axis=0).loc[["in_TOTAL"]].rename(index={"in_TOTAL":"diff_TOTAL"}),
    df_mig.loc[["out_ITALIANS", "in_ITALIANS"], :].diff(axis=0).loc[["in_ITALIANS"]].rename(index={"in_ITALIANS":"diff_ITALIANS"}),
    df_mig.loc[["out_FOREIGNERS", "in_FOREIGNERS"], :].diff(axis=0).loc[["in_FOREIGNERS"]].rename(index={"in_FOREIGNERS": "diff_FOREIGNERS"}),
])
assert all(df_mig.loc[[ "in_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "in_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of nationality groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of nationality groups should be equal to the total."
print("All checks comparing totals,  passed.")

display(df_mig)

All checks comparing totals,  passed.

colors_age = ["#E377C2", "#2CA02C", "#BCBD22", "#7F7F7F"]
colors_cittad = ["#1F77B4", "#FF7F0F"]
max_yrange = 550000

fig = make_subplots(rows=5, cols=1, vertical_spacing=0.03)
for i, group in enumerate(replace_eta_num.values()):
    fig.add_trace(
        go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_age[i], marker_line_width=0), 
        row=1, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
    fig.add_trace(
        go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_cittad[i], marker_line_width=0), 
        row=3, col=1)
for i, group in enumerate(replace_eta_num.values()):
    fig.add_trace(
        go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_age[i], marker_line_width=0), 
        row=2, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
    fig.add_trace(
        go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_cittad[i], marker_line_width=0), 
        row=4, col=1)
fig.add_trace(go.Bar(name="diff_TOTAL", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], marker_color="black", marker_line_width=0), 
    row=5, col=1)
fig.update_layout(
    barmode="stack", 
    #title="Migration Balance by age groups and citizenship", 
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=570,
    legend_traceorder="normal",
    )

fig.update_yaxes(title="Incoming<br>(by age)", range=[0, max_yrange], row=1, col=1)
fig.update_yaxes(title="Incoming<br>(by citznshp)", range=[0, max_yrange], row=3, col=1)
fig.update_yaxes(title="Outcoming<br>(by age)", range=[-max_yrange, 0], row=2, col=1)
fig.update_yaxes(title="Outcoming<br>(by citznshp)", range=[-max_yrange, 0], row=4, col=1)
fig.update_yaxes(title="Net Balance", range=[0, max_yrange], row=5, col=1)

fig.write_html("../images_output/migrants_by_agegroups_citizenship.html")
fig.show()

ratio_italians_out_in = df_mig.loc["out_ITALIANS", :]/df_mig.loc["in_ITALIANS", :]
fig = go.Figure(
).add_trace(go.Scatter(x=df_mig.columns, y=ratio_italians_out_in, mode='lines+markers', marker_color="#1F77B4"), 
).add_hline(y=1, line_dash="dot", line_color="black", line_width=1
).update_layout(
    yaxis_title="Ratio Italians OUT/IN",
    margin=dict(l=10, r=10, t=20, b=10),
    width=620,
    height=170,
)
fig.write_html("../images_output/migrants_italian_citizenship.html")

print("Italians OUT over Italians IN by citizenship (parity=1.0)")
fig.show()

Italians OUT over Italians IN by citizenship (parity=1.0)

Finally compare raw and computed data¶

Calculations (blue bars): obtained by subtracting estimated deaths from the change or residents
Raw Data (red line): obtained directly from DCIS_MIGRAZIONI (immigrations minus emigrations)
Average: obtained by my calculations (2002-latest) excluding 2020, 2021, and excluding artifacts (negative values for high age groups)

# are these total number reasonable?
fig = go.Figure()
fig.add_trace(go.Bar(name="From calculations", x=dfpm.columns, y=dfpm.sum(axis=0)))
fig.add_trace(go.Scatter(name="ISTAT data", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], mode='lines+markers'))
fig.add_trace(go.Scatter(
    name="Average (calculations)", 
    x=dfpm.columns, 
    y=[sspm.sum()]*len(dfpm.columns),
    mode='lines',
    line=dict(color='black', dash='dot')
))
fig.update_layout(
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Migration balance: Calculated vs. Observed")
fig.write_html("../images_output/migrants_total.html")
fig.show()

Migration balance: Calculated vs. Observed

Conclusions¶

According to total migration balance: raw data collected by ISTAT and my calculations from the change in residents has some significan discrepanciesm, expecially in 2015-2022.
In 2015 the number of italians emigrating was 3x those incoming
The net balance of migrants is oscillating between 130-480 thousands per year
Saved the average 2001-latest migration balance as average_migration_balance.csv: this is for ages 1-80, I can assume the other are 0 (newborns are counted as births, >80 y.o. are only less than 1000)

Follow-up¶

Understand why there is a discrepancy between calculated and raw data expecially in 2015-2021
Use the average number of migrants I compute to build scenarios for the future

	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	...	2017	2018	2019	2020	2021	2022	2023	2024	2025	2026
AGE
1	3716.941542	2497.940126	3405.557368	689.950059	1871.792265	3264.846325	2442.285264	-708.104678	-506.478189	-1375.674281	...	553.836246	775.996977	5873.013926	8378.290532	2754.844487	3697.205507	3353.591679	3756.499105	2771.843593	NaN
2	4001.118978	4570.887209	2940.919412	2986.613231	1377.021865	3369.583552	2704.097778	1388.267334	-805.938484	-660.257055	...	-92.490752	470.992966	1639.365835	6851.185538	1973.686914	3666.347711	3376.854898	3323.569545	2933.648930	NaN
3	2798.529113	4464.512850	4581.771414	2983.924215	2397.399404	2725.585478	3119.936650	1966.167538	973.910596	-860.427631	...	-401.496660	190.556640	937.853007	6976.726183	1113.897424	3309.951595	3377.526337	2841.875107	3282.730330	NaN
4	3762.018611	3285.664847	4886.157108	4649.832123	2076.844242	4005.555732	2185.626403	1763.343437	1083.561055	460.249849	...	-1172.295180	-358.622437	867.537674	8821.402353	551.774944	2771.274753	3085.545982	2778.721730	3296.809373	NaN
5	3979.980840	4339.927064	3680.127014	5013.354974	3826.012316	4198.634655	3261.110988	1335.600418	1215.710777	924.770767	...	-925.473115	-189.752823	944.113034	9586.400860	-1757.326113	2421.776393	3102.376192	2983.485594	3122.062503	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
76	2494.980250	2285.752473	1520.311731	1112.997467	-131.597803	1432.988726	1551.394943	965.132836	723.458237	986.103110	...	1851.352168	610.757983	1092.193477	1899.226185	-46.442311	-2413.742064	1016.249144	548.747651	921.476957	NaN
77	1872.497402	2123.170109	2482.928713	1358.382087	1024.737135	-168.809303	1326.349347	1114.074697	923.537319	940.949140	...	80.037018	1817.299665	1257.453006	2060.127576	257.375699	1371.562901	-2942.719700	448.566586	221.875882	NaN
78	3086.570577	1511.953866	1752.902930	1838.197783	1407.166681	1329.255660	-6.194408	1307.829851	887.349643	1209.543578	...	-565.764084	396.281628	2904.105579	2363.452779	497.870423	1240.166551	1383.290383	-3401.330528	557.349108	NaN
79	1236.572001	2697.990848	1667.539045	1778.697250	1851.643801	1304.906609	969.334211	-685.584018	1468.399408	1399.551161	...	-1527.058098	-599.975669	892.733052	4467.894214	549.914073	1174.933435	1279.504030	1167.148679	-4257.957211	NaN
80	1602.247058	1058.370325	1318.395557	724.570177	1725.275434	1414.046067	1043.294364	793.016264	-1183.793340	1861.195535	...	-1003.360844	-1546.103356	234.989594	1537.934422	2826.683665	1630.876053	1467.355049	667.606410	848.850440	NaN

year	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	...	2016	2017	2018	2019	2020	2021	2022	2023	2024	2025
in_TOTAL	213202.0	440301.0	414880.0	304960.0	279714.0	527123.0	494394.0	421859.0	447744.0	385793.0	...	300823.0	343440.0	332324.0	332778.0	247526.0	318366.0	410985.0	439658.0	451583.0	439916.0
in_Y00-17	48654.0	52481.0	61224.0	72540.0	61726.0	86285.0	84546.0	74882.0	69928.0	65272.0	...	48806.0	54787.0	57329.0	63315.0	47166.0	57273.0	79184.0	83982.0	83699.0	79858.0
in_Y18-39	120233.0	285813.0	259676.0	170008.0	157596.0	325505.0	290318.0	242082.0	261735.0	224695.0	...	180306.0	211033.0	195265.0	169142.0	128725.0	168227.0	208131.0	234378.0	248764.0	242164.0
in_Y40-64	36564.0	93231.0	84461.0	53684.0	51873.0	106145.0	106120.0	92977.0	106038.0	85769.0	...	59633.0	64341.0	65817.0	82647.0	60216.0	77851.0	104327.0	100615.0	98752.0	97499.0
in_Y65+	7751.0	8776.0	9519.0	8728.0	8519.0	9188.0	13410.0	11918.0	10043.0	10057.0	...	12078.0	13279.0	13913.0	17674.0	11419.0	15015.0	19343.0	20683.0	20368.0	20395.0
in_FOREIGNERS	168726.0	392771.0	373086.0	267634.0	242048.0	490430.0	462276.0	392529.0	419552.0	354327.0	...	262929.0	301071.0	285500.0	264571.0	191766.0	243607.0	336495.0	378372.0	393115.0	383465.0
in_ITALIANS	44476.0	47530.0	41794.0	37326.0	37666.0	36693.0	32118.0	29330.0	28192.0	31466.0	...	37894.0	42369.0	46824.0	68207.0	55760.0	74759.0	74490.0	61286.0	58468.0	56451.0
out_TOTAL	41756.0	48706.0	49910.0	53931.0	58407.0	51113.0	61671.0	64921.0	67501.0	82461.0	...	157065.0	155110.0	156960.0	179505.0	159884.0	158312.0	150189.0	158438.0	188903.0	144157.0
out_Y00-17	6778.0	8020.0	7149.0	8826.0	9597.0	8473.0	10033.0	10959.0	12564.0	14957.0	...	29345.0	27826.0	27265.0	27633.0	28195.0	26540.0	22125.0	20937.0	23926.0	16840.0
out_Y18-39	21786.0	23692.0	25170.0	25920.0	27127.0	26158.0	32096.0	33503.0	33126.0	40074.0	...	79036.0	78609.0	80273.0	94457.0	81168.0	78550.0	76996.0	84011.0	103463.0	75518.0
out_Y40-64	10358.0	12922.0	13412.0	14486.0	15948.0	13059.0	15888.0	16938.0	18125.0	22544.0	...	40851.0	40310.0	41088.0	48295.0	41134.0	42263.0	39947.0	40133.0	47785.0	37348.0
out_Y65+	2834.0	4072.0	4179.0	4699.0	5735.0	3423.0	3654.0	3521.0	3686.0	4886.0	...	7833.0	8365.0	8334.0	9120.0	9387.0	10959.0	11121.0	13357.0	13729.0	14451.0
out_FOREIGNERS	7700.0	8840.0	10755.0	11940.0	12099.0	14814.0	22135.0	25897.0	27956.0	32404.0	...	42553.0	40551.0	40228.0	57485.0	38934.0	64093.0	50679.0	44381.0	47847.0	35153.0
out_ITALIANS	34056.0	39866.0	39155.0	41991.0	46308.0	36299.0	39536.0	39024.0	39545.0	50057.0	...	114512.0	114559.0	116732.0	122020.0	120950.0	94219.0	99510.0	114057.0	141056.0	109004.0
diff_TOTAL	171446.0	391595.0	364970.0	251029.0	221307.0	476010.0	432723.0	356938.0	380243.0	303332.0	...	143758.0	188330.0	175364.0	153273.0	87642.0	160054.0	260796.0	281220.0	262680.0	295759.0
diff_ITALIANS	10420.0	7664.0	2639.0	-4665.0	-8642.0	394.0	-7418.0	-9694.0	-11353.0	-18591.0	...	-76618.0	-72190.0	-69908.0	-53813.0	-65190.0	-19460.0	-25020.0	-52771.0	-82588.0	-52553.0
diff_FOREIGNERS	161026.0	383931.0	362331.0	255694.0	229949.0	475616.0	440141.0	366632.0	391596.0	321923.0	...	220376.0	260520.0	245272.0	207086.0	152832.0	179514.0	285816.0	333991.0	345268.0	348312.0

	FREQ	REF_AREA	DATA_TYPE	CHANGE_OF_RESIDENCE	CITIZENSHIP	SEX	AGE	TERRITORY_NEXT_RESID	COUNTRY_PREV_RESID	COUNTRY_NEXT_RESID	TIME_PERIOD	value
0	A	IT	TDEREG	FREIGN	FOR	9	TOTAL	ITTOT	X1033	X1033	2002	7700.0
1	A	IT	TDEREG	FREIGN	FOR	9	TOTAL	ITTOT	X1033	X1033	2003	8840.0
2	A	IT	TDEREG	FREIGN	FOR	9	TOTAL	ITTOT	X1033	X1033	2004	10755.0
3	A	IT	TDEREG	FREIGN	FOR	9	TOTAL	ITTOT	X1033	X1033	2005	11940.0
4	A	IT	TDEREG	FREIGN	FOR	9	TOTAL	ITTOT	X1033	X1033	2006	12099.0
...	...	...	...	...	...	...	...	...	...	...	...	...
1075	A	ITTOT	TREG	FREIGN	TOTAL	9	Y40-64	ITTOT	X1033	X1033	2021	77851.0
1076	A	ITTOT	TREG	FREIGN	TOTAL	9	Y40-64	ITTOT	X1033	X1033	2022	104327.0
1077	A	ITTOT	TREG	FREIGN	TOTAL	9	Y40-64	ITTOT	X1033	X1033	2023	100615.0
1078	A	ITTOT	TREG	FREIGN	TOTAL	9	Y40-64	ITTOT	X1033	X1033	2024	98752.0
1079	A	ITTOT	TREG	FREIGN	TOTAL	9	Y40-64	ITTOT	X1033	X1033	2025	97499.0