← Home

Compute italian migration

AIM: get a migration balance by subracting deaths from delta-residents. I need it to have the migration trend till today, and use it for my model to predict the future.

In [1]:
import warnings 
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import sdmx

client = sdmx.Client("ISTAT")
pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"

def get_colors(n, cmap_name="rainbow"):
    """Get colors for px colors_discrete argument, given the number of colors needed, n."""
    cmap = matplotlib.colormaps[cmap_name]
    colors = [cmap(i) for i in np.linspace(0, 1, n)]  # Generate colors
    colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
    return colors_str
In [2]:
dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfpc=pd.read_csv("../data/cc_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfdp=pd.read_csv("../data/deathprob_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#30
In [3]:
px.line(
    data_frame=dfpc,
    x=dfpc.index,
    y=dfpc.columns,
    color_discrete_sequence=get_colors(n=dfpc.columns.size),
    title="Population Change by age, on each year of observation",
).update_layout(
    xaxis_title="Age",
    yaxis_title="Population Change",
    legend_title_text="Year of observation",
    width=1000, 
).show()
In [4]:
# Compute the deaths from DEATHPROB*POPULATION and subract these from the change in population
# keep only 2002-today, and ages under 90
MAX_AGE = 80 # Assume there are no migrants above this age (conveniently, because there are numberical problems with the oldest ages)
last_year = dfp.columns[-1]
dfd = (dfp*dfdp).loc[1:MAX_AGE, 2002:]
dfpm = dfd + dfpc.loc[1:MAX_AGE, 2002:]

# get some average values, making a couple of adjustements: remove 2020-2021 (adjsutment in statistics), fix high ages
sspm = dfpm.clip(lower=0).drop(columns=[2020, 2021]).mean(axis=1).astype(int)
sspm.to_csv("../data/average_migration_balance.csv")

fig = go.Figure(   
).add_trace(go.Scatter(
    x=sspm.index,
    y=sspm,
    mode='lines',
    line=dict(color="black", width=5),
    name="Average",
    zorder=1000
)).add_traces([go.Scatter(
    x=dfpm.index, 
    y=dfpm[year],
    mode='lines',
    line=dict(color=color),
    name=str(year)
    ) for year, color in zip(dfpm.columns, get_colors(n=dfpm.columns.size))]
).update_layout(
    xaxis_title="Age",
    yaxis_title="Computed Migrants",
    legend_title_text="Year",
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Computed Migrants by age and year of observations by excluding deaths from population change.")
fig.write_html("../images_output/migrants_by_age.html")
fig.show()
display(dfpm)
Computed Migrants by age and year of observations by excluding deaths from population change.
2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ... 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026
AGE
1 3716.941542 2497.940126 3405.557368 689.950059 1871.792265 3264.846325 2442.285264 -708.104678 -506.478189 -1375.674281 ... 553.836246 775.996977 5873.013926 8378.290532 2754.844487 3697.205507 3353.591679 3756.499105 2771.843593 NaN
2 4001.118978 4570.887209 2940.919412 2986.613231 1377.021865 3369.583552 2704.097778 1388.267334 -805.938484 -660.257055 ... -92.490752 470.992966 1639.365835 6851.185538 1973.686914 3666.347711 3376.854898 3323.569545 2933.648930 NaN
3 2798.529113 4464.512850 4581.771414 2983.924215 2397.399404 2725.585478 3119.936650 1966.167538 973.910596 -860.427631 ... -401.496660 190.556640 937.853007 6976.726183 1113.897424 3309.951595 3377.526337 2841.875107 3282.730330 NaN
4 3762.018611 3285.664847 4886.157108 4649.832123 2076.844242 4005.555732 2185.626403 1763.343437 1083.561055 460.249849 ... -1172.295180 -358.622437 867.537674 8821.402353 551.774944 2771.274753 3085.545982 2778.721730 3296.809373 NaN
5 3979.980840 4339.927064 3680.127014 5013.354974 3826.012316 4198.634655 3261.110988 1335.600418 1215.710777 924.770767 ... -925.473115 -189.752823 944.113034 9586.400860 -1757.326113 2421.776393 3102.376192 2983.485594 3122.062503 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
76 2494.980250 2285.752473 1520.311731 1112.997467 -131.597803 1432.988726 1551.394943 965.132836 723.458237 986.103110 ... 1851.352168 610.757983 1092.193477 1899.226185 -46.442311 -2413.742064 1016.249144 548.747651 921.476957 NaN
77 1872.497402 2123.170109 2482.928713 1358.382087 1024.737135 -168.809303 1326.349347 1114.074697 923.537319 940.949140 ... 80.037018 1817.299665 1257.453006 2060.127576 257.375699 1371.562901 -2942.719700 448.566586 221.875882 NaN
78 3086.570577 1511.953866 1752.902930 1838.197783 1407.166681 1329.255660 -6.194408 1307.829851 887.349643 1209.543578 ... -565.764084 396.281628 2904.105579 2363.452779 497.870423 1240.166551 1383.290383 -3401.330528 557.349108 NaN
79 1236.572001 2697.990848 1667.539045 1778.697250 1851.643801 1304.906609 969.334211 -685.584018 1468.399408 1399.551161 ... -1527.058098 -599.975669 892.733052 4467.894214 549.914073 1174.933435 1279.504030 1167.148679 -4257.957211 NaN
80 1602.247058 1058.370325 1318.395557 724.570177 1725.275434 1414.046067 1043.294364 793.016264 -1183.793340 1861.195535 ... -1003.360844 -1546.103356 234.989594 1537.934422 2826.683665 1630.876053 1467.355049 667.606410 848.850440 NaN

80 rows × 25 columns

Get raw data for migration

I can not get the same resolution per age, but I can compare with the total migration balance with the sum of the values I computed

In [5]:
# Dataset `28_185` has the following dimensions: 
# [<Dimension FREQ>, <Dimension REF_AREA>, <Dimension DATA_TYPE>, <Dimension CHANGE_OF_RESIDENCE>, 
# <Dimension CITIZENSHIP>, <Dimension SEX>, <Dimension AGE>, <Dimension TERRITORY_NEXT_RESID>, 
# <Dimension COUNTRY_PREV_RESID>, <Dimension COUNTRY_NEXT_RESID>, <TimeDimension TIME_PERIOD>]
df6 = sdmx.to_pandas(
    client.data(
        resource_id="28_185", 
        key={
            "FREQ": "A",
            "REF_AREA": ["IT", "ITTOT"], # PREV_RESID: IT is Italy, ITTOT out of Italy
            #"DATA_TYPE":      
            "CHANGE_OF_RESIDENCE": "FREIGN", # only from/to abroad, not internal migration
            "CITIZENSHIP": ["IT","FOR", "TOTAL"],
            "AGE": [], # I want them all
            "SEX": "9",
            "TERRITORY_NEXT_RESID": ["IT", "ITTOT"], # IT is for immigrants, ITOT is for emigrants
            "COUNTRY_PREV_RESID": "X1033", # all
            "COUNTRY_NEXT_RESID": "X1033", # all
            #"CITIZENSHIP": "TOTAL", # not present in this dataset   
        })
    ).reset_index()
df6
Out[5]:
FREQ REF_AREA DATA_TYPE CHANGE_OF_RESIDENCE CITIZENSHIP SEX AGE TERRITORY_NEXT_RESID COUNTRY_PREV_RESID COUNTRY_NEXT_RESID TIME_PERIOD value
0 A IT TDEREG FREIGN FOR 9 TOTAL ITTOT X1033 X1033 2002 7700.0
1 A IT TDEREG FREIGN FOR 9 TOTAL ITTOT X1033 X1033 2003 8840.0
2 A IT TDEREG FREIGN FOR 9 TOTAL ITTOT X1033 X1033 2004 10755.0
3 A IT TDEREG FREIGN FOR 9 TOTAL ITTOT X1033 X1033 2005 11940.0
4 A IT TDEREG FREIGN FOR 9 TOTAL ITTOT X1033 X1033 2006 12099.0
... ... ... ... ... ... ... ... ... ... ... ... ...
1075 A ITTOT TREG FREIGN TOTAL 9 Y40-64 ITTOT X1033 X1033 2021 77851.0
1076 A ITTOT TREG FREIGN TOTAL 9 Y40-64 ITTOT X1033 X1033 2022 104327.0
1077 A ITTOT TREG FREIGN TOTAL 9 Y40-64 ITTOT X1033 X1033 2023 100615.0
1078 A ITTOT TREG FREIGN TOTAL 9 Y40-64 ITTOT X1033 X1033 2024 98752.0
1079 A ITTOT TREG FREIGN TOTAL 9 Y40-64 ITTOT X1033 X1033 2025 97499.0

1080 rows × 12 columns

In [6]:
# Create a table with data the way I like it
replace_eta_num = {"Y_UN17": "Y00-17", "Y18-39": "Y18-39", "Y40-64": "Y40-64", "Y_GE65": "Y65+"} # more convenient for sorting
replace_paese_cittad = {"IT": "ITALIANS", "FOR": "FOREIGNERS"}
df_mig = pd.concat([
    ( # Immigrations: TOTAL, and by age
        df6
        .query("(TERRITORY_NEXT_RESID=='IT') & (REF_AREA=='ITTOT') & (CITIZENSHIP=='TOTAL')")
        .replace({"AGE": replace_eta_num})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="AGE", columns="year", values="value")
        .rename(index=lambda x: f"in_{x}")
    ),
    ( # Immigration: by nationality
        df6
        .query("(TERRITORY_NEXT_RESID=='IT') & (REF_AREA=='ITTOT') & (AGE=='TOTAL') &(CITIZENSHIP.isin(['FOR','IT']))")
        .replace({"CITIZENSHIP": replace_paese_cittad})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="CITIZENSHIP", columns="year", values="value")
        .rename(index=lambda x: f"in_{x}")
    ),
    ( # Emigrations: TOTAL, and by age
        df6
        .query("(TERRITORY_NEXT_RESID=='ITTOT') & (REF_AREA=='IT') & (CITIZENSHIP=='TOTAL')")
        .replace({"AGE": replace_eta_num})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="AGE", columns="year", values="value")
        .rename(index=lambda x: f"out_{x}")
    ),   
    ( # Emigrations: by nationality
        df6
        .query("(TERRITORY_NEXT_RESID=='ITTOT') & (REF_AREA=='IT') & (AGE=='TOTAL') &(CITIZENSHIP.isin(['FOR','IT']))")
        .replace({"CITIZENSHIP": replace_paese_cittad})
        .assign(year= lambda x: x["TIME_PERIOD"].astype(int))
        .pivot(index="CITIZENSHIP", columns="year", values="value")
        .rename(index=lambda x: f"out_{x}")
    ),
    

])
# add a row named "diff_TOTAL" with the difference between immigrations and emigrations
df_mig = pd.concat([
    df_mig, 
    df_mig.loc[["out_TOTAL", "in_TOTAL"], :].diff(axis=0).loc[["in_TOTAL"]].rename(index={"in_TOTAL":"diff_TOTAL"}),
    df_mig.loc[["out_ITALIANS", "in_ITALIANS"], :].diff(axis=0).loc[["in_ITALIANS"]].rename(index={"in_ITALIANS":"diff_ITALIANS"}),
    df_mig.loc[["out_FOREIGNERS", "in_FOREIGNERS"], :].diff(axis=0).loc[["in_FOREIGNERS"]].rename(index={"in_FOREIGNERS": "diff_FOREIGNERS"}),
])
assert all(df_mig.loc[[ "in_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "in_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of nationality groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of nationality groups should be equal to the total."
print("All checks comparing totals,  passed.")

display(df_mig)
All checks comparing totals,  passed.
year 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ... 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
in_TOTAL 213202.0 440301.0 414880.0 304960.0 279714.0 527123.0 494394.0 421859.0 447744.0 385793.0 ... 300823.0 343440.0 332324.0 332778.0 247526.0 318366.0 410985.0 439658.0 451583.0 439916.0
in_Y00-17 48654.0 52481.0 61224.0 72540.0 61726.0 86285.0 84546.0 74882.0 69928.0 65272.0 ... 48806.0 54787.0 57329.0 63315.0 47166.0 57273.0 79184.0 83982.0 83699.0 79858.0
in_Y18-39 120233.0 285813.0 259676.0 170008.0 157596.0 325505.0 290318.0 242082.0 261735.0 224695.0 ... 180306.0 211033.0 195265.0 169142.0 128725.0 168227.0 208131.0 234378.0 248764.0 242164.0
in_Y40-64 36564.0 93231.0 84461.0 53684.0 51873.0 106145.0 106120.0 92977.0 106038.0 85769.0 ... 59633.0 64341.0 65817.0 82647.0 60216.0 77851.0 104327.0 100615.0 98752.0 97499.0
in_Y65+ 7751.0 8776.0 9519.0 8728.0 8519.0 9188.0 13410.0 11918.0 10043.0 10057.0 ... 12078.0 13279.0 13913.0 17674.0 11419.0 15015.0 19343.0 20683.0 20368.0 20395.0
in_FOREIGNERS 168726.0 392771.0 373086.0 267634.0 242048.0 490430.0 462276.0 392529.0 419552.0 354327.0 ... 262929.0 301071.0 285500.0 264571.0 191766.0 243607.0 336495.0 378372.0 393115.0 383465.0
in_ITALIANS 44476.0 47530.0 41794.0 37326.0 37666.0 36693.0 32118.0 29330.0 28192.0 31466.0 ... 37894.0 42369.0 46824.0 68207.0 55760.0 74759.0 74490.0 61286.0 58468.0 56451.0
out_TOTAL 41756.0 48706.0 49910.0 53931.0 58407.0 51113.0 61671.0 64921.0 67501.0 82461.0 ... 157065.0 155110.0 156960.0 179505.0 159884.0 158312.0 150189.0 158438.0 188903.0 144157.0
out_Y00-17 6778.0 8020.0 7149.0 8826.0 9597.0 8473.0 10033.0 10959.0 12564.0 14957.0 ... 29345.0 27826.0 27265.0 27633.0 28195.0 26540.0 22125.0 20937.0 23926.0 16840.0
out_Y18-39 21786.0 23692.0 25170.0 25920.0 27127.0 26158.0 32096.0 33503.0 33126.0 40074.0 ... 79036.0 78609.0 80273.0 94457.0 81168.0 78550.0 76996.0 84011.0 103463.0 75518.0
out_Y40-64 10358.0 12922.0 13412.0 14486.0 15948.0 13059.0 15888.0 16938.0 18125.0 22544.0 ... 40851.0 40310.0 41088.0 48295.0 41134.0 42263.0 39947.0 40133.0 47785.0 37348.0
out_Y65+ 2834.0 4072.0 4179.0 4699.0 5735.0 3423.0 3654.0 3521.0 3686.0 4886.0 ... 7833.0 8365.0 8334.0 9120.0 9387.0 10959.0 11121.0 13357.0 13729.0 14451.0
out_FOREIGNERS 7700.0 8840.0 10755.0 11940.0 12099.0 14814.0 22135.0 25897.0 27956.0 32404.0 ... 42553.0 40551.0 40228.0 57485.0 38934.0 64093.0 50679.0 44381.0 47847.0 35153.0
out_ITALIANS 34056.0 39866.0 39155.0 41991.0 46308.0 36299.0 39536.0 39024.0 39545.0 50057.0 ... 114512.0 114559.0 116732.0 122020.0 120950.0 94219.0 99510.0 114057.0 141056.0 109004.0
diff_TOTAL 171446.0 391595.0 364970.0 251029.0 221307.0 476010.0 432723.0 356938.0 380243.0 303332.0 ... 143758.0 188330.0 175364.0 153273.0 87642.0 160054.0 260796.0 281220.0 262680.0 295759.0
diff_ITALIANS 10420.0 7664.0 2639.0 -4665.0 -8642.0 394.0 -7418.0 -9694.0 -11353.0 -18591.0 ... -76618.0 -72190.0 -69908.0 -53813.0 -65190.0 -19460.0 -25020.0 -52771.0 -82588.0 -52553.0
diff_FOREIGNERS 161026.0 383931.0 362331.0 255694.0 229949.0 475616.0 440141.0 366632.0 391596.0 321923.0 ... 220376.0 260520.0 245272.0 207086.0 152832.0 179514.0 285816.0 333991.0 345268.0 348312.0

17 rows × 24 columns

In [26]:
colors_age = ["#E377C2", "#2CA02C", "#BCBD22", "#7F7F7F"]
colors_cittad = ["#1F77B4", "#FF7F0F"]
max_yrange = 550000

fig = make_subplots(rows=5, cols=1, vertical_spacing=0.03)
for i, group in enumerate(replace_eta_num.values()):
    fig.add_trace(
        go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_age[i], marker_line_width=0), 
        row=1, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
    fig.add_trace(
        go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_cittad[i], marker_line_width=0), 
        row=3, col=1)
for i, group in enumerate(replace_eta_num.values()):
    fig.add_trace(
        go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_age[i], marker_line_width=0), 
        row=2, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
    fig.add_trace(
        go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_cittad[i], marker_line_width=0), 
        row=4, col=1)
fig.add_trace(go.Bar(name="diff_TOTAL", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], marker_color="black", marker_line_width=0), 
    row=5, col=1)
fig.update_layout(
    barmode="stack", 
    #title="Migration Balance by age groups and citizenship", 
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=570,
    legend_traceorder="normal",
    )

fig.update_yaxes(title="Incoming<br>(by age)", range=[0, max_yrange], row=1, col=1)
fig.update_yaxes(title="Incoming<br>(by citznshp)", range=[0, max_yrange], row=3, col=1)
fig.update_yaxes(title="Outcoming<br>(by age)", range=[-max_yrange, 0], row=2, col=1)
fig.update_yaxes(title="Outcoming<br>(by citznshp)", range=[-max_yrange, 0], row=4, col=1)
fig.update_yaxes(title="Net Balance", range=[0, max_yrange], row=5, col=1)

fig.write_html("../images_output/migrants_by_agegroups_citizenship.html")
fig.show()
In [27]:
ratio_italians_out_in = df_mig.loc["out_ITALIANS", :]/df_mig.loc["in_ITALIANS", :]
fig = go.Figure(
).add_trace(go.Scatter(x=df_mig.columns, y=ratio_italians_out_in, mode='lines+markers', marker_color="#1F77B4"), 
).add_hline(y=1, line_dash="dot", line_color="black", line_width=1
).update_layout(
    yaxis_title="Ratio Italians OUT/IN",
    margin=dict(l=10, r=10, t=20, b=10),
    width=620,
    height=170,
)
fig.write_html("../images_output/migrants_italian_citizenship.html")

print("Italians OUT over Italians IN by citizenship (parity=1.0)")
fig.show()
Italians OUT over Italians IN by citizenship (parity=1.0)

Finally compare raw and computed data

  • Calculations (blue bars): obtained by subtracting estimated deaths from the change or residents
  • Raw Data (red line): obtained directly from DCIS_MIGRAZIONI (immigrations minus emigrations)
  • Average: obtained by my calculations (2002-latest) excluding 2020, 2021, and excluding artifacts (negative values for high age groups)
In [9]:
# are these total number reasonable?
fig = go.Figure()
fig.add_trace(go.Bar(name="From calculations", x=dfpm.columns, y=dfpm.sum(axis=0)))
fig.add_trace(go.Scatter(name="ISTAT data", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], mode='lines+markers'))
fig.add_trace(go.Scatter(
    name="Average (calculations)", 
    x=dfpm.columns, 
    y=[sspm.sum()]*len(dfpm.columns),
    mode='lines',
    line=dict(color='black', dash='dot')
))
fig.update_layout(
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Migration balance: Calculated vs. Observed")
fig.write_html("../images_output/migrants_total.html")
fig.show()
Migration balance: Calculated vs. Observed

Conclusions

  • According to total migration balance: raw data collected by ISTAT and my calculations from the change in residents has some significan discrepanciesm, expecially in 2015-2022.
  • In 2015 the number of italians emigrating was 3x those incoming
  • The net balance of migrants is oscillating between 130-480 thousands per year
  • Saved the average 2001-latest migration balance as average_migration_balance.csv: this is for ages 1-80, I can assume the other are 0 (newborns are counted as births, >80 y.o. are only less than 1000)

Follow-up

  • Understand why there is a discrepancy between calculated and raw data expecially in 2015-2021
  • Use the average number of migrants I compute to build scenarios for the future
← Home