← Home

Compute italian migration

AIM: get a migration balance by subracting deaths from delta-residents. I need it to have the migration trend till today, and use it for my model to predict the future.

In [1]:
import warnings 
from istatapi import discovery, retrieval
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"

def get_colors(n, cmap_name="rainbow"):
    """Get colors for px colors_discrete argument, given the number of colors needed, n."""
    cmap = matplotlib.colormaps[cmap_name]
    colors = [cmap(i) for i in np.linspace(0, 1, n)]  # Generate colors
    colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
    return colors_str
In [2]:
dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfpc=pd.read_csv("../data/cc_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfdp=pd.read_csv("../data/deathprob_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#30
In [3]:
px.line(
    data_frame=dfpc,
    x=dfpc.index,
    y=dfpc.columns,
    color_discrete_sequence=get_colors(n=dfpc.columns.size),
    title="Population Change by age, on each year of observation",
).update_layout(
    xaxis_title="Age",
    yaxis_title="Population Change",
    legend_title_text="Year of observation",
    width=1000, 
).show()
In [4]:
# Compute the deaths from DEATHPROB*POPULATION and subract these from the change in population
# keep only 2002-today, and ages under 90
MAX_AGE = 80 # Assume there are no migrants above this age (conveniently, because there are numberical problems with the oldest ages)
last_year = dfp.columns[-1]
dfd = (dfp*dfdp).loc[1:MAX_AGE, 2002:]
dfpm = dfd + dfpc.loc[1:MAX_AGE, 2002:]

# get some average values, making a couple of adjustements: remove 2020-2021 (adjsutment in statistics), fix high ages
sspm = dfpm.clip(lower=0).drop(columns=[2020, 2021]).mean(axis=1).astype(int)
sspm.to_csv("../data/average_migration_balance.csv")

fig = go.Figure(   
).add_trace(go.Scatter(
    x=sspm.index,
    y=sspm,
    mode='lines',
    line=dict(color="black", width=5),
    name="Average",
    zorder=1000
)).add_traces([go.Scatter(
    x=dfpm.index, 
    y=dfpm[year],
    mode='lines',
    line=dict(color=color),
    name=str(year)
    ) for year, color in zip(dfpm.columns, get_colors(n=dfpm.columns.size))]
).update_layout(
    xaxis_title="Age",
    yaxis_title="Computed Migrants",
    legend_title_text="Year",
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Computed Migrants by age and year of observations by excluding deaths from population change.")
fig.write_html("../images_output/migrants_by_age.html")
fig.show()
display(dfpm)
Computed Migrants by age and year of observations by excluding deaths from population change.
2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ... 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
age
1 3716.941542 2497.940126 3405.557368 689.950059 1871.792265 3264.846325 2442.285264 -708.104678 -506.478189 -1375.674281 ... 92.814406 553.836246 775.996977 5873.013926 8378.290532 2754.844487 3697.205507 3353.591679 2313.136467 NaN
2 4001.118978 4570.887209 2940.919412 2986.613231 1377.021865 3369.583552 2704.097778 1388.267334 -805.938484 -660.257055 ... -531.706731 -92.490752 470.992966 1639.365835 6851.185538 1973.686914 3666.347711 3376.854898 2646.877851 NaN
3 2798.529113 4464.512850 4581.771414 2983.924215 2397.399404 2725.585478 3119.936650 1966.167538 973.910596 -860.427631 ... -1486.927629 -401.496660 190.556640 937.853007 6976.726183 1113.897424 3309.951595 3377.526337 2594.467958 NaN
4 3762.018611 3285.664847 4886.157108 4649.832123 2076.844242 4005.555732 2185.626403 1763.343437 1083.561055 460.249849 ... -1070.590095 -1172.295180 -358.622437 867.537674 8821.402353 551.774944 2771.274753 3085.545982 2881.019465 NaN
5 3979.980840 4339.927064 3680.127014 5013.354974 3826.012316 4198.634655 3261.110988 1335.600418 1215.710777 924.770767 ... -814.027216 -925.473115 -189.752823 944.113034 9586.400860 -1757.326113 2421.776393 3102.376192 2692.895494 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
76 2494.980250 2285.752473 1520.311731 1112.997467 -131.597803 1432.988726 1551.394943 965.132836 723.458237 986.103110 ... 262.157804 1851.352168 610.757983 1092.193477 1899.226185 -46.442311 -2413.742064 1016.249144 -594.724162 NaN
77 1872.497402 2123.170109 2482.928713 1358.382087 1024.737135 -168.809303 1326.349347 1114.074697 923.537319 940.949140 ... -432.992964 80.037018 1817.299665 1257.453006 2060.127576 257.375699 1371.562901 -2942.719700 -118.088264 NaN
78 3086.570577 1511.953866 1752.902930 1838.197783 1407.166681 1329.255660 -6.194408 1307.829851 887.349643 1209.543578 ... -1123.720595 -565.764084 396.281628 2904.105579 2363.452779 497.870423 1240.166551 1383.290383 -4084.637585 NaN
79 1236.572001 2697.990848 1667.539045 1778.697250 1851.643801 1304.906609 969.334211 -685.584018 1468.399408 1399.551161 ... -875.352686 -1527.058098 -599.975669 892.733052 4467.894214 549.914073 1174.933435 1279.504030 815.297939 NaN
80 1602.247058 1058.370325 1318.395557 724.570177 1725.275434 1414.046067 1043.294364 793.016264 -1183.793340 1861.195535 ... 881.597545 -1003.360844 -1546.103356 234.989594 1537.934422 2826.683665 1630.876053 1467.355049 328.582759 NaN

80 rows × 24 columns

Get raw data for migration

I can not get the same resolution per age, but I can compare with the total migration balance with the sum of the values I computed

In [5]:
# used this report to understand how to query the correct values: https://www.istat.it/it/files/2024/05/Migrazioni-interne-e-internazionali-della-popolazione-residente.pdf
# I can find data for: immigration/emigration; age: 0-17, 18-39, 40-64, 65+; nationality
ds = discovery.DataSet(dataflow_identifier="DCIS_MIGRAZIONI") 
ds.set_filters(
    freq="A", 
    #eta_num="TOTAL", 
    #paese_cittad="TOTAL",
    #terr_dest="IT",
    sesso="9",
    stato_est_dest="X1033", #all countries
    stato_est_prov="X1033", #all countries
    tipo_trasf="FREIGN", # from/to abroad
)
df6 = retrieval.get_data(ds)
df6.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
Out[5]:
DATAFLOW FREQ ETA_NUM PAESE_CITTAD TERR_DEST REF_AREA_O STATO_EST_DEST STATO_EST_PROV TIPO_TRASF SESSO TIPO_INDDEM TIME_PERIOD OBS_VALUE OBS_STATUS
77476 IT1:28_185(1.1) A Y18-39 BY ITTOT IT X1033 X1033 FREIGN 9 TDEREG 2002-01-01 1 NaN
103932 IT1:28_185(1.1) A Y40-64 BJ ITTOT ITTOT X1033 X1033 FREIGN 9 TREG 2002-01-01 5 NaN
4199 IT1:28_185(1.1) A TOTAL FOR ITE13 ITTOT X1033 X1033 FREIGN 9 TREG 2002-01-01 1111 NaN
74259 IT1:28_185(1.1) A Y_UN17 TOTAL ITTOT ITE1A X1033 X1033 FREIGN 9 TDEREG 2002-01-01 14 NaN
33769 IT1:28_185(1.1) A Y_GE65 FOR ITTOT ITE16 X1033 X1033 FREIGN 9 TDEREG 2002-01-01 2 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
56707 IT1:28_185(1.1) A Y_UN17 FOR ITTOT ITC16 X1033 X1033 FREIGN 9 TDEREG 2024-01-01 54 p
56730 IT1:28_185(1.1) A Y_UN17 FOR ITTOT ITC17 X1033 X1033 FREIGN 9 TDEREG 2024-01-01 23 p
56753 IT1:28_185(1.1) A Y_UN17 FOR ITTOT ITC18 X1033 X1033 FREIGN 9 TDEREG 2024-01-01 39 p
120847 IT1:28_185(1.1) A Y40-64 PH ITTOT ITTOT X1033 X1033 FREIGN 9 TREG 2024-01-01 655 p
129108 IT1:28_185(1.1) A Y40-64 ZW ITTOT ITTOT X1033 X1033 FREIGN 9 TREG 2024-01-01 3 p

129109 rows × 14 columns

In [6]:
# Create a table with data the way I like it
replace_eta_num = {"Y_UN17": "Y00-17", "Y18-39": "Y18-39", "Y40-64": "Y40-64", "Y_GE65": "Y65+"} # more convenient for sorting
replace_paese_cittad = {"IT": "ITALIANS", "FOR": "FOREIGNERS"}
df_mig = pd.concat([
    ( # Immigrations: TOTAL, and by age
        df6
        .query("(TERR_DEST=='IT') & (REF_AREA_O=='ITTOT') & (PAESE_CITTAD=='TOTAL')")
        .replace({"ETA_NUM": replace_eta_num})
        .assign(year= lambda x: x["TIME_PERIOD"].dt.year)
        .pivot(index="ETA_NUM", columns="year", values="OBS_VALUE")
        .rename(index=lambda x: f"in_{x}")
    ),
    ( # Immigration: by nationality
        df6
        .query("(TERR_DEST=='IT') & (REF_AREA_O=='ITTOT') & (ETA_NUM=='TOTAL') &(PAESE_CITTAD.isin(['FOR','IT']))")
        .replace({"PAESE_CITTAD": replace_paese_cittad})
        .assign(year= lambda x: x["TIME_PERIOD"].dt.year)
        .pivot(index="PAESE_CITTAD", columns="year", values="OBS_VALUE")
        .rename(index=lambda x: f"in_{x}")
    ),
    ( # Emigrations: TOTAL, and by age
        df6
        .query("(TERR_DEST=='ITTOT') & (REF_AREA_O=='IT') & (PAESE_CITTAD=='TOTAL')")
        .replace({"ETA_NUM": replace_eta_num})
        .assign(year= lambda x: x["TIME_PERIOD"].dt.year)
        .pivot(index="ETA_NUM", columns="year", values="OBS_VALUE")
        .rename(index=lambda x: f"out_{x}")
    ),   
    ( # Emigrations: by nationality
        df6
        .query("(TERR_DEST=='ITTOT') & (REF_AREA_O=='IT') & (ETA_NUM=='TOTAL') &(PAESE_CITTAD.isin(['FOR','IT']))")
        .replace({"PAESE_CITTAD": replace_paese_cittad})
        .assign(year= lambda x: x["TIME_PERIOD"].dt.year)
        .pivot(index="PAESE_CITTAD", columns="year", values="OBS_VALUE")
        .rename(index=lambda x: f"out_{x}")
    ),
    

])
# add a row named "diff_TOTAL" with the difference between immigrations and emigrations
df_mig = pd.concat([
    df_mig, 
    df_mig.loc[["out_TOTAL", "in_TOTAL"], :].diff(axis=0).loc[["in_TOTAL"]].rename(index={"in_TOTAL":"diff_TOTAL"}),
    df_mig.loc[["out_ITALIANS", "in_ITALIANS"], :].diff(axis=0).loc[["in_ITALIANS"]].rename(index={"in_ITALIANS":"diff_ITALIANS"}),
    df_mig.loc[["out_FOREIGNERS", "in_FOREIGNERS"], :].diff(axis=0).loc[["in_FOREIGNERS"]].rename(index={"in_FOREIGNERS": "diff_FOREIGNERS"}),
])
assert all(df_mig.loc[[ "in_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "in_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of nationality groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of nationality groups should be equal to the total."
print("All checks comparing totals,  passed.")

display(df_mig)
All checks comparing totals,  passed.
year 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ... 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
in_TOTAL 213202.0 440301.0 414880.0 304960.0 279714.0 527123.0 494394.0 421859.0 447744.0 385793.0 ... 280078.0 300823.0 343440.0 332324.0 332778.0 247526.0 318366.0 410985.0 439658.0 434579.0
in_Y00-17 48654.0 52481.0 61224.0 72540.0 61726.0 86285.0 84546.0 74882.0 69928.0 65272.0 ... 46557.0 48806.0 54787.0 57329.0 63315.0 47166.0 57273.0 79184.0 83982.0 79584.0
in_Y18-39 120233.0 285813.0 259676.0 170008.0 157596.0 325505.0 290318.0 242082.0 261735.0 224695.0 ... 165038.0 180306.0 211033.0 195265.0 169142.0 128725.0 168227.0 208131.0 234378.0 242559.0
in_Y40-64 36564.0 93231.0 84461.0 53684.0 51873.0 106145.0 106120.0 92977.0 106038.0 85769.0 ... 57738.0 59633.0 64341.0 65817.0 82647.0 60216.0 77851.0 104327.0 100615.0 93261.0
in_Y65+ 7751.0 8776.0 9519.0 8728.0 8519.0 9188.0 13410.0 11918.0 10043.0 10057.0 ... 10745.0 12078.0 13279.0 13913.0 17674.0 11419.0 15015.0 19343.0 20683.0 19175.0
in_FOREIGNERS 168726.0 392771.0 373086.0 267634.0 242048.0 490430.0 462276.0 392529.0 419552.0 354327.0 ... 250026.0 262929.0 301071.0 285500.0 264571.0 191766.0 243607.0 336495.0 378372.0 382071.0
in_ITALIANS 44476.0 47530.0 41794.0 37326.0 37666.0 36693.0 32118.0 29330.0 28192.0 31466.0 ... 30052.0 37894.0 42369.0 46824.0 68207.0 55760.0 74759.0 74490.0 61286.0 52508.0
out_TOTAL 41756.0 48706.0 49910.0 53931.0 58407.0 51113.0 61671.0 64921.0 67501.0 82461.0 ... 146955.0 157065.0 155110.0 156960.0 179505.0 159884.0 158312.0 150189.0 158438.0 190967.0
out_Y00-17 6778.0 8020.0 7149.0 8826.0 9597.0 8473.0 10033.0 10959.0 12564.0 14957.0 ... 28117.0 29345.0 27826.0 27265.0 27633.0 28195.0 26540.0 22125.0 20937.0 23237.0
out_Y18-39 21786.0 23692.0 25170.0 25920.0 27127.0 26158.0 32096.0 33503.0 33126.0 40074.0 ... 70195.0 79036.0 78609.0 80273.0 94457.0 81168.0 78550.0 76996.0 84011.0 104522.0
out_Y40-64 10358.0 12922.0 13412.0 14486.0 15948.0 13059.0 15888.0 16938.0 18125.0 22544.0 ... 40127.0 40851.0 40310.0 41088.0 48295.0 41134.0 42263.0 39947.0 40133.0 48370.0
out_Y65+ 2834.0 4072.0 4179.0 4699.0 5735.0 3423.0 3654.0 3521.0 3686.0 4886.0 ... 8516.0 7833.0 8365.0 8334.0 9120.0 9387.0 10959.0 11121.0 13357.0 14838.0
out_FOREIGNERS 7700.0 8840.0 10755.0 11940.0 12099.0 14814.0 22135.0 25897.0 27956.0 32404.0 ... 44696.0 42553.0 40551.0 40228.0 57485.0 38934.0 64093.0 50679.0 44381.0 35235.0
out_ITALIANS 34056.0 39866.0 39155.0 41991.0 46308.0 36299.0 39536.0 39024.0 39545.0 50057.0 ... 102259.0 114512.0 114559.0 116732.0 122020.0 120950.0 94219.0 99510.0 114057.0 155732.0
diff_TOTAL 171446.0 391595.0 364970.0 251029.0 221307.0 476010.0 432723.0 356938.0 380243.0 303332.0 ... 133123.0 143758.0 188330.0 175364.0 153273.0 87642.0 160054.0 260796.0 281220.0 243612.0
diff_ITALIANS 10420.0 7664.0 2639.0 -4665.0 -8642.0 394.0 -7418.0 -9694.0 -11353.0 -18591.0 ... -72207.0 -76618.0 -72190.0 -69908.0 -53813.0 -65190.0 -19460.0 -25020.0 -52771.0 -103224.0
diff_FOREIGNERS 161026.0 383931.0 362331.0 255694.0 229949.0 475616.0 440141.0 366632.0 391596.0 321923.0 ... 205330.0 220376.0 260520.0 245272.0 207086.0 152832.0 179514.0 285816.0 333991.0 346836.0

17 rows × 23 columns

In [7]:
colors_age = ["#E377C2", "#2CA02C", "#BCBD22", "#7F7F7F"]
colors_cittad = ["#1F77B4", "#FF7F0F"]
max_yrange = 550000

fig = make_subplots(rows=5, cols=1, vertical_spacing=0.03)
for i, group in enumerate(replace_eta_num.values()):
    fig.add_trace(
        go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_age[i], marker_line_width=0), 
        row=1, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
    fig.add_trace(
        go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_cittad[i], marker_line_width=0), 
        row=3, col=1)
for i, group in enumerate(replace_eta_num.values()):
    fig.add_trace(
        go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_age[i], marker_line_width=0), 
        row=2, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
    fig.add_trace(
        go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_cittad[i], marker_line_width=0), 
        row=4, col=1)
fig.add_trace(go.Bar(name="diff_TOTAL", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], marker_color="black", marker_line_width=0), 
    row=5, col=1)
fig.update_layout(
    barmode="stack", title="Migration Balance by age groups and nationality", width=1000, height=800, legend_traceorder="normal",
    margin=dict(l=100, r=10, t=50, b=30),
    )

fig.update_yaxes(title="Incoming<br>(by age)", range=[0, max_yrange], row=1, col=1)
fig.update_yaxes(title="Incoming<br>(by nationality)", range=[0, max_yrange], row=3, col=1)
fig.update_yaxes(title="Outcoming<br>(by age)", range=[-max_yrange, 0], row=2, col=1)
fig.update_yaxes(title="Outcoming<br>(by nationality)", range=[-max_yrange, 0], row=4, col=1)
fig.update_yaxes(title="Net Balance", range=[0, max_yrange], row=5, col=1)

fig.show()
In [8]:
ratio_italians_out_in = df_mig.loc["out_ITALIANS", :]/df_mig.loc["in_ITALIANS", :]
fig = go.Figure(
).add_trace(go.Scatter(x=df_mig.columns, y=ratio_italians_out_in, mode='lines+markers', marker_color="#1F77B4"), 
).add_hline(y=1, line_dash="dot", line_color="black", line_width=1
).update_layout(title="Italians OUT over Italians IN (parity=1.0)", width=1000, height=300,
).show()

Finally compare raw and computed data

  • Calculations (blue bars): obtained by subtracting estimated deaths from the change or residents
  • Raw Data (red line): obtained directly from DCIS_MIGRAZIONI (immigrations minus emigrations)
  • Average: obtained by my calculations (2002-latest) excluding 2020, 2021, and excluding artifacts (negative values for high age groups)
In [9]:
# are these total number reasonable?
fig = go.Figure()
fig.add_trace(go.Bar(name="From calculations", x=dfpm.columns, y=dfpm.sum(axis=0)))
fig.add_trace(go.Scatter(name="ISTAT data", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], mode='lines+markers'))
fig.add_trace(go.Scatter(
    name="Average (calculations)", 
    x=dfpm.columns, 
    y=[sspm.sum()]*len(dfpm.columns),
    mode='lines',
    line=dict(color='black', dash='dot')
))
fig.update_layout(
    margin=dict(l=10, r=10, t=20, b=10),
    width=780,
    height=320,
)
print("Migration balance: Calculated vs. Observed")
fig.write_html("../images_output/migrants_total.html")
fig.show()
Migration balance: Calculated vs. Observed

Conclusions

  • According to total migration balance: raw data collected by ISTAT and my calculations from the change in residents has some significan discrepanciesm, expecially in 2015-2022.
  • In 2015 the number of italians emigrating was 3x those incoming
  • The net balance of migrants is oscillating between 130-480 thousands per year
  • Saved the average 2001-latest migration balance as average_migration_balance.csv: this is for ages 1-80, I can assume the other are 0 (newborns are counted as births, >80 y.o. are only less than 1000)

Follow-up

  • Understand why there is a discrepancy between calculated and raw data expecially in 2015-2021
  • Use the average number of migrants I compute to build scenarios for the future
← Home