Compute italian migration¶
AIM: get a migration balance by subracting deaths from delta-residents. I need it to have the migration trend till today, and use it for my model to predict the future.
In [1]:
import warnings
from istatapi import discovery, retrieval
import numpy as np
import pandas as pd
import requests
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.renderers.default = 'vscode+notebook'
warnings.filterwarnings('ignore')
requests.urllib3.disable_warnings() # avoid "InsecureRequestWarning: Unverified HTTPS request is being made to host 'sdmx.istat.it'. Adding certificate verification is strongly advised"
def get_colors(n, cmap_name="rainbow"):
"""Get colors for px colors_discrete argument, given the number of colors needed, n."""
cmap = matplotlib.colormaps[cmap_name]
colors = [cmap(i) for i in np.linspace(0, 1, n)] # Generate colors
colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
return colors_str
In [2]:
dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfpc=pd.read_csv("../data/cc_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#14
dfdp=pd.read_csv("../data/deathprob_by_age_year.csv", index_col=0).rename(columns=int) # From Notebook#30
In [3]:
px.line(
data_frame=dfpc,
x=dfpc.index,
y=dfpc.columns,
color_discrete_sequence=get_colors(n=dfpc.columns.size),
title="Population Change by age, on each year of observation",
).update_layout(
xaxis_title="Age",
yaxis_title="Population Change",
legend_title_text="Year of observation",
width=1000,
).show()
In [4]:
# Compute the deaths from DEATHPROB*POPULATION and subract these from the change in population
# keep only 2002-today, and ages under 90
MAX_AGE = 80 # Assume there are no migrants above this age (conveniently, because there are numberical problems with the oldest ages)
last_year = dfp.columns[-1]
dfd = (dfp*dfdp).loc[1:MAX_AGE, 2002:]
dfpm = dfd + dfpc.loc[1:MAX_AGE, 2002:]
# get some average values, making a couple of adjustements: remove 2020-2021 (adjsutment in statistics), fix high ages
sspm = dfpm.clip(lower=0).drop(columns=[2020, 2021]).mean(axis=1).astype(int)
sspm.to_csv("../data/average_migration_balance.csv")
fig = go.Figure(
).add_trace(go.Scatter(
x=sspm.index,
y=sspm,
mode='lines',
line=dict(color="black", width=5),
name="Average",
zorder=1000
)).add_traces([go.Scatter(
x=dfpm.index,
y=dfpm[year],
mode='lines',
line=dict(color=color),
name=str(year)
) for year, color in zip(dfpm.columns, get_colors(n=dfpm.columns.size))]
).update_layout(
xaxis_title="Age",
yaxis_title="Computed Migrants",
legend_title_text="Year",
margin=dict(l=10, r=10, t=20, b=10),
width=780,
height=320,
)
print("Computed Migrants by age and year of observations by excluding deaths from population change.")
fig.write_html("../images_output/migrants_by_age.html")
fig.show()
display(dfpm)
Get raw data for migration¶
I can not get the same resolution per age, but I can compare with the total migration balance with the sum of the values I computed
In [5]:
# used this report to understand how to query the correct values: https://www.istat.it/it/files/2024/05/Migrazioni-interne-e-internazionali-della-popolazione-residente.pdf
# I can find data for: immigration/emigration; age: 0-17, 18-39, 40-64, 65+; nationality
ds = discovery.DataSet(dataflow_identifier="DCIS_MIGRAZIONI")
ds.set_filters(
freq="A",
#eta_num="TOTAL",
#paese_cittad="TOTAL",
#terr_dest="IT",
sesso="9",
stato_est_dest="X1033", #all countries
stato_est_prov="X1033", #all countries
tipo_trasf="FREIGN", # from/to abroad
)
df6 = retrieval.get_data(ds)
df6.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
Out[5]:
In [6]:
# Create a table with data the way I like it
replace_eta_num = {"Y_UN17": "Y00-17", "Y18-39": "Y18-39", "Y40-64": "Y40-64", "Y_GE65": "Y65+"} # more convenient for sorting
replace_paese_cittad = {"IT": "ITALIANS", "FOR": "FOREIGNERS"}
df_mig = pd.concat([
( # Immigrations: TOTAL, and by age
df6
.query("(TERR_DEST=='IT') & (REF_AREA_O=='ITTOT') & (PAESE_CITTAD=='TOTAL')")
.replace({"ETA_NUM": replace_eta_num})
.assign(year= lambda x: x["TIME_PERIOD"].dt.year)
.pivot(index="ETA_NUM", columns="year", values="OBS_VALUE")
.rename(index=lambda x: f"in_{x}")
),
( # Immigration: by nationality
df6
.query("(TERR_DEST=='IT') & (REF_AREA_O=='ITTOT') & (ETA_NUM=='TOTAL') &(PAESE_CITTAD.isin(['FOR','IT']))")
.replace({"PAESE_CITTAD": replace_paese_cittad})
.assign(year= lambda x: x["TIME_PERIOD"].dt.year)
.pivot(index="PAESE_CITTAD", columns="year", values="OBS_VALUE")
.rename(index=lambda x: f"in_{x}")
),
( # Emigrations: TOTAL, and by age
df6
.query("(TERR_DEST=='ITTOT') & (REF_AREA_O=='IT') & (PAESE_CITTAD=='TOTAL')")
.replace({"ETA_NUM": replace_eta_num})
.assign(year= lambda x: x["TIME_PERIOD"].dt.year)
.pivot(index="ETA_NUM", columns="year", values="OBS_VALUE")
.rename(index=lambda x: f"out_{x}")
),
( # Emigrations: by nationality
df6
.query("(TERR_DEST=='ITTOT') & (REF_AREA_O=='IT') & (ETA_NUM=='TOTAL') &(PAESE_CITTAD.isin(['FOR','IT']))")
.replace({"PAESE_CITTAD": replace_paese_cittad})
.assign(year= lambda x: x["TIME_PERIOD"].dt.year)
.pivot(index="PAESE_CITTAD", columns="year", values="OBS_VALUE")
.rename(index=lambda x: f"out_{x}")
),
])
# add a row named "diff_TOTAL" with the difference between immigrations and emigrations
df_mig = pd.concat([
df_mig,
df_mig.loc[["out_TOTAL", "in_TOTAL"], :].diff(axis=0).loc[["in_TOTAL"]].rename(index={"in_TOTAL":"diff_TOTAL"}),
df_mig.loc[["out_ITALIANS", "in_ITALIANS"], :].diff(axis=0).loc[["in_ITALIANS"]].rename(index={"in_ITALIANS":"diff_ITALIANS"}),
df_mig.loc[["out_FOREIGNERS", "in_FOREIGNERS"], :].diff(axis=0).loc[["in_FOREIGNERS"]].rename(index={"in_FOREIGNERS": "diff_FOREIGNERS"}),
])
assert all(df_mig.loc[[ "in_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "in_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["in_TOTAL", :]), "For immigrations, the sum of nationality groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_eta_num.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of age groups should be equal to the total."
assert all(df_mig.loc[[ "out_" + v for v in replace_paese_cittad.values()], :].sum(axis=0) == df_mig.loc["out_TOTAL", :]), "For emigrations, the sum of nationality groups should be equal to the total."
print("All checks comparing totals, passed.")
display(df_mig)
In [7]:
colors_age = ["#E377C2", "#2CA02C", "#BCBD22", "#7F7F7F"]
colors_cittad = ["#1F77B4", "#FF7F0F"]
max_yrange = 550000
fig = make_subplots(rows=5, cols=1, vertical_spacing=0.03)
for i, group in enumerate(replace_eta_num.values()):
fig.add_trace(
go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_age[i], marker_line_width=0),
row=1, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
fig.add_trace(
go.Bar(name=f"in_{group}", x=df_mig.columns, y=df_mig.loc[f"in_{group}"], marker_color=colors_cittad[i], marker_line_width=0),
row=3, col=1)
for i, group in enumerate(replace_eta_num.values()):
fig.add_trace(
go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_age[i], marker_line_width=0),
row=2, col=1)
for i, group in enumerate(replace_paese_cittad.values()):
fig.add_trace(
go.Bar(name=f"out_{group}", x=df_mig.columns, y=-df_mig.loc[f"out_{group}"], marker_color=colors_cittad[i], marker_line_width=0),
row=4, col=1)
fig.add_trace(go.Bar(name="diff_TOTAL", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], marker_color="black", marker_line_width=0),
row=5, col=1)
fig.update_layout(
barmode="stack", title="Migration Balance by age groups and nationality", width=1000, height=800, legend_traceorder="normal",
margin=dict(l=100, r=10, t=50, b=30),
)
fig.update_yaxes(title="Incoming<br>(by age)", range=[0, max_yrange], row=1, col=1)
fig.update_yaxes(title="Incoming<br>(by nationality)", range=[0, max_yrange], row=3, col=1)
fig.update_yaxes(title="Outcoming<br>(by age)", range=[-max_yrange, 0], row=2, col=1)
fig.update_yaxes(title="Outcoming<br>(by nationality)", range=[-max_yrange, 0], row=4, col=1)
fig.update_yaxes(title="Net Balance", range=[0, max_yrange], row=5, col=1)
fig.show()
In [8]:
ratio_italians_out_in = df_mig.loc["out_ITALIANS", :]/df_mig.loc["in_ITALIANS", :]
fig = go.Figure(
).add_trace(go.Scatter(x=df_mig.columns, y=ratio_italians_out_in, mode='lines+markers', marker_color="#1F77B4"),
).add_hline(y=1, line_dash="dot", line_color="black", line_width=1
).update_layout(title="Italians OUT over Italians IN (parity=1.0)", width=1000, height=300,
).show()
Finally compare raw and computed data¶
- Calculations (blue bars): obtained by subtracting estimated deaths from the change or residents
- Raw Data (red line): obtained directly from
DCIS_MIGRAZIONI(immigrations minus emigrations) - Average: obtained by my calculations (2002-latest) excluding 2020, 2021, and excluding artifacts (negative values for high age groups)
In [9]:
# are these total number reasonable?
fig = go.Figure()
fig.add_trace(go.Bar(name="From calculations", x=dfpm.columns, y=dfpm.sum(axis=0)))
fig.add_trace(go.Scatter(name="ISTAT data", x=df_mig.columns, y=df_mig.loc["diff_TOTAL"], mode='lines+markers'))
fig.add_trace(go.Scatter(
name="Average (calculations)",
x=dfpm.columns,
y=[sspm.sum()]*len(dfpm.columns),
mode='lines',
line=dict(color='black', dash='dot')
))
fig.update_layout(
margin=dict(l=10, r=10, t=20, b=10),
width=780,
height=320,
)
print("Migration balance: Calculated vs. Observed")
fig.write_html("../images_output/migrants_total.html")
fig.show()
Conclusions¶
- According to total migration balance: raw data collected by ISTAT and my calculations from the change in residents has some significan discrepanciesm, expecially in 2015-2022.
- In 2015 the number of italians emigrating was 3x those incoming
- The net balance of migrants is oscillating between 130-480 thousands per year
- Saved the average 2001-latest migration balance as
average_migration_balance.csv: this is for ages 1-80, I can assume the other are 0 (newborns are counted as births, >80 y.o. are only less than 1000)
Follow-up¶
- Understand why there is a discrepancy between calculated and raw data expecially in 2015-2021
- Use the average number of migrants I compute to build scenarios for the future