Modelling the future resident population¶
AIM: extrapolate from the past data, the future resident population in Italy by age distribution, till year 2100.
I'm assuming a model like:
$ pop(a=0,y) = birthprop( pop(a=18:50,y-1), fr_{avg}(y-1) ) $
$ pop(a,y) = p(a-1,y-1) \cdot [1 - dp(a, y-1)] + mb(a, y-1) $
$ pop(a=100^{+}, y) = p(a=99,y-1) \cdot [1 - dp(99, y-1)] + p(a=100,y-1) \cdot dp_{100^+}(a, y-1) $
Where:
- $pop(a, y)$ is the resident population of age $a$ in year $y$
- $birthprop$ is a function returning the number of births given a population in the fertile age range (18-50 yo) and an average fertility rate $fr_{avg}$
- $dp(a, y)$ is the death probability for age $a$ in year $y$
- $mb(a, y)$ is the Net Migration Balance, positive for more people immigrating than emigrating
- $dp_{100^+}(a, y)$ is an effective death probability for people older than 100 years old combined
Moreover:
- I will make 3 scenarios for the fertility rate (growing, constant, decreasing)
- I will make 3 scenarios for the Net Migration Balance: no more migration from next year, constant migration, 2x migration from next year. The reference migration balance is the average of the last 20 years.
- I will combine these 3x3=9 scenarios and see the results
In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as pc
import plotly.io as pio
from pandas_datareader import wb # https://pandas-datareader.readthedocs.io/en/latest/remote_data.html#world-bank Italian data are probably copied from ISTAT anyway
from copy import deepcopy
import warnings
pio.renderers.default = 'vscode+notebook'
pd.options.plotting.backend = "plotly"
warnings.filterwarnings("ignore", category=FutureWarning)
def get_colors(n, cmap_name="rainbow"):
"""Get colors for px colors_discrete argument, given the number of colors needed, n."""
cmap = matplotlib.colormaps[cmap_name]
colors = [cmap(i) for i in np.linspace(0, 1, n)] # Generate colors
colors_str = [f"rgba({int(color[0]*250)}, {int(color[1]*250)}, {int(color[2]*250)}, 1.0)" for color in colors]
return colors_str
# convert default plotly colors to matplotlib
plotly_colors = pc.qualitative.Plotly
matplotlib_colors = []
for color in plotly_colors:
r, g, b = pc.hex_to_rgb(color)
matplotlib_colors.append((r/255, g/255, b/255))
In [2]:
# Recover data from previous notebooks
dfp=pd.read_csv("../data/pop_by_age_year.csv", index_col=0).rename(columns=int)
dfdpj=pd.read_csv("../data/deathprob_by_age_year_proj.csv", index_col=0).rename(columns=int)
dfmb = pd.read_csv("../data/average_migration_balance.csv", index_col=0).squeeze()
In [3]:
# Reference years I will be using for the analysis
start_year = dfp.columns[0] # 2002
last_year = dfp.columns[-1] # 2023 (or later if updated)
end_year = 2100
print(f"First-last observation: Jan/{start_year} - Jan/{last_year}")
print(f"First-last prediction: Jan/{last_year+1} - Jan/{end_year}")
In [4]:
# Create coefficients for the migration balance
migration_coeff = {}
for year in range(last_year+1, end_year+1):
migration_coeff[year] = {
"double ": 2,
"average": 1,
"zero ": 0,
#f"↑ 5x in {end_year}": 1 + (5-1) / (end_year - last_year) * (year - last_year), # old scenario
}
In [5]:
# Values from previous analysis on fertility (see Notebook #40)
fdata = json.load(open("../data/fertility_fitted.json"))
print("Fertility coefficients:")
display(fdata)
def get_newborns_next_year(dfp, year, fertility_rate, fdata):
"""Fom population and fertility rate for a given year,
get the number of newborns for the NEXT year (fist January of year+1).
"""
def constrained_sigmoid(x0, k):
"""Get sigmoid function, which is a cumsum of a distribution."""
x = np.arange(17, 50+1)
y = 1 / (1 + np.exp(-k * (x - x0)))
y = y - y[0]
y = y / y[-1]
return y
pop = dfp[year]
mothers_potential = pop.loc[18:50].to_numpy() * fdata["assumed_women_ratio"]
x0 = np.polyval(fdata["sigmoid_x0"], year)
distrib = np.diff(constrained_sigmoid(x0, fdata["sigmoid_k"]))
newborns = sum(mothers_potential * distrib) * fertility_rate
return int(newborns)
fertility_rate = {}
m_coeff_scenario_1 = (2 - fdata["last_observed_value"]) / (end_year - fdata["last_observed_year"])
m_coeff_scenario_3 = (0 - fdata["last_observed_value"]) / (end_year - fdata["last_observed_year"])
for year in range(fdata["last_observed_year"], end_year+1):
fertility_rate[year] = {
f"↑ 2 in {end_year}": fdata["last_observed_value"] + m_coeff_scenario_1 * (year - fdata["last_observed_year"]), # going to 2
"eq. to last": fdata["last_observed_value"], # constant
f"↓ 0 in {end_year}": fdata["last_observed_value"] + m_coeff_scenario_3 * (year - fdata["last_observed_year"]), # going to 0
}
Projection of the resident population till 2100¶
In [6]:
dfpj_long_all = None
scenarios = [] # trying to sort them by descending end_year population
for migration_scenario in migration_coeff[end_year]:
for fertility_scenario in fertility_rate[end_year]:
scenario = f"migr {migration_scenario} & fert {fertility_scenario}"
scenarios.append(scenario)
print("Simulating:", scenario)
dfpj = dfp.copy()
for year in range(last_year+1, end_year+1):
dfpj.loc[0, year] = get_newborns_next_year(dfpj, year-1, fertility_rate[year-1][fertility_scenario], fdata)
dfpj.loc[1:, year] = dfpj.loc[0:99, year-1].values * ( 1 - dfdpj.loc[0:99, year-1]).values
dfpj.loc[100, year] += dfpj.loc[100, year-1] * (1-0.4)
# add migration balance
dfpj.loc[1:80, year] += dfmb.values * migration_coeff[year][migration_scenario]
dfpj_long = dfpj.reset_index().melt(id_vars="age").rename(columns={"variable": "year", "value": "population"})
if dfpj_long_all is None:
dfpj_long_all = deepcopy(dfpj_long).rename(columns={"population": scenario})
else:
dfpj_long_all[scenario] = dfpj_long["population"]
dfpj_long_all = dfpj_long_all.astype(int) # Integers are ok for years, ages, and population (no need for decimal precision!)
dfpj_long_all.to_csv("../data/pop_by_age_year_proj.csv", index=False)
display(dfpj_long_all)
In [7]:
fig = px.line(
dfpj_long_all,
x="age",
y=dfpj_long_all.columns,
animation_frame="year",
markers=False,
)
fig.update_layout(
xaxis_title="Age",
yaxis_title="Population by age group",
yaxis_range=[0, 1e6],
legend_title="Scenarios",
margin=dict(l=10, r=10, t=10, b=10),
width=780,
height=420,
)
fig.add_vrect(x0=17, x1=50, fillcolor="LightSalmon", opacity=0.5, layer="below", line_width=0)
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.write_html("../images_output/pop_by_age_proj.html", auto_play=False)
fig.show()
In [8]:
fig = dfpj_long_all.groupby("year").sum().drop(columns="age").plot()
fig.add_vrect(x0=start_year, x1=last_year, fillcolor="Green", opacity=0.1, layer="below", line_width=0)
fig.update_layout(
legend_title="Scenarios",
margin=dict(l=10, r=10, t=20, b=10),
xaxis_title=None,
yaxis_title="Total Population",
width=780,
height=320,
)
fig
In [9]:
# Get historical data back to 1960 from the World Bank
df_wb = wb.download(indicator='SP.POP.TOTL', country=['IT'], start=1900, end=2100) # https://data.worldbank.org/indicator/SP.POP.TOTL
ss_wb = (
df_wb
.reset_index()
.astype({"year": int})
.set_index("year")
.sort_index()
.rename(columns={"SP.POP.TOTL": "population"})
["population"]
)
ss_wb
Out[9]:
In [10]:
# Get forecast from ISTAT
from istatapi import discovery, retrieval
ds = discovery.DataSet(dataflow_identifier="DCIS_PREVDEM1")
ds.set_filters(
freq="A",
itter107="IT",
eta="TOTAL",
sesso="9",
tipo_inddem="JAN"
)
df3 = retrieval.get_data(ds)
#df3.loc[:, lambda dfx: (~dfx.isna()).any(axis=0)]
df3p = df3.pivot(index="TIME_PERIOD", columns="INT_PREV", values="OBS_VALUE")
display(df3p)
In [11]:
fig1 = deepcopy(fig)
fig1.add_trace(
go.Scatter(
x=ss_wb.index,
y=ss_wb.values,
mode="lines",
name="(historical data from World Bank)",
line=dict(color="gray", width=2, dash="dot")
)
)
fig1.add_trace(go.Scatter(
x=df3p.index.year,
y=df3p["PROJMED"],
mode="lines",
name="(ISTAT Projection ±80 percentile)",
line=dict(color="rgba(0,0,0,0.5)")
)).add_trace(go.Scatter(
x=df3p.index.year,
y=df3p["PROJLOW80"],
mode="lines",
showlegend=False,
fill=None,
line=dict(color="rgba(128,128,128,0.2)")
)).add_trace(go.Scatter(
x=df3p.index.year,
y=df3p["PROJUPP80"],
mode="lines",
showlegend=False,
fill="tonexty",
line=dict(color="rgba(128,128,128,0.2)"),
fillcolor="rgba(128,128,128,0.2)"
))
fig1.write_html("../images_output/pop_tot_proj.html")
fig1