China: From rural to urban

economy
python
Examining Social and Economic Inequalities Amid China’s Urbanization
Published

Jun 12, 2026

Keywords

urbanization

Summary

The charts illustrate the evolution of urbanization in China, the income inequality between rural and urban populations, and the scale of income distribution across cities.

Code
# Libraries
# ========================================================
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm

# Parameters
# ========================================================
code_iso = 'CHN'
name_iso = 'China'
max_urban = 0.82
year_pred = 1980 

# Data Extraction (urban and rural)
# ========================================================
# OWD Urban and Rural Population Data
df = pd.read_csv("https://ourworldindata.org/grapher/share-of-population-urban.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
df = df[df['Code'] == code_iso]
df = df[(df['Year'] >= 1960) & (df['Year'] <= 2023)]
df['Urban'] = df['sp_urb_totl_in_zs'] / 100
df = df[['Code', 'Year', 'Urban']]

# Variables
dfx = df[df['Year'] >= year_pred]
X = dfx[['Year']]
y = dfx['Urban']

# Transformar to polynomial
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)

# Training model
model = LinearRegression()
model.fit(X_poly, y)

# Future year range
future_years = pd.DataFrame({'Year': range(2024, 2051)})
future_X_poly = poly.transform(future_years)

# Prediction
future_preds = model.predict(future_X_poly)

# Limit prediction to max_urban
future_preds = np.clip(future_preds, None, max_urban)

# Dataframe prediction
future_df = pd.DataFrame({
    'Code': code_iso,
    'Year': future_years['Year'],
    'Urban': future_preds
})

# Concatenate future predictions with original dataframe
df = pd.concat([df, future_df], ignore_index=True)

# Add rural percentage
df['Rural'] = 1 - df['Urban']

# Data Extraction (population)
# ========================================================
# OWD Population Data
dfp = pd.read_csv("https://ourworldindata.org/grapher/population-long-run-with-projections.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
dfp = dfp[dfp['Code'] == code_iso]
dfp = dfp[dfp['Year'] >= 1960]
dfp['Total'] = dfp['population_projection'].fillna(0) + dfp['population_historical'].fillna(0)
dfp = dfp[['Code', 'Year', 'Total']]

# Merge urban and rural data with population data
df = df.merge(dfp, on=['Code', 'Year'], how='left')
df['Urban_Per'] = df['Urban']
df['Urban'] = df['Urban'] * df['Total']
df['Rural'] = df['Rural'] * df['Total']
df = df[['Code', 'Year', 'Urban', 'Rural', 'Urban_Per']]

print(df)

# Data Visualization
# ========================================================
# Seaborn figure style
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(12, 8))

# Create a palette
palette1 = ["#D32F2F", "#FBC02D"]  # 1960–2023

# Separar periods (actual and forecast)
df1 = df[df['Year'] <= 2023].copy()
df2 = df[df['Year'] >= 2023].copy()

# Create stacked area plot
df1.set_index('Year')[['Urban', 'Rural']].plot(
    kind="area", stacked=True, color=palette1, ax=ax, linewidth=0
)
df2.set_index('Year')[['Urban', 'Rural']].plot(
    kind="area", stacked=True, color=palette1, ax=ax, linewidth=0, alpha=0.7, legend=False
)

# Title
fig.add_artist(plt.Line2D([0.073, 0.073], [0.90, 0.99], linewidth=6, color='#203764', solid_capstyle='butt'))
ax.text(0.02, 1.09, f'The Urbanization of {name_iso}', fontsize=16, fontweight='bold', ha='left', transform=plt.gca().transAxes)
ax.text(0.02, 1.06, f'A demographic shift from rural to urban centers', fontsize=11, color='#262626', ha='left', transform=plt.gca().transAxes)
ax.text(0.02, 1.03, f'(Includes projections through 2050)', fontsize=9, color='#262626', ha='left', transform=plt.gca().transAxes)

# Configuration
ax.set_xlim(1960, 2050)
ax.set_xlabel('')
ax.set_ylabel('Population', fontsize=10, fontweight='bold')
ax.grid(axis='x')
ax.grid(axis='y', linestyle='--', linewidth=0.5, color='lightgray')
ax.tick_params(axis='x', labelsize=9)
ax.tick_params(axis='y', labelsize=9) 
ax.yaxis.set_major_formatter(plt.FuncFormatter(
    lambda x, _: f'{x/1e9:.1f}B' if x >= 1e9 else f'{x/1e6:.0f}M'
))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

max_total = (df["Urban"] + df["Rural"]).max()

# Establecer el límite del eje y
ax.set_ylim(top=max_total)

# White vertical line at 2023
ax.axvline(x=2023, color='white', linestyle='--', linewidth=1)

# Add text labels for urban percentages
y_offset = df['Urban'].max() * 0.04
for i, year in enumerate(df['Year']):
    if year % 10 == 0 and year not in [1960, 2050] or year in [1962, 2048]:
        urban_value = df.loc[i, 'Urban']
        urban_per = df.loc[i, 'Urban_Per']
        ax.text(
            year,
            urban_value + y_offset,
            f'{urban_per:.0%}',
            ha='center',
            va='bottom',
            fontsize=8,
            color='black',
            weight='bold',
            bbox=dict(facecolor='white', alpha=0.4, edgecolor='none', boxstyle='round,pad=0.3')
        )

# Legend configuration
handles, labels = ax.get_legend_handles_labels()
title_font = fm.FontProperties(weight='bold', size=11)
ax.legend(
    handles[:2], labels[:2],
    title="Zone",
    title_fontproperties=title_font,
    fontsize=10,
    loc='upper left'
)

# Actual text
y_max = ax.get_ylim()[1]
plt.text(
    x=(2023-1960)/2 + 1960,
    y=y_max*0.035,
    s="1960-2023",
    fontsize=9, 
    color='white',
    ha='center',
    va='bottom'
)
plt.text(
    x=(2023-1960)/2 + 1960,
    y=y_max*0.01,
    s="Actual",
    fontsize=9, 
    fontweight='bold',
    color='white',
    ha='center',
    va='bottom'
)

# Forecast text
plt.text(
    x=(2050-2023)/2 + 2023,
    y=y_max*0.035,
    s="2024-2050",
    fontsize=9, 
    color='white',
    ha='center',
    va='bottom'
)
plt.text(
    x=(2050-2023)/2 + 2023,
    y=y_max*0.01,
    s="Forecast",
    fontsize=9, 
    fontweight='bold',
    color='white',
    ha='center',
    va='bottom'
)

# Add Data Source
plt.text(0, -0.1, 'Data Source:', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    fontweight='bold',
    color='gray')
space = " " * 23
plt.text(0, -0.1, space + 'World Bank based on data from the UN Population Division (2025)', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    color='gray')

# Add Notes
plt.text(0, -0.12, 'Forecast:', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    fontweight='bold',
    color='gray')
space = " " * 17
plt.text(0, -0.12, space + 'Urban and rural population percentages were estimated using a polynomial linear regression model.', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    color='gray')

# Adjust layout
plt.tight_layout()

# Save it...
download_folder = os.path.join(os.path.expanduser("~"), "Downloads")
filename = os.path.join(download_folder, f"FIG_OWD_Population_Rural_Urban_{code_iso}.png")
plt.savefig(filename, dpi=300, bbox_inches='tight')

# Show :)
plt.show()

Code
# Libraries
# =====================================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os

# Data (China) 
# =====================================================================
# Read wikipedia data
url = "https://en.wikipedia.org/wiki/List_of_prefecture-level_divisions_of_China_by_GDP"
tables = pd.read_html(url)
df = tables[0]
df.columns = ['region', '1', '2', '3', 'gdp', '4', 'gdpc', '5']
df['population'] = df['gdp'] / df['gdpc'] * 1000
df = df[['region', 'gdpc', 'population']]
df['region'] = df['region'].str.replace('*', '', regex=False)

data = pd.DataFrame({
    'region': ['Beijing', 'Shangai', 'Chongqing', 'Tianjin', 'Hong Kong', 'Macao'],
    'gdpc': [28294, 26747, 12350, 17727, 48800, 36909],
    'population': [21.8, 24.7, 32.0, 13.9, 7.5, 0.68],
})

df = pd.concat([df, data], ignore_index=True)

# Data Manipulation
# =====================================================================
# Order dataframe
df = df.sort_values(by=['gdpc'])

# Calculate 'left accrual widths'
df['population_cum'] = df['population'].cumsum()
df['left'] = df['population'].cumsum() - df['population']

# Pondered Gini Function
def gini(x, weights=None):
    if weights is None:
        weights = np.ones_like(x)
    count = np.multiply.outer(weights, weights)
    mad = np.abs(np.subtract.outer(x, x) * count).sum() / count.sum()
    rmad = mad / np.average(x, weights=weights)
    return 0.5 * rmad

# Calculate gini and median
gini_index = gini(df['gdpc'].values, df['population'].values)

# Calculate weighted median
df.sort_values('gdpc', inplace=True)
cumsum = df['population'].cumsum()
cutoff = df['population'].sum() / 2.0
median = df.loc[cumsum >= cutoff, 'gdpc'].iloc[0]

# Show dataframe, gini and median
print(df)
print(gini_index)
print(median)

# Data Visualization
# =====================================================================
# Seaborn figure style
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(12, 8))

# Create a palette
norm = plt.Normalize(df["gdpc"].min(), 25000)
colors = plt.cm.coolwarm_r(norm(df["gdpc"]))

# Create a Matplotlib plot
bars = plt.bar(df['left'], df['gdpc'], width=df['population'], 
        color=colors, alpha=1, align='edge', edgecolor='grey', linewidth=0.1)

# Title
fig.add_artist(plt.Line2D([0.08, 0.08], [0.90, 0.99], linewidth=6, color='#203764', solid_capstyle='butt'))
ax.text(0.02, 1.09, f'Regional GDP Distribution of China', fontsize=16, fontweight='bold', ha='left', transform=plt.gca().transAxes)
ax.text(0.02, 1.06, f'From rural to urban, the role of location in income inequality', fontsize=11, color='#262626', ha='left', transform=plt.gca().transAxes)
ax.text(0.02, 1.03, f'(GDP per capita in $US)', fontsize=9, color='#262626', ha='left', transform=plt.gca().transAxes)

# Configuration grid and labels
ax.set_xlim(0, df['population_cum'].max()) 
ax.set_ylim(0, df['gdpc'].max() * 1.093)
ax.set_xlabel('Cumulative Population (M)', fontsize=10, fontweight='bold')
ax.set_ylabel('GDP per capita ($USD)', fontsize=10, fontweight='bold')
ax.grid(axis='x')
ax.grid(axis='y', linestyle='--', linewidth=0.5, color='lightgray')
ax.tick_params(axis='x', labelsize=9)
ax.tick_params(axis='y', labelsize=9) 
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.axhline(y=median, color='red', linestyle='--', linewidth=0.5, zorder=0, alpha=0.4)
ax.text(75, median + 100, f"Median: {median:,.0f}$", color='darkred', fontweight='bold', fontsize=9, ha='center', va='bottom', zorder=2)

# Add text each region except Ávila and Segovia
for i, bar in enumerate(bars):
    region_name = df['region'].iloc[i]
    
    top_cities = [
        'Beijing', 'Shangai', 'Chongqing', 'Tianjin', 'Hong Kong',
        'Ordos', 'Suzhou, Jiangsu', 'Zhenjiang', 'Jieyang', 'Kashgar', 'Shangrao', 'Qujing',
        'Shenzhen', 'Guangzhou', 'Suzhou', 'Chengdu', 'Wuhan', 'Hangzhou', 'Nanjing',
        'Ningbo', 'Qingdao', 'Wuxi', 'Changsha', 'Zhengzhou', 'Fuzhou', 'Quanzhou',
        'Jinan', 'Dongguan', 'Foshan', "Xi'an", 'Dalian', 'Wenzhou', 'Shenyang',
        'Kunming', 'Baoding', 'Shijiazhuang', 'Linyi', 'Harbin', 'Nanyang',
        'Weifang', 'Handan', 'Changchun', 'Xuzhou', 'Ganzhou', 'Zhoukou', 'Nanning',
        'Heze', 'Fujian', 'Jining', 'Shaoyang', 'Hefei', 'Nantong', 'Shangqiu',
        'Tangshan', 'Hengyang', 'Cangzhou', 'Jinhua', 'Luoyang', 'Xingtai',
        'Zhanjiang', 'Zhumadian', 'Bijie'
    ]
    
    # Add labels
    if region_name in top_cities:
        x = bar.get_x() + bar.get_width() / 2
        y = bar.get_height()
        
        # Special position
        if region_name in ["Ordos", "Jinan", "Foshan", "Qingdao"]:
            x -= 5
            y += 1000
        else:
            y += 1000

        ax.text(
            x, y,
            region_name,
            ha='center', va='bottom', color='#363636', fontsize=7, rotation=90,
        )

# Add Year label 
ax.text(1, 1.12, f'2022',
             transform=plt.gca().transAxes,
             fontsize=22, ha='right', va='top',
             fontweight='bold', color='#D3D3D3')
    
# Add Data Source
ax.text(0, -0.1, 'Data Source: National Bureau of Statistics of China', 
            transform=plt.gca().transAxes, 
            fontsize=8, 
            color='gray')

# Show GINI Index
ax.text(
    0.09, 0.97, f"Gini Index: {gini_index:.2f}", 
    transform=ax.transAxes,
    fontsize=8.5,
    color='black',
    ha='right',
    va='top', 
    bbox=dict(boxstyle="round,pad=0.3", edgecolor='gray', facecolor='white')
)

# Add Gini Index
ax.text(0, -0.12, 'Notes: The Gini coefficient has been calculated using population weights for each region.', 
            transform=plt.gca().transAxes, 
            fontsize=8, 
            color='gray')

# Add label "poorest" and "richest"
ax.text(0, -0.065, 'Low Income',
             transform=ax.transAxes,
             fontsize=11, fontweight='bold', color='darkred', ha='left', va='center')
ax.text(0.915, -0.065, 'High Income',
             transform=ax.transAxes,
             fontsize=11, fontweight='bold', color='darkblue', va='center')

# Adjust layout
plt.tight_layout()

# Save it...
download_folder = os.path.join(os.path.expanduser("~"), "Downloads")
filename = os.path.join(download_folder, f"FIG_BUREAU_Region_Distribution_China_Prefecture.png")
plt.savefig(filename, dpi=300, bbox_inches='tight')

# Show :)
plt.show()

Code
# Libraries
# =====================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Get Data (NBS and WID)
# =====================================================
data = {
    "year": list(range(1980, 2024)),
    "urban_population": [
        19140, 20171, 21480, 22274, 24017, 25094, 26366, 27674, 28661, 29540, 30195, 31203, 32175,
        33173, 34169, 35174, 37304, 39449, 41608, 43748, 45906, 48064, 50212, 52376, 54283, 56212,
        58288, 60633, 62403, 64512, 66978, 69927, 72175, 74502, 76738, 79302, 81924, 84343, 86433,
        88426, 90220, 91425, 92071, 93267
    ],
    "rural_population": [
        79565, 79901, 80174, 80734, 80340, 80757, 81141, 81626, 82365, 83164, 84138, 84620, 84996,
        85344, 85681, 85947, 85085, 84177, 83153, 82038, 80837, 79563, 78241, 76851, 75705, 74544,
        73160, 71496, 70399, 68938, 67113, 64989, 63747, 62224, 60908, 59024, 57308, 55668, 54108,
        52582, 50992, 49835, 49104, 47700
    ],
    "urban_consumption": [
        490, 517, 504, 547, 621, 750, 847, 953, 1200, 1345, 1404, 1623, 2017, 2676, 3671, 4810,
        5437, 5705, 5977, 6429, 7083, 7409, 7826, 8166, 8942, 9900, 10820, 12582, 14147, 15161,
        17119, 19853, 21563, 23386, 25264, 27039, 29324, 31454, 33700, 35841, 34823, 39205, 40066,
        43797
    ],
    "rural_consumption": [
        178, 202, 227, 252, 280, 346, 385, 427, 506, 588, 627, 661, 701, 822, 1073, 1344, 1655, 1768,
        1778, 1793, 1917, 2032, 2157, 2292, 2521, 2784, 3066, 3538, 3981, 4295, 4782, 5880, 6573,
        7397, 8365, 9409, 10609, 12145, 13985, 15460, 16209, 18720, 19929, 21953
    ],
    "gini_pre": [
        0.38, 0.39, 0.39, 0.39, 0.4, 0.4, 0.42, 0.42, 0.43, 0.44, 0.43, 0.45, 0.46, 0.48, 0.48,
        0.48, 0.47, 0.47, 0.47, 0.48, 0.5, 0.51, 0.53, 0.54, 0.55, 0.56, 0.56, 0.56, 0.56, 0.56,
        0.57, 0.56, 0.55, 0.56, 0.55, 0.56, 0.55, 0.56, 0.56, 0.56, 0.56, 0.57, 0.57, 0.57
    ],
    "gini_pos": [
        0.37678, 0.38164, 0.38878, 0.38653, 0.39165, 0.39926, 0.41379, 0.41783, 0.42156, 0.43046,
        0.4264, 0.44225, 0.45711, 0.47219, 0.47823, 0.47116, 0.46628, 0.46659, 0.46798, 0.4751,
        0.49029, 0.49467, 0.52002, 0.52932, 0.53136, 0.54142, 0.53733, 0.53991, 0.53665, 0.53439,
        0.53852, 0.53215, 0.51823, 0.5256, 0.51737, 0.51879, 0.51648, 0.52329, 0.51837, 0.5174,
        0.52289, 0.52394, 0.52394, 0.52394
    ],
   "gini_urb": [
        0.2415, 0.243, 0.2424, 0.2463, 0.2606, 0.2886, 0.2711, 0.2576, 0.267, 0.2738,
        0.2709, 0.2643, 0.2848, 0.3021, 0.3103, 0.3074, 0.3151, 0.3239, 0.3316, 0.3372,
        0.3456, 0.3589, 0.4127, 0.4245, 0.4383, 0.4464, 0.4492, 0.452, 0.456, 0.4516,
        0.4569, 0.4802, 0.4412, 0.474, 0.4466, 0.4474,
        None, None, None, None, None, None, None, None
    ],
    "gini_rur": [
        0.3329, 0.3422, 0.349, 0.3542, 0.3584, 0.3619, 0.3707, 0.3774, 0.3827, 0.387,
        0.3907, 0.4043, 0.4128, 0.4201, 0.4262, 0.4275, 0.43, 0.4315, 0.4267, 0.4311,
        0.4531, 0.4608, 0.4665, 0.4677, 0.4601, 0.4886, 0.483, 0.4921, 0.4903, 0.4963,
        0.5239, 0.5292, 0.5259, 0.524, 0.5232, 0.524, 
        None, None, None, None, None, None, None, None
    ]
}

df = pd.DataFrame(data)
df['var_consumption'] = df['urban_consumption'] / df['rural_consumption']
df['var_population'] = df['urban_population'] / df['rural_population']
df['gini'] = df['gini_pos']
df = df[['year', 'gini', 'var_consumption', 'var_population', 'gini_urb', 'gini_rur']]

# Data Visualization
# =====================================================
# Font and style
plt.rcParams.update({'font.family': 'sans-serif', 'font.sans-serif': ['Franklin Gothic'], 'font.size': 9})
sns.set(style="white", palette="muted")

# Create figure and axis
fig, ax1 = plt.subplots(figsize=(8, 6))

# Axis 1 DISPARITY
ax1.set_ylabel('Gini coefficient', fontsize=10)
line1, = ax1.plot(df['year'], df['gini'], color='#C00000', linewidth=2)
ax1.tick_params(axis='y')
ax1.set_xlim(1980, 2024)
ax1.set_ylim(0, 0.6)
ax1.tick_params(axis='x', labelsize=9)
ax1.tick_params(axis='y', labelsize=8)

# Axis 2 GINI
ax2 = ax1.twinx()
ax2.set_ylabel('Urban-rural ratio', fontsize=10)
line2, = ax2.plot(df['year'], df['var_consumption'], color='#215C98', linewidth=2)
line3, = ax2.plot(df['year'], df['var_population'], color='#282828', linewidth=1, linestyle=":")
ax2.tick_params(axis='y')
ax2.set_ylim(0, 4)
ax2.tick_params(axis='y', labelsize=8)

# Title and grid
plt.text(0.02, 1.13, f'Inequality Trends in China', fontsize=16, fontweight='bold', ha='left', transform=plt.gca().transAxes)
plt.text(0.02, 1.08, f'Urban-Rural Consumption Ratio and Gini Coefficient since 1980', fontsize=11, color="#3A3A3A", ha='left', transform=plt.gca().transAxes)
ax1.grid(axis='y', linestyle='-', alpha=0.5)

# Remove spines
for ax in (ax1, ax2):
    for spine_name, spine in ax.spines.items():
        if spine_name == 'bottom':
            spine.set_visible(True)
            spine.set_linewidth(0.5)
        else:
            spine.set_visible(False)

# Legend at bottom center
plt.plot([], [], color='#C00000', label='Gini coefficient')
plt.plot([], [], color='#215C98', label='Consumption ratio')
plt.plot([], [], color='#282828', label='Population ratio', linestyle=':')
plt.legend(
    loc='lower center',
    bbox_to_anchor=(0.5, -0.15),
    ncol=3,
    fontsize=8,
    frameon=False,
    handlelength=1,
    handleheight=1,
    borderpad=0.2,
    columnspacing=0.5
)

# Add Data Source
plt.text(0, -0.18, 'Data Source:', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    fontweight='bold',
    color='gray')
space = " " * 23
plt.text(0, -0.18, space + 'National Bureau of Statistics of China (NBS), World Inequality Database (WID)', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    color='gray')

# Add Notes
plt.text(0, -0.21, 'Ratio:', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    fontweight='bold',
    color='gray')
space = " " * 11
plt.text(0, -0.21, space + 'Urban-Rural Ratio measures the relative size between urban and rural for population and consumption', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    color='gray')

# Add Notes
plt.text(0, -0.24, 'Gini:', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    fontweight='bold',
    color='gray')
space = " " * 9
plt.text(0, -0.24, space + 'Gini coefficient is calculated using post-tax national income to measure income inequality', 
    transform=plt.gca().transAxes, 
    fontsize=8,
    color='gray')

# Adjust
plt.tight_layout()

# Save it...
download_folder = os.path.join(os.path.expanduser("~"), "Downloads")
filename = os.path.join(download_folder, f"FIG_NBS_Inequality_China.png")
plt.savefig(filename, dpi=300, bbox_inches='tight')

# Show it :)
plt.show()

Back to top