import pandas as pd # Pandas Library used to Crate Dataframes
import numpy as np # Numpy Library used for arrays and other math operations
import matplotlib.pyplot as plt # Matplotlib Library used for plotting
import plotly.express as px # Plotly library used to create interactive plots

plt.style.use('ggplot') # Style set for plots to create better visualizations


# Create Dataframe of Poverty Data from World Bank CSV
poverty_df = pd.read_csv("data/Poverty/Poverty_Data.csv", sep=',', header=2) 
poverty_df.columns = [c.replace(' ', '_') for c in poverty_df.columns]


# View first 5 rows of Poverty Data Frame
poverty_df.head()


# Set the Indicators We want for our Poverty Dataframe
poverty_indicators = ["Multidimensional poverty headcount ratio (% of total population)","Poverty gap at $3.20 a day (2011 PPP) (%)","Population living in slums (% of urban population)","Proportion of people living below 50 percent of median income (%)"]
boolean_series = poverty_df.Indicator_Name.isin(poverty_indicators)

# Clean the dataframe to only have desired indicators and years and drop countries with all data missing
filtered_df = poverty_df[boolean_series].reset_index()
filtered_df = filtered_df[['Country_Name','Indicator_Name','2015','2016','2017','2018','2019','2020']]
filtered_df = filtered_df.dropna(subset=['2015', '2016','2017','2018','2019','2020'], how='all').reset_index()
filtered_df.head()


# Create Education Data Frame from World Bank Education Indicators
edu_df = pd.read_csv("data/Education/Education_Data.csv", sep=',', header=2) 
edu_df.columns = [c.replace(' ', '_') for c in edu_df.columns]


# View first 5 Rows of Education data frame
edu_df.head()


# Set the education indicators we wish to use
edu_indicators = ["Literacy rate, adult total (% of people ages 15 and above)","Educational attainment, at least completed short-cycle tertiary, population 25+, total (%) (cumulative)","Unemployment, total (% of total labor force) (modeled ILO estimate)","Compulsory education, duration (years)"]
boolean_series = edu_df.Indicator_Name.isin(edu_indicators)

# Clean and filter the education dataframe to only have the indicators and years we want and drop countries with all data missing
edu_filter_df = edu_df[boolean_series].reset_index()
edu_filter_df = edu_filter_df[['Country_Name','Indicator_Name','2015','2016','2017','2018','2019','2020']]
edu_filter_df = edu_filter_df.dropna(subset=['2015', '2016','2017','2018','2019','2020'], how='all').reset_index()


# Filter the Education Data Frame to create a dataframe that focuses only on Literacy Rates
edu_litearcy =  edu_filter_df[edu_filter_df['Indicator_Name']=="Literacy rate, adult total (% of people ages 15 and above)"]
edu_litearcy.head()


# Create a dataframe filtering only multidimensional poverty from poverty dataframe
poverty_headcount =  filtered_df[filtered_df['Indicator_Name']=="Multidimensional poverty headcount ratio (% of total population)"]
poverty_headcount.head()


#Function to check which countries have at minimum 3 years worth of data to plot
# County how many data elements each country has and return true if there are at least 3 elements and false othewise

def count_plot(x):
 
    count = 0
    if not np.isnan(x['2015']):
        count = count + 1
    if not np.isnan(x['2016']):
        count = count + 1
    if not np.isnan(x['2017']):
        count = count + 1
    if not np.isnan(x['2018']):
        count = count + 1
    if not np.isnan(x['2019']):
        count = count + 1
    if not np.isnan(x['2020']):
        count = count + 1
    if count > 2:
        return True
    else:
        return False
    
    
# Add a column indicating if the country should be plotted based on fucntion
edu_litearcy["plot?"] = edu_litearcy.apply(count_plot, axis = 1)

#Filter education literacy data frame to only have countries to plot.
edu_litearcy = edu_litearcy[edu_litearcy["plot?"] == True]


edu_litearcy.head()


#Create subplots for each country of literacy rates over time
fig, axs = plt.subplots(32,2, figsize=(30,400))

#Loop through all countries and create individual line plots showing trend over time for literacy rates

for ax,(idx,row) in zip(axs.flat, edu_litearcy.iterrows()):
    row[['2015', '2016', '2017','2018','2019','2020']].plot(ax=ax, color=['C0','C1'], marker = 'o', linewidth = 4)
    ax.set_title(row['Country_Name'])
    ax.set_xlabel("Year")
    ax.set_ylabel("Litearcy Rate (%)")


# Create new dataframe reshaping the literacy rate dataframe for an interactive graph
data = []

# For each row of the education literacy rate dataframe, append to the new data array the literacy rate formatted Year, Rate, Country Name
for i,j in edu_litearcy.iterrows():
    temp = [2015, j['2015'], j.Country_Name]
    data.append(temp)
    temp = [2016, j['2016'], j.Country_Name]
    data.append(temp)
    temp = [2017, j['2017'], j.Country_Name]
    data.append(temp)
    temp = [2018, j['2018'], j.Country_Name]
    data.append(temp)
    temp = [2019, j['2019'], j.Country_Name]
    data.append(temp)
    temp = [2020, j['2020'], j.Country_Name]
    data.append(temp)
    
# Create new dataframe with reformatted litearcy rate 
my_lit = pd.DataFrame(data, columns = ['Year', 'Literacy Rate', 'Country'])   

# Create Interactive Plot
fig = px.line(my_lit, x="Year", y="Literacy Rate", color='Country')
fig.show()


# Create new dataframe reshaping the poverty headcount dataframe for an interactive graph
data = []

# For each row of the education poverty headcount dataframe, append to the new data array the literacy rate formatted Year, Rate, Country Name
for i,j in poverty_headcount.iterrows():
    temp = [2015, j['2015'], j.Country_Name]
    data.append(temp)
    temp = [2016, j['2016'], j.Country_Name]
    data.append(temp)
    temp = [2017, j['2017'], j.Country_Name]
    data.append(temp)
    temp = [2018, j['2018'], j.Country_Name]
    data.append(temp)
    temp = [2019, j['2019'], j.Country_Name]
    data.append(temp)
    temp = [2020, j['2020'], j.Country_Name]
    data.append(temp)

# Create new dataframe with reformatted poverty headcount rate 
my_lit = pd.DataFrame(data, columns = ['Year', 'Poverty Headcount', 'Country'])   

# Create Interactive Plot
fig = px.line(my_lit, x="Year", y="Poverty Headcount", color='Country')
fig.show()


#Calculate average literacy rate each year
mean_2015 = edu_litearcy['2015'].mean()
mean_2016 = edu_litearcy['2016'].mean()
mean_2017 = edu_litearcy['2017'].mean()
mean_2018 = edu_litearcy['2018'].mean()
mean_2019 = edu_litearcy['2019'].mean()
mean_2020 = edu_litearcy['2020'].mean()

# initialize list of lists of year and average
data = [['2015', mean_2015 ], ['2016', mean_2016], ['2017', mean_2017],['2018', mean_2018], ['2019', mean_2019], ['2020', mean_2020]]
  
# Create the pandas DataFrame on the average literacy rate each year
avg_lit = pd.DataFrame(data, columns = ['Year', 'Literacy Rate'])

#Plot Average Literacy Rate over Time
fix, ax = plt.subplots(figsize = (10,5))
ax.plot(avg_lit['Year'], avg_lit['Literacy Rate'], linewidth = 2)
ax.set_title("Average Literacy Rate over Time")
ax.set_xlabel("Year")
ax.set_ylabel("Litearcy Rate (%)")
plt.show()


#Calculate average multidimensional poverty for each year
mean_2015 = poverty_headcount['2015'].mean()
mean_2016 = poverty_headcount['2016'].mean()
mean_2017 = poverty_headcount['2017'].mean()
mean_2018 = poverty_headcount['2018'].mean()
mean_2019 = poverty_headcount['2019'].mean()
mean_2020 = poverty_headcount['2020'].mean()

# initialize list of lists
data2 = [['2015', mean_2015 ], ['2016', mean_2016], ['2017', mean_2017],['2018', mean_2018], ['2019', mean_2019], ['2020', mean_2020]]

# Create the pandas DataFrame
avg_poverty = pd.DataFrame(data2, columns = ['Year', 'Population in Poverty Headcount'])

# Plot Average Population in Poverty Over Time
fix, ax = plt.subplots(figsize = (10,5))
ax.plot(avg_poverty['Year'], avg_poverty['Population in Poverty Headcount'])
ax.set_title("Average Population in Poverty over Time")
ax.set_xlabel("Year")
ax.set_ylabel("Population in Poverty (%)")
plt.show()


# Create combined dataframe of literacy and multidimensional poverty factors
lit_headcount = edu_litearcy.merge(poverty_headcount, left_on='Country_Name', right_on='Country_Name')
lit_headcount.head()


# Plot Average Population in Poverty and Literacy Rate Overtime Side by Side
fig, ax = plt.subplots(1,2, figsize=(40,15))
avg_poverty.plot(x = 'Year', y = 'Population in Poverty Headcount', ax = ax[0],color='blue')
avg_lit.plot(x = 'Year', y = 'Literacy Rate', ax = ax[1])
ax[0].set_title("Average Population in Poverty over Time")
ax[1].set_title("Average Literacy Rate over Time")
ax[0].set_ylabel("Population in Poverty (%)")
ax[1].set_ylabel("Literacy Rate (%)")
plt.show()


# Fileter El Salvador Poverty Headcount 
el_sal1 = poverty_headcount[poverty_headcount['Country_Name'] == "El Salvador"]
# initialize list of lists
data1 = [['2015', el_sal1["2015"].unique()[0],"El Salvador" ], ['2016', el_sal1["2016"].unique()[0],"El Salvador"  ], ['2017', el_sal1["2017"].unique()[0],"El Salvador" ],['2018', el_sal1["2018"].unique()[0],"El Salvador" ], ['2019', el_sal1["2019"].unique()[0],"El Salvador" ], ['2020', el_sal1["2020"].unique()[0],"El Salvador" ]]

# Create the pandas DataFrame of El Salvador Population in Poverty Headcount
el_sal_pov = pd.DataFrame(data1, columns = ['Year', 'Population in Poverty Headcount', 'Country'])

# Filter El Salvador Literacy Rate
el_sal = edu_litearcy[edu_litearcy['Country_Name'] == "El Salvador"]
# initialize list of lists
data2 = [['2015', el_sal["2015"].unique()[0], "El Salvador" ], ['2016', el_sal["2016"].unique()[0],"El Salvador" ], ['2017', el_sal["2017"].unique()[0],"El Salvador"],['2018', el_sal["2018"].unique()[0],"El Salvador"], ['2019', el_sal["2019"].unique()[0],"El Salvador"], ['2020', el_sal["2020"].unique()[0],"El Salvador"]]

  
# Create the pandas DataFrame of El Salvador Literacy Rate
el_sal_lit = pd.DataFrame(data2, columns = ['Year', 'Literacy Rate', 'Country'])


# Plot El Salvador Population in Poverty and Literacy Rate Overtime side by side
fig, ax = plt.subplots(1,2, figsize=(40,15))
el_sal_pov.plot(x = 'Year', y = 'Population in Poverty Headcount', ax = ax[0],color='blue')
el_sal_lit.plot(x = 'Year', y = 'Literacy Rate', ax = ax[1])
ax[0].set_title("El Salvador Population in Poverty over Time")
ax[1].set_title("El Salvador Literacy Rate over Time")
ax[0].set_ylabel("Population in Poverty (%)")
ax[1].set_ylabel("Literacy Rate (%)")
plt.show()


# Filter Spain Poverty Headcount
Spain1 = poverty_headcount[poverty_headcount['Country_Name'] == "Spain"]

# Filter Spain Literacy Rate
Spain2 = edu_litearcy[edu_litearcy['Country_Name'] == "Spain"]

# initialize list of lists
data1 = [['2015', Spain1["2015"].unique()[0], "Spain" ], ['2016', Spain1["2016"].unique()[0], "Spain" ], ['2017', Spain1["2017"].unique()[0],"Spain" ],['2018', Spain1["2018"].unique()[0],"Spain" ], ['2019', Spain1["2019"].unique()[0],"Spain" ], ['2020', Spain1["2020"].unique()[0],"Spain" ]]
  
# Create the pandas DataFrame of Spain Poverty Headcount
Spain_pov = pd.DataFrame(data1, columns = ['Year', 'Population in Poverty Headcount', 'Country'])


# initialize list of lists
data2 = [[2015, Spain2["2015"].unique()[0], "Spain" ], [2016, Spain2["2016"].unique()[0],"Spain"  ], [2017, Spain2["2017"].unique()[0],"Spain" ],[2018, Spain2["2018"].unique()[0],"Spain" ], [2019, Spain2["2019"].unique()[0],"Spain" ], [2020, Spain2["2020"].unique()[0],"Spain" ]]

  
# Create the pandas DataFrame Spain Literacy Rate
Spain_lit = pd.DataFrame(data2, columns = ['Year', 'Literacy Rate', 'Country'])


# Plot Spain Population in Poverty and Literacy Rate side by side
fig, ax = plt.subplots(1,2, figsize=(40,15))
Spain_pov.plot(x = 'Year', y = 'Population in Poverty Headcount',ax = ax[0],marker='x',color='blue')
Spain_lit.plot(x = 'Year', y = 'Literacy Rate',ax = ax[1],marker='o')
ax[0].set_title("Spain Population in Poverty over Time")
ax[1].set_title("Spain Literacy Rate over Time")
ax[0].set_ylabel("Population in Poverty (%)")
ax[1].set_ylabel("Literacy Rate (%)")
plt.show()


# Combine the Spanish and El Salvador Dataframes
df  = pd.concat([Spain_lit, el_sal_lit], ignore_index=True)

# Create Interactive Plot
fig = px.line(df, x="Year", y="Literacy Rate", color='Country')
fig.show()


# Combine Spanish and El Salvador Poverty in Headcount Dataframes
df  = pd.concat([Spain_pov, el_sal_pov], ignore_index=True)

# Create Interactive Plot
fig = px.line(df, x="Year", y="Population in Poverty Headcount", color='Country')
fig.show()

	Country_Name	Country_Code	Indicator_Name	Indicator_Code	1960	1961	1962	1963	1964	1965	...	2012	2013	2014	2015	2016	2017	2018	2019	2020	Unnamed:_65
0	Aruba	ABW	Annualized average growth rate in per capita r...	SI.SPR.PCAP.ZG	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Aruba	ABW	Survey mean consumption or income per capita, ...	SI.SPR.PCAP	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Aruba	ABW	Annualized average growth rate in per capita r...	SI.SPR.PC40.ZG	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Aruba	ABW	Survey mean consumption or income per capita, ...	SI.SPR.PC40	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Aruba	ABW	Poverty gap at $5.50 a day (2011 PPP) (%)	SI.POV.UMIC.GP	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	index	Country_Name	Indicator_Name	2015	2016	2017	2018	2019	2020
0	7	Africa Eastern and Southern	Population living in slums (% of urban populat...	NaN	57.196225	NaN	55.770973	NaN	NaN
1	8	Afghanistan	Multidimensional poverty headcount ratio (% of...	NaN	51.700000	NaN	NaN	NaN	49.4
2	11	Afghanistan	Population living in slums (% of urban populat...	NaN	71.300003	NaN	70.699997	NaN	NaN
3	15	Africa Western and Central	Population living in slums (% of urban populat...	NaN	51.434008	NaN	51.168132	NaN	NaN
4	16	Angola	Multidimensional poverty headcount ratio (% of...	54.0	NaN	NaN	NaN	NaN	NaN

	Country_Name	Country_Code	Indicator_Name	Indicator_Code	1960	1961	1962	1963	1964	1965	...	2012	2013	2014	2015	2016	2017	2018	2019	2020	Unnamed:_65
0	Aruba	ABW	Population ages 15-64 (% of total population)	SP.POP.1564.TO.ZS	53.669919	54.056784	54.383281	54.710292	55.119933	55.631102	...	68.986934	69.108851	69.181105	69.159774	69.137615	68.946339	68.646606	68.321199	68.012289	NaN
1	Aruba	ABW	Population ages 0-14 (% of total population)	SP.POP.0014.TO.ZS	43.847190	43.358346	42.925745	42.488756	41.950133	41.290098	...	20.092629	19.593055	19.111724	18.703098	18.257495	17.980183	17.802447	17.620445	17.381938	NaN
2	Aruba	ABW	Unemployment, total (% of total labor force) (...	SL.UEM.TOTL.ZS	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Aruba	ABW	Unemployment, male (% of male labor force) (mo...	SL.UEM.TOTL.MA.ZS	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Aruba	ABW	Unemployment, female (% of female labor force)...	SL.UEM.TOTL.FE.ZS	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	index	Country_Name	Indicator_Name	2015	2016	2017	2018	2019	2020
1	3	Aruba	Literacy rate, adult total (% of people ages 1...	NaN	NaN	NaN	97.807419	NaN	NaN
12	23	Albania	Literacy rate, adult total (% of people ages 1...	NaN	NaN	NaN	98.141151	NaN	NaN
17	31	Arab World	Literacy rate, adult total (% of people ages 1...	73.499390	74.840797	75.766823	72.869087	73.114250	73.367767
21	35	United Arab Emirates	Literacy rate, adult total (% of people ages 1...	NaN	NaN	NaN	NaN	97.556923	NaN
24	39	Argentina	Literacy rate, adult total (% of people ages 1...	99.179962	99.125008	NaN	99.003868	NaN	NaN

	index	Country_Name	Indicator_Name	2015	2016	2017	2018	2019	2020
1	8	Afghanistan	Multidimensional poverty headcount ratio (% of...	NaN	51.7	NaN	NaN	NaN	49.4
4	16	Angola	Multidimensional poverty headcount ratio (% of...	54.0	NaN	NaN	NaN	NaN	NaN
16	40	Armenia	Multidimensional poverty headcount ratio (% of...	29.1	27.8	26.0	23.6	NaN	NaN
20	56	Austria	Multidimensional poverty headcount ratio (% of...	18.3	18.0	18.1	17.5	16.9	NaN
24	68	Belgium	Multidimensional poverty headcount ratio (% of...	21.1	20.9	20.6	20.0	19.5	NaN

World Bank Global Poverty and Education Tutorial

Nandhini Krishnan

1. Introduction¶

2. Get the Data¶

3. Visualizations¶

4. Conclusion¶

5. Resources¶

	index_x	Country_Name	Indicator_Name_x	2015_x	2016_x	2017_x	2018_x	2019_x	2020_x	plot?	index_y	Indicator_Name_y	2015_y	2016_y	2017_y	2018_y	2019_y	2020_y
0	183	Colombia	Literacy rate, adult total (% of people ages 1...	94.245049	94.653847	NaN	95.092506	95.249268	95.636330	True	180	Multidimensional poverty headcount ratio (% of...	20.20	17.80	NaN	19.60	NaN	NaN
1	267	Ecuador	Literacy rate, adult total (% of people ages 1...	94.455658	94.350227	92.829788	NaN	NaN	93.626099	True	264	Multidimensional poverty headcount ratio (% of...	34.95	35.10	34.59	37.86	38.13	NaN
2	283	Spain	Literacy rate, adult total (% of people ages 1...	98.143257	98.250511	NaN	98.436501	NaN	98.594460	True	280	Multidimensional poverty headcount ratio (% of...	28.60	27.90	26.60	26.10	25.30	NaN
3	555	Sri Lanka	Literacy rate, adult total (% of people ages 1...	NaN	92.386902	91.895752	91.709824	92.252960	NaN	True	552	Multidimensional poverty headcount ratio (% of...	NaN	2.37	NaN	NaN	NaN	NaN
4	619	Mexico	Literacy rate, adult total (% of people ages 1...	94.472282	94.859619	94.972778	95.379913	NaN	95.247849	True	616	Multidimensional poverty headcount ratio (% of...	NaN	43.60	NaN	41.90	NaN	NaN