import numpy as np
import matplotlib.pyplot as plt
import pandas
from pandas.plotting import bootstrap_plot, scatter_matrix  # also autocorrelation_plot, andrews_curves, lag_plot
import json
from time import sleep
import dateutil
%matplotlib inline

#beautiful function from http://stackoverflow.com/questions/20618804/how-to-smooth-a-curve-in-th
def smooth(y, box_pts=0):
  if box_pts==0:
    box_pts = np.ceil(y.__len__()*.01)
  box = np.ones(box_pts)/box_pts
  y_smooth = np.convolve(y, box, mode='same')
  return y_smooth

def invert_colors(ax, grid=True):
    ax.patch.set_facecolor('black')
    if grid:
        ax.grid(color='w')
    return ax

def calc_norm(x):
    #x is a pandas series object
    x_no_offset = x-x.copy().dropna().min()
    return x_no_offset/float(max(x_no_offset))

def stringify(a):
        return [str(c).replace('.','') for c in a]

gdp = pandas.read_csv("https://raw.githubusercontent.com/bgrayburn/gdp/master/data/gdp.csv")
country_list = list(pandas.read_csv("https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv")['ISO3166-1-Alpha-3'])
pops = pandas.read_csv("https://raw.githubusercontent.com/datasets/population/master/data/population.csv")
#next create weights for countries based on the percentage of the global population contained
pops_weights = pops.pivot(
    index='Year',
    columns="Country Code",
    values="Value"
).apply(
    lambda x: x / x.WLD,
    axis = 1
)
gdp.rename(columns={"Value":"GDP"}, inplace=True)
gdp['Country Name'] = gdp.index

#save regional gdps seperately and remove from main variable
regional_gdp = gdp.copy()[[(c not in country_list) & (c != 'WLD') for c in gdp['Country Code']]]
world_gdp = gdp.copy()[gdp['Country Code']=="WLD"].pivot(index="Year", columns="Country Code", values= "GDP")
world_gdp.rename(columns={'WLD':'GDP'})
gdp = gdp[[c in country_list for c in gdp['Country Code']]]

#add population column to gdp data
gdp = gdp.merge(pops, on=['Country Code', 'Year'])
gdp = gdp.rename(columns={'Value':'Population'})

#world population
wld_pops = pops[pops['Country Code']=='WLD']
wld_pops.index = wld_pops.Year
wld_pops = pandas.DataFrame(wld_pops['Value']).rename(columns={'WLD':'population'})
world_gdp = world_gdp.merge(wld_pops, left_index=True, right_index=True)
world_gdp = world_gdp.rename(columns={'Value':'Population'})

#calc gdp per capita
gdp['GDP per capita'] = gdp['GDP']/gdp['Population']
#world_gdp['GDP per capita'] = world_gdp['GDP']/world_gdp['Population']

gdp.head()

print('Data Summary')
gdp.describe()

Data Summary

startYear = int(gdp.describe().loc['min','Year'])
endYear = int(gdp.describe().loc['max','Year'])

print('Top 10 GDP '+str(endYear))
gdp[gdp['Year']==endYear].sort_values('GDP', ascending=False).head(10)

Top 10 GDP 2021

print('Top 10 GDP per capita '+str(endYear))
gdp[gdp['Year']==endYear].sort_values('GDP per capita', ascending=False).head(10)

Top 10 GDP per capita 2021

print('Top 10 GDP All Time')
gdp.sort_values('GDP', ascending=False).head(10)

Top 10 GDP All Time

print('Top 10 GDP per capita All Time')
gdp.sort_values('GDP per capita', ascending=False).head(10)

Top 10 GDP per capita All Time

#TODO fix figure size and title
fs = (15,15)
f = bootstrap_plot(gdp['GDP'], size=50, samples=500, color='grey');
f.suptitle("Distributions for GDP", y=1.02)
f = bootstrap_plot(gdp['GDP per capita'], size=50, samples=500, color='grey');
f.suptitle("Distributions for GDP per Capita", y=1.02);

fig = plt.figure()
scatter_matrix(gdp, figsize=(10,10), alpha=0.078, diagonal="kde");
_ = fig;

<Figure size 640x480 with 0 Axes>

world_gdp.head()

ax = world_gdp.plot(legend=False, title="World GDP");
ax.patch.set_facecolor('black')

fs = (12,8) #figure size
lgy = True
gdp_by_country = gdp.pivot(index="Year", columns="Country Name_y", values = "GDP")
ax = gdp_by_country.plot(logy=lgy, legend=False, figsize=fs, alpha=.4, title='All the GDPs')
ax = invert_colors(ax, grid=False)
gdp_pc_by_country = gdp.pivot(index="Year", columns="Country Name_y", values = "GDP per capita")
ax = gdp_pc_by_country.plot(logy=lgy, legend=False, figsize=fs, alpha=.4, title="All the GDP per capita's")
ax = invert_colors(ax, grid=False)

fs = (15,15) #figure size
counter = 0
leg = False #legend
lgy = True
al = .3 #alpha
gdp_by_country_norm = gdp_by_country.apply(calc_norm)
gdp_pc_by_country_norm = gdp_pc_by_country.apply(calc_norm)
#ax = gdp_by_country_norm.plot(legend=leg, logy=lgy, style=st, figsize=fs, alpha=al, title="Normalized GDP by Country")
ax = invert_colors(ax, grid=False)
ax = gdp_pc_by_country_norm.plot(
    legend=leg,
    #logy=lgy,
    figsize=fs, alpha=al, title="Normalized GDP per capita by Country");
ax = invert_colors(ax, grid=False)
counter+=1

diff_gdp = gdp_by_country.apply(lambda x: x.diff()/x, axis=0)
diff_gdp_pc = gdp_by_country.apply(lambda x: x.diff()/x, axis=0)
diff_gdp_pc.dropna()
#diff_gdp.plot(kind="kde")
diff_gdp_pc.tail()

wld_gdp_pc = world_gdp.WLD/world_gdp.Population
diff_wrd_gdp = wld_gdp_pc.diff()/wld_gdp_pc
ax = diff_wrd_gdp.loc[range(1960,2014)].plot(legend=False, style='.-', title="Year over year global change in GDP")
#ax = diff_gdp_pc.ix[range(1960,2014)].sum(axis=1).plot(legend=False, style='.', title="Year over year global change in GDP")
ax.set_ylabel('Percent');
#ax = invert_colors(ax)

diff_gdp_perc_of_wld = gdp_pc_by_country.bfill().apply(
    (lambda x: x.diff()), axis=0
)
diff_gdp_perc_of_wld['WLD'] = wld_gdp_pc
diff_gdp_perc_of_wld.apply(lambda x: x/diff_gdp_perc_of_wld['WLD'])

diff_gdp_perc_of_wld['WLD'].head(100);

ax = diff_gdp_pc.loc[range(startYear,endYear)].plot(kind='line', style='-', alpha=0.1, figsize=(15,20), ylim=(-4, 1), legend=False, title="Year over year change in GDP per capita (second derivative)")
ax = invert_colors(ax, grid=False)
ax.set_ylabel("Percent");
#savefig('test.png', bbox_inches='tight')

#same graph as last cell but different ylims
ax = diff_gdp_pc.loc[range(startYear,endYear)].plot(kind='line', style='-', alpha=0.1, figsize=(15,10), ylim=(-1, 1), legend=False, title="Year over year change in GDP per capita")
ax = invert_colors(ax, grid=False)
ax.set_ylabel("Percent");
#savefig('test.png', bbox_inches='tight')

#same graph as last cell but different xlims
ax = diff_gdp_pc.loc[range(2000,endYear)].plot(kind='line', style='-', alpha=0.1, figsize=(15,10), ylim=(-1, 1), legend=False, title=f"Year over year change in GDP per capita, 2000-{endYear}")
ax = invert_colors(ax, grid=False)
ax.set_ylabel("Percent");
#savefig('test.png', bbox_inches='tight')

#should we normalize based upon a population weighted mean? probably worth checking out
#same graph as last cell but with mean line drawn
ax = diff_gdp_pc.loc[range(startYear,endYear)].plot(kind='line', style='--b', alpha=0.1, figsize=(15,10), ylim=(-.3,.3), legend=False, title="Year over year change in GDP per capita")
ax = world_gdp.loc[range(startYear,endYear)]['WLD'].plot(kind='line', ax = ax, style='-', linewidth=5, alpha=1, legend=False, title="Year over year change in GDP per capita (mean line in BOLD)")
#ax = diff_gdp_pc.loc[range(startYear,endYear)].apply(lambda x: smooth(x, box_pts=3), axis=0).mean(weights=pops_weights, axis=1).plot(kind='line', ax = ax, style='-g', alpha=.8, linewidth=5, ylim=(-.3,.3), legend=False, title="Year over year change in GDP per capita (mean line in BOLD)")
ax = diff_gdp_pc.loc[range(startYear,endYear)].apply(lambda x: smooth(x, box_pts=3), axis=0).mean(axis=1).plot(kind='line', ax = ax, style='-g', alpha=.8, linewidth=5, ylim=(-.3,.3), legend=False, title="Year over year change in GDP per capita (mean line in BOLD)")
# ax = diff_gdp_pc.loc[range(startYear,endYear)].mean(weights=pops_weights, axis=1).plot(kind='line', ax = ax, style='-r', alpha =.8, linewidth=5, ylim=(-.3,.3), legend=False, title="Year over year change in GDP per capita (mean line in black, smoothed mean in green)")
ax = diff_gdp_pc.loc[range(startYear,endYear)].mean(axis=1).plot(kind='line', ax = ax, style='-r', alpha =.8, linewidth=5, ylim=(-.3,.3), legend=False, title="Year over year change in GDP per capita (mean line in black, smoothed mean in green)")
ax = invert_colors(ax, grid=True)
ax.set_ylabel("Percent");
#savefig('test.png', bbox_inches='tight')

#Remove trend line
#TODO: calc mean based on a windowed outlier reduction
timeRange = range(startYear,endYear)
# means = diff_gdp_pc.copy().mean(weights=pops_weights, axis=1)
means = diff_gdp_pc.copy().mean(axis=1)
stripped_diff_gdp = diff_gdp_pc.loc[timeRange].copy().apply(lambda x: x - means)

ax = stripped_diff_gdp.loc[range(startYear,endYear)].plot(kind='line', style='-', alpha=0.1, figsize=(20,15), ylim=(-.25,.25), legend=False, title="Mean normalized year over year change in GDP per capita")
stripped_diff_gdp.loc[range(startYear,endYear)].plot(kind='line', style='.', alpha=.4, figsize=(20,15), fontsize=20, ylim=(-.25,.25), legend=False, title="Smoothed Mean normalized year over year change in GDP per capita", ax=ax)
#ax = stripped_diff_gdp.ix[range(startYear,endYear)].mean(axis=1).plot(kind='line', ax = ax, style='-', linewidth=5, alpha=1, ylim=(-.5,.5), legend=False)
ax = invert_colors(ax, grid=False)
ax.set_ylabel("Percent Deviation from Mean");
#savefig('test.png', bbox_inches='tight')

# smooth_means = diff_gdp_pc.apply(lambda x: smooth(x, box_pts=3), axis=0).mean(weights=pops_weights, axis=1)
smooth_means = diff_gdp_pc.apply(lambda x: smooth(x, box_pts=3), axis=0).mean(axis=1)

smooth_stripped_diff_gdp = diff_gdp_pc.loc[timeRange].copy().apply(lambda x: x - smooth_means)

ax = smooth_stripped_diff_gdp.loc[range(startYear,endYear)].plot(kind='line', style='-', alpha=0.1, figsize=(20,15), fontsize=20, ylim=(-.25,.25), legend=False, title="Smoothed Mean normalized year over year change in GDP per capita")
smooth_stripped_diff_gdp.loc[range(startYear,endYear)].plot(kind='line', style='.', alpha=.4, figsize=(20,15), fontsize=20, ylim=(-.25,.25), legend=False, title="Smoothed Mean normalized year over year change in GDP per capita", ax=ax)
#ax = smooth_stripped_diff_gdp.ix[range(startYear,endYear)].mean(axis=1).plot(kind='line', ax = ax, style='-', linewidth=5, alpha=1, ylim=(-.5,.5), legend=False)
ax = invert_colors(ax, grid=False)
ax.set_ylabel("Percent Deviation from Smoothed Mean");

ecorr_stripped_diff_gdp = stripped_diff_gdp.ewm(span=10).corr()

countries = ['United States', 'South Africa', 'Ukraine', 'China', 'Germany']

for c in countries:
    #see who's most like a United States
    print(ecorr_stripped_diff_gdp.loc[endYear, c].dropna().sort_values(inplace=False, ascending=False).head())
    print('')

Country Name_y
United States                     1.000000
St. Vincent and the Grenadines    0.969383
Puerto Rico                       0.944064
Hong Kong SAR, China              0.931545
El Salvador                       0.929280
Name: (2021, United States), dtype: float64

Country Name_y
South Africa    1.000000
Eswatini        0.970859
Namibia         0.925197
Lesotho         0.869519
Chile           0.762598
Name: (2021, South Africa), dtype: float64

Country Name_y
Ukraine               1.000000
Russian Federation    0.794957
Moldova               0.760519
Brunei Darussalam     0.678051
Belarus               0.649224
Name: (2021, Ukraine), dtype: float64

Country Name_y
China          1.000000
Jordan         0.882434
Costa Rica     0.850011
Switzerland    0.842879
Haiti          0.841665
Name: (2021, China), dtype: float64

Country Name_y
Germany    1.000000
Belgium    0.989449
Austria    0.987472
Denmark    0.985498
France     0.984610
Name: (2021, Germany), dtype: float64

ecorr_stripped_diff_gdp.loc[endYear].sum().sort_values(inplace=False, ascending=False)

Country Name_y
Morocco                      107.475762
Vanuatu                      101.144210
Belgium                       98.974780
France                        98.576653
Italy                         98.308732
                                ...    
Turks and Caicos Islands     -11.561279
Northern Mariana Islands     -12.667563
Sint Maarten (Dutch part)    -15.154429
Curacao                      -16.125398
Iraq                        -114.589713
Length: 212, dtype: float64

ecorr_stripped_diff_gdp.head()

#prep stripped_diff_gdp
stripped_diff_gdp_T = stripped_diff_gdp.copy().T
diff_gdp_T = diff_gdp.copy().T
gdp_means = pandas.DataFrame(means, columns=["global mean (each country=1) gdp percent change"])
gdp_smoothed_means = pandas.DataFrame(smooth_means, columns=["global smoothed "])

gdp_pc = gdp.pivot(index="Year",columns="Country Name_y", values="GDP per capita")

gdp_pc.tail()

# #save things to mongo
# dataframes_to_save = ['gdp_pc', 'diff_gdp_T','stripped_diff_gdp_T','diff_gdp','diff_gdp_pc','stripped_diff_gdp','smooth_stripped_diff_gdp','gdp_means']
# db = MongoClient('mongodb')
# col_base = 'gdp.analysis.exploratory.'
# for t in dataframes_to_save:
#     df = eval(t).copy()
#     df.index = stringify(df.index)
#     df.columns = stringify(df.columns)
#     d = df.to_dict()
#     col = eval('db.'+col_base+t)
#     col.drop()
#     col.insert(d)
#     #col.insert_one(d)
    
# #panels have to be saved differently
# panels_to_save = ['ecorr_stripped_diff_gdp']
# for p in panels_to_save:
#     col = eval('db.'+col_base+p)
#     col.drop()
#     panel = eval(p).copy()
#     panel.items = stringify(panel.items)
#     panel.major_axis = stringify(panel.major_axis)
#     panel.minor_axis = stringify(panel.minor_axis)
#     for i in panel.items:
#         col.insert({'item':i, 'data': panel[str(i)].to_dict()})

	Country Name_x	Country Code	Year	GDP	Country Name_y	Population	GDP per capita
0	0	ABW	1986	4.055866e+08	Aruba	64553	6283.001443
1	1	ABW	1987	4.877095e+08	Aruba	64450	7567.253642
2	2	ABW	1988	5.966480e+08	Aruba	64332	9274.514156
3	3	ABW	1989	6.955307e+08	Aruba	64596	10767.396220
4	4	ABW	1990	7.648045e+08	Aruba	65712	11638.733706

	Country Name_x	Year	GDP	Population	GDP per capita
count	10333.000000	10333.000000	1.033300e+04	1.033300e+04	10333.000000
mean	6556.539824	1994.446240	1.845690e+11	3.026933e+07	9102.529468
std	3908.301440	17.132574	9.924763e+11	1.171713e+08	17999.782145
min	0.000000	1960.000000	8.824746e+06	9.182000e+03	12.786964
25%	2920.000000	1981.000000	1.501500e+09	9.890870e+05	578.450144
50%	6442.000000	1996.000000	7.870982e+09	5.434294e+06	2080.465147
75%	9891.000000	2009.000000	5.254400e+10	1.784401e+07	8787.553381
max	13363.000000	2021.000000	2.331508e+13	1.412360e+09	234317.084818

	Country Name_x	Country Code	Year	GDP	Country Name_y	Population	GDP per capita
9829	12774	USA	2021	2.331508e+13	United States	331893745	70248.629000
1842	2103	CHN	2021	1.782046e+13	China	1412360000	12617.504986
4917	6187	JPN	2021	5.005537e+12	Japan	125681593	39827.126768
2507	2842	DEU	2021	4.259935e+12	Germany	83196078	51203.554473
4394	5655	IND	2021	3.150307e+12	India	1407563842	2238.127139
3408	4211	GBR	2021	3.122480e+12	United Kingdom	67326569	46378.129649
3223	4023	FRA	2021	2.957880e+12	France	67749632	43658.978978
4736	6003	ITA	2021	2.114356e+12	Italy	59109668	35770.049612
1666	1875	CAN	2021	2.001487e+12	Canada	38246108	52331.775704
8046	10308	RUS	2021	1.836892e+12	Russian Federation	143449286	12805.167086

	Country Name_x	Country Code	Year	GDP	Country Name_y	Population	GDP per capita
5936	7627	MCO	2021	8.596157e+09	Monaco	36686	234317.084818
5540	7035	LIE	2021	7.186429e+09	Liechtenstein	39039	184083.321374
5753	7440	LUX	2021	8.550624e+10	Luxembourg	640064	133590.146976
1209	1411	BMU	2021	7.127200e+09	Bermuda	63867	111594.407127
4456	5718	IRL	2021	5.041826e+11	Ireland	5033165	100172.079253
1718	1961	CHE	2021	8.006402e+11	Switzerland	8703405	91991.600458
7103	9054	NOR	2021	4.902934e+11	Norway	5408320	90655.391023
2376	2709	CYM	2021	6.028374e+09	Cayman Islands	68136	88475.600879
8348	10678	SGP	2021	4.237970e+11	Singapore	5453566	77710.069957
9829	12774	USA	2021	2.331508e+13	United States	331893745	70248.629000

	Country Name_x	Country Code	Year	GDP	Country Name_y	Population	GDP per capita
9829	12774	USA	2021	2.331508e+13	United States	331893745	70248.629000
9827	12772	USA	2019	2.138098e+13	United States	328329953	65120.394663
9828	12773	USA	2020	2.106047e+13	United States	331501080	63530.633484
9826	12771	USA	2018	2.053306e+13	United States	326838199	62823.309438
9825	12770	USA	2017	1.947734e+13	United States	325122128	59907.754261
9824	12769	USA	2016	1.869511e+13	United States	323071755	57866.744934
9823	12768	USA	2015	1.820602e+13	United States	320738994	56762.729452
1842	2103	CHN	2021	1.782046e+13	China	1412360000	12617.504986
9822	12767	USA	2014	1.755068e+13	United States	318386329	55123.849787
9821	12766	USA	2013	1.684319e+13	United States	316059947	53291.127689

Global GDP Data Tear Down (Part 1: Exploration)¶

Initial Exploration¶

Load the Data¶

Tabular Structure Check¶

High-level View of Data¶

The Whole World¶

Rate of Change¶

Sneak peak at relationships¶

Store all the things¶

	WLD	Population
Year
1960	1.384857e+12	3031564839
1961	1.449221e+12	3072510552
1962	1.550815e+12	3126934725
1963	1.671447e+12	3193508879
1964	1.830492e+12	3260517816

Country Name_y	Afghanistan	Albania	Algeria	American Samoa	Andorra	Angola	Antigua and Barbuda	Argentina	Armenia	Aruba	...	Uruguay	Uzbekistan	Vanuatu	Venezuela, RB	Vietnam	Virgin Islands (U.S.)	West Bank and Gaza	Yemen, Rep.	Zambia	Zimbabwe
Year
2017	0.046400	0.088982	0.059159	-0.096405	0.034515	0.277389	0.021370	0.133767	0.085129	0.035050	...	0.115762	-0.387507	0.112688	NaN	0.086218	-0.001054	0.044804	-0.166737	0.189969	-0.168542
2018	-0.025924	0.140976	0.027522	0.042254	0.067815	0.113380	0.085255	-0.226380	0.074690	0.056166	...	0.003022	-0.174223	0.037907	NaN	0.092719	0.032636	0.009130	-0.242342	0.016643	0.485161
2019	0.025689	0.015933	-0.018343	0.012365	-0.020053	-0.122406	0.042159	-0.172115	0.085272	0.035222	...	-0.050839	0.122976	0.023266	NaN	0.072552	0.047365	0.050013	NaN	-0.128829	-0.564479
2020	0.061506	-0.015768	-0.178509	0.096369	-0.091369	-0.379523	-0.182904	-0.161369	-0.077331	-0.301051	...	-0.156180	-0.000976	-0.029805	NaN	0.035343	0.020695	-0.103131	NaN	-0.287015	-0.014995
2021	-0.381284	0.154364	0.108450	-0.009873	0.130564	0.235122	0.092386	0.208705	0.087993	0.165060	...	0.126121	0.134710	0.064031	NaN	0.053318	NaN	0.142321	NaN	0.182277	0.241848

Country Name_y	Afghanistan	Albania	Algeria	American Samoa	Andorra	Angola	Antigua and Barbuda	Argentina	Armenia	Aruba	...	Uruguay	Uzbekistan	Vanuatu	Venezuela, RB	Vietnam	Virgin Islands (U.S.)	West Bank and Gaza	Yemen, Rep.	Zambia	Zimbabwe
Year
2017	530.149863	4531.032207	4134.936099	12372.884783	40632.231554	2283.214233	16110.312400	14613.035715	4041.995072	29326.708058	...	18995.397020	1916.764625	3032.197020	NaN	2992.071746	35365.069304	3620.360487	893.716573	1495.752138	1192.107012
2018	502.057099	5287.660817	4171.795309	13195.935900	42904.828456	2487.500996	17514.355864	11795.162885	4391.923274	30918.515218	...	19026.049611	1604.258642	3076.835315	NaN	3267.225009	36653.863048	3562.330943	701.714878	1475.199836	2269.177012
2019	500.522981	5396.214227	4021.983608	13672.576657	41328.600499	2142.238757	18187.779712	9963.674231	4828.504889	31902.762582	...	18098.361549	1795.201768	3076.589886	NaN	3491.091279	38596.030712	3656.858271	NaN	1268.120941	1421.868596
2020	516.866797	5343.037704	3354.157303	15501.526337	37207.222000	1502.950754	15284.772384	8496.428157	4505.867746	24487.863569	...	15650.499427	1759.307471	2917.756849	NaN	3586.347297	39552.168595	3233.568638	NaN	956.831747	1372.696674
2021	363.674087	6377.203096	3700.311195	15743.310758	42072.341103	1903.717405	16740.348196	10636.115530	4966.513471	29342.100730	...	17923.995333	1993.424478	3044.573640	NaN	3756.489122	NaN	3678.635657	NaN	1137.344395	1773.920411