*Path has to reflect where the Excel file (WDI.xlsx downloaded from the World Bank website) and label do file (created based on that file) are located
cd "C:\Users\sarkisin\Documents\Teaching Grad Statistics\SC706\Spring 2017\"

set more off
set excelxlsxlargefile on
*the file is too large to import otherwise; that is why I use "set excelxlsxlargefile on" - it is an undocumented setting that allows for larger Excel files to be read

clear all
import excel using WDI.xlsx, sheet("Data") firstrow
save WDI_data.dta, replace 
clear all
import excel using WDI.xlsx, sheet("Country") firstrow
keep CountryCode ShortName
rename CountryCode countrycode
rename ShortName countryname
save WDI_country_names.dta, replace

use WDI_data.dta, clear

*Since the year variable names are just numbers, they weren't read it Stata, so I rename them
rename E     y1960
rename F     y1961
rename G     y1962
rename H     y1963
rename I     y1964
rename J     y1965
rename K     y1966
rename L     y1967
rename M     y1968
rename N     y1969
rename O     y1970
rename P     y1971
rename Q     y1972
rename R     y1973
rename S     y1974
rename T     y1975
rename U     y1976
rename V     y1977
rename W     y1978
rename X     y1979
rename Y     y1980
rename Z     y1981
rename AA    y1982
rename AB    y1983
rename AC    y1984
rename AD    y1985
rename AE    y1986
rename AF    y1987
rename AG    y1988
rename AH    y1989
rename AI    y1990
rename AJ    y1991
rename AK    y1992
rename AL    y1993
rename AM    y1994
rename AN    y1995
rename AO    y1996
rename AP    y1997
rename AQ    y1998
rename AR    y1999
rename AS    y2000
rename AT    y2001
rename AU    y2002
rename AV    y2003
rename AW    y2004
rename AX    y2005
rename AY    y2006
rename AZ    y2007
rename BA    y2008
rename BB    y2009
rename BC    y2010
rename BD    y2011
rename BE    y2012
rename BF    y2013
rename BG    y2014
rename BH    y2015
rename BI    y2016

*names of indicators contain dots, which is not acceptable for variable names, and since I plan to use them as variable names, replace dot with _
gen code= subinstr( IndicatorCode ,".","_",.)
drop IndicatorCode

*limiting years based on data availability for key variables in the paper
for num 1960/1997  2015/2016: drop yX

*Taking data one year at a time and reshaping so that indicators become variables; then renaming variables and appending all years back together
for num 1998/2014: preserve \ keep CountryCode code yX \ reshape wide yX, i(CountryCode) j(code) string \ rename yX* * \gen year=X \ save WDI_yearX.dta, replace \ restore  
use WDI_year1998.dta, clear
for num 1999/2014: append using WDI_yearX.dta

*labeling variables -- label commands created in Excel by combining text with @CONCATENATE function
lab var year "Year"
do labels_for_WDI.do

*country names are from another sheet in Excel file
rename CountryCode countrycode
merge m:1 countrycode using WDI_country_names.dta, gen(merge_names)
drop merge_names 


*getting rid of observations that are not individual countries
local dropcountry ARB CEB CSS EAP EAR EAS ECA ECS EMU EUU FCS HIC HPC IBD IBT IDA IDB IDX INX LAC LCN LDC LIC LMC LMY LTE MEA MIC MNA NAC OED OSS PRE PSS PST SAS SSA SSF SST TEA TEC TLA TMN TSA TSS UMC WLD
foreach drop of local dropcountry {
   drop if countrycode=="`drop'"
}

*need a numeric code for countries because otherwise xtset won't work
encode countrycode, gen(countrynum)
xtset countrynum year
lab var countrynum "Country Numeric Code"
lab var countryname "Country Name"

*creating some variables needed for paper
*recalculating expenditure into actual constant dollars using GDP values
for var SE_XPD_PRIM_P~S  SE_XPD_SECO_P~S SE_XPD_TERT_P~S: gen Xgdp=X*NY_GDP_PCAP_KD
lab var SE_XPD_TERT_PC_ZSgdp "Government expenditure per student, tertiary, constant 2010 US$"
lab var SE_XPD_PRIM_PC_ZSgdp  "Government expenditure per student, primary, constant 2010 US$"
lab var SE_XPD_SECO_PC_ZSgdp  "Government expenditure per student, secondary, constant 2010 US$"

*calculating percent in secondary vocational based on vocational vs total secondary enrollment
gen SE_SEC_ENRL_VO_pct= (SE_SEC_ENRL_VO/SE_SEC_ENRL)*100
lab var SE_SEC_ENRL_VO_pct "Enrolment in secondary vocational as % of total secondary enrolment, both sexes"

*renaming variables so that names are easier to use
rename SE_TER_ENRR  enrol_tert
rename SE_XPD_TOT~D_ZS govt_expend
rename SE_XPD_PRIM_P~p govt_prim
rename SE_XPD_SECO_P~p govt_second
rename SE_XPD_TERT_P~p  govt_tert
rename NY_GDP_PCAP_KD gdp_pc
rename SE_PRM_ENRR  enrol_prim
rename SE_SEC_ENRR enrol_second
rename SE_SEC_ENRL_V~t vocation
rename SE_ENR_PRIM_F~S gender_prim
rename SE_ENR_SECO_F~S gender_second
rename SE_ENR_TERT_FM_ZS gender_tert
rename SP_POP_65UP_T~S pop_older
rename SP_POP_TOTL_F~S  pop_female
rename SP_POP_TOTL  pop_total

*keeping only the subset of variables used in the analysis, reshaping the dataset into wide (so that you can practice reshaping into long)
keep countrycode countryname countrynum year enrol_tert govt_expend govt_prim govt_second govt_tert gdp_pc enrol_prim enrol_second vocation gender_prim gender_second gender_tert pop_older pop_female pop_total
order countrycode countryname countrynum year enrol_tert govt_expend govt_prim govt_second govt_tert gdp_pc enrol_prim enrol_second vocation gender_prim gender_second gender_tert pop_older pop_female pop_total
reshape wide enrol_tert govt_expend govt_prim govt_second govt_tert gdp_pc enrol_prim enrol_second vocation gender_prim gender_second gender_tert pop_older pop_female pop_total, i( countrycode) j(year)
reshape clear
order countrycode countryname countrynum
save wdi_education.dta, replace