# LONGRUN-WEALTH: Combine datasets for cross-country analysis
#
# Date: Sept 25, 2018
# Author: Jonas Markgraf
# last edited: Dec 09, 2018
# R version 3.5.1 (2018-07-02) -- "Feather Spray"
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
#########################
# logs: -----------------
# JM, 09/12/18: harmonized wealth data to variable bounded by [0;1].
# JM, 09/12/18: generate data reports.
# JM, 10/01/19: add tax data from Scheve & Stasavage.


rm(list = ls())

library(readxl)  # to load data
library(repmis)  # to set wd
library(countrycode)   # to harmonize country codes
library(plyr)
library(dplyr)
library(purrr)  # to load multiple sheets from xls
library(tidyr)
library(reshape2)  # long to wide format (melt function)
library(readstata13)
library(stringi)
library(stringr)
library(zoo)  # to interpolate data
library(dataMaid)  # to generate summary reports
library(data.table)
library(DataCombine)  # to fill down data

# set wd
possibles <- c("~/Nexus365/Ben Ansell - WEALTHPOL_RESEARCH/Papers/longrunWealth/data/")
set_valid_wd(possibles)

# load political datasets
QOG <- read.csv("masterFiles/QOG/QOG_std_ts_jan18.csv", header=T)

BMR <- read.csv("masterFiles/BoixMillerRosato/democracy-v3.0.csv", header=T)

polity4 <- read_excel("masterFiles/Polity4/p4v2017-2.xls", sheet = 1)

vanhanen <- read.csv2("masterFiles/Vanhanen_LandIneq/Study/data/daF1216e.csv")

schularick <- read_excel("masterFiles/JordaSchularickTaylor/JSTdatasetR4.xlsx", sheet = 2)

houleDunning <- read.dta13("../../../Data/Crossnational/Historical/Capital Share Data Houle and Dunning.dta"
  , convert.factors = F)

morrisson <- read.csv2("../../../Data/Crossnational/Historical/Vanhanen plus Bouguignon and Morrisson.csv"
  , sep = ",")

cnts <- read_excel("masterFiles/CNTSDATA/2015 Edition CNTSDATA.xlsx", sheet = 1)

lindvall <- read_excel("masterFiles/Lindvall_HeadsGovernment/HoG_dataset_country_year_v1.5.xls"
  , sheet = 1)

trebesch <- read.dta13("masterFiles/TrebeschFunkeSchularick/EER_2849_data.dta"
  , convert.factors = F)

KAopen <- read_excel("masterFiles/ChinnIto/kaopen_2016.xls", sheet = 1)

COWtrade <- read.csv("masterFiles/COW_Trade_4.0/National_COW_4.0.csv")

wealth.Waldenstrom <- read.csv("wealth_Au-Ch-Dk-Fi-Nl-Se.csv")

wealth.portugal <- read.csv("wealth_por.csv")

# przeworski <- read.dta13("masterFiles/Przeworski_PIPE/PIPE_081813.dta", convert.factors = F)

vdem <- read.csv("masterFiles/V-Dem/V-Dem-CY+Others-v8.csv")

SSrepl <- read.dta13("../code/scheve_stasavage_replications/SSWorldPolitics2009Replication/iqimpl.dta"
  , convert.factors = F)

capitalshare.Waldenstrom <- read_excel("masterFiles/Bengtsson-Waldenström Historical Capital Shares Database, 1875-2015/Capital Share Database.xlsx"
  , sheet = 1)

albertus <- read.dta13("masterFiles/Albertus/Table4.dta", convert.factors = F)

leeLee <- read.dta13("masterFiles/LeeLee_Education/LeeLee_v1.dta", convert.factors = F)

# piketty.income <- read.csv2("masterFiles/WID_fulldataset_01-10-2018_09_04_11/WID_incomeData/WID_Data_24012019-101249.csv"
#   , sep = ";", header = F)

# taxation data is saved in different sheets
pathIncomeTax <- "masterFiles/taxationScheveStasavage/incomeTaxation/Income taxation - Masterfile.xlsx"

incomeTax <- pathIncomeTax %>%
  excel_sheets() %>%
  set_names() %>%
  map(read_excel, path = pathIncomeTax)

pathInheritTax <- "masterFiles/taxationScheveStasavage/inheritanceTaxation/Inheritance_taxation_Masterfile.xls"

inheritanceTax <- pathInheritTax %>%
  excel_sheets() %>%
  set_names() %>%
  map(read_excel, path = pathInheritTax)

# Credit Suisse data is saved in different sheets
pathCreditSuisse <- "../../../Data/Crossnational/Credit Suisse - Data/CreditSuisse_OCR.xlsx"

crdtCH <- pathCreditSuisse %>%
  excel_sheets() %>%
  set_names() %>%
    map(read_excel, path = pathCreditSuisse)

# WID data is also a bit more cumbersome as saved in separate country files.

### COMMENTED OUT because only needed when master files are downloaded again ###############
# zipFiles <- list.files(path = "masterFiles/WID_fulldataset_01-10-2018_09_04_11/WID_full_dataset"
#   , pattern = ".zip", full.names = T)
# outDir <- "masterFiles/WID_fulldataset_01-10-2018_09_04_11/unzippedData/"
# piketty <- ldply(.data = zipFiles, .fun = unzip, exdir = outDir)  
###############
# NOTE: doesn't unpack subnational files for DE

# load all macro data #### NOT USED FOR NOW ####
piketty.path <- "masterFiles/WID_fulldataset_01-10-2018_09_04_11/unzippedData/"
# files.macro <- list.files (path = "masterFiles/WID_fulldataset_01-10-2018_09_04_11/unzippedData", pattern = "MacroData.csv", recursive = T)
# piketty.macro <- list()
# for(i in 1:length(files.macro)){
#   piketty.macro[[i]] <- read.csv2(paste0(piketty.path, files.macro[i]))
# }

# load all inequality data
files.inequality <- list.files (path = "masterFiles/WID_fulldataset_01-10-2018_09_04_11/unzippedData", pattern = "InequalityData.csv", recursive = T)
piketty.inequality <- list()
for(i in 1:length(files.inequality)){
  piketty.inequality[[i]] <- read.csv2(paste0(piketty.path, files.inequality[i]))
}

#################################################################
# clean-up all data for merge -----------------------------------
#################################################################

# clean and resrict QUG dataset ----------------------------
QOG <- QOG[,c(grep("pwt_|year|ccodecow", names(QOG), value=TRUE))]  # note: that only loads Penn World Tables. Can be changed later. Just done to make file 'workable'.

colnames(QOG)[colnames(QOG) == "ccode"] <- "ccodeiso"

# clean and restructure Vanhanen dataset ----------------------------
vanhanen.varname <- c("competition", "participation", "democratization", "urbanPop"
, "nonagriPop", "occupDiversification", "totStudents", "percStudents", "percLiterate"
, "knowledgeDist", "familyFarm", "econResourceDecentral", "econPowerDist", "powerResource", "mean")

# wide to short
vanhanen <- melt(setDT(vanhanen),
  measure.vars = list(grep("_1$", colnames(vanhanen)), grep("_2$", colnames(vanhanen)),grep("_3$", colnames(vanhanen))
    ,grep("_4$", colnames(vanhanen)),grep("_5$", colnames(vanhanen)),grep("_6$", colnames(vanhanen))
    ,grep("_7$", colnames(vanhanen)),grep("_8$", colnames(vanhanen)),grep("_9$", colnames(vanhanen))
    ,grep("_10$", colnames(vanhanen)),grep("_11$", colnames(vanhanen)),grep("_12$", colnames(vanhanen))
    ,grep("_13$", colnames(vanhanen)),grep("_14$", colnames(vanhanen)),grep("_15$", colnames(vanhanen))),
  variable.name='year', value.name=vanhanen.varname, var = c(1:16))

# change year values.
vanhanen$year <- recode(vanhanen$year,`1`=1858,`2`=1868,`3`=1878,`4`=1888,`5`=1898,`6`=1908
  ,`7`=1918,`8`=1928,`9`=1938,`10`=1948,`11`=1958,`12`=1968,`13`=1978,`14`=1988,`15`=1998,`16`=2000)

# add ccode
vanhanen$ccode <- countrycode(vanhanen$bv1, "country.name", "cown")  # some not matched unambiguously (esp Serbia)

# restrict to key variables
vanhanen <- vanhanen[, c("ccode", "year", "familyFarm", "percStudents", "percLiterate"
  , "urbanPop", "nonagriPop")]

######################################################
### note: we might want to interpolate values later
######################################################

# clean and restructure Credit Suisse dataset -----------------------------------------------------

crdtCH <- crdtCH[c(10:27)]  # these are the relevant sheets

crdtCH <- lapply(crdtCH, function(x) x[-1,])  # remove first row from all dfs

# clean names of dfs
names(crdtCH) <- str_extract_all(names(crdtCH), "\\([^()]+\\)")
names(crdtCH) <- substring(names(crdtCH), 2, nchar(names(crdtCH))-1)

# rbind all elements of list into single df
crdtCH <- ldply(crdtCH, data.frame)

# clean up df.
names(crdtCH)[names(crdtCH) == ".id"] <- "year"
#crdtCH$year <- stri_sub(crdtCH$year,-4,-1)

crdtCH <- crdtCH[, c(1:2, 7:11)]  # restrict to key vars


crdtCH <- plyr::rename(crdtCH, c("Wealth.per.adult" = "pcWealth"
  , "Financial.wealth.per.adult" = "pcFinWealth"
  , "Nonfinancial.wealth.per.adult" = "pcNonfinWealth"
  , "Debt.per..adult" = "pcDebt"
  , "Median.wealth.per.adult" = "pcWealth.median"))

crdtCH$ccode <- countrycode(crdtCH$Country, "country.name", "cown")  # Serbia not unambigiously matched; needs to be solved. 

for(i in 3:ncol(crdtCH)){
  crdtCH[,i] <- as.numeric(crdtCH[,i])
}  # turn variables into numeric

crdtCH <- crdtCH[!is.na(crdtCH$ccode),]  # for now, I exclude all non-matched.

# clean and restructure taxation datasets -----------------------------------------------------

## INCOME TAX
incomeTax <- lapply(incomeTax, as.data.frame)

# remove columns that are not needed
incomeTax <- lapply(incomeTax, function(x) colnames(x) <- x[,-c(3:ncol(x))])

# remove first row from all dfs
incomeTax <- lapply(incomeTax, function(x) x[-1,])

# raname columns
incomeTax <- lapply(incomeTax, setNames, c("year", "topIncomeTax"))

# turn into single dataframe
incomeTax <- ldply (incomeTax, data.frame)

# clean and harmonize country names
incomeTax$country <- str_sub(incomeTax$.id, end = - 10)
incomeTax$country <- trimws(incomeTax$country)
incomeTax[incomeTax$country == "UK 17",]$country <- "United Kingdom"
incomeTax$ccode <- countrycode(incomeTax$country, "country.name", "cown")

# re-classify tax
incomeTax$topIncomeTax <- as.numeric(incomeTax$topIncomeTax)

## INHERITANCE TAX
inheritanceTax <- lapply(inheritanceTax, as.data.frame)

# remove columns that are not needed
inheritanceTax <- lapply(inheritanceTax, function(x) colnames(x) <- x[,-c(3:ncol(x))])

# remove first row from all dfs
inheritanceTax <- lapply(inheritanceTax, function(x) x[-1,])

# raname columns
inheritanceTax <- lapply(inheritanceTax, setNames, c("year", "topInheritTax"))

# turn into single dataframe
inheritanceTax <- ldply (inheritanceTax, data.frame)

# clean and harmonize country names
inheritanceTax$country <- str_sub(inheritanceTax$.id, end = - 10)
inheritanceTax$country <- trimws(inheritanceTax$country)
inheritanceTax$ccode <- countrycode(inheritanceTax$country, "country.name", "cown")

# re-classify tax
inheritanceTax$topInheritTax <- as.numeric(inheritanceTax$topInheritTax)

# clean and restructure Piketty dataset ---------------------------------------------------------

# new colnames
piketty.inequality <- lapply(piketty.inequality, function(x) {
  colnames(x) <- unlist(x[7,]) # take values from row = 7 as colnames
  return(x)
  })

# drop rows that are not needed
piketty.inequality <- lapply(piketty.inequality, function(x) x[-c(1:7),])

# now rbind all dfs
piketty.inequality  <- ldply(piketty.inequality, data.frame)

# restrict to relevant percentiles
incPercentiles <- c("p90p100", "p95p100", "p99p100", "p0p50", "p0p90")
piketty.inequality <- piketty.inequality[is.element(piketty.inequality$perc
  , incPercentiles),]

# drop unnecessary column
piketty.inequality[, 1] <- NULL

# turn variables in numeric
piketty.inequality[,c(4:ncol(piketty.inequality))] <- apply (piketty.inequality[,c(4:ncol(piketty.inequality))]
  ,2, function(x) as.numeric(as.character(x)))

# bring to long format
piketty.inequality <- melt(piketty.inequality[,!(colnames(piketty.inequality) %in% c("Variable.Code"))]
  , id.vars=c("country", "year", "perc"), na.rm = F)

######## STRUCTURE OF VARIABLE NAMES ###########
# 1st letter: s = share; a = average; m = macroecon total; w = wealth/income ratio
# 'ptinc': pre-tax income
# 'hweal': net wealth
# 'fiinc': fiscal income
# 'pllin': labor income
# 'pkkin': capital income
# 992 = 20+y; 999 = all ages; 996 = 20-65y
# last letter: i = individuals; t = tax units; j = equal-split adults
################################################

# only keep columns for pre/post-fiscal income & wealth (decided based on data availability) and age=20+y
keyVars <- c("sptinc992", "shweal992", "sfiinc992"
  # ,"spllin992", "spkkin992" ## exluded for now: only 2 countries (US,FR) post-1962
  )
piketty.inequality <- piketty.inequality[grep(paste(keyVars, collapse = "|"), piketty.inequality$variable),]

# only keep observations that are measured at individual level or equal-split adults (based on data availability)
piketty.inequality <- subset(piketty.inequality, grepl("^.+(j|i)$", variable))

#drop NAs
piketty.inequality <- piketty.inequality[!is.na(piketty.inequality$value),]


# drop observations at the subnational level
piketty.inequality <- piketty.inequality[(which(nchar(as.character(piketty.inequality$country)) == 2)),]

# different countries have data in individual vs. equal-split adult format
# went through data and grouped countries based on data availability
spltData <- c("US", "TR", "BH", "BR", "CI", "CN", "CZ", "EG", "FR", "HR", "IN", "SA"
  , "IQ", "IR", "JO", "KW", "LB", "OM", "PS", "RU", "QA", "SI", "SY", "XM", "YE", "AE")

indData <- c("JP", "NO", "AU","AR", "CA", "CO", "DK", "ES", "FI", "GB", "HU", "IT", "KR"
  , "MY", "NZ", "SC", "SG", "UY", "ZA", "ZM", "ZW")

piketty.inequality$dataLvl <- ifelse(is.element(piketty.inequality$country, indData) == T
  , "ind", ifelse(is.element(piketty.inequality$country, spltData) == T, "splt",NA))

piketty.inequality <- subset(piketty.inequality, (grepl("^.+(i)$", variable) & piketty.inequality$dataLvl == "ind") | 
    (grepl("^.+(j)$", variable) & piketty.inequality$dataLvl == "splt"))

# rename measurements
piketty.inequality$variable <- revalue(piketty.inequality$variable
  , c("sptinc992i" = "incomePre"
    , "sptinc992j" = "incomePre"
    , "sfiinc992i" = "incomePost"
    , "sfiinc992j" = "incomePost"
    , "shweal992i" = "wealth"
    , "shweal992j" = "wealth"))

# bring into wide format
piketty.inequality$value <- as.numeric(as.character(piketty.inequality$value ))  # bring values in correct format

piketty.inequality <-  dcast(piketty.inequality, country + year + dataLvl ~ variable + perc)

# add cow country code
piketty.inequality$ccode <- countrycode(piketty.inequality$country, "iso2c", "cown")

# clean Cross-National Time-Series data -----------------------------------------------------------

colnames(cnts) <- cnts[1,]  # rename variables
cnts <- cnts[-1,]  # delete first row

cnts.var <- c("trade2", "trade4", "industry3", "industry4", "domestic2"
  , "domestic4", "domestic6", "domestic7", "domestic8","domestic9", "polit11", "polit12"
  , "year", "Wbcode")
cnts <- cnts[,cnts.var]  # restrict to key vars

cnts <- plyr::rename(cnts, c("trade2" = "pcImport"
  , "trade4" = "pcExport"
  , "industry3" = "agriWorkforce"
  , "industry4" = "industWorkforce"
  , "domestic2" = "genStrikes"
  , "domestic4" = "govCrises"
  , "domestic6" = "riots"
  , "domestic7" = "revolutions"
  , "domestic8" = "antigovDemo"
  , "domestic9" = "conflictIndex"
  , "polit11" = "executiveTurnover"
  , "polit12" = "cabinetTurnover"))

cnts$ccode <- countrycode(cnts$Wbcode, "wb", "cown")  # issues with ABW, NAU, ROM, SER, SIN, VAT, WBG, ZAR; no WB code for 32 countries (Czechoslovakia; USSR (Russia); Yugoslavia; Yemen; German DR; German FR; Netherlands Antilles; Vietnam DR; Vietnam REP; UAR; Dahomey; Malagasy Republic; Upper Volta; Tanganyika; Western Samoa; Yemen AR; Zanzibar; Rhodesia; Yemen PDR (So. Yemen); Yemen PDR; Comoros; Transkei; Bophutswana; Venda; Ciskei; Senegambia; Cyprus: Greek Sector; Cyprus: Turkish Sector; Somaliland; Anjouan; Serbia/Montenegro; Kosovo)

# clean Heads of Government data ------------------------------------------------------------------

# recode ideology
lindvall$hogideo <- recode(lindvall$hogideo
  , "C" = "center"
  , "R" = "right"
  , "L" = "left"
  , "O" = "other")

# clean Trebesch, Funke & Schularick data ---------------------------------------------------------

# unique country ID
trebesch$ccodecow <- countrycode(trebesch$iso, "wb", "cown")

# rename variables
trebesch <- plyr::rename(trebesch, c("frac" = "polFrac"
    , "right" = "rightVote"
    , "left" = "leftVote"
    , "extr" = "extrVote"
    , "crisisJST" = "crisisFin"
    , "pk_fin" = "recessionFin"
    , "pk_norm" = "recessionNonfin"
    , "pk_dis" = "crisisNonfin"))

# restrict to key vars
vars.trebesch <- c("ccodecow", "year", "govvote", "oppvote", "polFrac", "partycount"
  , "rightVote", "leftVote", "extrVote", "crisisFin", "crisisNonfin", "recessionFin"
  , "recessionNonfin", "vetopl", "election")

#### clean capital-openness data ------------------------------------------------------------------

KAopen$ccodecow <- countrycode(KAopen$ccode, "iso3c", "cown") # not matched for ABW, ANT, HKG, ROM, ZAR

# clean Waldenstrom's capital share data ----------------------------------------------------------

colnames(capitalshare.Waldenstrom) <- capitalshare.Waldenstrom[8,]
capitalshare.Waldenstrom <- capitalshare.Waldenstrom[9:nrow(capitalshare.Waldenstrom),c(1:4)]
capitalshare.Waldenstrom$ccodecow <- countrycode(capitalshare.Waldenstrom$Country
  , "country.name", "cown")

colnames(capitalshare.Waldenstrom)[3:4] <- c("capshare.gross", "capshare.net")

# clean Waldenstrom's wealth inequality data ------------------------------------------------------

colnames(wealth.Waldenstrom)[c(4,6,7)] <- c("wealth.top1", "wealth.top10", "wealth.top5")

# harmonize to 0-1 variables
wealth.Waldenstrom[, c(4,6,7)] <- wealth.Waldenstrom[, c(4,6,7)]/100

# clean Portugal wealth inequality data ------------------------------------------------------

# harmonize to 0-1 variables
wealth.portugal[, c(3:9)] <- wealth.portugal[, c(3:9)]/100


# clean Albertus data -----------------------------------------------------------------------------

colnames(albertus)[grepl("l_repressive_labor_pc", colnames(albertus))] <- "labordepAgri"

# clean Lee&Lee Education data --------------------------------------------------------------------

# change variable names
leeLee <- plyr::rename(leeLee, c("pri" = "educ.enrollPrim"
  , "sec" = "educ.enrollScnd"
  , "ter" = "educ.enrollTert"
  , "lu" = "educ.noSchool"
  , "lp" = "educ.prim"
  , "lpc" = "educ.primCompl"
  , "ls" = "educ.scnd"
  , "lsc" = "educ.scndCompl"
  , "lh" = "educ.tert"
  , "lhc" = "educ.tertCompl"
  , "tyr" = "educ.yrsTot"
  , "pyr" = "educ.yrsPrim"
  , "syr" = "educ.yrsScnd"
  , "hyr" = "educ.yrsTert"
  , "hc" = "humanCapital"
  , "hca" = "humanCapital2"))

# drop 'male' and 'female' observations -- only keep combined [might be changed later]
leeLee <- leeLee[leeLee$sex == "MF",]

# add CoW id
leeLee$ccodecow <- countrycode(leeLee$country, "country.name", "cown") # attention: Hong Kong; China; Reunion; Serbia couldn't be matched -- need to solve later.

# # clean Przeworki PIPE data -----------------------------------------------------------------------
# 
# przeworski <- przeworski %>%
#   group_by(cowcodes) %>%
#   mutate(PrzSuffrage = FillDown(Var = eligible_pr))
#   

# clean V-Dem data --------------------------------------------------------------------------------

names(vdem)[grepl("v2fsuffrage", names(vdem))] <- "vdem.femSuffrage"
names(vdem)[grepl("v2msuffrage", names(vdem))] <- "vdem.maleSuffrage"

# clean Scheve & Stasavage replication data -------------------------------------------------------

keyVar.SSrepl <- c("funisuf", "fempart", "nondem", "wagctl2", "totden2", "stnum", "year")

SSrepl <- SSrepl[, keyVar.SSrepl]

SSrepl <- plyr::rename(SSrepl, c("funisuf" = "SS.femSuffrage"
  , "fempart" = "SS.femParticipation"
  , "nondem" = "SS.nondem"
  , "wagctl2" = "SS.wageCntrl"
  , "totden2" = "SS.unionDens"))

###############################################################
# combine datasets -------------------------------------------
###############################################################

# combine Boix with QoG
fullData <- merge(BMR[ ,!(colnames(BMR) %in% c("X", "abbreviation"))], QOG, by.x = c("ccode", "year")
  , by.y = c("ccodecow", "year"), all.x = T)  # matched by Correlates of War ID

# add polity 4
fullData <- merge(fullData, polity4[,c("ccode", "year", "polity", "polity2")]
  , by = c("ccode", "year"), all.x = T)  # matched by Correlates of War ID

# add Schularick
schularick$ccode <- countrycode(schularick$iso, "iso3c", "cown")

fullData <- merge(fullData, schularick[ ,!(colnames(schularick) %in% c("country", "iso", "ifs", "pop"))]
  , by = c("ccode", "year"), all.x = T)  # matched by Correlates of War ID

# add Vanhanen
fullData <- merge(fullData, vanhanen, by = c("ccode", "year"), all.x = T)

# add Credit Suisse
fullData <- merge(fullData, crdtCH[,!(colnames(crdtCH) %in% c("Country"))], by = c("ccode", "year"), all.x = T)

# add Dunning & Houle
fullData <- merge(fullData, houleDunning[,c("ccode", "year", "kshare_2", "capshare")]
  , by = c("ccode", 'year'), all.x = T)

# add Bourguignon/Morrisson
morrisson$bmgini <- as.numeric(as.character(morrisson$bmgini))
fullData <- merge(fullData, morrisson[,c("ccode", "year", "bmgini")]
  , by = c("ccode", 'year'), all.x = T)

# add CNTS data
fullData <- merge(fullData, cnts, by = c("ccode", "year")
  , all.x = T)

# add Lindvall's Heads of Government data
fullData <- merge(fullData, lindvall[, c("ccodecow", "year", "hogideo", "hogtenure", "hogrel")]
  , by.x = c("ccode", "year"), by.y = c("ccodecow", "year"), all.x = T)

# add Trebesch, Funke & Schularick data
fullData <- merge(fullData, trebesch[,vars.trebesch], by.x = c("ccode", "year")
  , by.y = c("ccodecow", "year"), all.x = T)

# add Chinn-Ito data on capital openness
fullData <- merge(fullData, KAopen[,c("ccodecow", "year", "ka_open")], by.x = c("ccode", "year")
  , by.y = c("ccodecow", "year"), all.x = T)

# add Piketty data
fullData <- merge(fullData, piketty.inequality[, c("ccode", "year","incomePre_p90p100"
  , "incomePre_p99p100", "wealth_p0p50", "wealth_p0p90", "wealth_p90p100", "wealth_p95p100"
  , "wealth_p99p100")], by = c("ccode", 'year'), all.x = T)

# add CapShare data from Waldenstrom
fullData <- merge(fullData, capitalshare.Waldenstrom[, !(colnames(capitalshare.Waldenstrom) %in% c("Country"))]
  , by.x = c("ccode", "year"),by.y = c("ccodecow", "Year"), all.x = T)

# add wealth inequality data from Roine & Waldenstrom
fullData <- merge(fullData, wealth.Waldenstrom[, !(colnames(wealth.Waldenstrom) %in% c("X", "country", "p95p99"))]
  , by.x = c("ccode", "year"),by.y = c("ccodecow", "Year"), all.x = T)

# add wealth inequality data for Portugal
fullData <- merge(fullData, wealth.portugal[, !(colnames(wealth.portugal) %in% c("X"))]
  , by.x = c("ccode", "year"),by.y = c("cow", "Year"), all.x = T)


# add land and structural inequality data from Albertus
fullData <- merge(fullData, albertus[, c("hmccode", "year", "lwheatsugar", "labordepAgri")]
  , by.x = c("ccode", "year"),by.y = c("hmccode", "year"), all.x = T) # note: two countries not matched

# add education data from Lee & Lee
fullData <- merge(fullData, leeLee[, !(colnames(leeLee) %in% c("region_code", "BLcode","country", "sex", "pop"))]
  , by.x = c("ccode", "year"),by.y = c("ccodecow", "year"), all.x = T) # note: two countries not matched

# add top inheritance tax rate
fullData <- merge(fullData, inheritanceTax[, !(colnames(inheritanceTax) %in% c(".id", "country"))]
  , by.x = c("ccode", "year"),by.y = c("ccode", "year"), all.x = T)

# add top income tax rate
fullData <- merge(fullData, incomeTax[, !(colnames(incomeTax) %in% c(".id", "country"))]
  , by.x = c("ccode", "year"),by.y = c("ccode", "year"), all.x = T)

# add trade openness from COW
fullData <- merge(fullData, COWtrade[, c("ccode", "year", "imports", "exports")]
  , by = c("ccode", "year"), all.x = T)

# add suffrage data from V-Dem
fullData <- merge(fullData, vdem[, c("COWcode", "year", "vdem.maleSuffrage", "vdem.femSuffrage")]
  , by.x = c("ccode", "year"), by.y = c("COWcode", "year"), all.x = T)

# add replication from V-Dem
fullData <- merge(fullData, SSrepl
  , by.x = c("ccode", "year"), by.y = c("stnum", "year"), all.x = T)

######################################################################
# combine wealth inequality measures to single column ----------------
######################################################################

## top-10%
fullData$top10wealth <- ifelse(!is.na(fullData$wealth_p90p100), fullData$wealth_p90p100
  , ifelse(!is.na(fullData$wealth.top10), fullData$wealth.top10
    , ifelse(!is.na(fullData$P90), fullData$P90, NA)))

## top-5%
fullData$top5wealth <- ifelse(!is.na(fullData$wealth_p95p100), fullData$wealth_p95p100
  , ifelse(!is.na(fullData$wealth.top5), fullData$wealth.top5
    , ifelse(!is.na(fullData$P95), fullData$P95, NA)))

## top-1%
fullData$top1wealth <- ifelse(!is.na(fullData$wealth_p99p100), fullData$wealth_p99p100
  , ifelse(!is.na(fullData$wealth.top1), fullData$wealth.top1
    , ifelse(!is.na(fullData$P99), fullData$P99, NA)))


###############################################################
# interpolate data --------------------------------------------
###############################################################

# linear interpolation
fullData.linApprox <- fullData %>%
  group_by(ccode) %>%
  mutate(wealth.top1 = na.approx(wealth.top1, na.rm = F)
    , wealth.top5 = na.approx(wealth.top5, na.rm = F)
    , wealth.top10 = na.approx(wealth.top10, na.rm = F)
    , educ.enrollPrim = na.approx(educ.enrollPrim, na.rm = F)
    , educ.enrollScnd = na.approx(educ.enrollScnd, na.rm = F)
    , educ.enrollTert = na.approx(educ.enrollTert, na.rm = F)
    , educ.noSchool = na.approx(educ.noSchool, na.rm = F)
    , educ.prim = na.approx(educ.prim, na.rm = F)
    , educ.primCompl = na.approx(educ.primCompl, na.rm = F)
    , educ.scnd = na.approx(educ.scnd, na.rm = F)
    , educ.scndCompl = na.approx(educ.scndCompl, na.rm = F)
    , educ.tert = na.approx(educ.tert, na.rm = F)
    , educ.tertCompl = na.approx(educ.tertCompl, na.rm = F)
    , educ.yrsTot = na.approx(educ.yrsTot, na.rm = F)
    , educ.yrsPrim = na.approx(educ.yrsPrim, na.rm = F)
    , educ.yrsScnd = na.approx(educ.yrsScnd, na.rm = F)
    , educ.yrsTert = na.approx(educ.yrsTert, na.rm = F)
    , humanCapital = na.approx(humanCapital, na.rm = F)
    , humanCapital2 = na.approx(humanCapital2, na.rm = F)
    , familyFarm = na.approx(familyFarm, na.rm = F)
    , percStudents = na.approx(percStudents, na.rm = F)
    , percLiterate = na.approx(percLiterate, na.rm = F)
    , urbanPop = na.approx(urbanPop, na.rm = F)
    , nonagriPop = na.approx(nonagriPop, na.rm = F)
    , bmgini = na.approx(bmgini, na.rm = F)
    , capshare.net = na.approx(capshare.net, na.rm = F)
    , capshare.gross = na.approx(capshare.gross, na.rm = F)
    # , incomePre_p0p50 = na.approx(incomePre_p0p50, na.rm = F)
    # , incomePre_p0p90 = na.approx(incomePre_p0p90, na.rm = F)
    , incomePre_p90p100 = na.approx(incomePre_p90p100, na.rm = F)
    # , incomePre_p95p100 = na.approx(incomePre_p95p100, na.rm = F)
    , incomePre_p99p100 = na.approx(incomePre_p99p100, na.rm = F)
    # , incomePost_p0p50 = na.approx(incomePost_p0p50, na.rm = F)
    # , incomePost_p0p90 = na.approx(incomePost_p0p90, na.rm = F)
    # , incomePost_p90p100 = na.approx(incomePost_p90p100, na.rm = F)
    # , incomePost_p95p100 = na.approx(incomePost_p95p100, na.rm = F)
    # , incomePost_p99p100 = na.approx(incomePost_p99p100, na.rm = F)
    , wealth_p0p50 = na.approx(wealth_p0p50, na.rm = F)
    , wealth_p0p90 = na.approx(wealth_p0p90, na.rm = F)
    , wealth_p90p100 = na.approx(wealth_p90p100, na.rm = F)
    , wealth_p95p100 = na.approx(wealth_p95p100, na.rm = F)
    , wealth_p99p100 = na.approx(wealth_p99p100, na.rm = F)
    , top1wealth = na.approx(top1wealth, na.rm = F)
    , top5wealth = na.approx(top5wealth, na.rm = F)
    , top10wealth = na.approx(top10wealth, na.rm = F))

# # polynomial (spline) interpolation
# fullData.splnApprox <- fullData %>%
#   group_by(ccode) %>%
#   mutate(wealth.top1 = na.spline(wealth.top1, na.rm = F)
#     , wealth.top5 = na.spline(wealth.top5, na.rm = F)
#     , wealth.top10 = na.spline(wealth.top10, na.rm = F)
#     , educ.enrollPrim = na.spline(educ.enrollPrim, na.rm = F)
#     , educ.enrollScnd = na.spline(educ.enrollScnd, na.rm = F)
#     , educ.enrollTert = na.spline(educ.enrollTert, na.rm = F)
#     , educ.noSchool = na.spline(educ.noSchool, na.rm = F)
#     , educ.prim = na.spline(educ.prim, na.rm = F)
#     , educ.primCompl = na.spline(educ.primCompl, na.rm = F)
#     , educ.scnd = na.spline(educ.scnd, na.rm = F)
#     , educ.scndCompl = na.spline(educ.scndCompl, na.rm = F)
#     , educ.tert = na.spline(educ.tert, na.rm = F)
#     , educ.tertCompl = na.spline(educ.tertCompl, na.rm = F)
#     , educ.yrsTot = na.spline(educ.yrsTot, na.rm = F)
#     , educ.yrsPrim = na.spline(educ.yrsPrim, na.rm = F)
#     , educ.yrsScnd = na.spline(educ.yrsScnd, na.rm = F)
#     , educ.yrsTert = na.spline(educ.yrsTert, na.rm = F)
#     , humanCapital = na.spline(humanCapital, na.rm = F)
#     , humanCapital2 = na.spline(humanCapital2, na.rm = F)
#     , familyFarm = na.spline(familyFarm, na.rm = F)
#     , percStudents = na.spline(percStudents, na.rm = F)
#     , percLiterate = na.spline(percLiterate, na.rm = F)
#     , urbanPop = na.spline(urbanPop, na.rm = F)
#     , nonagriPop = na.spline(nonagriPop, na.rm = F)
#     , bmgini = na.spline(bmgini, na.rm = F)
#     , capshare.net = na.spline(capshare.net, na.rm = F)
#     , capshare.gross = na.spline(capshare.gross, na.rm = F)
#     , incomePre_p0p50 = na.spline(incomePre_p0p50, na.rm = F)
#     , incomePre_p0p90 = na.spline(incomePre_p0p90, na.rm = F)
#     , incomePre_p90p100 = na.spline(incomePre_p90p100, na.rm = F)
#     , incomePre_p95p100 = na.spline(incomePre_p95p100, na.rm = F)
#     , incomePre_p99p100 = na.spline(incomePre_p99p100, na.rm = F)
#     , incomePost_p0p50 = na.spline(incomePost_p0p50, na.rm = F)
#     , incomePost_p0p90 = na.spline(incomePost_p0p90, na.rm = F)
#     , incomePost_p90p100 = na.spline(incomePost_p90p100, na.rm = F)
#     , incomePost_p95p100 = na.spline(incomePost_p95p100, na.rm = F)
#     , incomePost_p99p100 = na.spline(incomePost_p99p100, na.rm = F)
#     , wealth_p0p50 = na.spline(wealth_p0p50, na.rm = F)
#     , wealth_p0p90 = na.spline(wealth_p0p90, na.rm = F)
#     , wealth_p90p100 = na.spline(wealth_p90p100, na.rm = F)
#     , wealth_p95p100 = na.spline(wealth_p95p100, na.rm = F)
#     , wealth_p99p100 = na.spline(wealth_p99p100, na.rm = F))
# 


# export in various formats
save.dta13(fullData[fullData$year >= 1870,], "longrunData_1870onwards.dta", version = 14)
save.dta13(fullData.linApprox[fullData.linApprox$year >= 1870,]
  , "longrunData_linInterpolation.dta", version = 14)
write.csv(fullData[fullData$year >= 1870,], "longrunData_1870onwards.csv")
save(fullData, file = "longrunData_full.RData")

# summary tables ----------------------------------------

# subset to countries with *any* wealth data
fullDataWealth <- fullData %>% group_by(ccode) %>% 
  filter(any(!is.na(top1wealth | top5wealth | top10wealth)))

fullDataWealthInterpolated <- fullData.linApprox %>% group_by(ccode) %>% 
  filter(any(!is.na(top1wealth | top5wealth | top10wealth)))


# generate summary documents

## entire sample
makeDataReport(fullDataWealth, output = "pdf", file = "../code/summaryReports/summary_allYears.Rmd"
  , replace = T
  , reportTitle = "Data Summary for countries with any wealth data, all years, not interpolated")

## entire sample, interpolated
makeDataReport(fullDataWealthInterpolated, output = "pdf"
  , file = "../code/summaryReports/summary_allYears-interpolated.Rmd", replace = T
  , reportTitle = "Data Summary for countries with any wealth data, all years, interpolated")

## pre-WW1 sample
makeDataReport(fullDataWealth[fullDataWealth$year <= 1914,]
  , output = "pdf", file = "../code/summaryReports/summary_preWW1.Rmd", replace = T
  , reportTitle = "Data Summary for countries with any wealth data, until WW1, not interpolated")

## inter-war sample
makeDataReport(fullDataWealth[fullDataWealth$year >= 1914 & fullDataWealth$year <= 1945,]
  , output = "pdf", file = "../code/summaryReports/summary_WW1toWW2.Rmd", replace = T
  , reportTitle = "Data Summary for countries with any wealth data, 1914--1945, not interpolated")

## post-WW2 sample
makeDataReport(fullDataWealth[fullDataWealth$year > 1945,]
  , output = "pdf", file = "../code/summaryReports/summary_postWW2.Rmd", replace = T
  , reportTitle = "Data Summary for countries with any wealth data, 1945-today, not interpolated")


################### CONTINUE HERE #######################
# CHECK WHAT'S GOING ON WITH KOREA, SERBIA, MONTENEGRO
#########################################################