# Create Main YouGov UK Survey Dataset June 9 2021 MAE #

library(tidyverse)
library(haven)
library(tidylog)



# Import main datasets ----------------------------------------------------


# using SPSS as main input dataset to keep labels
#  BA 28/7/21 changing file paths to work for all users - Adding constituency level data and constituency data for panel

yg_dat <-read_sav("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/OxfordUni_WealthInequality_June2021_Client.sav")

yg_dat <- zap_formats(yg_dat)

yg_dat <- zap_widths(yg_dat)

yg_dat_cons <-read_sav("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/Pcon_constituency_Client.sav")

cons_data <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/constituency_data_yg.csv")

yg_dat_cons <- yg_dat_cons %>% 
  left_join(cons_data, by = "profile_pcon_2010_constituency_Cal")

#write_csv(yg_dat_cons, "~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/yg_data_par_cons.csv")

# Using csvs instead

#yg_dat <-read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/OxfordUni_WealthInequality_June2021_Client.csv")
#yg_dat_cons <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/yg_data_par_cons.csv")

yg_dat <- yg_dat %>% 
  left_join(yg_dat_cons, by  = "ID")

# Recoding 

# Don't Knows coded as NAs 
# All number answers coded as integers
# Nominal answers coded as characters

# Adding label variables for demographics ---------------------------------


# Adding Codes for Sector, Region, Ethnicity, Occupation, HH Income, Tenure


ed_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  Oprofile_education_level = as.numeric(c("1",
           "2","3","4","5","6","7","8","9","10",
           "11","12","13","14","15","16","17","18",
           "19","20","98","99")),
  education_code = c("No formal qualifications",
                "Youth training certificate/skillseekers",
                "Recognised trade apprenticeship completed","Clerical and commercial",
                "City & Guilds certificate",
                "City & Guilds certificate - advanced","ONC",
                "CSE grades 2-5",
                "CSE grade 1, GCE O level, GCSE, School Certificate",
                "Scottish Ordinary/ Lower Certificate","GCE A level or Higher Certificate",
                "Scottish Higher Certificate",
                "Nursing qualification (e.g. SEN, SRN, SCM, RGN)",
                "Teaching qualification (not degree)",
                "University diploma",
                "University or CNAA first degree (e.g. BA, B.Sc, B.Ed)",
                "University or CNAA higher degree (e.g. M.Sc, Ph.D)",
                "Other technical, professional or higher qualification",
                "Don't know","Prefer not to say","Skipped",
                "Not Asked")
)

sector_codes <- data.frame(
  stringsAsFactors = FALSE,
  Oemployer_type_private_non_self = c("1","2","3","4","5","6",
           "7","8","9","10","11","12","13","14","15","16",
           "17","18","19","20","21","22","23","24","25","26",
           "27","28","29","30","31","32","33","34","97",
           "99","998","999"),
  sector_private = c("Agriculture","Natural resources",
             "Construction",
             "Manufacturing: Automobiles ","Manufacturing: Heavy industry",
             "Manufacturing: Technology",
             "Manufacturing: Other","Wholesaling",
             "Media and communications","Financial Services: Banking",
             "Financial Services: Insurance",
             "Financial Services: Brokerage & Investment",
             "Business services",
             "Real estate","Scientific research","Technology services",
             "Engineering, design, or architecture",
             "Legal services","Personal care","Education",
             "Health","Arts, entertainment, and recreation",
             "Restaurant services","Hotels or lodging",
             "Airlines or air travel","Other travel","Retail",
             "Transportation","Maintenance and repair","Accounting",
             "Financial Services: Other",
             "Leisure",
             "Tourism ",
             "Utilities and energy","Other","Not applicable","Skipped",
             "Not Asked")
)

sector_codes <- sector_codes %>% 
  mutate(Oemployer_type_private_non_self=as.numeric(Oemployer_type_private_non_self))

region_codes<- data.frame(
  stringsAsFactors = FALSE,
       check.names = FALSE,
            profile_GOR = c("1","2","3","4","5","6",
                       "7","8","9","10","11","12","13","98","99"),
         region_name = c("North East","North West",
                       "Yorkshire and the Humber","East Midlands","West Midlands",
                       "East of England","London","South East","South West",
                       "Wales","Scotland","Northern Ireland",
                       "Non UK & Invalid","Skipped","Not Asked")
)

region_codes <- region_codes %>% 
  mutate(profile_GOR = as.numeric(profile_GOR))

ethnicity_codes <- data.frame(
  stringsAsFactors = FALSE,
       check.names = FALSE,
       Oethnicity_new = as.numeric(c("1","2","3","4","5","6",
                                          "7","8","9","10","11","12","13",
                                          "14","15","16","17","18","19",
                                          "98","99")),
       ethnicity_name = c("White British","Irish",
                          "Gypsy or Irish Traveller",
                           "Other White",
                           "White and Black Caribbean","White and Black African",
                           "White and Asian",
                           "Other Mixed","Indian",
                           "Pakistani","Bangladeshi","Chinese",
                           "Other Asian ","African",
                           "Caribbean",
                           "Other Black","Arab",
                           "Any other ethnic group",
                           "Prefer not to say","Skipped","Not Asked")
                   )

work_stat_codes <- data.frame(
  stringsAsFactors = FALSE,
       check.names = FALSE,
        Oprofile_work_stat = as.numeric(c("1","2","3","4","5","6",
                                          "7","8","98","99")),
       work_status = c("Full time",
                       "Part time (8-29 hrs)",
                       "Part time (<8 hrs)",
                        "Student","Retired","Unemployed",
                        "Not working","Other","Skipped",
                         "Not Asked")
                   )

occ_codes <- data.frame(
  stringsAsFactors = FALSE,
       check.names = FALSE,
  Oprofile_work_type = as.numeric(c("1",
            "2","3","4","5","6","7","8","9",
            "98","99")),
  occ_type = c("Professional",
                "Manager",
                "Junior managerial",
                "Sales or Services ",
                "Supervisor",
                "Skilled Manual Work",
                "Low-skilled Manual Work", "Other", "Never Worked",
                "Skipped","Not Asked")
             )

industry_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  Oprofile_work_industry = as.numeric(c("1","2","3","4","5","6","7",
            "8","9","10","11","12","13","14",
            "15","16","17","18","19","20",
            "21","98","99")),
  Not.Asked = c("Advertising/Marketing/PR",
                "Aerospace",
                "Agriculture/Chemicals/Forest Products","Automotive",
                "Computers/Electronics","Construction",
                "Consumer Goods","Education","Energy/Mining",
                "Finance/Insurance/Real Estate",
                "Government/Military/Public Service",
                "Hospitality/Recreation",
                "Media/Publishing/Entertainment","Medical/Health Services",
                "Pharmaceuticals","Retail","Service",
                "Telecommunications/Networking",
                "Travel/Transportation","Other",
                "Have never worked","Skipped","Not Asked")
)

marital_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  Oprofile_marital_stat = as.numeric(c("1","2","3","4","5","6","7",
            "8","9")),
  marital_status= c("Married","Living as married",
                "Separated (after being married)","Divorced",
                "Widowed","Never married",
                "Civil Partnership","Skipped","Not Asked")
)

housing_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  Oprofile_house_tenure = as.numeric(c("1","2","3","4","5","6","7",
           "8","9","98","99")),
  house_tenure = c("Own outright",
                "Own mortgage",
                "Shared Ownership",
                "Rent: private","Rent:LA",
                "Rent:HA",
                "Live with others: pay",
                "Live with others: free","Other",
                "Skipped","Not Asked")
)
  
hh_income_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  Oprofile_gross_household = as.numeric(c("1","2","3","4","5","6",
            "7","8","9","10","11","12","13",
            "14","15","16","17","98","99")),
  hh_income_code = c("under £5,000 per year",
                "£5,000 to £9,999 per year",
                "£10,000 to £14,999 per year",
                "£15,000 to £19,999 per year","£20,000 to £24,999 per year",
                "£25,000 to £29,999 per year",
                "£30,000 to £34,999 per year",
                "£35,000 to £39,999 per year",
                "£40,000 to £44,999 per year",
                "£45,000 to £49,999 per year","£50,000 to £59,999 per year",
                "£60,000 to £69,999 per year",
                "£70,000 to £99,999 per year",
                "£100,000 to £149,999 per year","£150,000 and over",
                "Don't know","Prefer not to answer",
                "Skipped","Not Asked")
)

pers_inc_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  Oprofile_gross_personal = as.numeric(c("1","2","3","4","5","6","7",
            "8","9","10","11","12","13","14",
            "15","16","98","99")),
  pers_income_code = c("under £5,000 per year",
                "£5,000 to £9,999 per year",
                "£10,000 to £14,999 per year",
                "£15,000 to £19,999 per year","£20,000 to £24,999 per year",
                "£25,000 to £29,999 per year",
                "£30,000 to £34,999 per year",
                "£35,000 to £39,999 per year",
                "£40,000 to £44,999 per year","£45,000 to £49,999 per year",
                "£50,000 to £59,999 per year",
                "£60,000 to £69,999 per year",
                "£70,000 to £99,999 per year","£100,000 and over",
                "Don't know","Prefer not to answer","Skipped",
                "Not Asked")
)

yg_dat <- yg_dat %>% 
  left_join(ed_codes, by = "Oprofile_education_level")

yg_dat <- yg_dat %>% 
  left_join(sector_codes, by = "Oemployer_type_private_non_self")

yg_dat <- yg_dat %>% 
  left_join(region_codes, by = "profile_GOR")

yg_dat <- yg_dat %>% 
  left_join(ethnicity_codes, by = "Oethnicity_new")

yg_dat <- yg_dat %>% 
  left_join(work_stat_codes, by = "Oprofile_work_stat")

yg_dat <- yg_dat %>% 
  left_join(occ_codes, by = "Oprofile_work_type")

yg_dat <- yg_dat %>% 
  left_join(marital_codes, by = "Oprofile_marital_stat")

yg_dat <- yg_dat %>% 
  left_join(housing_codes, by = "Oprofile_house_tenure")

yg_dat <- yg_dat %>% 
  left_join(hh_income_codes, by = "Oprofile_gross_household")

yg_dat <- yg_dat %>% 
  left_join(pers_inc_codes, by = "Oprofile_gross_personal")

rm(ed_codes, ethnicity_codes, hh_income_codes, housing_codes, industry_codes, marital_codes, occ_codes,
   pers_inc_codes, region_codes, sector_codes, work_stat_codes)





# Adding New Variables for Demographics -----------------------------------



# Create New Variables with DK's removed etc  for DEMOGRAPHICS

yg_dat <- yg_dat %>% 
  mutate(turnout_ge_2019 = case_when(voted_ge_2019 == 1L ~ 1L,
                                   voted_ge_2019 == 2L ~ 0L,
                                   TRUE ~ NA_integer_),
         ge_2019 = case_when(pastvote_ge_2019 == 1L ~ "Conservative",
                                      pastvote_ge_2019 == 2L ~ "Labour",
                                      pastvote_ge_2019 == 3L ~ "Liberal Democrat",
                                      pastvote_ge_2019 == 4L ~ "SNP",
                                      pastvote_ge_2019 == 5L ~ "Plaid Cymru",
                                      pastvote_ge_2019 == 6L ~ "Brexit Party",
                                      pastvote_ge_2019 == 7L ~ "Green",
                                      pastvote_ge_2019 == 98 ~ "Other",
                                      TRUE ~ NA_character_),
         EURef = case_when(pastvote_EURef == 1L ~ "Remain",
                                    pastvote_EURef == 2L ~ "Leave",
                                    pastvote_EURef == 3L ~ "Did Not Vote",
                                    pastvote_EURef == 4L ~ "Can't Remember",
                                    TRUE ~ NA_character_),
         voted_leave = case_when(EURef == "Remain" ~ 0L, 
                                 EURef == "Leave" ~ 1L,
                                 EURef == "Did Not Vote" ~0L,
                                 TRUE ~ NA_integer_),
         voted_remain = case_when(EURef == "Remain" ~ 1L, 
                                 EURef == "Leave" ~ 0L,
                                 EURef == "Did Not Vote" ~0L,
                                 TRUE ~ NA_integer_),
         ref_didnt_vote = case_when(EURef == "Remain" ~ 0L, 
                                  EURef == "Leave" ~ 0L,
                                  EURef == "Did Not Vote" ~1L,
                                  TRUE ~ NA_integer_),
         gender = case_when(profile_gender == 1L ~ "Male",
                                    profile_gender == 2L ~ "Female",
                                    TRUE ~ NA_character_),
         social_grade_no = case_when (profile_socialgrade_cie < 7 ~ 6 - profile_socialgrade_cie,
                                       TRUE ~ NA_real_),
         socialgrade_cie = case_when(profile_socialgrade_cie == 1L ~ "A",
                                             profile_socialgrade_cie == 2L ~ "B",
                                             profile_socialgrade_cie == 3L ~ "C1",
                                             profile_socialgrade_cie == 4L ~ "C2",
                                             profile_socialgrade_cie == 5L ~ "D",
                                             profile_socialgrade_cie == 6L ~ "E",
                                             TRUE ~ NA_character_),
         UK_citizen = case_when(Ocitizenship < 3 ~ 1,
                                Ocitizenship == 3 ~ 0,
                                TRUE ~NA_real_),
         citizenship_code = case_when(Ocitizenship == 1 ~ "UK Citizen",
                                 Ocitizenship == 2 ~ "Dual National",
                                 Ocitizenship == 3 ~ "Non-Citizen",
                                 TRUE ~ NA_character_),
         education_no = case_when(Oprofile_education_level <19 ~ Oprofile_education_level,
                                  Oprofile_education_level> 18 ~ NA_real_),
         education_short = case_when(Oprofile_education_level == 1 ~ 1,
                                     Oprofile_education_level %in% 2:10 ~ 2,
                                     Oprofile_education_level %in% 11:12 ~ 3,
                                     Oprofile_education_level %in% 13:14 ~ 3,
                                     Oprofile_education_level %in% 15:16 ~ 4,
                                     Oprofile_education_level ==17 ~ 5,
                                     Oprofile_education_level ==18 ~ 4,
                                     Oprofile_education_level > 18 ~NA_real_),
         degree = case_when(Oprofile_education_level %in% 15:18 ~ 1,
                            Oprofile_education_level <15 ~ 0,
                            TRUE ~NA_real_),
         homeowner = case_when(Oprofile_house_tenure < 4 ~ 1,
                               Oprofile_house_tenure %in% 4:9 ~ 0,
                               TRUE ~NA_real_),
         renter = case_when(Oprofile_house_tenure < 4 ~ 0,
                            Oprofile_house_tenure %in% 4:6 ~ 1,
                            Oprofile_house_tenure %in% 7:9 ~0,
                            TRUE ~NA_real_), 
         parent_homeowner = case_when(Q10 < 4 ~ 1,
                                      Q10 %in% 4:7 ~ 0,
                               TRUE ~NA_real_),
         parent_renter = case_when(Q10 < 4 ~ 0,
                                   Q10 %in% 4:6 ~ 1,
                                   Q10 == 7 ~0,
                            TRUE ~NA_real_), 
         household_size = case_when(Oprofile_household_size < 9 ~ Oprofile_household_size,
                                    TRUE ~ NA_real_),
         hh_income = case_when(Oprofile_gross_household<16 ~ Oprofile_gross_household,
                               TRUE ~ NA_real_),
         pers_income = case_when(Oprofile_gross_personal<15 ~ Oprofile_gross_personal,
                               TRUE ~ NA_real_)
         
         ) 

# yg_dat %>% 
#   group_by(age) %>% 
#   count() %>% View()


#write_csv(yg_dat, "/Users/madselkjaer/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/yg_data_for_analysis.csv")


# ADDING LOCAL AUTHORITY INFORMATION

yg_local_authority_codes <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/housing data/median_house_prices_for_yg.csv")

yg_local_authority_codes <- yg_local_authority_codes %>% 
  rename(median_price = price,
         median_price_group = group,
         Oprofile_oslaua = yg_code)

yg_dat <- yg_dat %>% 
  left_join(yg_local_authority_codes, by = "Oprofile_oslaua")

la_wide <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/shiny_app/la_wide.csv")

la_wide <- la_wide %>% 
  select(reg_code:la_median_mar20, p_10, p_20, p_30, p_40, p_50, p_60, p_70, p_80, p_90, p_95)

yg_dat <- yg_dat %>% 
  left_join(la_wide, by = "la_code")

## Adding in Local Authority Pay

la_pay <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/labor_mkt_data/la_pay_long.csv")

la_pay <- la_pay %>% 
  dplyr::select(-(la_name)) %>% 
  filter(quantile == 50) %>% 
  select(-quantile)



yg_dat <- yg_dat %>% 
  left_join(la_pay, by= "la_code")


# Recoding Survey Questions -----------------------------------------------

# Remove all DKs and Skipped  - code as NA. 
# Coding is set-up so agreement is higher number except where noted

yg_dat <-  yg_dat %>% 
  mutate(pension_security = case_when(Q12 == 1 ~ 1,
                                      Q12 == 2 ~ 2,
                                      Q12 == 3 ~ 3,
                                      Q12 == 4 ~ 4,
                                        TRUE ~ NA_real_),
  perception_hh_inc = case_when(Q13_1 == 1 ~ 1,
                                Q13_1 == 2 ~ 2,
                                Q13_1 == 3 ~ 3,
                                Q13_1 == 4 ~ 4,
                                Q13_1 == 5 ~ 5,
                                TRUE ~ NA_real_),
  perception_hh_wea = case_when(Q13_2 == 1 ~ 1,
                                Q13_2 == 2 ~ 2,
                                Q13_2 == 3 ~ 3,
                                Q13_2 == 4 ~ 4,
                                Q13_2 == 5 ~ 5,
                                TRUE ~ NA_real_),        
  perception_val_hou = case_when(Q13_3 == 1 ~ 1,
                                 Q13_3 == 2 ~ 2,
                                 Q13_3 == 3 ~ 3,
                                 Q13_3 == 4 ~ 4,
                                 Q13_3 == 5 ~ 5,
                                 TRUE ~ NA_real_),       
  la_typ_hh_inc = case_when(Q14 == 1 ~ 1,
                            Q14 == 2 ~ 2,
                            Q14 == 3 ~ 3,
                            Q14 == 4 ~ 4,
                            Q14 == 5 ~ 5,
                            Q14 == 6 ~ 6,
                            TRUE ~ NA_real_),
  la_typ_house_cost = case_when(Q15 == 1 ~ 1,
                              Q15 == 2 ~ 2,
                              Q15 == 3 ~ 3,
                              Q15 == 4 ~ 4,
                              Q15 == 5 ~ 5,
                              Q15 == 6 ~ 6,
                              TRUE ~ NA_real_),
  diff_inc = case_when(Q23_1 == 1 ~ 1,
                     Q23_1 == 2 ~ 2,
                     Q23_1 == 3 ~ 3,
                     Q23_1 == 4 ~ 4,
                     Q23_1 == 5 ~ 5,
                     TRUE ~ NA_real_),
  diff_wea = case_when(Q23_2 == 1 ~ 1,
                     Q23_2 == 2 ~ 2,
                     Q23_2 == 3 ~ 3,
                     Q23_2 == 4 ~ 4,
                     Q23_2 == 5 ~ 5,
                     TRUE ~ NA_real_),
  diff_houses = case_when(Q23_3 == 1 ~ 1,
                        Q23_3 == 2 ~ 2,
                        Q23_3 == 3 ~ 3,
                        Q23_3 == 4 ~ 4,
                        Q23_3 == 5 ~ 5,
                        TRUE ~ NA_real_),
  diff_inc_la = case_when(Q24_1 == 1 ~ 1,
                        Q24_1 == 2 ~ 2,
                        Q24_1 == 3 ~ 3,
                        Q24_1 == 4 ~ 4,
                        Q24_1 == 5 ~ 5,
                        TRUE ~ NA_real_),
  diff_wea_la = case_when(Q24_2 == 1 ~ 1,
                        Q24_2 == 2 ~ 2,
                        Q24_2 == 3 ~ 3,
                        Q24_2 == 4 ~ 4,
                        Q24_2 == 5 ~ 5,
                        TRUE ~ NA_real_),
  diff_houses_la = case_when(Q24_3 == 1 ~ 1,
                           Q24_3 == 2 ~ 2,
                           Q24_3 == 3 ~ 3,
                           Q24_3 == 4 ~ 4,
                           Q24_3 == 5 ~ 5,
                           TRUE ~ NA_real_),
  tax_slider = case_when(Q25_1 == 0 ~ 0,
                       Q25_1 == 1 ~ 1,
                       Q25_1 == 2 ~ 2,
                       Q25_1 == 3 ~ 3,
                       Q25_1 == 4 ~ 4, 
                       Q25_1 == 5 ~ 5, 
                       Q25_1 == 6 ~ 6,
                       Q25_1 == 7 ~ 7,
                       Q25_1 == 8 ~ 8,
                       Q25_1 == 9 ~ 9, 
                       Q25_1 == 10 ~ 10,
                       TRUE ~ NA_real_),
  like_taxing_wealth = as.numeric(Q25_1<5),
  support_redistribution_inc = case_when(Q27 == 1 ~ 1,
                                       Q27 == 2 ~ 2,
                                       Q27 == 3 ~ 3,
                                       Q27 == 4 ~ 4,
                                       TRUE ~ NA_real_),
  support_redistribution_la = case_when(Q29 == 1 ~ 1,
                                      Q29 == 2 ~ 2,
                                      Q29 == 3 ~ 3,
                                      Q29 == 4 ~ 4,
                                      TRUE ~ NA_real_),
  support_redistribution_wea_la = case_when(Q33 == 1 ~ 1,
                                          Q33 == 2 ~ 2,
                                          Q33 == 3 ~ 3,
                                          Q33 == 4 ~ 4,
                                          TRUE ~ NA_real_),
  pol_complicated = case_when(Q36_1 == 1 ~ 1,
                            Q36_1 == 2 ~ 2,
                            Q36_1 == 3 ~ 3, 
                            Q36_1 == 4 ~ 4,
                            Q36_1 == 5 ~ 5,
                            TRUE ~ NA_real_),
  pol_say = case_when(Q36_2 == 1 ~ 1,
                    Q36_2 == 2 ~ 2,
                    Q36_2 == 3 ~ 3,
                    Q36_2 == 4 ~ 4,
                    Q36_2 == 5 ~ 5,
                    TRUE ~ NA_real_),
  pol_care = case_when(Q36_3 == 1 ~ 1,
                     Q36_3 == 2 ~ 2,
                     Q36_3 == 3 ~ 3,
                     Q36_3 == 4 ~ 4,
                     Q36_3 == 5 ~ 5,
                     TRUE ~ NA_real_),
  inh_overall = case_when(Q37_1 == 1 ~ 1,
                        Q37_1 == 2 ~ 2,
                        Q37_1 == 3 ~ 3,
                        Q37_1 == 4 ~ 4,
                        Q37_1 == 5 ~ 5,
                        TRUE ~ NA_real_),
  inh_high = case_when(Q37_2 == 1 ~ 1,
                     Q37_2 == 2 ~ 2,
                     Q37_2 == 3 ~ 3,
                     Q37_2 == 4 ~ 4,
                     Q37_2 == 5 ~ 5,
                     TRUE ~ NA_real_),
  inh_moderate = case_when(Q37_3 == 1 ~ 1,
                         Q37_3 == 2 ~ 2,
                         Q37_3 == 3 ~ 3,
                         Q37_3 == 4 ~ 4,
                         Q37_3 == 5 ~ 5,
                         TRUE ~ NA_real_),
  inh_low = case_when(Q37_4 == 1 ~ 1,
                    Q37_4 == 2 ~ 2,
                    Q37_4 == 3 ~ 3,
                    Q37_4 == 4 ~ 4,
                    Q37_4 == 5 ~ 5,
                    TRUE ~ NA_real_),
  inh_you_pay = case_when(Q37_5 == 1 ~ 1,
                        Q37_5 == 2 ~ 2,
                        Q37_5 == 3 ~ 3,
                        Q37_5 == 4 ~ 4,
                        Q37_5 == 5 ~ 5,
                        TRUE ~ NA_real_),
  inh_heirs_pay = case_when(Q37_6 == 1 ~ 1,
                          Q37_6 == 2 ~ 2,
                          Q37_6 == 3 ~ 3,
                          Q37_6 == 4 ~ 4,
                          Q37_6 == 5 ~ 5,
                          TRUE ~ NA_real_),
  inc_overall = case_when(Q38_1 == 1 ~ 1,
                        Q38_1 == 2 ~ 2,
                        Q38_1 == 3 ~ 3,
                        Q38_1 == 4 ~ 4,
                        Q38_1 == 5 ~ 5,
                        TRUE ~ NA_real_),
  inc_high = case_when(Q38_2 == 1 ~ 1,
                     Q38_2 == 2 ~ 2,
                     Q38_2 == 3 ~ 3,
                     Q38_2 == 4 ~ 4,
                     Q38_2 == 5 ~ 5,
                     TRUE ~ NA_real_),
  inc_moderate = case_when(Q38_3 == 1 ~ 1,
                         Q38_3 == 2 ~ 2,
                         Q38_3 == 3 ~ 3,
                         Q38_3 == 4 ~ 4,
                         Q38_3 == 5 ~ 5,
                         TRUE ~ NA_real_),
  inc_low = case_when(Q38_4 == 1 ~ 1,
                    Q38_4 == 2 ~ 2,
                    Q38_4 == 3 ~ 3,
                    Q38_4 == 4 ~ 4,
                    Q38_4 == 5 ~ 5,
                    TRUE ~ NA_real_),
  inc_you_pay = case_when(Q38_5 == 1 ~ 1,
                        Q38_5 == 2 ~ 2,
                        Q38_5 == 3 ~ 3,
                        Q38_5 == 4 ~ 4,
                        Q38_5 == 5 ~ 5,
                        TRUE ~ NA_real_),
  inc_very_high = case_when(Q38_6 == 1 ~ 1,
                          Q38_6 == 2 ~ 2,
                          Q38_6 == 3 ~ 3,
                          Q38_6 == 4 ~ 4,
                          Q38_6 == 5 ~ 5,
                          TRUE ~ NA_real_),
  supp_local_housing = case_when(Q39 == 1 ~ 5,
                               Q39 == 2 ~ 4,
                               Q39 == 3 ~ 3,
                               Q39 == 4 ~ 2,
                               Q39 == 5 ~ 1,
                               TRUE ~ NA_real_),
  greenbelt = case_when(Q42 == 1 ~ 5,
                      Q42 == 2 ~ 4,
                      Q42 == 3 ~ 3,
                      Q42 == 4 ~ 2,
                      Q42 == 5 ~ 1,
                      TRUE ~ NA_real_),
  housing_slider = case_when(Q43_1 == 0 ~ 0,
                           Q43_1 == 1 ~ 1,
                           Q43_1 == 2 ~ 2,
                           Q43_1 == 3 ~ 3,
                           Q43_1 == 4 ~ 4, 
                           Q43_1 == 5 ~ 5, 
                           Q43_1 == 6 ~ 6,
                           Q43_1 == 7 ~ 7,
                           Q43_1 == 8 ~ 8,
                           Q43_1 == 9 ~ 9, 
                           Q43_1 == 10 ~ 10,
                           TRUE ~ NA_real_)
)


yg_dat <- yg_dat %>% 
  mutate(house_price_written = Q8a) %>% 
  mutate(house_price_written = case_when(house_price_written %in% c("__NA__", "__DK__") ~NA_character_,
                                         TRUE ~ house_price_written)) %>% 
  mutate(parsed_house = parse_number(house_price_written)) %>% 
  mutate(est_house_price = case_when(between(parsed_house, 20, 999) ~ 1000*parsed_house,
                                     between(parsed_house, 0, 19) ~ 1000000*parsed_house,
                                     TRUE ~ parsed_house)) %>% 
  mutate(est_house_price = case_when (house_price_written == "one hundred and sixty thousand approx"~ 160000,
                                      house_price_written == "Two hundred and fifty five thousand." ~250000,
                                      house_price_written == "Two hundred thousand" ~ 200000,
                                      house_price_written == "Three hundred and twenty thousand" ~320000,
                                      house_price_written == "£1,2 million" ~1200000,
                                      house_price_written == "£500,00 ish" ~500000,
                                      house_price_written == "£1,1 million" ~1100000,
                                      house_price_written == "3 to 400,000" ~350000,
                                      house_price_written == "One million" ~1000000,
                                      house_price_written == "1,000.000" ~1000000,
                                      house_price_written == "200,00" ~200000,
                                      house_price_written == "350,00" ~350000,
                                      house_price_written == "£450,00" ~450000,
                                      house_price_written == "£130,00" ~130000,
                                      house_price_written == "450,00" ~450000,
                                      house_price_written == "£18500)" ~185000,
                                      house_price_written == "135,00" ~135000,
                                      house_price_written == "150,00" ~150000,
                                      house_price_written == "£1900" ~ NA_real_, # 1.9 million?
                                      house_price_written == "£1500.00" ~ NA_real_, # 1.5 million?
                                      house_price_written == "£300,00" ~  300000,
                                      house_price_written == "£150,00.00" ~150000,
                                      house_price_written == "£1000000000000000000" ~ NA_real_,
                                      house_price_written == "0" ~NA_real_,
                                      house_price_written =="4000" ~NA_real_,
                                      house_price_written == "Flat - less than 100k" ~100000,
                                      house_price_written == "£1-£1.2m" ~1100000,
                                      house_price_written == "£115,000-£125,000" ~120000,
                                      house_price_written == "£25000 to. 30000" ~27500,
                                      house_price_written == "£300k-320k" ~310000, 
                                      house_price_written == "Not sure but think between 290k and 320k" ~305000, 
                                      house_price_written == "£230,000-250,000" ~240000,
                                      house_price_written == "450,000 to £500,000" ~475000,
                                      house_price_written == "Between £100K and £150K" ~125000,
                                      house_price_written == "350 - 400k" ~375000,
                                      house_price_written == "£380 to 400 k" ~390000,
                                      house_price_written == "£350K - 360K" ~355000,
                                      TRUE ~ est_house_price
  )) %>%
  mutate(log_est_house_price = log(est_house_price),
         homeowner_X_price = case_when(homeowner==1 ~ est_house_price,
                                       homeowner == 0 ~ 0,
                                       TRUE ~NA_real_),
         homeowner_X_log_price = case_when(homeowner==1 ~ log_est_house_price,
                                           homeowner == 0 ~ 0,
                                           TRUE ~NA_real_),
         log_median_price = log(median_price),
         house_rent_groups  = case_when(homeowner==0 ~ "Don't Own",
                                        est_house_price <=100000 ~ "Own, £100k and under",
                                        between(est_house_price, 100001, 200000) ~ "Own, £100k to £200k",
                                        between(est_house_price, 200001, 300000) ~ "Own, £200k to £300k",
                                        between(est_house_price, 300001, 400000) ~ "Own, £300k to £400k",
                                        between(est_house_price, 400001, 500000) ~ "Own, £400k to £500k",
                                        between(est_house_price, 500001, 750000) ~ "Own, £500k to £750k",
                                        est_house_price >750000 ~ "Own, £750k and up",
                                        TRUE ~NA_character_
         ),
         house_rent_groups = factor(house_rent_groups, 
                                    levels = c("Don't Own", "Own, £100k and under", "Own, £100k to £200k",
                                               "Own, £200k to £300k", "Own, £300k to £400k", "Own, £400k to £500k",
                                               "Own, £500k to £750k", "Own, £750k and up"))
  )

yg_dat <- yg_dat %>% 
  mutate(house_price_written_parents = Q11a) %>% 
  mutate(house_price_written_parents = case_when(house_price_written_parents %in% c("__NA__", "__DK__") ~NA_character_,
                                                 TRUE ~ house_price_written_parents)) %>% 
  mutate(parsed_house_parents = parse_number(house_price_written_parents)) %>% 
  mutate(est_house_price_parents = case_when(between(parsed_house_parents, 20, 999) ~ 1000*parsed_house_parents,
                                             between(parsed_house_parents, 0, 19) ~ 1000000*parsed_house_parents,
                                             TRUE ~ parsed_house_parents))%>% 
  mutate(est_house_price_parents = case_when (house_price_written_parents == "-£300,00"~ -300000,
                                              house_price_written_parents == "290,00"~ 290000,
                                              house_price_written_parents == "1,00000"~ 1000000,
                                              house_price_written_parents == "18000.00"~ 1800000,
                                              house_price_written_parents == "350,00"~ 350000,
                                              house_price_written_parents == "£350,000-£375,000"~ 362500,
                                              house_price_written_parents == "300.000/350.000"~ 325000,
                                              house_price_written_parents == "Zero"~ 0,
                                              house_price_written_parents == "5000.000"~ 5000000,
                                              house_price_written_parents == "-£400,000"~ -400000,
                                              house_price_written_parents == "My dads £450,000 My Mums £225,000"~ 675000,
                                              house_price_written_parents == "£270,000-£300,000"~ 285000,
                                              house_price_written_parents == "300.00"~ 300000,
                                              house_price_written_parents == "£250,000 - £300,000"~ 275000,
                                              house_price_written_parents == "£170.00"~ 170000,
                                              house_price_written_parents == "£180,000-mother's only, (£150,000, fathers)"~ 330000,
                                              house_price_written_parents == "Parents deceased, house sold in 2009 for £350,000"~ NA_real_,
                                              house_price_written_parents == "Sold earlier this year for £435,000"~ NA_real_,
                                              house_price_written_parents == "£280,000 (for each of the houses)"~ 560000,
                                              house_price_written_parents == "Two hundred and fifty thousand"~ 250000,
                                              house_price_written_parents == "They both live in their own houses. I think my dad's would be around 400.00 euro, my mums possibly slightly more, like 450.000 euro"~ 713000,
                                              house_price_written_parents == "c. £200k"~ 200000,
                                              house_price_written_parents == "75,000 euro"~ 63000,
                                              house_price_written_parents == "Mums £180000. Dads £400000"~ 580000,
                                              house_price_written_parents == "£280k sold for 3 years ago"~ NA_real_,
                                              house_price_written_parents == "500.000 €"~ 420000,
                                              house_price_written_parents == "350 - 400k"~ 375000,
                                              house_price_written_parents == "1.5million Naira"~ 2700,
                                              house_price_written_parents == "One hundred and eighty thousand"~ 180000,
                                              house_price_written_parents == "Difficult to answer as it is a farmhouse integral to the farm (c. 220 acres)"~ 7500001,
                                              house_price_written_parents == "30k INR"~ 300,
                                              house_price_written_parents == "3000 grand"~ 300000,
                                              house_price_written_parents == "Like around 250k to 300k, comparable ones in the area sell for as much as 400k but ours needs some work doing"~ 275000,
                                              house_price_written_parents == "£400 to 500k"~ 450000,
                                              house_price_written_parents == "over a million"~ 1000000,
                                              house_price_written_parents == "300000€"~ 250000,
                                              house_price_written_parents == "Same house - £260,000"~ 260000,
                                              TRUE ~ est_house_price_parents)) %>%
  mutate(log_est_house_price_parents = log(est_house_price_parents),
         homeowner_X_price_parents = case_when(parent_homeowner==1 ~ est_house_price_parents,
                                               parent_homeowner == 0 ~ 0,
                                               TRUE ~NA_real_),
         homeowner_X_log_price_parents = case_when(parent_homeowner==1 ~ log_est_house_price_parents,
                                                   parent_homeowner == 0 ~ 0,
                                                   TRUE ~NA_real_),
         house_rent_groups_parents  = case_when(parent_homeowner==0 ~ "Don't Own",
                                                est_house_price_parents <=100000 ~ "Own, £100k and under",
                                                between(est_house_price_parents, 100001, 200000) ~ "Own, £100k to £200k",
                                                between(est_house_price_parents, 200001, 300000) ~ "Own, £200k to £300k",
                                                between(est_house_price_parents, 300001, 400000) ~ "Own, £300k to £400k",
                                                between(est_house_price_parents, 400001, 500000) ~ "Own, £400k to £500k",
                                                between(est_house_price_parents, 500001, 750000) ~ "Own, £500k to £750k",
                                                est_house_price_parents >750000 ~ "Own, £750k and up",
                                                TRUE ~NA_character_),
         house_rent_groups_parents = factor(house_rent_groups_parents, 
                                            levels = c("Don't Own", "Own, £100k and under", "Own, £100k to £200k",
                                                       "Own, £200k to £300k", "Own, £300k to £400k", "Own, £400k to £500k",
                                                       "Own, £500k to £750k", "Own, £750k and up")))  

# COMPREHENSION CHECKS


# filter responses to include only individuals who have answered the comprehension checks correctly

# comprehension checks for national survey: Q18 and Q19

yg_dat <- yg_dat %>% mutate(CC1_n = case_when(Q18 == 2 ~ 1,
                                              Q18 == 1 ~ 0,
                                              Q18 == 3 ~ 0,
                                              Q18 == 4 ~ 0,
                                              TRUE ~ NA_real_),
                            CC2_n = case_when(Q19 == 3 ~ 1,
                                              Q19 == 1 ~ 0,
                                              Q19 == 2 ~ 0,
                                              Q19 == 4 ~ 0, 
                                              TRUE ~ NA_real_),
                            CC_n = case_when(CC1_n == 1 & CC2_n == 1 ~ 1,
                                             CC1_n == 0 & CC2_n == 1 ~ 0,
                                             CC1_n == 1 & CC2_n == 0 ~ 0,
                                             CC1_n == 0 & CC2_n == 0 ~ 0,
                                             TRUE ~ NA_real_))

# insert dummies right after the questions
yg_dat <- yg_dat %>% relocate(CC1_n, .after = Q18) %>% relocate(CC2_n, .after = Q19) %>%
  relocate(CC_n, .after = CC2_n)

# comprehension checks for local authority questions: Q21 and Q22

yg_dat <- yg_dat %>% mutate(CC1_l = case_when(Q21 == 1 & median_price > 230000 ~ 1,
                                              Q21 == 1 & median_price <= 230000 ~ 0,
                                              Q21 == 2 & median_price > 230000 ~ 0,
                                              Q21 == 2 & median_price <= 230000 ~ 1,
                                              Q21 == 3 ~ 0, 
                                              TRUE ~ NA_real_),
                            CC2_l = case_when(Q22 == 3 ~ 1,
                                              Q22 == 1 ~ 0,
                                              Q22 == 2 ~ 0,
                                              Q22 == 4 ~ 0, 
                                              TRUE ~ NA_real_),
                            CC_l = case_when(CC1_l == 1 & CC2_l == 1 ~ 1,
                                             CC1_l == 0 & CC2_l == 1 ~ 0,
                                             CC1_l == 1 & CC2_l == 0 ~ 0,
                                             CC1_l == 0 & CC2_l == 0 ~ 0,
                                             TRUE ~ NA_real_))

# there are some local authorities where the median house price is pretty much exaclty 230k
# residents there often answered "don't know" which is currently coded as incorrect
# could think about changing that

yg_dat <- yg_dat %>% relocate(CC1_l, .after = Q21) %>% relocate(CC2_l, .after = Q22) %>%
  relocate(CC_l, .after = CC2_l) %>% relocate(median_price, .after = CC1_l)

# Treatment group variable that only includes respondents in treatment group if both 
# comprehension checks were answered correctly

yg_dat <- yg_dat %>% mutate(treatment_cc = case_when(Qtreatment == 1 ~ 1,
                                                     Qtreatment == 2 & CC_n == 1 ~ 2,
                                                     Qtreatment == 3 & CC_l == 1 ~ 3,
                                                     TRUE ~ NA_real_)) %>%
  relocate(treatment_cc, .after = Qtreatment)

yg_dat <- yg_dat %>% mutate(treatment_cc_f = case_when(treatment_cc == 1 ~ "Control",
                                                       treatment_cc == 2 ~ "National",
                                                       treatment_cc == 3 ~ "Local",
                                                       TRUE ~ NA_character_)) %>%
  relocate(treatment_cc_f, .after = treatment_cc)

#stat.desc(yg_dat$treatment_cc)

#-------------------------------------------------------------------------------

# ADDITIONAL VARIABLES

# add numeric variable for house value groups

yg_dat <- yg_dat %>% mutate(house_value_bins = case_when(house_rent_groups == "Don't Own" ~ 1,
                                                         house_rent_groups == "Own, £100k and under" ~ 2,
                                                         house_rent_groups == "Own, £100k to £200k" ~ 3,
                                                         house_rent_groups == "Own, £200k to £300k" ~ 4,
                                                         house_rent_groups == "Own, £300k to £400k" ~ 5,
                                                         house_rent_groups == "Own, £400k to £500k" ~ 6,
                                                         house_rent_groups == "Own, £500k to £750k" ~ 7,
                                                         house_rent_groups == "Own, £750k and up" ~ 8,
                                                         TRUE ~ NA_real_))

yg_dat <- yg_dat %>% mutate(house_value_bins_parents = case_when(house_rent_groups_parents == "Don't Own" ~ 1,
                                                                 house_rent_groups_parents == "Own, £100k and under" ~ 2,
                                                                 house_rent_groups_parents == "Own, £100k to £200k" ~ 3,
                                                                 house_rent_groups_parents == "Own, £200k to £300k" ~ 4,
                                                                 house_rent_groups_parents == "Own, £300k to £400k" ~ 5,
                                                                 house_rent_groups_parents == "Own, £400k to £500k" ~ 6,
                                                                 house_rent_groups_parents == "Own, £500k to £750k" ~ 7,
                                                                 house_rent_groups_parents == "Own, £750k and up" ~ 8,
                                                                 TRUE ~ NA_real_))


# add factor variables for Q12 (pension security) and Q13 (economic self-perception)

yg_dat <- yg_dat %>%
  mutate(pension_security_code = case_when(Q12 == 1 ~ "Very confident",
                                           Q12 == 2 ~ "Somewhat confident",
                                           Q12 == 3 ~ "Somewhat uncertain",
                                           Q12 == 4 ~ "Very uncertain",
                                           TRUE ~ NA_character_),
         perception_hh_inc_f = case_when(Q13_1 == 1 ~ "Bottom quintile",
                                         Q13_1 == 2 ~ "2nd quintile",
                                         Q13_1 == 3 ~ "Middle quintile",
                                         Q13_1 == 4 ~ "4th quintile",
                                         Q13_1 == 5 ~ "Top quintile",
                                         TRUE ~ NA_character_),
         perception_hh_wea_f = case_when(Q13_2 == 1 ~ "Bottom quintile",
                                         Q13_2 == 2 ~ "2nd quintile",
                                         Q13_2 == 3 ~ "Middle quintile",
                                         Q13_2 == 4 ~ "4th quintile",
                                         Q13_2 == 5 ~ "Top quintile",
                                         TRUE ~ NA_character_),        
         perception_val_hou_f = case_when(Q13_3 == 1 ~ "Bottom quintile",
                                          Q13_3 == 2 ~ "2nd quintile",
                                          Q13_3 == 3 ~ "Middle quintile",
                                          Q13_3 == 4 ~ "4th quintile",
                                          Q13_3 == 5 ~ "Top quintile",
                                          TRUE ~ NA_character_))

# create relative house price variable: house value minus local median house price

yg_dat <- yg_dat %>%
  mutate(rel_house_price = (homeowner_X_price - median_price)/1000)


#POLITICAL EFFICACY INDICES

# index with all three items
# yg_dat <- yg_dat %>%
#  mutate(eff_index = case_when(!is.na(pol_complicated) & !is.na(pol_say) & !is.na(pol_care) ~ (pol_complicated + pol_say + pol_care)/3,
#                               TRUE ~ NA_real_)) %>%
#  relocate(eff_index, .after = pol_care)

# reverse code questions so that higher number signals higher efficacy

yg_dat <- yg_dat %>%
  mutate(int_eff = case_when(Q36_1 == 1 ~ 5,
                             Q36_1 == 2 ~ 4,
                             Q36_1 == 3 ~ 3, 
                             Q36_1 == 4 ~ 2,
                             Q36_1 == 5 ~ 1,
                             TRUE ~ NA_real_)) %>%
  relocate(int_eff, .after = pol_care)

yg_dat <- yg_dat %>%
  mutate(ext_eff_1 = case_when(Q36_2 == 1 ~ 5,
                               Q36_2 == 2 ~ 4,
                               Q36_2 == 3 ~ 3, 
                               Q36_2 == 4 ~ 2,
                               Q36_2 == 5 ~ 1,
                               TRUE ~ NA_real_))

yg_dat <- yg_dat %>%
  mutate(ext_eff_2 = case_when(Q36_3 == 1 ~ 5,
                               Q36_3 == 2 ~ 4,
                               Q36_3 == 3 ~ 3, 
                               Q36_3 == 4 ~ 2,
                               Q36_3 == 5 ~ 1,
                               TRUE ~ NA_real_))

yg_dat <- yg_dat %>%
  mutate(ext_eff = case_when(!is.na(ext_eff_1) & !is.na(ext_eff_2) ~ (ext_eff_1 + ext_eff_2)/2,
                             TRUE ~ NA_real_)) %>%
  relocate(ext_eff, .after = int_eff)

yg_dat = subset(yg_dat, select = -c(ext_eff_1, ext_eff_2))

# dummy for local authorities with low/high house prices

yg_dat <- yg_dat %>%
  mutate(expensive = case_when(p_50 > 224975 ~ 1,
                               TRUE ~ 0))


# Last saved Feb 14 2022
write_csv(yg_dat, "~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/yg_data_ready.csv")
#write_csv(yg_dat, "~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/yg_data_for_analysis.csv")


## MAE June 21, 2020: loading the long data for analysis of conjoints

#df <-read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/yg_data/yg_data_long_for_conjoint.csv")
