#############################################################
###   Wealthpol YG social mobility survey data cleaning   ###
#############################################################


# created: 15.08.22
# updated: 30.08.22

pacman::p_load(knitr, tidyverse, mediocrethemes, haven, validate, vtable, expss,
               dataMaid, sjlabelled, cobalt, stable, crosstable, gmodels, diffdf, 
               arsenal, readxl, gmodels)
options(scipen = 999)
set_mediocre_all()

##
## 1. Load data ##
## 

# load data
data <- read.csv("/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/UniversityofOxford_WealthInequality_August22_Client_CSV.csv")
names(data)

# import variable labels from YG codebook
codebook <- read_xlsx("/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/UniversityofOxford_WealthInequality_August22_Client_Codebook.xlsx",
                      sheet = 2) %>% 
  dplyr::select(c("Variable", "Label"))
# keep labels so they are displayed in datamaid
codebook_wide <- codebook %>% 
  pivot_wider(names_from = Variable, values_from = Label)
labels <- codebook_wide[1,]
data <- set_label(data, codebook_wide[,]) 

##
## 2. Clean data ## 
## 

# convert to factor where relevant
data <- data %>% 
  sjlabelled::as_factor() %>% 
  mutate(Weight = sjlabelled::as_numeric(Weight),
         age = sjlabelled::as_numeric(age),
         profile_household_size = sjlabelled::as_numeric(profile_household_size),
         Q16a_other = sjlabelled::as_numeric(Q16a_other),
         Q24a_other = sjlabelled::as_character(Q24a_other),
         Q30aa_other = sjlabelled::as_character(Q30aa_other),
         Q52 = sjlabelled::as_character(Q52), 
         profile_education_level = as.numeric(profile_education_level), 
         Q10 = as.numeric(Q10), 
         profile_house_tenure = as.numeric(profile_house_tenure), 
         profile_gross_household = as.numeric(profile_gross_household),
         profile_gross_personal = as.numeric(profile_gross_personal))

# RECODING DEMOGRAPHICS QUESTIONS (from BA's script for other YG survey)  ------

# Adding Codes for Sector, Region, Ethnicity, Occupation, HH Income, Tenure 
ed_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_education_level = as.numeric(c("1","2","3","4","5","6","7","8","9","10",
                                         "11","12","13","14","15","16","17","18",
                                         "19","20","98","99")),
  education_code = c("No formal qualifications",
                     "Youth training certificate/skillseekers",
                     "Recognised trade apprenticeship completed","Clerical and commercial",
                     "City & Guilds certificate",
                     "City & Guilds certificate - advanced","ONC",
                     "CSE grades 2-5",
                     "CSE grade 1, GCE O level, GCSE, School Certificate",
                     "Scottish Ordinary/ Lower Certificate","GCE A level or Higher Certificate",
                     "Scottish Higher Certificate",
                     "Nursing qualification (e.g. SEN, SRN, SCM, RGN)",
                     "Teaching qualification (not degree)",
                     "University diploma",
                     "University or CNAA first degree (e.g. BA, B.Sc, B.Ed)",
                     "University or CNAA higher degree (e.g. M.Sc, Ph.D)",
                     "Other technical, professional or higher qualification",
                     "Don't know","Prefer not to say","Skipped",
                     "Not Asked")
)

# sector_codes <- data.frame(
#  stringsAsFactors = FALSE,
#  Oemployer_type_private_non_self = c("1","2","3","4","5","6",
#                                       "7","8","9","10","11","12","13","14","15","16",
#                                       "17","18","19","20","21","22","23","24","25","26",
#                                       "27","28","29","30","31","32","33","34","97",
#                                       "99","998","999"),
#   sector_private = c("Agriculture","Natural resources",
#                      "Construction",
#                      "Manufacturing: Automobiles ","Manufacturing: Heavy industry",
#                      "Manufacturing: Technology",
#                      "Manufacturing: Other","Wholesaling",
#                      "Media and communications","Financial Services: Banking",
#                      "Financial Services: Insurance",
#                      "Financial Services: Brokerage & Investment",
#                      "Business services",
#                      "Real estate","Scientific research","Technology services",
#                      "Engineering, design, or architecture",
#                      "Legal services","Personal care","Education",
#                      "Health","Arts, entertainment, and recreation",
#                      "Restaurant services","Hotels or lodging",
#                      "Airlines or air travel","Other travel","Retail",
#                      "Transportation","Maintenance and repair","Accounting",
#                      "Financial Services: Other",
#                      "Leisure",
#                      "Tourism ",
#                      "Utilities and energy","Other","Not applicable","Skipped",
#                      "Not Asked")
# )

# sector_codes <- sector_codes %>% 
#  mutate(Oemployer_type_private_non_self=as.numeric(Oemployer_type_private_non_self))

region_codes<- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_GOR = c("1","2","3","4","5","6",
                  "7","8","9","10","11","12","13","98","99"),
  region_name = c("North East","North West",
                  "Yorkshire and the Humber","East Midlands","West Midlands",
                  "East of England","London","South East","South West",
                  "Wales","Scotland","Northern Ireland",
                  "Non UK & Invalid","Skipped","Not Asked")
)

region_codes <- region_codes %>% 
  mutate(profile_GOR = as.factor(profile_GOR))

ethnicity_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  ethnicity_new = as.factor(c("1","2","3","4","5","6",
                              "7","8","9","10","11","12","13",
                              "14","15","16","17","18","19",
                              "98","99")),
  ethnicity_name = c("White British","Irish",
                     "Gypsy or Irish Traveller",
                     "Other White",
                     "White and Black Caribbean","White and Black African",
                     "White and Asian",
                     "Other Mixed","Indian",
                     "Pakistani","Bangladeshi","Chinese",
                     "Other Asian ","African",
                     "Caribbean",
                     "Other Black","Arab",
                     "Any other ethnic group",
                     "Prefer not to say","Skipped","Not Asked")
)

work_stat_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_work_stat = as.factor(c("1","2","3","4","5","6",
                                  "7","8","98","99")),
  work_status = c("Full time",
                  "Part time (8-29 hrs)",
                  "Part time (<8 hrs)",
                  "Student","Retired","Unemployed",
                  "Not working","Other","Skipped",
                  "Not Asked")
)

occ_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_work_type = as.factor(c("1",
                                  "2","3","4","5","6","7","8","9",
                                  "98","99")),
  occ_type = c("Professional",
               "Manager",
               "Junior managerial",
               "Sales or Services ",
               "Supervisor",
               "Skilled Manual Work",
               "Low-skilled Manual Work", "Other", "Never Worked",
               "Skipped","Not Asked")
)

industry_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_work_industry = as.factor(c("1","2","3","4","5","6","7",
                                      "8","9","10","11","12","13","14",
                                      "15","16","17","18","19","20",
                                      "21","98","99")),
  Not.Asked = c("Advertising/Marketing/PR",
                "Aerospace",
                "Agriculture/Chemicals/Forest Products","Automotive",
                "Computers/Electronics","Construction",
                "Consumer Goods","Education","Energy/Mining",
                "Finance/Insurance/Real Estate",
                "Government/Military/Public Service",
                "Hospitality/Recreation",
                "Media/Publishing/Entertainment","Medical/Health Services",
                "Pharmaceuticals","Retail","Service",
                "Telecommunications/Networking",
                "Travel/Transportation","Other",
                "Have never worked","Skipped","Not Asked")
)

marital_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_marital_stat = as.factor(c("1","2","3","4","5","6","7",
                                     "8","9")),
  marital_status= c("Married","Living as married",
                    "Separated (after being married)","Divorced",
                    "Widowed","Never married",
                    "Civil Partnership","Skipped","Not Asked")
)

housing_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_house_tenure = as.numeric(c("1","2","3","4","5","6","7",
                                      "8","9","98","99")),
  house_tenure = c("Own outright",
                   "Own mortgage",
                   "Shared Ownership",
                   "Rent: private","Rent:LA",
                   "Rent:HA",
                   "Live with others: pay",
                   "Live with others: free","Other",
                   "Skipped","Not Asked")
)

hh_income_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_gross_household = as.numeric(c("1","2","3","4","5","6",
                                         "7","8","9","10","11","12","13",
                                         "14","15","16","17","98","99")),
  hh_income_code = c("under £5,000 per year",
                     "£5,000 to £9,999 per year",
                     "£10,000 to £14,999 per year",
                     "£15,000 to £19,999 per year","£20,000 to £24,999 per year",
                     "£25,000 to £29,999 per year",
                     "£30,000 to £34,999 per year",
                     "£35,000 to £39,999 per year",
                     "£40,000 to £44,999 per year",
                     "£45,000 to £49,999 per year","£50,000 to £59,999 per year",
                     "£60,000 to £69,999 per year",
                     "£70,000 to £99,999 per year",
                     "£100,000 to £149,999 per year","£150,000 and over",
                     "Don't know","Prefer not to answer",
                     "Skipped","Not Asked")
)

pers_inc_codes <- data.frame(
  stringsAsFactors = FALSE,
  check.names = FALSE,
  profile_gross_personal = as.numeric(c("1","2","3","4","5","6","7",
                                        "8","9","10","11","12","13","14",
                                        "15","16","98","99")),
  pers_income_code = c("under £5,000 per year",
                       "£5,000 to £9,999 per year",
                       "£10,000 to £14,999 per year",
                       "£15,000 to £19,999 per year","£20,000 to £24,999 per year",
                       "£25,000 to £29,999 per year",
                       "£30,000 to £34,999 per year",
                       "£35,000 to £39,999 per year",
                       "£40,000 to £44,999 per year","£45,000 to £49,999 per year",
                       "£50,000 to £59,999 per year",
                       "£60,000 to £69,999 per year",
                       "£70,000 to £99,999 per year","£100,000 and over",
                       "Don't know","Prefer not to answer","Skipped",
                       "Not Asked")
)

data <- data %>% 
  left_join(ed_codes, by = "profile_education_level")

# data <- data %>% 
#   left_join(sector_codes, by = "employer_type_private_non_self")

data <- data %>% 
  left_join(region_codes, by = "profile_GOR")

data <- data %>% 
  left_join(ethnicity_codes, by = "ethnicity_new")

data <- data %>% 
  left_join(work_stat_codes, by = "profile_work_stat")

data <- data %>% 
  left_join(occ_codes, by = "profile_work_type")

data <- data %>% 
  left_join(marital_codes, by = "profile_marital_stat")

data <- data %>% 
  left_join(housing_codes, by = "profile_house_tenure")

data <- data %>% 
  left_join(hh_income_codes, by = "profile_gross_household")

data <- data %>% 
  left_join(pers_inc_codes, by = "profile_gross_personal")

# Create New Variables with DK's removed etc  for DEMOGRAPHICS

data <- data %>% 
  mutate(turnout_ge_2019 = case_when(voted_ge_2019 == 1L ~ 1L,
                                     voted_ge_2019 == 2L ~ 0L,
                                     TRUE ~ NA_integer_),
         ge_2019 = case_when(pastvote_ge_2019 == 1L ~ "Conservative",
                             pastvote_ge_2019 == 2L ~ "Labour",
                             pastvote_ge_2019 == 3L ~ "Liberal Democrat",
                             pastvote_ge_2019 == 4L ~ "SNP",
                             pastvote_ge_2019 == 5L ~ "Plaid Cymru",
                             pastvote_ge_2019 == 6L ~ "Brexit Party",
                             pastvote_ge_2019 == 7L ~ "Green",
                             pastvote_ge_2019 == 98 ~ "Other",
                             TRUE ~ NA_character_),
         EURef = case_when(pastvote_EURef == 1L ~ "Remain",
                           pastvote_EURef == 2L ~ "Leave",
                           pastvote_EURef == 3L ~ "Did Not Vote",
                           pastvote_EURef == 4L ~ "Can't Remember",
                           TRUE ~ NA_character_),
         voted_leave = case_when(EURef == "Remain" ~ 0L, 
                                 EURef == "Leave" ~ 1L,
                                 EURef == "Did Not Vote" ~0L,
                                 TRUE ~ NA_integer_),
         voted_remain = case_when(EURef == "Remain" ~ 1L, 
                                  EURef == "Leave" ~ 0L,
                                  EURef == "Did Not Vote" ~0L,
                                  TRUE ~ NA_integer_),
         ref_didnt_vote = case_when(EURef == "Remain" ~ 0L, 
                                    EURef == "Leave" ~ 0L,
                                    EURef == "Did Not Vote" ~1L,
                                    TRUE ~ NA_integer_),
         gender = case_when(profile_gender == 1L ~ "Male",
                            profile_gender == 2L ~ "Female",
                            TRUE ~ NA_character_),
         # social_grade_no = case_when (profile_socialgrade_cie < 7 ~ 6 - profile_socialgrade_cie,
         #                              TRUE ~ NA_real_),
         # socialgrade_cie = case_when(profile_socialgrade_cie == 1L ~ "A",
         #                             profile_socialgrade_cie == 2L ~ "B",
         #                             profile_socialgrade_cie == 3L ~ "C1",
         #                             profile_socialgrade_cie == 4L ~ "C2",
         #                             profile_socialgrade_cie == 5L ~ "D",
         #                             profile_socialgrade_cie == 6L ~ "E",
         #                             TRUE ~ NA_character_),
         # UK_citizen = case_when(citizenship < 3 ~ 1,
         #                        citizenship == 3 ~ 0,
         #                        TRUE ~NA_real_),
         # citizenship_code = case_when(citizenship == 1 ~ "UK Citizen",
         #                              citizenship == 2 ~ "Dual National",
         #                              citizenship == 3 ~ "Non-Citizen",
         #                             TRUE ~ NA_character_),
         education_no = case_when(profile_education_level <19 ~ profile_education_level,
                                  profile_education_level> 18 ~ NA_real_),
         education_short = case_when(profile_education_level == 1 ~ 1,
                                     profile_education_level %in% 2:10 ~ 2,
                                     profile_education_level %in% 11:12 ~ 3,
                                     profile_education_level %in% 13:14 ~ 3,
                                     profile_education_level %in% 15:16 ~ 4,
                                     profile_education_level == 17 ~ 5,
                                     profile_education_level == 18 ~ 4,
                                     profile_education_level > 18 ~ NA_real_),
         degree = case_when(profile_education_level %in% 15:18 ~ 1,
                            profile_education_level <15 ~ 0,
                            TRUE ~NA_real_),
         homeowner = case_when(profile_house_tenure < 4 ~ 1,
                               profile_house_tenure %in% 4:9 ~ 0,
                               TRUE ~NA_real_),
         renter = case_when(profile_house_tenure < 4 ~ 0,
                            profile_house_tenure %in% 4:6 ~ 1,
                            profile_house_tenure %in% 7:9 ~0,
                            TRUE ~NA_real_), 
         parent_homeowner = case_when(Q10 < 4 ~ 1,
                                      Q10 %in% 4:7 ~ 0,
                                      TRUE ~NA_real_),
         parent_renter = case_when(Q10 < 4 ~ 0,
                                   Q10 %in% 4:6 ~ 1,
                                   Q10 == 7 ~0,
                                   TRUE ~NA_real_), 
         household_size = case_when(profile_household_size < 9 ~ profile_household_size,
                                    TRUE ~ NA_real_),
         hh_income = case_when(profile_gross_household<16 ~ profile_gross_household,
                               TRUE ~ NA_real_),
         pers_income = case_when(profile_gross_personal<15 ~ profile_gross_personal,
                                 TRUE ~ NA_real_)
         
  ) 

# saved 22.08.22
# write.csv(data, "/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/yg_data_clean.csv")

# ADDING LOCAL AUTHORITY INFORMATION ------------------------------------------
# data <- read_csv("/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/yg_data_clean.csv")
yg_local_authority_codes <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/housing data/median_house_prices_for_yg.csv")

yg_local_authority_codes <- yg_local_authority_codes %>% 
  rename(median_price = price,
         median_price_group = group,
         profile_oslaua = yg_code) %>% 
  mutate(profile_oslaua = profile_oslaua,
         profile_oslaua = as.factor(profile_oslaua)) 

data <- data%>% 
  left_join(yg_local_authority_codes, by = "profile_oslaua")

la_wide <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/shiny_app/la_wide.csv")

la_wide <- la_wide %>% 
  dplyr::select(reg_code:la_median_mar20, p_10, p_20, p_30, p_40, p_50, p_60, p_70, p_80, p_90, p_95) %>% 
  dplyr::select(-(la_name))

data <- data %>% 
  left_join(la_wide, by = "la_code")

## Adding in Local Authority Pay
la_pay <- read_csv("~/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/labor_mkt_data/la_pay_long.csv")

la_pay <- la_pay %>% 
  dplyr::select(-(la_name)) %>% 
  filter(quantile == 50) %>% 
  dplyr::select(-quantile)

data <- data %>% 
  left_join(la_pay, by= "la_code")

## Adding in Local Authority housing affordability
la_afford <- read_csv("/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/data/median_affordability_2021.csv")

la_afford <- la_afford %>% 
  filter((str_sub(la_code, 1, 1)=="E")) %>% 
  mutate(affordability_ratio = as.numeric(affordability_ratio),
         afford_rank = rank(affordability_ratio, ties.method = "min"),
         y = 1)

la_afford <- la_afford %>% 
  mutate(id = la_code) %>% 
  filter(!is.na(affordability_ratio))

# add YG LA codes
yg_codes <- read_csv("/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/shiny_app/profile_oslaua_correspondence.csv")

yg_codes <-  yg_codes %>% dplyr::select(code, Region) %>% 
  dplyr::rename(la_name = Region,
                yg_code = code)

yg_codes <- yg_codes %>% 
  mutate(la_name = case_when(la_name== "Rhondda, Cynon, Taff"~ "Rhondda Cynon Taf",
                             la_name=="The Vale of Glamorgan" ~ "Vale of Glamorgan",
                             TRUE ~ la_name)) 

la_afford <-  la_afford %>% 
  left_join(yg_codes, by = "la_name") %>% 
  mutate(yg_code = case_when(la_name=="Folkestone and Hythe" ~ 268,
                             la_name=="North Northamptonshire"~ 450,
                             la_name == "West Northamptonshire" ~451,
                             TRUE ~ yg_code
  )) %>% 
  dplyr::select(-c("la_name", "y", "id", "yg_code"))


data <- data %>% 
  left_join(la_afford, by = "la_code")

## Add TUNDRA data on university attendance
la_tundra <-  read_csv("/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/data/la_tundra_estimates_21.csv")

yg_codes <- yg_codes %>% 
  mutate(la_name = case_when(la_name== "Rhondda, Cynon, Taff"~ "Rhondda Cynon Taf",
                             la_name=="The Vale of Glamorgan" ~ "Vale of Glamorgan",
                             TRUE ~ la_name)) %>% 
  mutate(la_name_he = la_name)


la_tundra <- la_tundra %>% 
  filter(!(la_name %in% c("West Dorset", "North Dorset", "East Dorset", "Purbeck", "Weymouth and Portland",
                          "Suffolk Coastal", "Waveney",
                          "South Bucks", "Aylesbury Vale", "Chiltern", "Wycombe",
                          "St Edmundsbury", "Forest Heath",
                          "Kettering", "Daventry", "Corby", "Northampton",
                          "South Northamptonshire", "East Northamptonshire", "Wellingborough",
                          "Taunton Deane", "West Somerset",
                          "Bournemouth", "Poole", "Christchurch")))

la_tundra <- la_tundra %>% 
  rename(id = la_code ) %>% 
  rename(la_name_he = la_name) %>% 
  group_by(id) %>% 
  slice(1) %>% 
  ungroup()

la_tundra <- la_tundra %>% 
  mutate(he_rank = rank(la_stud_progression, ties.method = "min")) %>% 
  left_join(yg_codes, by="la_name_he") %>% 
  mutate(profile_oslaua = case_when(la_name_he=="Folkestone and Hythe" ~ 268,
                                    la_name_he=="North Northamptonshire"~ 450,
                                    la_name_he == "West Northamptonshire" ~451,
                                    TRUE ~ yg_code), 
         profile_oslaua = as.factor(profile_oslaua)) %>% 
  dplyr::select(-la_name)

data <- data %>% 
  left_join(la_tundra, by = "profile_oslaua")


# save 22.08.22
# write.csv(data, "/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/yg_data_la_clean.csv")



## RECODING SURVEY QUESTIONS -----------------------------------------------

# Remove all DKs and Skipped  - code as NA. 
# Coding is set-up so agreement is higher number except where noted
data <-  data %>% 
  mutate(perception_hh_inc = case_when(Q32_1 == 1 ~ 1,
                                       Q32_1 == 2 ~ 2,
                                       Q32_1 == 3 ~ 3,
                                       Q32_1 == 4 ~ 4,
                                       Q32_1 == 5 ~ 5, # 1 = least wealthy / 5 = wealthiest
                                       TRUE ~ NA_real_),
         perception_hh_wea = case_when(Q32_2 == 1 ~ 1,
                                       Q32_2 == 2 ~ 2,
                                       Q32_2 == 3 ~ 3,
                                       Q32_2 == 4 ~ 4,
                                       Q32_2 == 5 ~ 5,
                                       TRUE ~ NA_real_),
         perception_hh_houseval = case_when(Q32_3 == 1 ~ 1,
                                            Q32_3 == 2 ~ 2,
                                            Q32_3 == 3 ~ 3,
                                            Q32_3 == 4 ~ 4,
                                            Q32_3 == 5 ~ 5,
                                            TRUE ~ NA_real_),
         perception_la_he = case_when(Q33_1 == 5 ~ 1, # changed so higher number is better/lower number is worse
                                      Q33_1 == 4 ~ 2,
                                      Q33_1 == 3 ~ 3,
                                      Q33_1 == 2 ~ 4,
                                      Q33_1 == 1 ~ 5,
                                      TRUE ~ NA_real_),
         perception_la_afford = case_when(Q33_2 == 5 ~ 1, # changed so higher number is better/lower number is worse
                                          Q33_2 == 4 ~ 2,
                                          Q33_2 == 3 ~ 3,
                                          Q33_2 == 2 ~ 4,
                                          Q33_2 == 1 ~ 5,
                                          TRUE ~ NA_real_),
         perception_la_getahead = case_when(Q33_3 == 5 ~ 1, # changed so higher number is better/lower number is worse
                                            Q33_3 == 4 ~ 2,
                                            Q33_3 == 3 ~ 3,
                                            Q33_3 == 2 ~ 4,
                                            Q33_3 == 1 ~ 5,
                                            TRUE ~ NA_real_),
         fairshare_general = case_when(Q40 == 5 ~ 1, # changed so higher number is more than fair share/lower number is less than fair share
                                       Q40 == 4 ~ 2,
                                       Q40 == 3 ~ 3,
                                       Q40 == 2 ~ 4,
                                       Q40 == 1 ~ 5,
                                       TRUE ~ NA_real_),
         fairchance_own_edu = case_when(Q41_1 == 5 ~ 1, # changed so higher number is high agreement/lower number is low agreement
                                        Q41_1 == 4 ~ 2,
                                        Q41_1 == 3 ~ 3,
                                        Q41_1 == 2 ~ 4,
                                        Q41_1 == 1 ~ 5,
                                        TRUE ~ NA_real_),
         fairchance_own_job = case_when(Q41_2 == 5 ~ 1, # changed so higher number is high agreement/lower number is low agreement
                                        Q41_2 == 4 ~ 2,
                                        Q41_2 == 3 ~ 3,
                                        Q41_2 == 2 ~ 4,
                                        Q41_2 == 1 ~ 5,
                                        TRUE ~ NA_real_),
         fairchance_own_house = case_when(Q41_3 == 5 ~ 1, # changed so higher number is high agreement/lower number is low agreement
                                          Q41_3 == 4 ~ 2,
                                          Q41_3 == 3 ~ 3,
                                          Q41_3 == 2 ~ 4,
                                          Q41_3 == 1 ~ 5,
                                          TRUE ~ NA_real_),
         fairchance_others_edu = case_when(Q42_1 == 5 ~ 1, # changed so higher number is high agreement/lower number is low agreement
                                           Q42_1 == 4 ~ 2,
                                           Q42_1 == 3 ~ 3,
                                           Q42_1 == 2 ~ 4,
                                           Q42_1 == 1 ~ 5,
                                           TRUE ~ NA_real_),
         fairchance_others_job = case_when(Q42_2 == 5 ~ 1, # changed so higher number is high agreement/lower number is low agreement
                                           Q42_2 == 4 ~ 2,
                                           Q42_2 == 3 ~ 3,
                                           Q42_2 == 2 ~ 4,
                                           Q42_2 == 1 ~ 5,
                                           TRUE ~ NA_real_),
         fairchance_others_house = case_when(Q42_3 == 5 ~ 1, # changed so higher number is high agreement/lower number is low agreement
                                             Q42_3 == 4 ~ 2,
                                             Q42_3 == 3 ~ 3,
                                             Q42_3 == 2 ~ 4,
                                             Q42_3 == 1 ~ 5,
                                             TRUE ~ NA_real_),
         compare_parents_edu = case_when(Q43_1 == 5 ~ 1, # changed so higher number is better/lower number is worse
                                         Q43_1 == 4 ~ 2,
                                         Q43_1 == 3 ~ 3,
                                         Q43_1 == 2 ~ 4,
                                         Q43_1 == 1 ~ 5,
                                         TRUE ~ NA_real_),
         compare_parents_finan = case_when(Q43_2 == 5 ~ 1, # changed so higher number is better/lower number is worse
                                           Q43_2 == 4 ~ 2,
                                           Q43_2 == 3 ~ 3,
                                           Q43_2 == 2 ~ 4,
                                           Q43_2 == 1 ~ 5,
                                           TRUE ~ NA_real_),
         compare_parents_house = case_when(Q43_4 == 5 ~ 1, # changed so higher number is better/lower number is worse
                                           Q43_4 == 4 ~ 2,
                                           Q43_4 == 3 ~ 3,
                                           Q43_4 == 2 ~ 4,
                                           Q43_4 == 1 ~ 5,
                                           TRUE ~ NA_real_),
         fairness_buy_educ = case_when(Q48 == 5 ~ 1, # changed so higher number is very just/lower number is very unjust
                                       Q48 == 4 ~ 2,
                                       Q48 == 3 ~ 3,
                                       Q48 == 2 ~ 4,
                                       Q48 == 1 ~ 5,
                                       TRUE ~ NA_real_),
         gov_spend_educ = case_when(Q49_1 == 1 ~ 1, # already higher number = much more
                                    Q49_1 == 2 ~ 2,
                                    Q49_1 == 3 ~ 3,
                                    Q49_1 == 4 ~ 4,
                                    Q49_1 == 5 ~ 5,
                                    TRUE ~ NA_real_),
         gov_spend_pubhous = case_when(Q49_4 == 1 ~ 1, # already higher number = much more
                                       Q49_4 == 2 ~ 2,
                                       Q49_4 == 3 ~ 3,
                                       Q49_4 == 4 ~ 4,
                                       Q49_4 == 5 ~ 5,
                                       TRUE ~ NA_real_),
         gov_spend_indust = case_when(Q49_5 == 1 ~ 1, # already higher number = much more
                                      Q49_5 == 2 ~ 2,
                                      Q49_5 == 3 ~ 3,
                                      Q49_5 == 4 ~ 4,
                                      Q49_5 == 5 ~ 5,
                                      TRUE ~ NA_real_),
         diff_inc = case_when(Q50_1 == 1 ~ 1, # already higher number = much more
                              Q50_1 == 2 ~ 2,
                              Q50_1 == 3 ~ 3,
                              Q50_1 == 4 ~ 4,
                              Q50_1 == 5 ~ 5,
                              TRUE ~ NA_real_),
         diff_wea = case_when(Q50_2 == 1 ~ 1, # already higher number = much more
                              Q50_2 == 2 ~ 2,
                              Q50_2 == 3 ~ 3,
                              Q50_2 == 4 ~ 4,
                              Q50_2 == 5 ~ 5,
                              TRUE ~ NA_real_),
         diff_houses = case_when(Q50_3 == 1 ~ 1, # already higher number = much more
                                 Q50_3 == 2 ~ 2,
                                 Q50_3 == 3 ~ 3,
                                 Q50_3 == 4 ~ 4,
                                 Q50_3 == 5 ~ 5,
                                 TRUE ~ NA_real_),
         tax_slider = case_when(q51_1 == 0 ~ 0,
                                q51_1 == 1 ~ 1,
                                q51_1 == 2 ~ 2,
                                q51_1 == 3 ~ 3,
                                q51_1 == 4 ~ 4, 
                                q51_1 == 5 ~ 5, 
                                q51_1 == 6 ~ 6,
                                q51_1 == 7 ~ 7,
                                q51_1 == 8 ~ 8,
                                q51_1 == 9 ~ 9, 
                                q51_1 == 10 ~ 10,
                                TRUE ~ NA_real_),
         like_taxing_wealth = as.numeric(as.numeric(q51_1) < 5),
         support_redistribution_inc = case_when(Q53_1 == 1 ~ 1,
                                                Q53_1 == 2 ~ 2,
                                                Q53_1 == 3 ~ 3,
                                                Q53_1 == 4 ~ 4,
                                                TRUE ~ NA_real_),
         support_redistribution_wea = case_when(Q53_2 == 1 ~ 1,
                                                Q53_2 == 2 ~ 2,
                                                Q53_2 == 3 ~ 3,
                                                Q53_2 == 4 ~ 4,
                                                TRUE ~ NA_real_),
         support_redistribution_educ = case_when(Q53_3 == 1 ~ 1,
                                                 Q53_3 == 2 ~ 2,
                                                 Q53_3 == 3 ~ 3,
                                                 Q53_3 == 4 ~ 4,
                                                 TRUE ~ NA_real_),
         pol_complicated = case_when(Q54_1 == 5 ~ 1,
                                     Q54_1 == 4 ~ 2,
                                     Q54_1 == 3 ~ 3, 
                                     Q54_1 == 2 ~ 4,
                                     Q54_1 == 1 ~ 5,
                                     TRUE ~ NA_real_),
         pol_say = case_when(Q54_2 == 5 ~ 1,
                             Q54_2 == 4 ~ 2,
                             Q54_2 == 3 ~ 3,
                             Q54_2 == 2 ~ 4,
                             Q54_2 == 1 ~ 5,
                             TRUE ~ NA_real_),
         pol_care = case_when(Q54_3 == 5 ~ 1,
                              Q54_3 == 4 ~ 2,
                              Q54_3 == 3 ~ 3,
                              Q54_3 == 2 ~ 4,
                              Q54_3 == 1 ~ 5,
                              TRUE ~ NA_real_),
         effort_slider = case_when(q45_1 == 0 ~ 0,
                                   q45_1 == 1 ~ 1,
                                   q45_1 == 2 ~ 2,
                                   q45_1 == 3 ~ 3,
                                   q45_1 == 4 ~ 4, 
                                   q45_1 == 5 ~ 5, 
                                   q45_1 == 6 ~ 6,
                                   q45_1 == 7 ~ 7,
                                   q45_1 == 8 ~ 8,
                                   q45_1 == 9 ~ 9, 
                                   q45_1 == 10 ~ 10,
                                   TRUE ~ NA_real_), 
         priveduc = case_when(Q48 == 5 ~ 1,
                              Q54_1 == 4 ~ 2,
                              Q54_1 == 3 ~ 3, 
                              Q54_1 == 2 ~ 4,
                              Q54_1 == 1 ~ 5,
                              TRUE ~ NA_real_))


# parse house price questions
data <- data %>% 
  mutate(house_price_written = Q24a_other) %>% 
  mutate(house_price_written = case_when(house_price_written %in% c("__NA__", "__DK__", 
                                                                    "No idea", "Dk", 
                                                                    "million", "Dont know", 
                                                                    "DK", "I don't know", 
                                                                    "No idea", "Dk", 
                                                                    "Do not know", "no idea", 
                                                                    "Don't know", "I don't know", 
                                                                    "A lot", "No idea!", 
                                                                    "Not sure", "????", 
                                                                    "I m not sure", "Don't know ",
                                                                    "I don’t know", "Don’t know") ~ NA_character_,
                                         TRUE ~ house_price_written)) %>% 
  mutate(parsed_house = parse_number(house_price_written)) %>% 
  mutate(est_house_price = case_when(between(parsed_house, 20, 999) ~ 1000*parsed_house,
                                     between(parsed_house, 0, 19) ~ 1000000*parsed_house,
                                     TRUE ~ parsed_house)) %>% 
  mutate(est_house_price = case_when(house_price_written == "0" ~ NA_real_,
                                     TRUE ~ est_house_price),
         log_est_house_price = log(est_house_price),
         homeowner_X_price = case_when(homeowner==1 ~ est_house_price,
                                       homeowner == 0 ~ 0,
                                       TRUE ~NA_real_),
         homeowner_X_log_price = case_when(homeowner==1 ~ log_est_house_price,
                                           homeowner == 0 ~ 0,
                                           TRUE ~NA_real_),
         log_median_price = log(median_price),
         house_rent_groups  = case_when(homeowner==0 ~ "Don't Own",
                                        est_house_price <=100000 ~ "Own, £100k and under",
                                        between(est_house_price, 100001, 200000) ~ "Own, £100k to £200k",
                                        between(est_house_price, 200001, 300000) ~ "Own, £200k to £300k",
                                        between(est_house_price, 300001, 400000) ~ "Own, £300k to £400k",
                                        between(est_house_price, 400001, 500000) ~ "Own, £400k to £500k",
                                        between(est_house_price, 500001, 750000) ~ "Own, £500k to £750k",
                                        est_house_price >750000 ~ "Own, £750k and up",
                                        TRUE ~NA_character_),
         house_rent_groups = factor(house_rent_groups, 
                                    levels = c("Don't Own", "Own, £100k and under", "Own, £100k to £200k",
                                               "Own, £200k to £300k", "Own, £300k to £400k", "Own, £400k to £500k",
                                               "Own, £500k to £750k", "Own, £750k and up")))


# parse parent's house price question 
data <- data %>% 
  mutate(house_price_written_parents = Q30aa_other) %>% 
  mutate(house_price_written_parents = case_when(house_price_written_parents %in% c("__NA__", "__DK__", "d/k", 
                                                                                    "No idea", "Dont know", "I don’t know",
                                                                                    "no idea", "Unsure", "don't know",
                                                                                    "No clue", "Nothing", "Don’t know",
                                                                                    "Don't know", "None of your business",
                                                                                    "No idea!", "?", "No Idea",
                                                                                    "DK", "Several million. No idea exactly.",
                                                                                    "????", "Dk", "dont know", "I do not know",
                                                                                    "Which parent? They are divorced") ~NA_character_,
                                                 TRUE ~ house_price_written_parents)) %>% 
  mutate(parsed_house_parents = parse_number(house_price_written_parents)) %>% 
  mutate(est_house_price_parents = case_when(between(parsed_house_parents, 20, 999) ~ 1000*parsed_house_parents,
                                             between(parsed_house_parents, 0, 19) ~ 1000000*parsed_house_parents,
                                             TRUE ~ parsed_house_parents)) %>% 
  mutate(est_house_price_parents = case_when (house_price_written_parents == "3Mum’s - 265k"~ 265000,
                                              TRUE ~ est_house_price_parents)) %>%
  mutate(log_est_house_price_parents = log(est_house_price_parents),
         homeowner_X_price_parents = case_when(parent_homeowner==1 ~ est_house_price_parents,
                                               parent_homeowner == 0 ~ 0,
                                               TRUE ~NA_real_),
         homeowner_X_log_price_parents = case_when(parent_homeowner==1 ~ log_est_house_price_parents,
                                                   parent_homeowner == 0 ~ 0,
                                                   TRUE ~NA_real_),
         house_rent_groups_parents  = case_when(parent_homeowner==0 ~ "Don't Own",
                                                est_house_price_parents <=100000 ~ "Own, £100k and under",
                                                between(est_house_price_parents, 100001, 200000) ~ "Own, £100k to £200k",
                                                between(est_house_price_parents, 200001, 300000) ~ "Own, £200k to £300k",
                                                between(est_house_price_parents, 300001, 400000) ~ "Own, £300k to £400k",
                                                between(est_house_price_parents, 400001, 500000) ~ "Own, £400k to £500k",
                                                between(est_house_price_parents, 500001, 750000) ~ "Own, £500k to £750k",
                                                est_house_price_parents >750000 ~ "Own, £750k and up",
                                                TRUE ~NA_character_),
         house_rent_groups_parents = factor(house_rent_groups_parents, 
                                            levels = c("Don't Own", "Own, £100k and under", "Own, £100k to £200k",
                                                       "Own, £200k to £300k", "Own, £300k to £400k", "Own, £400k to £500k",
                                                       "Own, £500k to £750k", "Own, £750k and up")))  


# save 22.08.22
# write.csv(data, "/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/yg_data_finalclean.csv")

## COMPREHENSION CHECKS -------------------------------------------------------

# comprehension checks for housing affordability: Q36, Q37
data <- data %>% mutate(CC_housing_1 = case_when(Q36 == 2 ~ 1,
                                                 Q36 == 1 ~ 0,
                                                 Q36 == 3 ~ 0,
                                                 Q36 == 4 ~ 0,
                                                 TRUE ~ NA_real_),
                        CC_housing_2_v1 = case_when(Q37 == 1 & affordability_ratio > 7.69 ~ 0,
                                                        Q37 == 3 & affordability_ratio < 7.69 ~ 0,
                                                        Q37 == 4 ~ 0, 
                                                       Q37 %in% c(1, 2) & affordability_ratio <= 7.69 ~ 1,
                                                       Q37 %in% c(2, 3) & affordability_ratio >= 7.69 ~ 1,
                                                        TRUE ~ NA_real_), # CC_housing_2_v1 = they get it wrong if they go the other way
                        CC_housing_2_v2 = case_when(Q37 == 1 & affordability_ratio <= 7.69 ~ 1, 
                                                    Q37 == 3 & affordability_ratio >= 7.69 ~ 1,
                                                    Q37 == 2 & affordability_ratio <= 9.61 & affordability_ratio >= 5.77 ~ 1,
                                                    Q37 == 4 ~ 0,
                                                    TRUE ~ 0), # CC_housing_2_v2 = "as affordable" = +/- 25% of national average  
                        CC_housing_all_v1= case_when(CC_housing_1 == 1 & CC_housing_2_v1 == 1 ~ 1,
                                                     CC_housing_1 == 0 & CC_housing_2_v1 == 1 ~ 0,
                                                     CC_housing_1 == 1 & CC_housing_2_v1 == 0 ~ 0,
                                                     CC_housing_1 == 0 & CC_housing_2_v1 == 0 ~ 0,
                                                     TRUE ~ NA_real_), 
                        CC_housing_all_v2 = case_when(CC_housing_1 == 1 & CC_housing_2_v2 == 1 ~ 1,
                                                      CC_housing_1 == 0 & CC_housing_2_v2 == 1 ~ 0,
                                                      CC_housing_1 == 1 & CC_housing_2_v2 == 0 ~ 0,
                                                      CC_housing_1 == 0 & CC_housing_2_v2 == 0 ~ 0,
                                                      TRUE ~ NA_real_))
          
# comprehension checks for university attendance: Q112, Q39
data <- data %>% mutate(CC_educ_1 = case_when(Q112 == 2 ~ 1,
                                              Q112 == 1 ~ 0,
                                              Q112 == 3 ~ 0,
                                              Q112 == 4 ~ 0,
                                              TRUE ~ NA_real_),
                        CC_educ_2_v1= case_when(Q39 == 1 & 100*la_stud_progression > 42 ~ 0,
                                                Q39 == 3 &  100*la_stud_progression < 42 ~ 0,
                                                Q39 == 4 ~ 0, 
                                                Q39 %in% c(1, 2) & 100*la_stud_progression <= 42 ~ 1,
                                                Q39 %in% c(2, 3) & 100*la_stud_progression >= 42 ~ 1,
                                                TRUE ~ NA_real_),  # CC_educ_2_v1 = they get it wrong if they go the other way
                        CC_educ_2_v2 = case_when(Q39 == 1 & 100*la_stud_progression <= 42 ~ 1,
                                                 Q39 == 3 &  100*la_stud_progression >= 42 ~ 1,
                                                 Q39 == 2 & 100*la_stud_progression <= 52.5 & 100*la_stud_progression >= 31.5 ~ 1,
                                                 Q39 == 4 ~ 0, 
                                                 TRUE ~ NA_real_), # CC_educ_2_v2 = "as many students" = +/- 25% of national average
                        CC_educ_all_v1= case_when(CC_educ_1 == 1 & CC_educ_2_v1 == 1 ~ 1,
                                                     CC_educ_1 == 0 & CC_educ_2_v1 == 1 ~ 0,
                                                     CC_educ_1 == 1 & CC_educ_2_v1 == 0 ~ 0,
                                                     CC_educ_1 == 0 & CC_educ_2_v1 == 0 ~ 0,
                                                     TRUE ~ NA_real_), 
                        CC_educ_all_v2 = case_when(CC_educ_1 == 1 & CC_educ_2_v2 == 1 ~ 1,
                                                      CC_educ_1 == 0 & CC_educ_2_v2 == 1 ~ 0,
                                                      CC_educ_1 == 1 & CC_educ_2_v2 == 0 ~ 0,
                                                      CC_educ_1 == 0 & CC_educ_2_v2 == 0 ~ 0,
                                                      TRUE ~ NA_real_))
# save 30.08.22
# write.csv(data, "/Users/laurebokobza/Dropbox/WEALTHPOL_Research/Papers/uk_survey_experiments/social_mobility/yg_data/yg_data_finalclean.csv")


