################# INTRO ################ Initial data preparation script:# - Locates and imports# -- (1) SHP long data files and# -- (2) Year-level SHP metadata files# - Merges user long file with metadata# - Performs initial variable construction# - Subsets the data to variables of (potential) interest# - Exports result as Parquet so as to drastically speed up r/w times############################# Path definitions ############################if (!exists("pqtfilepath_full") ||!exists("paths_meta") ||!exists("pqtfilepath_full") ||!exists("pqtfilepath_1423") ||!exists("path_spss_p")) {source("10_runner.R")}################### Imports #################### Metadata#------------# Canton var is not included in personal long data, so need to grab from CNEF.# There is a CNEF long file, but it unfortunately only goes up to# 2019, so the individual files need to be read.### Importframes_meta <-lapply(paths_meta, \(x) { haven::read_spss(x, user_na =TRUE)})### Year cleanup:# Meta files have years in var names, we instead want them in# columns so that we can join on the namesmetayears <- paths_meta |> stringr::str_extract("\\d{4}")names(frames_meta) <- metayearsfor (i innames(frames_meta)) { frames_meta[[i]]$year <-as.numeric(i)}frames_meta <-lapply(frames_meta, \(x) {colnames(x) <-colnames(x) |>gsub(x = _, "_\\d{4}", "")return(x)})### Mergemeta_combined <- frames_meta |> data.table::rbindlist(fill =TRUE,ignore.attr =TRUE ) |> tibble::tibble() |> dplyr::rename(idpers = x11101ll)### Long data#------------df_p_long <- haven::read_spss(path_spss_p, user_na =TRUE)### Initial merge#----------------df_merge <- dplyr::full_join( df_p_long, meta_combined,by =c("idpers", "year"))##################### Wrangling ###################### Helpers#----------### Recode and collapse factor levels# This will throw warnings upon executing because not all levels are present# in all input factorsCollapseParties <-function(col) {# Coded missings to NA col[col <0] <-NA out <- col |> sjlabelled::to_label() |> forcats::fct_collapse(AVF ="avf socialist green alternative and women groups",CSP ="pcs swiss christian social party",DM =c("bdp conservative democratic party","pdc swiss christian-democrat party","the centre" ),EDU ="udf federal democratic union",EVP ="pev swiss pop evang party/swiss evang party",FDP =c("prd swiss radical-democratic party","pls swiss liberal party","plr les libéraux-radicaux","adi independent alliance","fdp the liberals" ),FPS ="psl swiss freedom party (former swiss car party )",GLP ="gl green liberals",GP ="pes swiss ecology party",Lega ="lega dei ticinesi",MCG ="mcg mouvement citoyens genevois",Other ="other party",PDA ="pst swiss labour party/popular labour party/solidarity",SD ="ds swiss democrats (former national action)",SP ="pss swiss socialist party",SVP ="udc democratic union of the centre",None =c("vote for a candidate, not for a party","for no party","wouldn't vote" ) ) |># Condense further forcats::fct_other(keep =c("SP", "DM", "SVP", "FDP", "GP", "GLP", "None") ) |> forcats::fct_relevel("FDP")return(out)}### Data#-------### New combined party variable based on heuristics# membership > hypothetical vote > IDdf_r <- df_merge |> dplyr::rename(canton = l11101,fed_trust = pp04,armypos = pp12,army_op = pp49,army_spend = pp58,aid_spend = pp62,feel_sec = pl101,feel_insec = pc103,ptyid = pp67,ptymbr = pp11,ptyhypo = pp19 ) |> dplyr::mutate(pty = data.table::fcase( labelled::remove_labels(ptymbr) >0, ptymbr, labelled::remove_labels(ptyhypo) >0, ptyhypo, labelled::remove_labels(ptyid) >0, ptyid,default =NA ) )# Collapsedf_r <- df_r |> dplyr::mutate( dplyr::across(.cols = tidyselect::starts_with("pty"),.fns =~CollapseParties(.x) ) )### Armypos cleaning#-------------------# Drop non-answersdf_r$armypos[df_r$armypos <0] <-NA# Relabel to DE for subseqdf_r$armypos <- df_r$armypos |> sjlabelled::set_labels(labels =c("Starke Armee"=1, "Weder noch"=2, "Keine Armee"=3) )### Factorisations#-----------------# Figure out at which age the armypos question starts being asked so that# we can correctly label the lower bound of the agecat variabledf_r |> dplyr::filter(age >0& age <18&!is.na(armypos) & armypos >0) |> dplyr::pull(age) |>min()# -> 13df_r <- df_r |> dplyr::mutate(# Ageagecat = data.table::fcase( age %in%0:17,"13-17", age %in%18:35,"18-35", age %in%36:45,"36-45", age %in%46:55,"46-55", age %in%56:65,"56-65", age >65,"65+" ) |>as.factor() |> forcats::fct_relevel("36-45"),# Yearyearfac =as.factor(year),# Army positionapfac = sjlabelled::as_label(armypos),# Sexsexfac =as.factor( data.table::fcase( sex ==1,"Männlich", sex ==2,"Weiblich",default =NA ) ),# Household/contact language; region-coded per doc, hence desirable for# röstigraben estimateshsh_lang = sjlabelled::as_label(l11102),canton = sjlabelled::as_label(canton) )# Canton level cleaning: Abbreviation is sufficientlevels(df_r$canton) <-substr(levels(df_r$canton), 1, 2)# With this cleaned, add a region variabledf_r <- df_r |> dplyr::mutate(# cf. https://forscenter.ch/wp-content/uploads/2022/02/shp_user-guide-w22.pdfnutsiireg = data.table::fcase( canton %in%c("VD", "VS", "GE"),"Genferseeregion", canton %in%c("BE", "FR", "SO", "NE", "JU"),"Mittelland", canton %in%c("BS", "BL", "AG"),"Nordwestschweiz", canton =="ZH","Zürich", canton %in%c("GL", "SH", "AR", "AI", "SG", "GR", "TG"),"Ostschweiz", canton %in%c("LU", "UR", "SZ", "OW", "NW", "ZG"),"Zentralschweiz", canton =="TI","Tessin",default =NA ) )### Dummifications#-----------------df_r <- df_r |> dplyr::mutate(isStrongerprop = data.table::fcase( apfac =="Starke Armee",TRUE, apfac %in%c("Weder noch", "Keine Armee"),FALSE,default =NA ),isWeakerprop = data.table::fcase( apfac =="Keine Armee",TRUE, apfac %in%c("Weder noch", "Starke Armee"),FALSE,default =NA ),isCH = data.table::fcase( nat_1_ ==8100,"Yes", nat_2_ ==8100,"Yes", nat_3_ ==8100,"Yes",default ="No" ) |>factor(),isMale = data.table::fcase( sexfac =="Männlich","Yes", sexfac =="Weiblich","No",default =NA ) |>factor() ) |> sjlabelled::var_labels(isStrongerprop ="Für Armee-Stärkung?",isWeakerprop ="Für Armee-Schwächung?",isCH ="Schweizer:in?",isMale ="Mann?" )# Age and cohort transformations#-------------------------------df_cohorts <- df_r |> dplyr::select(year, age, idpers) |> dplyr::mutate(age = sjlabelled::remove_all_labels(age)) |> dplyr::filter(!is.na(age) &!is.na(year) & age >0) |> dplyr::distinct(idpers, .keep_all =TRUE) |> dplyr::mutate(estby = year - age,cohort =as.factor(data.table::fcase( estby <=1945,"Pre1945", estby %in%1946:1964,"Boomer", estby %in%1965:1980,"GenX", estby %in%1981:1996,"Millenial", estby >=1997,"Post1996" )) ) |> dplyr::select(estby, cohort, idpers)df_r <- dplyr::full_join(df_r, df_cohorts, by ="idpers") |> dplyr::mutate(age = sjlabelled::remove_all_labels(age),age =ifelse(age >0, age, NA) )################## Subset ################### Save the full data# df_r |> arrow::write_parquet("shplong_combined_modified_full.parquet")### Select the variables we actually need to cut down on sizedf_r <- df_r |> dplyr::select( idpers, year, yearfac, estby, age, agecat, cohort, sex, isMale, isCH, canton, hsh_lang, nutsiireg, fed_trust, armypos, isStrongerprop, isWeakerprop, apfac, army_op, army_spend, aid_spend, feel_sec, feel_insec, pty, ptyid, ptymbr, ptyhypo, wicss )########################## Alt data frame ######################### In addition to a full longitudinal model, we also wish to model switching prob-# abilities between 2014 and 2023, based _only_ on respondents who are present# in both waves. This requires subsetting and recoding.# Subset#-------# Construct a vector containing the ids of respondents present in both yearsvec_compids <- df_r |> dplyr::filter(year %in%c(2014, 2023)) |> dplyr::filter(!is.na(armypos)) |> dplyr::count(idpers) |> dplyr::filter(n ==2) |> dplyr::pull(idpers) |>unique()# Subset the df based on this vectordf_comp <- df_r |> dplyr::filter( idpers %in% vec_compids, year %in%c(2014, 2023) )# Pivot#-------df_comp <- df_comp |> tidyr::pivot_wider(names_from = yearfac,values_from =!c(yearfac, idpers) ) |> dplyr::mutate(army_delta = armypos_2023 - armypos_2014,ptycomb = data.table::fcase( (is.na(pty_2014) |is.na(pty_2023)),"setmetona", pty_2014 == pty_2023,as.character(pty_2014),!pty_2014 == pty_2023,"Switched" ) |>factor() |> forcats::fct_relevel("FDP"),cntcomb = data.table::fcase( canton_2014 == canton_2023,as.character(canton_2014),!canton_2014 == canton_2023,"Moved" ) |>factor() |> forcats::fct_relevel("AG"),hshlang_comb = data.table::fcase( hsh_lang_2014 == hsh_lang_2023,as.character(hsh_lang_2014),!hsh_lang_2014 == hsh_lang_2023,"Moved" ) |>factor(),strengthened =ifelse(army_delta <0, 1, 0),weakened =ifelse(army_delta >0, 1, 0) )df_comp$ptycomb[which(df_comp$ptycomb =="setmetona")] <-NAdf_comp$idpers <- sjlabelled::remove_all_labels(df_comp$idpers)################## Export ################df_r |> arrow::write_parquet(pqtfilepath_full)df_comp |> arrow::write_parquet(pqtfilepath_1423)