data_preparation

##############
### INTRO ###
############

# Initial data preparation script:
# - Locates and imports
# -- (1) SHP long data files and
# -- (2) Year-level SHP metadata files
# - Merges user long file with metadata
# - Performs initial variable construction
# - Subsets the data to variables of (potential) interest
# - Exports result as Parquet so as to drastically speed up r/w times

##########################
### Path definitions ####
########################

if (
  !exists("pqtfilepath_full") ||
    !exists("paths_meta") ||
    !exists("pqtfilepath_full") ||
    !exists("pqtfilepath_1423") ||
    !exists("path_spss_p")
) {
  source("10_runner.R")
}

################
### Imports ###
##############

### Metadata
#------------

# Canton var is not included in personal long data, so need to grab from CNEF.
# There is a CNEF long file, but it unfortunately only goes up to
# 2019, so the individual files need to be read.

### Import
frames_meta <- lapply(paths_meta, \(x) {
  haven::read_spss(x, user_na = TRUE)
})

### Year cleanup:
# Meta files have years in var names, we instead want them in
# columns so that we can join on the names

metayears <- paths_meta |> stringr::str_extract("\\d{4}")

names(frames_meta) <- metayears

for (i in names(frames_meta)) {
  frames_meta[[i]]$year <- as.numeric(i)
}

frames_meta <- lapply(frames_meta, \(x) {
  colnames(x) <- colnames(x) |> gsub(x = _, "_\\d{4}", "")
  return(x)
})

### Merge
meta_combined <- frames_meta |>
  data.table::rbindlist(
    fill = TRUE,
    ignore.attr = TRUE
  ) |>
  tibble::tibble() |>
  dplyr::rename(idpers = x11101ll)


### Long data
#------------

df_p_long <- haven::read_spss(path_spss_p, user_na = TRUE)


### Initial merge
#----------------

df_merge <- dplyr::full_join(
  df_p_long,
  meta_combined,
  by = c("idpers", "year")
)


##################
### Wrangling ###
################

### Helpers
#----------

### Recode and collapse factor levels
# This will throw warnings upon executing because not all levels are present
# in all input factors

CollapseParties <- function(col) {
  # Coded missings to NA
  col[col < 0] <- NA

  out <- col |>
    sjlabelled::to_label() |>
    forcats::fct_collapse(
      AVF = "avf socialist green alternative and women groups",
      CSP = "pcs swiss christian social party",
      DM = c(
        "bdp conservative democratic party",
        "pdc swiss christian-democrat party",
        "the centre"
      ),
      EDU = "udf federal democratic union",
      EVP = "pev swiss pop evang party/swiss evang party",
      FDP = c(
        "prd swiss radical-democratic party",
        "pls swiss liberal party",
        "plr les libéraux-radicaux",
        "adi independent alliance",
        "fdp the liberals"
      ),
      FPS = "psl swiss freedom party  (former swiss car party )",
      GLP = "gl green liberals",
      GP = "pes swiss ecology party",
      Lega = "lega dei ticinesi",
      MCG = "mcg mouvement citoyens genevois",
      Other = "other party",
      PDA = "pst swiss labour party/popular labour party/solidarity",
      SD = "ds swiss democrats (former national action)",
      SP = "pss swiss socialist party",
      SVP = "udc democratic union of the centre",
      None = c(
        "vote for a candidate, not for a party",
        "for no party",
        "wouldn't vote"
      )
    ) |>
    # Condense further
    forcats::fct_other(
      keep = c("SP", "DM", "SVP", "FDP", "GP", "GLP", "None")
    ) |>
    forcats::fct_relevel("FDP")

  return(out)
}

### Data
#-------

### New combined party variable based on heuristics
# membership > hypothetical vote > ID

df_r <- df_merge |>
  dplyr::rename(
    canton = l11101,
    fed_trust = pp04,
    armypos = pp12,
    army_op = pp49,
    army_spend = pp58,
    aid_spend = pp62,
    feel_sec = pl101,
    feel_insec = pc103,
    ptyid = pp67,
    ptymbr = pp11,
    ptyhypo = pp19
  ) |>
  dplyr::mutate(
    pty = data.table::fcase(
      labelled::remove_labels(ptymbr) > 0,
      ptymbr,
      labelled::remove_labels(ptyhypo) > 0,
      ptyhypo,
      labelled::remove_labels(ptyid) > 0,
      ptyid,
      default = NA
    )
  )

# Collapse
df_r <- df_r |>
  dplyr::mutate(
    dplyr::across(
      .cols = tidyselect::starts_with("pty"),
      .fns = ~ CollapseParties(.x)
    )
  )


### Armypos cleaning
#-------------------

# Drop non-answers
df_r$armypos[df_r$armypos < 0] <- NA

# Relabel to DE for subseq
df_r$armypos <- df_r$armypos |>
  sjlabelled::set_labels(
    labels = c("Starke Armee" = 1, "Weder noch" = 2, "Keine Armee" = 3)
  )


### Factorisations
#-----------------

# Figure out at which age the armypos question starts being asked so that
# we can correctly label the lower bound of the agecat variable

df_r |>
  dplyr::filter(age > 0 & age < 18 & !is.na(armypos) & armypos > 0) |>
  dplyr::pull(age) |>
  min()

# -> 13

df_r <-
  df_r |>
  dplyr::mutate(
    # Age
    agecat = data.table::fcase(
      age %in% 0:17,
      "13-17",
      age %in% 18:35,
      "18-35",
      age %in% 36:45,
      "36-45",
      age %in% 46:55,
      "46-55",
      age %in% 56:65,
      "56-65",
      age > 65,
      "65+"
    ) |>
      as.factor() |>
      forcats::fct_relevel("36-45"),
    # Year
    yearfac = as.factor(year),
    # Army position
    apfac = sjlabelled::as_label(armypos),
    # Sex
    sexfac = as.factor(
      data.table::fcase(
        sex == 1,
        "Männlich",
        sex == 2,
        "Weiblich",
        default = NA
      )
    ),
    # Household/contact language; region-coded per doc, hence desirable for
    # röstigraben estimates
    hsh_lang = sjlabelled::as_label(l11102),
    canton = sjlabelled::as_label(canton)
  )


# Canton level cleaning: Abbreviation is sufficient
levels(df_r$canton) <- substr(levels(df_r$canton), 1, 2)

# With this cleaned, add a region variable
df_r <- df_r |>
  dplyr::mutate(
    # cf. https://forscenter.ch/wp-content/uploads/2022/02/shp_user-guide-w22.pdf
    nutsiireg = data.table::fcase(
      canton %in% c("VD", "VS", "GE"),
      "Genferseeregion",
      canton %in% c("BE", "FR", "SO", "NE", "JU"),
      "Mittelland",
      canton %in% c("BS", "BL", "AG"),
      "Nordwestschweiz",
      canton == "ZH",
      "Zürich",
      canton %in% c("GL", "SH", "AR", "AI", "SG", "GR", "TG"),
      "Ostschweiz",
      canton %in% c("LU", "UR", "SZ", "OW", "NW", "ZG"),
      "Zentralschweiz",
      canton == "TI",
      "Tessin",
      default = NA
    )
  )


### Dummifications
#-----------------

df_r <- df_r |>
  dplyr::mutate(
    isStrongerprop = data.table::fcase(
      apfac == "Starke Armee",
      TRUE,
      apfac %in% c("Weder noch", "Keine Armee"),
      FALSE,
      default = NA
    ),
    isWeakerprop = data.table::fcase(
      apfac == "Keine Armee",
      TRUE,
      apfac %in% c("Weder noch", "Starke Armee"),
      FALSE,
      default = NA
    ),
    isCH = data.table::fcase(
      nat_1_ == 8100,
      "Yes",
      nat_2_ == 8100,
      "Yes",
      nat_3_ == 8100,
      "Yes",
      default = "No"
    ) |>
      factor(),
    isMale = data.table::fcase(
      sexfac == "Männlich",
      "Yes",
      sexfac == "Weiblich",
      "No",
      default = NA
    ) |>
      factor()
  ) |>
  sjlabelled::var_labels(
    isStrongerprop = "Für Armee-Stärkung?",
    isWeakerprop = "Für Armee-Schwächung?",
    isCH = "Schweizer:in?",
    isMale = "Mann?"
  )


# Age and cohort transformations
#-------------------------------
df_cohorts <-
  df_r |>
  dplyr::select(year, age, idpers) |>
  dplyr::mutate(age = sjlabelled::remove_all_labels(age)) |>
  dplyr::filter(!is.na(age) & !is.na(year) & age > 0) |>
  dplyr::distinct(idpers, .keep_all = TRUE) |>
  dplyr::mutate(
    estby = year - age,
    cohort = as.factor(data.table::fcase(
      estby <= 1945,
      "Pre1945",
      estby %in% 1946:1964,
      "Boomer",
      estby %in% 1965:1980,
      "GenX",
      estby %in% 1981:1996,
      "Millenial",
      estby >= 1997,
      "Post1996"
    ))
  ) |>
  dplyr::select(estby, cohort, idpers)

df_r <- dplyr::full_join(df_r, df_cohorts, by = "idpers") |>
  dplyr::mutate(
    age = sjlabelled::remove_all_labels(age),
    age = ifelse(age > 0, age, NA)
  )


###############
### Subset ###
#############

### Save the full data
# df_r |> arrow::write_parquet("shplong_combined_modified_full.parquet")

### Select the variables we actually need to cut down on size
df_r <- df_r |>
  dplyr::select(
    idpers,
    year,
    yearfac,
    estby,
    age,
    agecat,
    cohort,
    sex,
    isMale,
    isCH,
    canton,
    hsh_lang,
    nutsiireg,
    fed_trust,
    armypos,
    isStrongerprop,
    isWeakerprop,
    apfac,
    army_op,
    army_spend,
    aid_spend,
    feel_sec,
    feel_insec,
    pty,
    ptyid,
    ptymbr,
    ptyhypo,
    wicss
  )

#######################
### Alt data frame ###
#####################

# In addition to a full longitudinal model, we also wish to model switching prob-
# abilities between 2014 and 2023, based _only_ on respondents who are present
# in both waves. This requires subsetting and recoding.

# Subset
#-------

# Construct a vector containing the ids of respondents present in both years
vec_compids <- df_r |>
  dplyr::filter(year %in% c(2014, 2023)) |>
  dplyr::filter(!is.na(armypos)) |>
  dplyr::count(idpers) |>
  dplyr::filter(n == 2) |>
  dplyr::pull(idpers) |>
  unique()


# Subset the df based on this vector
df_comp <- df_r |>
  dplyr::filter(
    idpers %in% vec_compids,
    year %in% c(2014, 2023)
  )

# Pivot
#-------

df_comp <- df_comp |>
  tidyr::pivot_wider(
    names_from = yearfac,
    values_from = !c(yearfac, idpers)
  ) |>
  dplyr::mutate(
    army_delta = armypos_2023 - armypos_2014,
    ptycomb = data.table::fcase(
      (is.na(pty_2014) | is.na(pty_2023)),
      "setmetona",
      pty_2014 == pty_2023,
      as.character(pty_2014),
      !pty_2014 == pty_2023,
      "Switched"
    ) |>
      factor() |>
      forcats::fct_relevel("FDP"),
    cntcomb = data.table::fcase(
      canton_2014 == canton_2023,
      as.character(canton_2014),
      !canton_2014 == canton_2023,
      "Moved"
    ) |>
      factor() |>
      forcats::fct_relevel("AG"),
    hshlang_comb = data.table::fcase(
      hsh_lang_2014 == hsh_lang_2023,
      as.character(hsh_lang_2014),
      !hsh_lang_2014 == hsh_lang_2023,
      "Moved"
    ) |>
      factor(),
    strengthened = ifelse(army_delta < 0, 1, 0),
    weakened = ifelse(army_delta > 0, 1, 0)
  )

df_comp$ptycomb[which(df_comp$ptycomb == "setmetona")] <- NA

df_comp$idpers <- sjlabelled::remove_all_labels(df_comp$idpers)


###############
### Export ###
#############

df_r |>
  arrow::write_parquet(pqtfilepath_full)

df_comp |>
  arrow::write_parquet(pqtfilepath_1423)