predictmodels

##############
### INTRO ###
############

# Data modeling script
# - Imports data generated in 00_data_preparation.R
# - Runs models with glmmTMB
# - Caches models to ./cache using `memoise`
# - This section only includes models that were directly referenced in the paper,
# for model construction and validation, see 04_model_validation.R

###############
### Import ###
#############

# Ensure that init script has been run

if (
  !exists("pqtfilepath_full") ||
    !exists("pqtfilepath_1423") ||
    !exists("plot_output_dir") ||
    !exists("m1predictspath")
) {
  source("10_runner.R")
}

if (!file.exists(pqtfilepath_full) || !file.exists(pqtfilepath_1423)) {
  source("00_data_preparation.R")
}
df <- arrow::read_parquet(pqtfilepath_full)

df2 <- arrow::read_parquet(pqtfilepath_1423)


# Import explicitly to make sure all necessary methods are available
library(glmmTMB)

###############
### Models ###
#############

### M1: Full longitudinal
#------------------------

dfreg <- df |>
  sjlabelled::remove_all_labels() |>
  dplyr::select(
    isStrongerprop,
    isWeakerprop,
    idpers,
    age,
    yearfac,
    pty,
    isMale,
    isCH,
    canton,
    wicss
  ) |>
  dplyr::mutate(
    # ggeffects does not like logicals
    dplyr::across(c(isStrongerprop, isWeakerprop), ~ as.integer(.x))
  )


m1 <-
  glmmTMB(
    isStrongerprop ~
      (1 | idpers) +
        age +
        I(age^2) +
        yearfac +
        pty +
        isMale +
        isCH +
        canton,
    data = dfreg,
    weights = wicss,
    family = binomial(link = "logit")
  )


m1a <-
  glmmTMB(
    isWeakerprop ~
      (1 | idpers) +
        age +
        I(age^2) +
        yearfac +
        pty +
        isMale +
        isCH +
        canton,
    data = dfreg,
    weights = wicss,
    family = binomial(link = "logit")
  )

### M1 Predicts:
#---------------
# Generate predictions and prediction intervals for various
# subsets of the population, then join. predict_response can take a maximum of
# five terms simultaneously, so CH citizen/noncitizen has to be done separately
#
# Using the "empirical" margins in ggeffects would probably be preferable;
# unfortunately, there are too many terms (and too many datapoints) for that to
# work without crashing my R session (and/or computer). Proportional weighting
# on the means should get us a bit closer:
#
# https://strengejacke.github.io/ggeffects/articles/technical_differencepredictemmeans.html

agevec <- c(20, 30, 45, 60, 80)

commonterms <-
  c(
    "yearfac [all]",
    "pty [all]",
    "canton [all]",
    "age [agevec]",
    "isMale [all]"
  )

### isStrongerpop ~ model
pm1_ch <- ggeffects::predict_response(
  m1,
  terms = commonterms,
  weights = "proportional",
  condition = c(
    isCH = "Yes"
  ),
  bias_correction = TRUE
)

### ~isWeakerprop model
# CH citizens
pm1_ch_a <- ggeffects::predict_response(
  m1a,
  terms = commonterms,
  weights = "proportional",
  condition = c(
    isCH = "Yes"
  ),
  bias_correction = TRUE
)

### Clean predictions

# Helper
CleanPredictions <- function(preds) {
  df_pred <- as.data.frame(preds)

  df_pred <- df_pred |>
    dplyr::rename(
      "age" = "panel",
      "party" = "group",
      "year" = "x",
      "isMale" = "grid",
      "canton" = "facet"
    ) |>
    # GLP does not exist in the data pre-2009. This doesn't stop us from making
    # predictions, but discussing them would probably be misleading.
    dplyr::filter(!(as.numeric(as.character(year)) < 2009 & party == "GLP"))

  levels(df_pred$age) <- agevec

  return(df_pred)
}


df_pch <- pm1_ch |>
  CleanPredictions() |>
  dplyr::mutate(
    outcome = "Starke Armee"
  )

df_pch_a <- pm1_ch_a |>
  CleanPredictions() |>
  dplyr::mutate(
    outcome = "Keine Armee"
  )

# Join
m1_predicts <- list(df_pch, df_pch_a) |>
  purrr::reduce(dplyr::full_join) |>
  tibble::tibble() |>
  dplyr::mutate(
    dplyr::across(
      .cols = c(predicted, conf.low, conf.high),
      .fns = ~ .x * 100,
      .names = "{.col}_pct"
    )
  )


# Save (for plotting)
arrow::write_parquet(m1_predicts, m1predictspath)