model_construction

##############
### INTRO ###
############

# Data model consttruction script
# - Imports data generated in 00_data_preparation.R
# - Sets up an initial model, then investigates the effects of various
# alternative specifications

###############
### IMPORT ###
#############

# Ensure that init script has been run
if (
  !exists("pqtfilepath_full") ||
    !exists("pqtfilepath_1423") ||
    !exists("plot_output_dir")
) {
  source("10_runner.R")
}


if (!file.exists(pqtfilepath_full) || !file.exists(pqtfilepath_1423)) {
  source("00_data_preparation.R")
}

df <- arrow::read_parquet(pqtfilepath_full)

df$idpers <- sjlabelled::remove_all_labels(df$idpers)


df2 <- arrow::read_parquet(pqtfilepath_1423)

df2$idpers <- sjlabelled::remove_all_labels(df2$idpers)


# Load glmmTMB to make S3 methods available
library(glmmTMB)


###############
### HELPER ###
#############

### Cache helper
# We use glmmTMB over lme4 because the latter proves to be agonizingly slow on
# a dataset this masssive. It's still not fast, however; we'll speed things up
# further by memoising the function call to avoid unnecessary re-execution.
#
# Note that memoise will always return the cache object unless the input
# expression (!= Data!) changes; for our purposes, this means that any changes
# to the dataset have to be accompanied by flushing the cache
#
# Unlike lme4, glmmTMB defaults to non-REML (i.e. ordinary ML) estimation,
# so no additional arguments are needed to allow for valid comparisons.
# It will remain turned off for the "main models" as well, since the focus
# is on the fixed components of the model
# (See https://tobiasroth.github.io/BDAEcology/lmer.html)

cd <- cachem::cache_disk(
  paste0(here::here(), "/cache"),
  # glmmTMB models are quite big; allocate maximum of 10GB cache disk
  max_size = 10 * 1024 * 1024^2
)

CacheGLM <- memoise::memoise(
  function(...) {
    glmmTMB::glmmTMB(...)
  },
  cache = cd
)

################
### Model 1 ###
##############

### Variable selection
#---------------------

# To allow for reasonable comparisons between models, we have to do this on a
#  complete dataset, so we subset to cases that include all (potentially)
# interesting variables.
# Even with only fully complete cases included, the resulting dataset is still
# very large.

df_s <- df |>
  dplyr::select(
    isStrongerprop,
    idpers,
    yearfac,
    pty,
    agecat,
    isMale,
    isCH,
    hsh_lang,
    canton,
    nutsiireg
  ) |>
  dplyr::filter(
    (dplyr::if_all(
      tidyselect::everything(),
      complete.cases
    ))
  )


# Outcome, year, personid and pty are of inherent primary theoretical interest,
# and gender and age are relevant as controls (since we're using party.)
# We treat this as our baseline:

vm1_init <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat,
  data = df_s,
  family = binomial(link = "logit")
)


modelsummary::modelsummary(vm1_init, stars = TRUE)

performance::check_collinearity(vm1_init)

# We also know from the descriptive analysis that nationality matters,
# so we'll add that as well.

vm1_addch <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH,
  data = df_s,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(list(vm1_init, vm1_addch), stars = TRUE)

performance::check_collinearity(vm1_addch)

# Fit improves, albeit only slightly; we'll keep the addition.

### Regional aspects
# There are three potential ways to analyse the effect of geography:
# - Household language (DE/FR/IT; presumably coded based on municipality language?)
# - Canton
# - NUTS 2 region
# Including more than one is infeasible since they are almost certainly highly
# correlated.
#
# Let's compare our options (fixed effects only for the time being):

vm1_canton <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      canton,
  data = df_s,
  family = binomial(link = "logit")
)


vm1_nuts <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      nutsiireg,
  data = df_s,
  family = binomial(link = "logit")
)

vm1_hshl <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      hsh_lang,
  data = df_s,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(list(vm1_canton, vm1_nuts, vm1_hshl), stars = TRUE)

# Canton has the best overall fit. Do we need a second random intercept?

vm1_ricant <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      (1 | canton),
  data = df_s,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(
  list("no cant. RI" = vm1_canton, "cant. RI" = vm1_ricant),
  stars = TRUE
)

# -> Fit gets worse, we keep it as-is.

### With initial variable selection settled, rerun on full data
vm1 <-
  CacheGLM(
    isStrongerprop ~
      (1 | idpers) +
        yearfac +
        pty +
        isMale +
        agecat +
        isCH +
        canton,
    data = df,
    family = binomial(link = "logit")
  )

### Residual diagnostics
#-----------------------

### Simulated vs observed residuals with DHARMa
# See:
# - https://rdrr.io/cran/glmmTMB/f/inst/doc/model_evaluation.pdf
# - https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMa.html
# - https://stats.stackexchange.com/questions/610084/dispersion-parameter-in-dharma
resid_vm1 <- DHARMa::simulateResiduals(fittedModel = vm1, plot = F)

DHARMa::testResiduals(resid_vm1)

# Some underdispersion is observable. This this would appear acceptable, since
# (1) DHARMa docs suggest that significant deviations are inherently likely on all
# large datasets
# (2) underdispersion should lead to (overly) conservative significance estimates, which seems
# acceptable considering that we can still observe lots of significant predictors
# despite our power loss

# Alternative outcome specification
#----------------------------------

# Instead of doing Stronger vs. Indifferent/None, do None vs. Indifferent/Stronger;
# This is equivalent to coding "Neither" as "stronger" and then flipping the sign on all
# predictors.
#
# Both models will be useful for the actual text. Comparing them also allows us to
# ensure that the "message" of the primary interesting predictors (especially party)
# does not change drastically based on how the outcome is operationalised.
#
# No explicit subsetting is performed since the two outcomes represent
# dummifcations of the same variable (so the respective subsets are identical)
vm1a <- CacheGLM(
  isWeakerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      canton,
  data = df,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(
  list("DV: Strong" = vm1, "DV: Abolish" = vm1a),
  stars = TRUE
)

# The substantive meaning of the parameter estimates doesn't change much,
# insofar as it can be compared. Fit is tricky to compare since
# we're switching out the DV, but -- based on adjusted R² -- appears to be better
# when abolishment is taken as the DV. This is fairly unsurprising if we look at
# the descriptive plots.
#
# The residuals look slightly better as well:

resid_vm1a <- DHARMa::simulateResiduals(
  fittedModel = vm1a,
  plot = F
)

DHARMa::testResiduals(resid_vm1a)
# -> Initial test suggests potential outlier issues but warns of type I errors
# when using the default method; rerun with bootstrap as recommended

DHARMa::testOutliers(resid_vm1a, type = "bootstrap")
# -> Should be ok.

# Impact of weighting
#---------------------

# Introduce cross-sectional survey weight provided by SHP.
# Since the model is interested in change at the population level, this is the
# appropriate choice per
#  https://forscenter.ch/wp-content/uploads/2018/07/faq_weights1.pdf

### Models
vm1_w <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)

vm1a_w <- CacheGLM(
  isWeakerprop ~
    (1 | idpers) +
      yearfac +
      pty +
      isMale +
      agecat +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)

### Summary
modelsummary::modelsummary(
  list(
    "unweighted" = vm1,
    "weighted" = vm1_w,
    "unweighted (alt outcome)" = vm1a,
    "weighted  (alt outcome)" = vm1a_w
  ),
  stars = TRUE
)

# -> some significance levels drop and some coefficients get smaller;
# directions stay the same.

# Check sample predictions
#--------------------------

### Helper

GetPredictionAccuracy <- function(
  x
) {
  # Make sure there are no leftover labels on the weight since
  # this can result in errors
  if ("(weights)" %in% colnames(x$frame)) {
    x$frame$`(weights)` <- sjlabelled::remove_all_labels(x$frame$`(weights)`)
  }

  outcomevar <- names(x$modelInfo$respCol)

  predicts_reform <- glmmTMB:::predict.glmmTMB(x, type = "response")

  preds_noreform <- glmmTMB:::predict.glmmTMB(
    x,
    type = "response",
    re.form = NA
  )

  x$frame$preds <- as.logical(round(predicts_reform))

  x$frame$pred_correct <- x$frame[[outcomevar]] == x$frame$preds

  t1 <- x$frame$pred_correct |>
    table() |>
    prop.table()

  x$frame$predsnr <- as.logical(round(preds_noreform))

  x$frame$pred_correctnr <- x$frame[[outcomevar]] == x$frame$predsnr

  t2 <- x$frame$pred_correctnr |>
    table() |>
    prop.table()

  cli::cli_alert_info("With random effects:\n")
  print(t1)
  cat("\n\n\n")
  cli::cli_alert_info("Without random effects:\n")
  print(t2)

  ol <- list(
    "preds_rand" = predicts_reform,
    "preds_norand" = preds_noreform,
    "accuracy_withrand" = t1,
    "accuracy_norand" = t2
  )

  invisible(ol)
}


pa_vm1w <- GetPredictionAccuracy(vm1_w)

pa_vm1a_w <- GetPredictionAccuracy(vm1a_w)


### Alternative age specifications
#---------------------------------
# We initially use agecat to capture the impact of age; try
# - Substituting age + age^2, the latter to account for the fact that
# the model thus far suggests a non-linear component
# - Substituting cohort
# - Including cohort as ranef

### Substitute age for agecat
# With DV: Stronger
vm1_w_age <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      age +
      I(age^2) +
      yearfac +
      pty +
      isMale +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)

# With DV: Weaker
vm1a_w_age <- CacheGLM(
  isWeakerprop ~
    (1 | idpers) +
      age +
      I(age^2) +
      yearfac +
      pty +
      isMale +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)


### Summaries
modelsummary::modelsummary(
  list(
    "agecat" = vm1_w,
    "age/age²" = vm1_w_age,
    "agecat (alt DV)" = vm1a_w,
    "age/age² (alt DV)" = vm1a_w_age
  ),
  stars = TRUE
)

pa_vm1_w_age <- GetPredictionAccuracy(vm1_w_age)

pa_vm1a_w_age <- GetPredictionAccuracy(vm1a_w_age)

performance::model_performance(vm1_w_age)

performance::model_performance(vm1a_w_age)

performance::check_collinearity(vm1_w_age)

performance::check_collinearity(vm1a_w_age)

# Fit is a bit better on either outcome variable.

### Substitute Cohort for agecat

# With DV: Stronger
vm1_w_co <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      cohort +
      pty +
      isMale +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)

# With DV: Weaker
vm1a_w_co <- CacheGLM(
  isWeakerprop ~
    (1 | idpers) +
      yearfac +
      cohort +
      pty +
      isMale +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)

# Summaries
modelsummary::modelsummary(
  list(
    "agecat" = vm1_w,
    "age/age²" = vm1_w_age,
    "cohort" = vm1_w_co,
    "agecat (alt DV)" = vm1a_w,
    "age/age² (alt DV)" = vm1a_w_age,
    "cohort (alt DV)" = vm1_w_co
  ),
  stars = TRUE
)

pa_vm1_w_co <- GetPredictionAccuracy(vm1_w_co)

pa_vm1a_w_co <- GetPredictionAccuracy(vm1a_w_co)

# -> Cohort only is worse than either alternative

### Cohort as ranef

# With DV: Stronger
vm1_w_age_randco <- CacheGLM(
  isStrongerprop ~
    (1 | idpers) +
      yearfac +
      (1 | cohort) +
      age +
      I(age^2) +
      pty +
      isMale +
      isCH +
      canton,
  weights = df$wicss,
  data = df,
  family = binomial(link = "logit")
)

# Creates singularity issues:
performance::check_singularity(vm1_w_age_randco)

# Given their good performance, the _w_age variants are decided upon.

################
### Model 2 ###
##############
# Model 2 aims to separately model switching probabilities based only on panel
# data.

### Initial model
#-----------------

vm2_1 <- glmmTMB::glmmTMB(
  strengthened ~
    ptycomb +
      age_2014 +
      I(age_2014^2) +
      cntcomb +
      isMale_2014 +
      isCH_2023,
  data = df2,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(vm2_1, stars = TRUE)


# Convergence failure. Drop the quadratic term:
vm2_2 <- glmmTMB::glmmTMB(
  strengthened ~
    ptycomb +
      age_2014 +
      cntcomb +
      isMale_2014 +
      isCH_2023,
  data = df2,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(vm2_2, stars = TRUE)

# Additionally try to use hsh_lang instead of canton for grouping

vm2_3 <- glmmTMB::glmmTMB(
  strengthened ~
    ptycomb +
      age_2014 +
      hshlang_comb +
      isMale_2014 +
      isCH_2023,
  data = df2,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(list(vm2_2, vm2_3), stars = TRUE)

# hshlang is better based on AIC, but a bit of a bummer for predictions

# + cohort
vm2_4 <- glmmTMB::glmmTMB(
  strengthened ~
    ptycomb +
      age_2014 +
      cohort_2014 +
      hshlang_comb +
      isMale_2014 +
      isCH_2023,
  data = df2,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(list(vm2_3, vm2_4), stars = TRUE)

# -> not an improvement

# With RIs at l2
vm2_5 <- glmmTMB::glmmTMB(
  strengthened ~
    age_2014 +
      ptycomb +
      (1 | hshlang_comb) +
      isMale_2014 +
      isCH_2023,
  data = df2,
  family = binomial(link = "logit")
)

modelsummary::modelsummary(list(vm2_3, vm2_5), stars = TRUE)

# No apparent reason to go beyond 2_3.
resid_vm2_3 <- DHARMa::simulateResiduals(fittedModel = vm2_3, plot = F)

DHARMa::testResiduals(vm2_3)

### With flipped outcome
vm2_3a <- glmmTMB::glmmTMB(
  weakened ~
    ptycomb +
      age_2014 +
      hshlang_comb +
      isMale_2014 +
      isCH_2023,
  data = df2,
  family = binomial(link = "logit")
)


modelsummary::modelsummary(list(vm2_3, vm2_3a), stars = TRUE)

# Fits substantially better than on the other outcome variable

resid_vm2_3 <- DHARMa::simulateResiduals(fittedModel = vm2_3, plot = F)

DHARMa::testResiduals(vm2_3)


resid_vm2_3a <- DHARMa::simulateResiduals(fittedModel = vm2_3a, plot = F)

DHARMa::testResiduals(vm2_3a)

# Nonetheless, these models yield fairly large confidence intervals if grouped by
# language and party, and they don't allow us to make predictions based on age:

ggeffects::predict_response(
  vm2_3,
  terms = c(
    "ptycomb",
    "hshlang_comb",
    "isCH_2023"
  ),
  condition = c(
    isMale = "Yes"
  ),
  margin = "empirical"
)

ggeffects::predict_response(
  vm2_3a,
  terms = c(
    "ptycomb",
    "hshlang_comb"
  ),
  condition = c(
    isMale = "Yes",
    isCH = "Yes"
  ),
  margin = "empirical"
)

# I thus decide to employ the cross-sectional model for my "example individuals".