################# INTRO ################ Data model consttruction script# - Imports data generated in 00_data_preparation.R# - Sets up an initial model, then investigates the effects of various# alternative specifications################## IMPORT ################# Ensure that init script has been runif (!exists("pqtfilepath_full") ||!exists("pqtfilepath_1423") ||!exists("plot_output_dir")) {source("10_runner.R")}if (!file.exists(pqtfilepath_full) ||!file.exists(pqtfilepath_1423)) {source("00_data_preparation.R")}df <- arrow::read_parquet(pqtfilepath_full)df$idpers <- sjlabelled::remove_all_labels(df$idpers)df2 <- arrow::read_parquet(pqtfilepath_1423)df2$idpers <- sjlabelled::remove_all_labels(df2$idpers)# Load glmmTMB to make S3 methods availablelibrary(glmmTMB)################## HELPER ################### Cache helper# We use glmmTMB over lme4 because the latter proves to be agonizingly slow on# a dataset this masssive. It's still not fast, however; we'll speed things up# further by memoising the function call to avoid unnecessary re-execution.## Note that memoise will always return the cache object unless the input# expression (!= Data!) changes; for our purposes, this means that any changes# to the dataset have to be accompanied by flushing the cache## Unlike lme4, glmmTMB defaults to non-REML (i.e. ordinary ML) estimation,# so no additional arguments are needed to allow for valid comparisons.# It will remain turned off for the "main models" as well, since the focus# is on the fixed components of the model# (See https://tobiasroth.github.io/BDAEcology/lmer.html)cd <- cachem::cache_disk(paste0(here::here(), "/cache"),# glmmTMB models are quite big; allocate maximum of 10GB cache diskmax_size =10*1024*1024^2)CacheGLM <- memoise::memoise(function(...) { glmmTMB::glmmTMB(...) },cache = cd)################### Model 1 #################### Variable selection#---------------------# To allow for reasonable comparisons between models, we have to do this on a# complete dataset, so we subset to cases that include all (potentially)# interesting variables.# Even with only fully complete cases included, the resulting dataset is still# very large.df_s <- df |> dplyr::select( isStrongerprop, idpers, yearfac, pty, agecat, isMale, isCH, hsh_lang, canton, nutsiireg ) |> dplyr::filter( (dplyr::if_all( tidyselect::everything(), complete.cases )) )# Outcome, year, personid and pty are of inherent primary theoretical interest,# and gender and age are relevant as controls (since we're using party.)# We treat this as our baseline:vm1_init <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat,data = df_s,family =binomial(link ="logit"))modelsummary::modelsummary(vm1_init, stars =TRUE)performance::check_collinearity(vm1_init)# We also know from the descriptive analysis that nationality matters,# so we'll add that as well.vm1_addch <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH,data = df_s,family =binomial(link ="logit"))modelsummary::modelsummary(list(vm1_init, vm1_addch), stars =TRUE)performance::check_collinearity(vm1_addch)# Fit improves, albeit only slightly; we'll keep the addition.### Regional aspects# There are three potential ways to analyse the effect of geography:# - Household language (DE/FR/IT; presumably coded based on municipality language?)# - Canton# - NUTS 2 region# Including more than one is infeasible since they are almost certainly highly# correlated.## Let's compare our options (fixed effects only for the time being):vm1_canton <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + canton,data = df_s,family =binomial(link ="logit"))vm1_nuts <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + nutsiireg,data = df_s,family =binomial(link ="logit"))vm1_hshl <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + hsh_lang,data = df_s,family =binomial(link ="logit"))modelsummary::modelsummary(list(vm1_canton, vm1_nuts, vm1_hshl), stars =TRUE)# Canton has the best overall fit. Do we need a second random intercept?vm1_ricant <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + (1| canton),data = df_s,family =binomial(link ="logit"))modelsummary::modelsummary(list("no cant. RI"= vm1_canton, "cant. RI"= vm1_ricant),stars =TRUE)# -> Fit gets worse, we keep it as-is.### With initial variable selection settled, rerun on full datavm1 <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + canton,data = df,family =binomial(link ="logit") )### Residual diagnostics#-----------------------### Simulated vs observed residuals with DHARMa# See:# - https://rdrr.io/cran/glmmTMB/f/inst/doc/model_evaluation.pdf# - https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMa.html# - https://stats.stackexchange.com/questions/610084/dispersion-parameter-in-dharmaresid_vm1 <- DHARMa::simulateResiduals(fittedModel = vm1, plot = F)DHARMa::testResiduals(resid_vm1)# Some underdispersion is observable. This this would appear acceptable, since# (1) DHARMa docs suggest that significant deviations are inherently likely on all# large datasets# (2) underdispersion should lead to (overly) conservative significance estimates, which seems# acceptable considering that we can still observe lots of significant predictors# despite our power loss# Alternative outcome specification#----------------------------------# Instead of doing Stronger vs. Indifferent/None, do None vs. Indifferent/Stronger;# This is equivalent to coding "Neither" as "stronger" and then flipping the sign on all# predictors.## Both models will be useful for the actual text. Comparing them also allows us to# ensure that the "message" of the primary interesting predictors (especially party)# does not change drastically based on how the outcome is operationalised.## No explicit subsetting is performed since the two outcomes represent# dummifcations of the same variable (so the respective subsets are identical)vm1a <-CacheGLM( isWeakerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + canton,data = df,family =binomial(link ="logit"))modelsummary::modelsummary(list("DV: Strong"= vm1, "DV: Abolish"= vm1a),stars =TRUE)# The substantive meaning of the parameter estimates doesn't change much,# insofar as it can be compared. Fit is tricky to compare since# we're switching out the DV, but -- based on adjusted R² -- appears to be better# when abolishment is taken as the DV. This is fairly unsurprising if we look at# the descriptive plots.## The residuals look slightly better as well:resid_vm1a <- DHARMa::simulateResiduals(fittedModel = vm1a,plot = F)DHARMa::testResiduals(resid_vm1a)# -> Initial test suggests potential outlier issues but warns of type I errors# when using the default method; rerun with bootstrap as recommendedDHARMa::testOutliers(resid_vm1a, type ="bootstrap")# -> Should be ok.# Impact of weighting#---------------------# Introduce cross-sectional survey weight provided by SHP.# Since the model is interested in change at the population level, this is the# appropriate choice per# https://forscenter.ch/wp-content/uploads/2018/07/faq_weights1.pdf### Modelsvm1_w <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))vm1a_w <-CacheGLM( isWeakerprop ~ (1| idpers) + yearfac + pty + isMale + agecat + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))### Summarymodelsummary::modelsummary(list("unweighted"= vm1,"weighted"= vm1_w,"unweighted (alt outcome)"= vm1a,"weighted (alt outcome)"= vm1a_w ),stars =TRUE)# -> some significance levels drop and some coefficients get smaller;# directions stay the same.# Check sample predictions#--------------------------### HelperGetPredictionAccuracy <-function( x) {# Make sure there are no leftover labels on the weight since# this can result in errorsif ("(weights)"%in%colnames(x$frame)) { x$frame$`(weights)`<- sjlabelled::remove_all_labels(x$frame$`(weights)`) } outcomevar <-names(x$modelInfo$respCol) predicts_reform <- glmmTMB:::predict.glmmTMB(x, type ="response") preds_noreform <- glmmTMB:::predict.glmmTMB( x,type ="response",re.form =NA ) x$frame$preds <-as.logical(round(predicts_reform)) x$frame$pred_correct <- x$frame[[outcomevar]] == x$frame$preds t1 <- x$frame$pred_correct |>table() |>prop.table() x$frame$predsnr <-as.logical(round(preds_noreform)) x$frame$pred_correctnr <- x$frame[[outcomevar]] == x$frame$predsnr t2 <- x$frame$pred_correctnr |>table() |>prop.table() cli::cli_alert_info("With random effects:\n")print(t1)cat("\n\n\n") cli::cli_alert_info("Without random effects:\n")print(t2) ol <-list("preds_rand"= predicts_reform,"preds_norand"= preds_noreform,"accuracy_withrand"= t1,"accuracy_norand"= t2 )invisible(ol)}pa_vm1w <-GetPredictionAccuracy(vm1_w)pa_vm1a_w <-GetPredictionAccuracy(vm1a_w)### Alternative age specifications#---------------------------------# We initially use agecat to capture the impact of age; try# - Substituting age + age^2, the latter to account for the fact that# the model thus far suggests a non-linear component# - Substituting cohort# - Including cohort as ranef### Substitute age for agecat# With DV: Strongervm1_w_age <-CacheGLM( isStrongerprop ~ (1| idpers) + age +I(age^2) + yearfac + pty + isMale + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))# With DV: Weakervm1a_w_age <-CacheGLM( isWeakerprop ~ (1| idpers) + age +I(age^2) + yearfac + pty + isMale + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))### Summariesmodelsummary::modelsummary(list("agecat"= vm1_w,"age/age²"= vm1_w_age,"agecat (alt DV)"= vm1a_w,"age/age² (alt DV)"= vm1a_w_age ),stars =TRUE)pa_vm1_w_age <-GetPredictionAccuracy(vm1_w_age)pa_vm1a_w_age <-GetPredictionAccuracy(vm1a_w_age)performance::model_performance(vm1_w_age)performance::model_performance(vm1a_w_age)performance::check_collinearity(vm1_w_age)performance::check_collinearity(vm1a_w_age)# Fit is a bit better on either outcome variable.### Substitute Cohort for agecat# With DV: Strongervm1_w_co <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + cohort + pty + isMale + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))# With DV: Weakervm1a_w_co <-CacheGLM( isWeakerprop ~ (1| idpers) + yearfac + cohort + pty + isMale + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))# Summariesmodelsummary::modelsummary(list("agecat"= vm1_w,"age/age²"= vm1_w_age,"cohort"= vm1_w_co,"agecat (alt DV)"= vm1a_w,"age/age² (alt DV)"= vm1a_w_age,"cohort (alt DV)"= vm1_w_co ),stars =TRUE)pa_vm1_w_co <-GetPredictionAccuracy(vm1_w_co)pa_vm1a_w_co <-GetPredictionAccuracy(vm1a_w_co)# -> Cohort only is worse than either alternative### Cohort as ranef# With DV: Strongervm1_w_age_randco <-CacheGLM( isStrongerprop ~ (1| idpers) + yearfac + (1| cohort) + age +I(age^2) + pty + isMale + isCH + canton,weights = df$wicss,data = df,family =binomial(link ="logit"))# Creates singularity issues:performance::check_singularity(vm1_w_age_randco)# Given their good performance, the _w_age variants are decided upon.################### Model 2 ################## Model 2 aims to separately model switching probabilities based only on panel# data.### Initial model#-----------------vm2_1 <- glmmTMB::glmmTMB( strengthened ~ ptycomb + age_2014 +I(age_2014^2) + cntcomb + isMale_2014 + isCH_2023,data = df2,family =binomial(link ="logit"))modelsummary::modelsummary(vm2_1, stars =TRUE)# Convergence failure. Drop the quadratic term:vm2_2 <- glmmTMB::glmmTMB( strengthened ~ ptycomb + age_2014 + cntcomb + isMale_2014 + isCH_2023,data = df2,family =binomial(link ="logit"))modelsummary::modelsummary(vm2_2, stars =TRUE)# Additionally try to use hsh_lang instead of canton for groupingvm2_3 <- glmmTMB::glmmTMB( strengthened ~ ptycomb + age_2014 + hshlang_comb + isMale_2014 + isCH_2023,data = df2,family =binomial(link ="logit"))modelsummary::modelsummary(list(vm2_2, vm2_3), stars =TRUE)# hshlang is better based on AIC, but a bit of a bummer for predictions# + cohortvm2_4 <- glmmTMB::glmmTMB( strengthened ~ ptycomb + age_2014 + cohort_2014 + hshlang_comb + isMale_2014 + isCH_2023,data = df2,family =binomial(link ="logit"))modelsummary::modelsummary(list(vm2_3, vm2_4), stars =TRUE)# -> not an improvement# With RIs at l2vm2_5 <- glmmTMB::glmmTMB( strengthened ~ age_2014 + ptycomb + (1| hshlang_comb) + isMale_2014 + isCH_2023,data = df2,family =binomial(link ="logit"))modelsummary::modelsummary(list(vm2_3, vm2_5), stars =TRUE)# No apparent reason to go beyond 2_3.resid_vm2_3 <- DHARMa::simulateResiduals(fittedModel = vm2_3, plot = F)DHARMa::testResiduals(vm2_3)### With flipped outcomevm2_3a <- glmmTMB::glmmTMB( weakened ~ ptycomb + age_2014 + hshlang_comb + isMale_2014 + isCH_2023,data = df2,family =binomial(link ="logit"))modelsummary::modelsummary(list(vm2_3, vm2_3a), stars =TRUE)# Fits substantially better than on the other outcome variableresid_vm2_3 <- DHARMa::simulateResiduals(fittedModel = vm2_3, plot = F)DHARMa::testResiduals(vm2_3)resid_vm2_3a <- DHARMa::simulateResiduals(fittedModel = vm2_3a, plot = F)DHARMa::testResiduals(vm2_3a)# Nonetheless, these models yield fairly large confidence intervals if grouped by# language and party, and they don't allow us to make predictions based on age:ggeffects::predict_response( vm2_3,terms =c("ptycomb","hshlang_comb","isCH_2023" ),condition =c(isMale ="Yes" ),margin ="empirical")ggeffects::predict_response( vm2_3a,terms =c("ptycomb","hshlang_comb" ),condition =c(isMale ="Yes",isCH ="Yes" ),margin ="empirical")# I thus decide to employ the cross-sectional model for my "example individuals".