library(tidyverse)
library(maps)
library(mapproj)
library(sf)
library(viridis)
library(cowplot)
library(raster)
library(stars)
library(swissdd)
library(ggplot2)
library(foreign)
library(stringr)
library(conflicted)
library(rnaturalearth)
conflicts_prefer(
dplyr::filter,
# dplyr::lag,
dplyr::select,
dplyr::mutate
)
# setting Working Directory
setwd("C:/Users//camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel")
Foschungsseminar Politischer Datenjournalismus: Data-Cleaning
Da die Datenbereinigung lange Ladezeiten am Arbeitsgerät braucht wurde sie in einem seperaten Skript durchgeführt. Alle verwendeten Daten wurden vom Group (2024) bezogen.
Libraries & set up
Reading datasets für 2021
shp_house_2021 <- haven::read_sav("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel/swissubase_932_14_0/data/Data_SPSS/Data_SPSS/SHP-Data-W1-W24-SPSS/W23_2021/SHP21_H_USER.sav")
shp_pers_2021 <- haven::read_sav("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel/swissubase_932_14_0/data/Data_SPSS/Data_SPSS/SHP-Data-W1-W24-SPSS/W23_2021/SHP21_P_USER.sav")
# joining household and individual data
shp_2021 <- right_join(shp_pers_2021, shp_house_2021, by = "IDHOUS21")
Auswahl und Umbenennung der Variablen
#nur relevanten Variablen umbenannt und enthalten
shp_21_breve <- shp_2021 |>
dplyr::select(id_person = IDPERS,
id_house = IDHOUS21,
id_partner = IDSPOU21,
age = AGE21,
sex = SEX21,
civstatus = CIVSTA21,
child = P21D80,
Educ_17 = EDCAT21,
Educ_19 = EDGR21,
Educ_11_plusgrid = EDUCAT21,
Educ_11_grid = EDUGR21,
Educ_11 = EDU_1_21,
kids_num = NBADUL21,
canton = CANTON21,
house_type = HLDFFS21,
cohab = COHAST21,
## Variablen
tz = P21W39,
tz_p = P21W42,
tz_reason = P21W43,
# opin_working_mom = P21D92,
care_work = P21F63,
opin_work = P21F64,
satis_together_house = P21F03,
satis_housework_share = P21F04,
household_time = P21F08,
## Femal Tasks
# opin_working_mom = P21D92,
volunt = P21N35,
volunt_time = P21N38,
## Reg. Modell Variablen
lr_pos = P21P10, # 0 = left, 10 = right
party_vote = P21P19,
# wo_penalised = P21P20 #nur 2020
## Religion
confession = P21R01,
rel_freq = P21R02,
rel_party = P21R03,
rel_frequ = P21R04,
christ_att = P21R23,
islam_att = P21R24,
hindu_att = P21R25,
budda_att = P21R26,
jew_att = P21R27,
atheism_att = P21R28,
rel_shift = P21R34,
## parents
lr_dad = P21P46,
lr_mom = P21P47,
# alive_dad = P21N82,
# parctical_dad = P21N88,
# emo_dad = P21N89,
# alive_mom = P21N73,
# practical_mom = P21N79,
# emotional_mom = P21N80,
## weights
weights = WI21LS20,
weights_2 = WI21CSS
)
Recoding der Variablen für Lesbarkeit.
# -1 bis -8 sind meistens "don´t know", "inapplicable" etc.
shp_21_breve_rc <- shp_21_breve |>
mutate(sex = case_when(
sex == 1 ~ "male",
sex == 2 ~ "female",
sex == 3 ~ "other",
TRUE ~ NA_character_),
civstatus = case_when(
civstatus == 1 ~ "single",
civstatus == 2 & 6 ~ "married", #registered partnership und married wird als dasselbe gesehen.
civstatus == 3 & 4 & 7 ~ "seperated", #seperated, divorced, dissolved partnership sind dasselbe.
civstatus == 5 ~ "widowER",
TRUE ~ NA_character_),
lr_mom = case_when(
lr_mom < 0 ~ NA_real_, TRUE ~ lr_mom),
lr_dad = case_when(
lr_dad < 0 ~ NA_real_, TRUE ~ lr_dad),
household_time = case_when(
household_time > 80 ~ NA_real_, TRUE ~ household_time),
confession = case_when(
confession < 0 ~ NA_real_,
TRUE ~ confession),
canton = case_when(
canton < 0 ~ NA_real_,
TRUE ~ canton),
cohab = case_when(
cohab == 1 ~ "married",
cohab == 2 ~ "not_married",
TRUE ~ NA_character_)) |>
mutate( # als Faktor machen, damit es die kofessionen anzeigt und nicht nur Nummern
confession = haven::as_factor(confession),
canton = haven::as_factor(canton),
house_type = haven::as_factor(house_type)) |>
# 2 neue Variablen:
# tz_rate = Wenn die tz 2 ist (=Vollzeit arbeiten), dann wird es zu 100%, für nicht-vollzeit arbeitende (= 1) gibt es dann das genaue Pensum von Teilzeitarbeit an (also tz_p)
# cant_nr = dieselbe Nummerierung, wie die des BFS, für die Karten in den Darstellungen.
mutate(tz_rate = NA, cant_nr = NA, sex_bin = NA, kmk = NA) |>
mutate(
tz_rate = case_when(
tz == 2 ~ 100,
tz == 1 ~ tz_p,
TRUE ~ NA_real_),
sex_bin = case_when(
sex == "female" ~ "female",
sex == "male" ~ "male",
sex == "other" ~ NA_character_),
cant_nr = case_when(
canton == "ZH Zurich" ~ 1,
canton == "BE Berne" ~ 2,
canton == "LU Lucerne" ~ 3,
canton == "UR Uri" ~ 4,
canton == "SZ Schwyz" ~ 5,
canton == "OW Obwalden" ~ 6,
canton == "NW Nidwalden" ~ 7,
canton == "GL Glarus" ~ 8,
canton == "ZG Zug" ~ 9,
canton == "FR Fribourg" ~ 10,
canton == "SO Solothurn" ~ 11,
canton == "BS Basle-Town" ~ 12,
canton == "BL Basle-Country" ~ 13,
canton == "SH Schaffhausen" ~ 14,
canton == "AR Appenzell Outer-Rhodes" ~ 15,
canton == "AI Appenzell Inner-Rhodes" ~ 16,
canton == "SG St. Gall" ~ 17,
canton == "GR Grisons" ~ 18,
canton == "AG Argovia" ~ 19,
canton == "TG Thurgovia" ~ 20,
canton == "TI Ticino" ~ 21,
canton == "VD Vaud" ~ 22,
canton == "VS Valais" ~ 23,
canton == "NE Neuchatel" ~ 24,
canton == "GE Geneva" ~ 25,
canton == "JU Jura" ~ 26,
TRUE ~ NA_real_)) |>
mutate(kmk = case_when(
kids_num > 0 ~ "children",
kids_num == 0 & cohab == "not_married" ~ "together",
kids_num == 0 & cohab == "married" ~ "married",
TRUE ~ NA_character_
)
)
# unique(shp_21_breve_rc$house_type)
#Variable ist nicht eindeutig, weil kids_num für Personen ohne Kinder NA ist, darum kann nicht zwischen wirklichen NAs und Personen ohne Kindern unterschieden werden.
write.csv(shp_21_breve_rc, file = "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_21_recoded.csv", row.names = FALSE)
Couple-Data
Für die Within Couple Analyse muss man zuerst *_person* und *_partner* ergänzen
shp_21_partners <- merge(shp_21_breve_rc, shp_21_breve_rc, by.x = c("id_person", "id_partner"), by.y = c("id_partner", "id_person"), suffixes = c("_person", "_partner"), all = FALSE) #merge, weil Probleme mit inner joint
shp_21_partners <- shp_21_partners |>
mutate(hw_widif_person = household_time_person - household_time_partner) |>
mutate(tz_widif_person = tz_rate_person - tz_rate_partner ) |>
mutate(care_widif_person = care_work_person - care_work_partner) |>
mutate(hwsat_widif_person = satis_housework_share_person - satis_housework_share_partner) |>
mutate(beziehung = case_when(
sex_bin_partner == sex_bin_person ~ "gleich",
sex_bin_partner != sex_bin_person ~ "hetero"
)) |>
mutate(live_together = case_when(
id_house_person == id_house_partner ~ "same",
id_house_person != id_house_partner ~ "dif"
))
unique(shp_21_partners$kmk_person)
write.csv(shp_21_partners, file = "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_21_partners.csv")
haven::write_sav(shp_21_partners, "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_21_partners.sav")
Zeitraum längere Ladedauer
Loop, der überall durch geht und jene Variablen und nimmt (falls vorhanden), die für die Berechnungen benötigt werden. (Note für Replikation: Hat auf meinem Rechner relativ lange gedauert)
shp_data_list_2 <- list()
shp_time_2 <- data.frame()
x <- c("99", "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22")
i <- 1
# Loop durch from 1999 to 2022
for (year in 1999:2022) {
person_file <- paste0("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel/swissubase_932_14_0/data/Data_SPSS/Data_SPSS/SHP-Data-W1-W24-SPSS/W", i, "_", year, "/SHP", x[i], "_P_USER.sav")
house_file <- paste0("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel/swissubase_932_14_0/data/Data_SPSS/Data_SPSS/SHP-Data-W1-W24-SPSS/W", i, "_", year, "/SHP", x[i], "_H_USER.sav")
shp_person <- haven::read_sav(person_file)
shp_house <- haven::read_sav(house_file)
# Variablen namen
id_house <- paste0("IDHOUS", x[i])
shp_data_2 <- left_join(shp_person, shp_house, by = id_house, copy = FALSE, suffix = c("", ""))
id_person <- paste0("IDPERS")
id_partner <- paste0("IDSPOU", x[i])
age <- paste0("AGE", x[i])
sex <- paste0("SEX", x[i])
civstatus <- paste0("CIVSTA", x[i])
canton <- paste0("CANTON", x[i])
kids_num <- paste0("NBADUL", x[i])
cohab <- paste0("COHAST", x[i])
tz <- paste0("P", x[i], "W39")
tz_p <- paste0("P", x[i], "W42")
opin_working_mom <- paste0("P", x[i], "D92")
care_work <- paste0("P", x[i], "F63")
satis_housework_share <- paste0("P", x[i], "F04")
household_time <- paste0("P", x[i], "F08")
volunt <- paste0("P", x[i], "N35")
lr_pos <- paste0("P", x[i], "P10")
party_vote <- paste0("P", x[i], "P19")
wo_penalised <- paste0("P", x[i], "P20")
volunt_time <- paste0("P", x[i], "N38")
confession <- paste0("P", x[i], "R01")
rel_freq <- paste0("P", x[i], "R02")
rel_party <- paste0("P", x[i], "R03")
rel_frequ <- paste0("P", x[i], "R04")
#datasets joinen
shp_data_2 <- shp_data_2 |>
dplyr::mutate(
id_house_1 = if_else("id_house" %in% names(shp_data_2), id_house, NA_character_),
id_person = if_else("id_person" %in% names(shp_data_2), id_person, NA_character_),
id_partner = if_else("id_partner" %in% names(shp_data_2), id_partner, NA_character_),
age = if_else("age" %in% names(shp_data_2), age, NA_character_),
sex = if_else("sex" %in% names(shp_data_2), sex, NA_character_),
civstatus = if_else("civstatus" %in% names(shp_data_2), civstatus, NA_character_),
canton = if_else("canton" %in% names(shp_data_2), canton, NA_character_),
kids_num = if_else("kids_num" %in% names(shp_data_2), kids_num, NA_character_),
cohab = if_else("cohab" %in% names(shp_data_2), cohab, NA_character_),
tz = if_else("tz" %in% names(shp_data_2), tz, NA_character_),
tz_p = if_else("tz_p" %in% names(shp_data_2), tz_p, NA_character_),
opin_working_mom = if_else("opin_working_mom" %in% names(shp_data_2), opin_working_mom, NA_character_),
care_work = if_else("care_work" %in% names(shp_data_2), care_work, NA_character_),
satis_housework_share = if_else("satis_housework_share" %in% names(shp_data_2), satis_housework_share, NA_character_),
household_time = if_else("household_time" %in% names(shp_data_2), household_time, NA_character_),
volunt = if_else("volunt" %in% names(shp_data_2), volunt, NA_character_),
lr_pos = if_else("lr_pos" %in% names(shp_data_2), lr_pos, NA_character_),
party_vote = if_else("party_vote" %in% names(shp_data_2), party_vote, NA_character_),
wo_penalised = if_else("wo_penalised" %in% names(shp_data_2), wo_penalised, NA_character_),
volunt_time = if_else("volunt_time" %in% names(shp_data_2), volunt_time, NA_character_),
confession = if_else("confession" %in% names(shp_data_2), confession, NA_character_),
rel_freq = if_else("rel_freq" %in% names(shp_data_2), rel_freq, NA_character_),
rel_party = if_else("rel_party" %in% names(shp_data_2), rel_party, NA_character_),
rel_frequ = if_else("rel_frequ" %in% names(shp_data_2), rel_frequ, NA_character_),
year = year,
wave = i
)
#For some reason funktioniert des nur, mit base R & nicht tidy
names(shp_data_2)[which(names(shp_data_2) == "IDPERS")] <- "id_person"
names(shp_data_2)[which(names(shp_data_2) == paste0("IDHOUS", x[i]))] <- "id_house"
names(shp_data_2)[which(names(shp_data_2) == paste0("IDSPOU", x[i]))] <- "id_partner"
names(shp_data_2)[which(names(shp_data_2) == paste0("AGE", x[i]))] <- "age"
names(shp_data_2)[which(names(shp_data_2) == paste0("SEX", x[i]))] <- "sex"
names(shp_data_2)[which(names(shp_data_2) == paste0("CIVSTA", x[i]))] <- "civstatus"
names(shp_data_2)[which(names(shp_data_2) == paste0("CANTON", x[i]))] <- "canton"
names(shp_data_2)[which(names(shp_data_2) == paste0("NBADUL", x[i]))] <- "kids_num"
names(shp_data_2)[which(names(shp_data_2) == paste0("COHAST", x[i]))] <- "cohab"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "W39"))] <- "tz"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "W42"))] <- "tz_p"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "D92"))] <- "opin_working_mom"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "F63"))] <- "care_work"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "F04"))] <- "satis_housework_share"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "F08"))] <- "household_time"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "N35"))] <- "volunt"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "P10"))] <- "lr_pos"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "P19"))] <- "party_vote"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "P20"))] <- "wo_penalised"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "N38"))] <- "volunt_time"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "R01"))] <- "confession"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "R02"))] <- "rel_freq"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "R03"))] <- "rel_party"
names(shp_data_2)[which(names(shp_data_2) == paste0("P", x[i], "R04"))] <- "rel_frequ"
#Anführungszeichen sind hier wichtig!
shp_data_2 <- shp_data_2 |>
select("year", "wave", "id_person", "id_house", "id_partner", "age", "sex", "civstatus", "canton", "kids_num", "cohab", "tz", "tz_p", "opin_working_mom", "care_work", "satis_housework_share", "household_time", "volunt", "lr_pos", "party_vote", "wo_penalised", "volunt_time", "confession", "rel_freq", "rel_party", "rel_frequ")
# |>
# as_factor()
shp_data_list_2[[i]] <- shp_data_2
i = i+1
}
saveRDS(shp_data_list_2, file = "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/shp_time_some-Variables_2RDS.rds")
#rowbind
i <- 24
shp_long_CB <- data.frame()
df <- data.frame()
for (i in 1:24) {
df <- as.data.frame(shp_data_list_2[[i]])
df <- df |> haven::as_factor()
shp_long_CB <- dplyr::bind_rows(shp_long_CB, df)
i = (i - 1)
}
write.csv(shp_long_CB, file = "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_time-long_CB.csv", row.names = FALSE)
saveRDS(shp_long_CB, "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_time-long_CB_rds.rds" )
Recoding der Zeitanalyse:
shp_long_CB <- read.csv("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_time-long_CB.csv")
shp_long_CB_rc <- shp_long_CB |>
mutate(sex = case_when(
sex == "man" ~ "male",
sex == "woman" ~ "female",
sex == "other" ~ "other",
TRUE ~ NA_character_),
civstatus = case_when(
civstatus == "single, never married" ~ "single",
civstatus == "married" | civstatus == "registered partnership" ~ "married",
civstatus == "dissolved partnership" | civstatus == "divorced" | civstatus == "separated" ~ "separated",
civstatus == "widower/widow" ~ "widower",
TRUE ~ NA_character_),
household_time = case_when(
household_time > 80 ~ NA_real_, TRUE ~ household_time))|>
mutate(tz_rate = NA, cant_nr = NA, sex_bin = NA, kmk = NA) |>
mutate(
tz_rate = case_when(
tz == 2 ~ 100,
tz == 1 ~ tz_p,
TRUE ~ NA_real_),
sex_bin = case_when(
sex == "female" ~ "female",
sex == "male" ~ "male",
sex == "other" ~ NA_character_),
cant_nr = case_when(
canton == "ZH Zurich" ~ 1,
canton == "BE Berne" ~ 2,
canton == "LU Lucerne" ~ 3,
canton == "UR Uri" ~ 4,
canton == "SZ Schwyz" ~ 5,
canton == "OW Obwalden" ~ 6,
canton == "NW Nidwalden" ~ 7,
canton == "GL Glarus" ~ 8,
canton == "ZG Zug" ~ 9,
canton == "FR Fribourg" ~ 10,
canton == "SO Solothurn" ~ 11,
canton == "BS Basle-Town" ~ 12,
canton == "BL Basle-Country" ~ 13,
canton == "SH Schaffhausen" ~ 14,
canton == "AR Appenzell Outer-Rhodes" ~ 15,
canton == "AI Appenzell Inner-Rhodes" ~ 16,
canton == "SG St. Gall" ~ 17,
canton == "GR Grisons" ~ 18,
canton == "AG Argovia" ~ 19,
canton == "TG Thurgovia" ~ 20,
canton == "TI Ticino" ~ 21,
canton == "VD Vaud" ~ 22,
canton == "VS Valais" ~ 23,
canton == "NE Neuchatel" ~ 24,
canton == "GE Geneva" ~ 25,
canton == "JU Jura" ~ 26,
TRUE ~ NA_real_),
kmk = case_when(
cohab == "not married" & kids_num == 0 ~ "together",
cohab == "married" & kids_num == 0 ~ "married",
kids_num > 0 ~ "children",
TRUE ~ NA_character_ ))
write.csv(shp_long_CB_rc, "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_time-long_CB_recoded.csv", row.names = FALSE)
Couple-Data-Zeit
shp_long_CB_rc <- read.csv("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_time-long_CB_recoded.csv")
shp_long_partners <- merge(shp_long_CB_rc, shp_long_CB_rc, by.x = c("id_person", "id_partner"), by.y = c("id_partner", "id_person"), suffixes = c("_person", "_partner"), all = FALSE)
#openxlsx::write.xlsx(shp_21_partners, "partners_shp.xlsx")
shp_long_partners <- shp_long_partners |>
mutate(hw_widif_person = household_time_person - household_time_partner) |>
mutate(tz_widif_person = tz_rate_person - tz_rate_partner ) |>
mutate(care_widif_person = care_work_person - care_work_partner) |>
#mutate(hwsat_widif_person = satis_housework_share_person - satis_housework_share_partner) |>
mutate(beziehung = case_when(
sex_bin_partner == sex_bin_person ~ "gleich",
sex_bin_partner != sex_bin_person ~ "hetero"
)) |>
mutate(live_together = case_when(
id_house_person == id_house_partner ~ "same",
id_house_person != id_house_partner ~ "dif"
))
write.csv(shp_long_partners, "C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/finale-Arbeit/Data/shp_time-long_CB_rc-pair.csv", row.names = FALSE)
Gewichtung for Zeitanalyse
Die Gewichtungsvariablen könnten nicht entnommen werden. Es gibt keine einheitliche Gewichtungsvariable über die Zeit. Die IDINT entspricht nicht den IDs der einzelnen Wellen. Die Gewichtungen sind nicht für die Wellen 1 bis 24 einheitlich. Deshalb muss auf eine Gewichtung verzichtet werden. Der Gewichtungsreport des Group (2024) legt nahe, dass einheitliche Gewichtungen erst ab Welle 16 vorhanden sind.
shp_house_long <- haven::read_sav("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel/swissubase_932_14_0/data/Data_SPSS/Data_SPSS/SHP-Data-Longfile-SPSS/SHPLONG_H_USER.sav")
shp_person_long <- haven::read_sav("C:/Users/camil/Desktop/Aa_MA-Semester1/Forschungsseminar/Swisshouseholdpanel/swissubase_932_14_0/data/Data_SPSS/Data_SPSS/SHP-Data-Longfile-SPSS/SHPLONG_H_USER.sav")
# joining household and individual data
shp_long <- left_join(shp_house_long, shp_person_long, by = c("IDHOUS", "YEAR"), copy = FALSE, suffix = c("", ""))
summary(shp_long$HWEIGHT) #Gewichtungsvariable!!
shp_weights <- shp_long |>
dplyr::select(id_person = IDINT,
year = YEAR,
weights_long = HWEIGHT) #IDINT und id_person entsprechen nicht einander.
shp_time_w <- merge(shp_long_CB, shp_weights, by = c("id_person", "year"), copy = FALSE, suffix = c("", ""))
#nicht möglich, zu wenig Observationen?
variable.names(shp_person_long)