Libraries

list.of.packages <- c("writexl", "tidyverse", "readxl", "dplyr", "swissparl", "quanteda", "stm", "data.table", "geometry", "tm", "Rtsne", "rsvd", "wordcloud", "tidytext", "ggplot2", "ggraph", "hrbrthemes")
lapply(list.of.packages, require, character.only = TRUE)

getwd()
set.seed(123)

Data

# Business Text #291
BR.TextUkraine <- get_data(table = "Business", SubmittedText = "~Ukraine", Language = "DE", SubmissionDate = c(">2020-01-01", "<2023-03-21"))

# Reason Text #168
BR.ReasonUkraine <- get_data(table = "Business", ReasonText = "~Ukraine", Language = "DE", SubmissionDate = c(">2020-01-01", "<2023-03-21"))

# Federal Council Response Text #256
BR.FCRUkraine <- get_data(table = "Business", FederalCouncilResponseText = "~Ukraine", Language = "DE", SubmissionDate = c(">2020-01-01", "<2023-03-21"))

# Combine the datasets & remove duplicates #499
DataRaw <- merge(merge(BR.TextUkraine, BR.ReasonUkraine, all = TRUE), BR.FCRUkraine, all = TRUE)

# Backup Data
# write.csv(DataRaw, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/Data_Raw_Backup_neu.csv")

# Fraction membership data #243
FractionMembership <- swissparl::get_data(table = "MemberParlGroup", Language = "DE")

Grafik Geschäfte raw

## Auswahl der nötigen Variablen
GraphRaw <- DataRaw %>%
  select(BusinessTypeName, SubmittedBy, SubmissionDate)

##Auf Monat runden
GraphRaw$SubmissionDate <- substr(GraphRaw$SubmissionDate, 1,7)

## Absolute Freq pro Monat
Raw.freq <- count(GraphRaw, SubmissionDate)

# write_xlsx(Raw.freq, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/ukraine_graph_beste2.xlsx")

Preparation of Data for analysis with stm

### Prepare Fraction membership data
FractionMembership <- FractionMembership %>% 
  select(FirstName, LastName, ParlGroupName)

FractionMembership <- data.table(FractionMembership, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)

FractionMembership[,SubmittedBy := paste(LastName, FirstName)][,c("LastName", "FirstName") := NULL]

### Prepare Business Data
# Creation of Data Table
DataRawTable <- data.table(DataRaw, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)

# Nur interessante Variablen behalten
DataStm <- DataRawTable[,.(ID, BusinessShortNumber, BusinessTypeName, Title, SubmittedText, ReasonText, SubmittedBy, SubmissionDate)]

# Mit Fraction membership mergen
DataStm <- merge(DataStm, FractionMembership, by = "SubmittedBy", all.x = TRUE)

# Add Fraction manual for NAs
### Kommissionen
DataStm$ParlGroupName[DataStm$ID == "20220023"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$ID == "20220024"] <- "Ständerat"
DataStm$SubmittedBy[DataStm$ID == "20220023"] <- "Nationalrat"
DataStm$SubmittedBy[DataStm$ID == "20220024"] <- "Ständerat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Aussenpolitische Kommission-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Aussenpolitische Kommission-Ständerat"] <- "Ständerat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Kommission für Rechtsfragen-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Kommission für Wirtschaft und Abgaben-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Sicherheitspolitische Kommission-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Sicherheitspolitische Kommission-Ständerat"] <- "Ständerat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Genf"] <- "Genf" ##Standesinitiative
### Fraktionen
DataStm$ParlGroupName[DataStm$SubmittedBy == "Die Mitte-Fraktion. Die Mitte. EVP."] <- "Die Mitte-Fraktion. Die Mitte. EVP."
DataStm$ParlGroupName[DataStm$SubmittedBy == "FDP-Liberale Fraktion"] <- "FDP-Liberale Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Fraktion der Schweizerischen Volkspartei"] <- "Fraktion der Schweizerischen Volkspartei"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Grüne Fraktion"] <- "Grüne Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Grünliberale Fraktion"] <- "Grünliberale Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Sozialdemokratische Fraktion"] <- "Sozialdemokratische Fraktion"
## Personen
DataStm$ParlGroupName[DataStm$SubmittedBy == "Humbel Ruth"] <- "Die Mitte-Fraktion. Die Mitte. EVP."
DataStm$ParlGroupName[DataStm$SubmittedBy == "Carobbio Guscetti Marina"] <- "Sozialdemokratische Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Rechsteiner Paul"] <- "Sozialdemokratische Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Streiff-Feller Marianne"] <- "Die Mitte-Fraktion. Die Mitte. EVP."

# Titel und Text(e) zusammenlegen
DataStm[,txt := paste(Title, SubmittedText, ReasonText)][,c("Title", "SubmittedText", "ReasonText") := NULL]

# Leerzeichen aufräumen
DataStm[,txt := gsub("[[:space:]]+", " ", txt)] #löscht html text und so 

##### Preprocessing von Texten #####

# Preprocessing
corpus_stm <- textProcessor(DataStm$txt, metadata = as.data.frame(DataStm), #Metadata = covariates
                            lowercase = T, removestopwords = T, removenumbers = T,
                            removepunctuation = T, stem = T, wordLengths = c(3, Inf),
                            sparselevel = 1, language = 'de', verbose = T,
                            onlycharacter = T, striphtml = F,
                            customstopwords = c("dass", "text"))

# Modelle brauchen drei Inputs: Dokumente, Vokabular, Metadaten
corpus_stm <- prepDocuments(documents = corpus_stm$documents,
                            vocab = corpus_stm$vocab, 
                            meta = corpus_stm$meta,
                            lower.thresh = 2, upper.thresh = Inf,
                            verbose = T)
# Test removed words
plotRemoved(documents = corpus_stm$document, lower.thresh = seq(1, 200, by = 20))

##############################################
##### STM #####

## Diagnostik --> 15 Topics scheinen gut zu sein
stm_search <- searchK(corpus_stm$documents, corpus_stm$vocab,
                       K = seq(5, 55, by = 10), max.em.its = 100) ##
plot(stm_search)

## 15 Topics erzeugen
stm_15 <- stm(corpus_stm$documents, corpus_stm$vocab, K = 15)

## Backup Model
# saveRDS(stm_15, "stm_15_DATUM.RDS")
# stm_15_BUP <- readRDS("stm_15_230512.RDS")
# stm_15 <- stm_15_BUP

## Häufigkeit & Übersicht der Topics
plot.STM(stm_15, n = 7, topics = 1:15, type = "summary")
labelTopics(stm_15, n = 7)

# Themen vergleichen
plot.STM(stm_15, topics=c(2,15), type = "perspectives")

## word clouds
cloud(stm_15, topic = 2)
cloud(stm_15, topic = 5)
cloud(stm_15, topic = 15)

## Select Model evaluation
swissparl_stm_select <- selectModel(corpus_stm$documents,
                                    corpus_stm$vocab,
                                    K = 15, runs = 20)

plotModels(swissparl_stm_select, legend.position = "bottomleft")

selected <- swissparl_stm_select$runout[[4]]

plot(selected)

plot(unlist(stm_search$results$K),
     unlist(stm_search$results$exclus),
     xlab = "K", ylab = "Exclusivity")


## Testplots
plot.STM(stm_15, n = 7, topics = c(2,5,7), type = "labels")
plot.STM(stm_15, n = 7, topics = c(1,2), type = "perspectives")

##### Prevalence #####

corpus_stm$meta$ParlGroupName <- as.factor(corpus_stm$meta$ParlGroupName)

stm_15_covar <- stm(corpus_stm$documents,
                    corpus_stm$vocab,
                    K = 15,
                    data=corpus_stm$meta,
                    prevalence =~ ParlGroupName)

########## Zusammenhänge ##########
stm_15_covar_est <- estimateEffect(1:15 ~ ParlGroupName, stm_15_covar,
                                   metadata = corpus_stm$meta)

plot.estimateEffect(stm_15_covar_est, topics = 15,
                    covariate = "ParlGroupName", method = "pointestimate",
                    verbose.labels = T)

plot.estimateEffect(stm_15_covar_est, topics = c(9,11),
                    covariate = "ParlGroupName", method = "pointestimate",
                    verbose.labels = T)

names(stm_15$metadata)
attributes(stm_15)

table <- table(stm_15_covar_est$metadata$ParlGroupName)

Visualisation

# Visualisation of 10 most used words per topic

betas <- tidy(stm_15)

betaplot1 <- betas %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() + scale_x_reordered() +
  labs(x = NULL, y = expression(beta), title = "Wörter mit der höchsten Häufigkeit pro Topic") +
  theme_bw()

print(betaplot1)


########## Visualisation of percentages ##########

topicprop <- make.dt(stm_15)

head(topicprop)[,1:6]

## Add "docnum" to DataStm - preparation for merge
DataStm_Final <- DataStm %>% 
  mutate(docnum = row_number()) %>%
  filter(docnum <= 499)

## Merge

STM_merged_data <- merge(DataStm_Final, topicprop, by = "docnum")

table(STM_merged_data$ParlGroupName) 
## Remove NR/SR & Geneva
STM_merged_data2 <- subset(STM_merged_data, !(ParlGroupName == "Nationalrat" | ParlGroupName == "Ständerat" | ParlGroupName == "Genf"))


recoded_data <- STM_merged_data2[, c("Topic1", "Topic2", "Topic3", "Topic4", "Topic5", "Topic6", "Topic7", "Topic8", "Topic9", "Topic10", "Topic11", "Topic12", "Topic13", "Topic14", "Topic15")]

### Benchmark 0.1 gibt gute Ergebnisse
recoded_data[recoded_data >= 0.1] <- 1
recoded_data[recoded_data < 0.1] <- 0

## merge
docnum <- seq(from = 1, to = 479)

# Add the docnum column to your dataset
recoded_data <- cbind(docnum, recoded_data)

STM_merged_data2$docnum <- NULL
STM_merged_data2 <- cbind(docnum, STM_merged_data2)

STM_merged_data3 <- STM_merged_data2 %>% 
  select(docnum, SubmittedBy, SubmissionDate, ParlGroupName, txt, trend, BusinessShortNumber, ID) ## ID ergänzt

indices <- merge(STM_merged_data3, recoded_data, by="docnum")

########## Frequencies ##########

sum_topics <- aggregate(indices[,9:23], by=list(indices$ParlGroupName), FUN=sum)
colnames(sum_topics)[1] <- "ParlGroupName"
## write sum topics for count
# write_xlsx(sum_topics, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/sum_topics_count.xlsx")

### Standardization nach Fraktionsgrösse - Macht das überhaupt sinn? --> Ja! Aber achtung bei Interpretation 
## Fraktionsgrössen
Fraktionsgrossen <- as.data.frame(table(FractionMembership$ParlGroupName))
colnames(Fraktionsgrossen) <- c("ParlGroupName", "Fraktionsgrosse")

SumStand <- merge(sum_topics, Fraktionsgrossen, by="ParlGroupName")
SumStand[,2:16] <- SumStand[,2:16] / SumStand$Fraktionsgrosse

## Save for Smartspiders
# write_xlsx(SumStand, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/Frequencies2.xlsx")

Overview Vorstösse & Network Analysis

## Overview alle Vorstösse zum Thema "Ukraine"

# Plot by type ##
indices %>% 
  group_by(ParlGroupName) %>% 
  count() %>% 
  ungroup() %>% 
  ggplot(aes(reorder(ParlGroupName, n), n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Vorstösse im Zusammenhang mit der Ukraine",
    subtitle = "Vorstösse nach Fraktionen",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
    ) +
  theme_ipsum_rc(grid="X") +
  theme(axis.title = element_blank())

SanktionenSubset <- subset(indices, Topic2 == 1)

## First overview Sanktionen

SanktionenSubset %>% 
  group_by(SubmittedBy) %>% 
  count() %>% 
  ungroup() %>% 
  top_n(10, n) %>% 
  mutate(SubmittedBy = stringr::str_remove_all(SubmittedBy, "\\s.*$")) %>% 
  ggplot(aes(reorder(SubmittedBy, n), n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Who with Whom in the Council of States?",
    subtitle = "Vorstösse zum Topic 'Sanktionen' eingereicht",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
    ) +
  theme_ipsum_rc(grid="X") +
  theme(axis.title = element_blank())


########## Network Analysis ##########
######################################

## All Business roles
biz.roles <- swissparl::get_data(
  table = "BusinessRole", 
  BusinessNumber = SanktionenSubset$ID,
  Language = "DE"
  )

# Plot by role
biz.roles %>% 
  group_by(RoleName) %>% 
  count() %>% 
  ungroup() %>% 
  ggplot(aes(reorder(RoleName, n), n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Roles",
    subtitle = "Rollen in den Vorstössen im Zusammenhang mit Sanktionen",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
    ) +
  theme_ipsum_rc(grid="X") +
  theme(axis.title = element_blank())

## get names
council.members <- swissparl::get_data(
  table = "MemberCouncil", 
  ID = biz.roles$MemberCouncilNumber,
  Language = "DE"
  )

## join data
biz.roles <- left_join(
  biz.roles,
  council.members,
  by = c("MemberCouncilNumber" = "ID")
)  

### PLOT
# Plot by role
biz.roles %>%
  filter(Role %in% c(3, 7)) %>% 
  mutate(name_canton = paste0(LastName, " (", CantonAbbreviation, ")")) %>% 
  group_by(name_canton, RoleName) %>%
  count() %>% 
  ungroup() %>% 
  mutate(RoleName = stringr::str_remove_all(RoleName, "\\(.*?\\)")) %>% 
  pivot_wider(names_from = RoleName, values_from = n) %>% 
  ggplot(aes(Mitunterzeichner, Urheber, label = name_canton)) +
  geom_smooth(method = lm, color = "black", fill = "grey80") +
  geom_point() +
  ggrepel::geom_text_repel() +
  labs(
    title = "Who with Whom in the Council of States?",
    subtitle = "Roles in Political Businesses of the 50th Legislative Period",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
    ) +
  scale_y_continuous(limits = c(0, 40)) +
  theme_ipsum_rc()


## Finally, start of the network analysis

######################################################
## Co-signing-behaviour ALL Vorstösse

## All Business roles
biz.roles.all <- swissparl::get_data(
  table = "BusinessRole", 
  BusinessNumber = indices$ID,
  Language = "DE"
  )

## get names
council.members.all <- swissparl::get_data(
  table = "MemberCouncil", 
  ID = biz.roles.all$MemberCouncilNumber,
  Language = "DE"
  )

## join data
biz.roles.all <- left_join(
  biz.roles.all,
  council.members.all,
  by = c("MemberCouncilNumber" = "ID")
)  

## Ständerat entfernen
biz.roles.all <- biz.roles.all %>%
  filter(Council != 2)

# Business authors
authors.all <- biz.roles.all %>%
  filter(Role == 7) %>% 
  filter(!is.na(MemberCouncilNumber)) %>% 
  select(BusinessNumber, MemberCouncilNumber) %>% 
  distinct(BusinessNumber, MemberCouncilNumber, .keep_all = T)

# Business cosigners
cosigners.all <- biz.roles.all %>%
  filter(Role == 3) %>% 
  filter(!is.na(MemberCouncilNumber)) %>% 
  select(BusinessNumber, MemberCouncilNumber) %>% 
  distinct(BusinessNumber, MemberCouncilNumber, .keep_all = T)

# Author-cosigner-pair
acp.all <- left_join(
  authors.all, 
  cosigners.all, 
  by = "BusinessNumber", 
  suffix = c(".author", ".cosigner")
  )

# define edges
edges.all <- acp.all %>% 
  rename(
    from = "MemberCouncilNumber.cosigner",
    to = "MemberCouncilNumber.author"
    ) %>% 
  group_by(from, to) %>% 
  count() %>% 
  ungroup() %>% 
  filter(!is.na(from))

# author co-signatory cooperations
edges.all %>%
  complete(from, to, fill = list(n = 0)) %>% 
  ggplot(aes(n)) +
  geom_bar() +
  labs(
    x = "Num. of\ncooperations",
    y = "Freq.",
    title = "Who with Whom in the Council of States?",
    subtitle = "Frequency of Author-Co-Signatory Cooperations",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
    ) +
  theme_ipsum_rc() +
  theme(panel.grid.minor = element_blank())

# creation of nodes
nodes.all <- left_join(
  tibble::tibble(ID = unique(c(edges.all$from, edges.all$to))), 
  council.members.all %>% 
    mutate(ID = ID) %>% 
    select(ID, LastName, GenderAsString, CantonAbbreviation, ParlGroupAbbreviation),
  by = "ID"
  )

#remove duplicate nodes
nodes.all <- nodes.all %>% distinct()


## create tbl
cs50.all <- tidygraph::tbl_graph(
  nodes = nodes.all %>% mutate(ID = as.character(ID)), 
  edges = edges.all %>% mutate_at(vars(from, to), as.character), 
  directed = F
  )


## remove distorting names

nodes2.all <- nodes.all %>% 
  filter(!ID %in% c(4325, 4237, 3871, 4181, 1267, 1120, 4239, 525, 3830)) %>% 
  mutate(ID = as.character(ID))

edges2.all <- edges.all %>% 
  filter(!from %in% c(4325, 4237, 3871, 4181, 1267, 1120, 4239, 525, 3830)) %>% 
  filter(!to %in% c(4325, 4237, 3871, 4181, 1267, 1120, 4239, 525, 3830)) %>% 
  mutate_at(vars(from, to), as.character)

cs50.all <- tidygraph::tbl_graph(nodes = nodes2.all, edges = edges2.all, directed = F)


# For reproducibility reasons
set.seed(15)

# Network plot
network_colour.all <- cs50.all %>%
  mutate(fraction = factor(ParlGroupAbbreviation, levels = c("G", "RL", "GL", "M-E", "S", "V", "-"))) %>% 
  mutate(importance = tidygraph::centrality_eigen(weights = n)) %>%
  mutate(label = paste0(LastName, " (", CantonAbbreviation, ")")) %>% 
  ggraph() + 
  geom_edge_fan(aes(alpha = n, width = sqrt(n)), edge_colour = "gray60") + 
  scale_edge_width(range = c(0.001, 1)) +
  scale_edge_alpha(range = c(0.5, 2)) +
  geom_node_point(aes(size = importance^2, color = fraction)) +
  geom_node_text(aes(label = label), size = 1, nudge_y = -0.08) +
  scale_color_manual(values = c("chartreuse3", "blue3", "darkmagenta", "darkorange", "brown2", "darkgreen")) +
  scale_size(range = c(0.1, 3)) +
  labs(
    title = "Who with Whom in the Council of States?",
    subtitle = "Co-Signing Behaviour",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern\nNode Size: Eigenvector Centrality"
    ) +
  theme_graph() + 
  theme_ipsum_rc(
    grid = FALSE,
    axis_text_size = 0
  ) + 
  theme(
    legend.position = "none",  
    panel.grid = element_blank(), 
    axis.text = element_blank(), 
    axis.title = element_blank()
    )

# ggsave("network_colour_noname_name.png", bg = "white", plot = network_colour.all, width = 6, height = 4, dpi = 1200)

### Without names
network_colour_noname <- cs50.all %>%
  mutate(fraction = factor(ParlGroupAbbreviation, levels = c("G", "RL", "GL", "M-E", "S", "V", "-"))) %>% 
  mutate(importance = tidygraph::centrality_eigen(weights = n)) %>%
  ggraph() + 
  geom_edge_fan(aes(alpha = n, width = sqrt(n)), edge_colour = "gray60") + 
  scale_edge_width(range = c(0.001, 1)) +
  scale_edge_alpha(range = c(0.5, 2)) +
  geom_node_point(aes(size = importance^2, color = fraction)) +
  scale_color_manual(values = c("chartreuse3", "blue3", "darkmagenta", "darkorange", "brown2", "darkgreen")) +
  scale_size(range = c(0.1, 3)) +
  labs(
    title = "Who with Whom in the Council of States?",
    subtitle = "Co-Signing Behaviour in the 50th Legislative Period",
    caption = "Data: Parliamentary Services of the Federal Assembly, Bern\nNode Size: Eigenvector Centrality"
  ) +
  theme_graph() + 
  theme_ipsum_rc(
    grid = FALSE,
    axis_text_size = 0
  ) + 
  theme(
    legend.position = "none",  
    panel.grid = element_blank(), 
    axis.text = element_blank(), 
    axis.title = element_blank()
  )

# ggsave("network_colour_noname.png", bg = "white", plot = network_colour_noname, width = 6, height = 4, dpi = 1200)