Libraries
list.of.packages <- c("writexl", "tidyverse", "readxl", "dplyr", "swissparl", "quanteda", "stm", "data.table", "geometry", "tm", "Rtsne", "rsvd", "wordcloud", "tidytext", "ggplot2", "ggraph", "hrbrthemes")
lapply(list.of.packages, require, character.only = TRUE)
getwd()
set.seed(123)
Data
# Business Text #291
BR.TextUkraine <- get_data(table = "Business", SubmittedText = "~Ukraine", Language = "DE", SubmissionDate = c(">2020-01-01", "<2023-03-21"))
# Reason Text #168
BR.ReasonUkraine <- get_data(table = "Business", ReasonText = "~Ukraine", Language = "DE", SubmissionDate = c(">2020-01-01", "<2023-03-21"))
# Federal Council Response Text #256
BR.FCRUkraine <- get_data(table = "Business", FederalCouncilResponseText = "~Ukraine", Language = "DE", SubmissionDate = c(">2020-01-01", "<2023-03-21"))
# Combine the datasets & remove duplicates #499
DataRaw <- merge(merge(BR.TextUkraine, BR.ReasonUkraine, all = TRUE), BR.FCRUkraine, all = TRUE)
# Backup Data
# write.csv(DataRaw, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/Data_Raw_Backup_neu.csv")
# Fraction membership data #243
FractionMembership <- swissparl::get_data(table = "MemberParlGroup", Language = "DE")
Grafik Geschäfte raw
## Auswahl der nötigen Variablen
GraphRaw <- DataRaw %>%
select(BusinessTypeName, SubmittedBy, SubmissionDate)
##Auf Monat runden
GraphRaw$SubmissionDate <- substr(GraphRaw$SubmissionDate, 1,7)
## Absolute Freq pro Monat
Raw.freq <- count(GraphRaw, SubmissionDate)
# write_xlsx(Raw.freq, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/ukraine_graph_beste2.xlsx")
Preparation of Data for analysis with stm
### Prepare Fraction membership data
FractionMembership <- FractionMembership %>%
select(FirstName, LastName, ParlGroupName)
FractionMembership <- data.table(FractionMembership, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
FractionMembership[,SubmittedBy := paste(LastName, FirstName)][,c("LastName", "FirstName") := NULL]
### Prepare Business Data
# Creation of Data Table
DataRawTable <- data.table(DataRaw, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
# Nur interessante Variablen behalten
DataStm <- DataRawTable[,.(ID, BusinessShortNumber, BusinessTypeName, Title, SubmittedText, ReasonText, SubmittedBy, SubmissionDate)]
# Mit Fraction membership mergen
DataStm <- merge(DataStm, FractionMembership, by = "SubmittedBy", all.x = TRUE)
# Add Fraction manual for NAs
### Kommissionen
DataStm$ParlGroupName[DataStm$ID == "20220023"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$ID == "20220024"] <- "Ständerat"
DataStm$SubmittedBy[DataStm$ID == "20220023"] <- "Nationalrat"
DataStm$SubmittedBy[DataStm$ID == "20220024"] <- "Ständerat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Aussenpolitische Kommission-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Aussenpolitische Kommission-Ständerat"] <- "Ständerat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Kommission für Rechtsfragen-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Kommission für Wirtschaft und Abgaben-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Sicherheitspolitische Kommission-Nationalrat"] <- "Nationalrat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Sicherheitspolitische Kommission-Ständerat"] <- "Ständerat"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Genf"] <- "Genf" ##Standesinitiative
### Fraktionen
DataStm$ParlGroupName[DataStm$SubmittedBy == "Die Mitte-Fraktion. Die Mitte. EVP."] <- "Die Mitte-Fraktion. Die Mitte. EVP."
DataStm$ParlGroupName[DataStm$SubmittedBy == "FDP-Liberale Fraktion"] <- "FDP-Liberale Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Fraktion der Schweizerischen Volkspartei"] <- "Fraktion der Schweizerischen Volkspartei"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Grüne Fraktion"] <- "Grüne Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Grünliberale Fraktion"] <- "Grünliberale Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Sozialdemokratische Fraktion"] <- "Sozialdemokratische Fraktion"
## Personen
DataStm$ParlGroupName[DataStm$SubmittedBy == "Humbel Ruth"] <- "Die Mitte-Fraktion. Die Mitte. EVP."
DataStm$ParlGroupName[DataStm$SubmittedBy == "Carobbio Guscetti Marina"] <- "Sozialdemokratische Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Rechsteiner Paul"] <- "Sozialdemokratische Fraktion"
DataStm$ParlGroupName[DataStm$SubmittedBy == "Streiff-Feller Marianne"] <- "Die Mitte-Fraktion. Die Mitte. EVP."
# Titel und Text(e) zusammenlegen
DataStm[,txt := paste(Title, SubmittedText, ReasonText)][,c("Title", "SubmittedText", "ReasonText") := NULL]
# Leerzeichen aufräumen
DataStm[,txt := gsub("[[:space:]]+", " ", txt)] #löscht html text und so
##### Preprocessing von Texten #####
# Preprocessing
corpus_stm <- textProcessor(DataStm$txt, metadata = as.data.frame(DataStm), #Metadata = covariates
lowercase = T, removestopwords = T, removenumbers = T,
removepunctuation = T, stem = T, wordLengths = c(3, Inf),
sparselevel = 1, language = 'de', verbose = T,
onlycharacter = T, striphtml = F,
customstopwords = c("dass", "text"))
# Modelle brauchen drei Inputs: Dokumente, Vokabular, Metadaten
corpus_stm <- prepDocuments(documents = corpus_stm$documents,
vocab = corpus_stm$vocab,
meta = corpus_stm$meta,
lower.thresh = 2, upper.thresh = Inf,
verbose = T)
# Test removed words
plotRemoved(documents = corpus_stm$document, lower.thresh = seq(1, 200, by = 20))
##############################################
##### STM #####
## Diagnostik --> 15 Topics scheinen gut zu sein
stm_search <- searchK(corpus_stm$documents, corpus_stm$vocab,
K = seq(5, 55, by = 10), max.em.its = 100) ##
plot(stm_search)
## 15 Topics erzeugen
stm_15 <- stm(corpus_stm$documents, corpus_stm$vocab, K = 15)
## Backup Model
# saveRDS(stm_15, "stm_15_DATUM.RDS")
# stm_15_BUP <- readRDS("stm_15_230512.RDS")
# stm_15 <- stm_15_BUP
## Häufigkeit & Übersicht der Topics
plot.STM(stm_15, n = 7, topics = 1:15, type = "summary")
labelTopics(stm_15, n = 7)
# Themen vergleichen
plot.STM(stm_15, topics=c(2,15), type = "perspectives")
## word clouds
cloud(stm_15, topic = 2)
cloud(stm_15, topic = 5)
cloud(stm_15, topic = 15)
## Select Model evaluation
swissparl_stm_select <- selectModel(corpus_stm$documents,
corpus_stm$vocab,
K = 15, runs = 20)
plotModels(swissparl_stm_select, legend.position = "bottomleft")
selected <- swissparl_stm_select$runout[[4]]
plot(selected)
plot(unlist(stm_search$results$K),
unlist(stm_search$results$exclus),
xlab = "K", ylab = "Exclusivity")
## Testplots
plot.STM(stm_15, n = 7, topics = c(2,5,7), type = "labels")
plot.STM(stm_15, n = 7, topics = c(1,2), type = "perspectives")
##### Prevalence #####
corpus_stm$meta$ParlGroupName <- as.factor(corpus_stm$meta$ParlGroupName)
stm_15_covar <- stm(corpus_stm$documents,
corpus_stm$vocab,
K = 15,
data=corpus_stm$meta,
prevalence =~ ParlGroupName)
########## Zusammenhänge ##########
stm_15_covar_est <- estimateEffect(1:15 ~ ParlGroupName, stm_15_covar,
metadata = corpus_stm$meta)
plot.estimateEffect(stm_15_covar_est, topics = 15,
covariate = "ParlGroupName", method = "pointestimate",
verbose.labels = T)
plot.estimateEffect(stm_15_covar_est, topics = c(9,11),
covariate = "ParlGroupName", method = "pointestimate",
verbose.labels = T)
names(stm_15$metadata)
attributes(stm_15)
table <- table(stm_15_covar_est$metadata$ParlGroupName)
Visualisation
# Visualisation of 10 most used words per topic
betas <- tidy(stm_15)
betaplot1 <- betas %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
mutate(topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() + scale_x_reordered() +
labs(x = NULL, y = expression(beta), title = "Wörter mit der höchsten Häufigkeit pro Topic") +
theme_bw()
print(betaplot1)
########## Visualisation of percentages ##########
topicprop <- make.dt(stm_15)
head(topicprop)[,1:6]
## Add "docnum" to DataStm - preparation for merge
DataStm_Final <- DataStm %>%
mutate(docnum = row_number()) %>%
filter(docnum <= 499)
## Merge
STM_merged_data <- merge(DataStm_Final, topicprop, by = "docnum")
table(STM_merged_data$ParlGroupName)
## Remove NR/SR & Geneva
STM_merged_data2 <- subset(STM_merged_data, !(ParlGroupName == "Nationalrat" | ParlGroupName == "Ständerat" | ParlGroupName == "Genf"))
recoded_data <- STM_merged_data2[, c("Topic1", "Topic2", "Topic3", "Topic4", "Topic5", "Topic6", "Topic7", "Topic8", "Topic9", "Topic10", "Topic11", "Topic12", "Topic13", "Topic14", "Topic15")]
### Benchmark 0.1 gibt gute Ergebnisse
recoded_data[recoded_data >= 0.1] <- 1
recoded_data[recoded_data < 0.1] <- 0
## merge
docnum <- seq(from = 1, to = 479)
# Add the docnum column to your dataset
recoded_data <- cbind(docnum, recoded_data)
STM_merged_data2$docnum <- NULL
STM_merged_data2 <- cbind(docnum, STM_merged_data2)
STM_merged_data3 <- STM_merged_data2 %>%
select(docnum, SubmittedBy, SubmissionDate, ParlGroupName, txt, trend, BusinessShortNumber, ID) ## ID ergänzt
indices <- merge(STM_merged_data3, recoded_data, by="docnum")
########## Frequencies ##########
sum_topics <- aggregate(indices[,9:23], by=list(indices$ParlGroupName), FUN=sum)
colnames(sum_topics)[1] <- "ParlGroupName"
## write sum topics for count
# write_xlsx(sum_topics, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/sum_topics_count.xlsx")
### Standardization nach Fraktionsgrösse - Macht das überhaupt sinn? --> Ja! Aber achtung bei Interpretation
## Fraktionsgrössen
Fraktionsgrossen <- as.data.frame(table(FractionMembership$ParlGroupName))
colnames(Fraktionsgrossen) <- c("ParlGroupName", "Fraktionsgrosse")
SumStand <- merge(sum_topics, Fraktionsgrossen, by="ParlGroupName")
SumStand[,2:16] <- SumStand[,2:16] / SumStand$Fraktionsgrosse
## Save for Smartspiders
# write_xlsx(SumStand, "C:/Users/julia/Desktop/Uni 10/Forschungsseminar Datenjournalismus/Blogbeitrag/Frequencies2.xlsx")
Overview Vorstösse & Network Analysis
## Overview alle Vorstösse zum Thema "Ukraine"
# Plot by type ##
indices %>%
group_by(ParlGroupName) %>%
count() %>%
ungroup() %>%
ggplot(aes(reorder(ParlGroupName, n), n)) +
geom_col() +
coord_flip() +
labs(
title = "Vorstösse im Zusammenhang mit der Ukraine",
subtitle = "Vorstösse nach Fraktionen",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
) +
theme_ipsum_rc(grid="X") +
theme(axis.title = element_blank())
SanktionenSubset <- subset(indices, Topic2 == 1)
## First overview Sanktionen
SanktionenSubset %>%
group_by(SubmittedBy) %>%
count() %>%
ungroup() %>%
top_n(10, n) %>%
mutate(SubmittedBy = stringr::str_remove_all(SubmittedBy, "\\s.*$")) %>%
ggplot(aes(reorder(SubmittedBy, n), n)) +
geom_col() +
coord_flip() +
labs(
title = "Who with Whom in the Council of States?",
subtitle = "Vorstösse zum Topic 'Sanktionen' eingereicht",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
) +
theme_ipsum_rc(grid="X") +
theme(axis.title = element_blank())
########## Network Analysis ##########
######################################
## All Business roles
biz.roles <- swissparl::get_data(
table = "BusinessRole",
BusinessNumber = SanktionenSubset$ID,
Language = "DE"
)
# Plot by role
biz.roles %>%
group_by(RoleName) %>%
count() %>%
ungroup() %>%
ggplot(aes(reorder(RoleName, n), n)) +
geom_col() +
coord_flip() +
labs(
title = "Roles",
subtitle = "Rollen in den Vorstössen im Zusammenhang mit Sanktionen",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
) +
theme_ipsum_rc(grid="X") +
theme(axis.title = element_blank())
## get names
council.members <- swissparl::get_data(
table = "MemberCouncil",
ID = biz.roles$MemberCouncilNumber,
Language = "DE"
)
## join data
biz.roles <- left_join(
biz.roles,
council.members,
by = c("MemberCouncilNumber" = "ID")
)
### PLOT
# Plot by role
biz.roles %>%
filter(Role %in% c(3, 7)) %>%
mutate(name_canton = paste0(LastName, " (", CantonAbbreviation, ")")) %>%
group_by(name_canton, RoleName) %>%
count() %>%
ungroup() %>%
mutate(RoleName = stringr::str_remove_all(RoleName, "\\(.*?\\)")) %>%
pivot_wider(names_from = RoleName, values_from = n) %>%
ggplot(aes(Mitunterzeichner, Urheber, label = name_canton)) +
geom_smooth(method = lm, color = "black", fill = "grey80") +
geom_point() +
ggrepel::geom_text_repel() +
labs(
title = "Who with Whom in the Council of States?",
subtitle = "Roles in Political Businesses of the 50th Legislative Period",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
) +
scale_y_continuous(limits = c(0, 40)) +
theme_ipsum_rc()
## Finally, start of the network analysis
######################################################
## Co-signing-behaviour ALL Vorstösse
## All Business roles
biz.roles.all <- swissparl::get_data(
table = "BusinessRole",
BusinessNumber = indices$ID,
Language = "DE"
)
## get names
council.members.all <- swissparl::get_data(
table = "MemberCouncil",
ID = biz.roles.all$MemberCouncilNumber,
Language = "DE"
)
## join data
biz.roles.all <- left_join(
biz.roles.all,
council.members.all,
by = c("MemberCouncilNumber" = "ID")
)
## Ständerat entfernen
biz.roles.all <- biz.roles.all %>%
filter(Council != 2)
# Business authors
authors.all <- biz.roles.all %>%
filter(Role == 7) %>%
filter(!is.na(MemberCouncilNumber)) %>%
select(BusinessNumber, MemberCouncilNumber) %>%
distinct(BusinessNumber, MemberCouncilNumber, .keep_all = T)
# Business cosigners
cosigners.all <- biz.roles.all %>%
filter(Role == 3) %>%
filter(!is.na(MemberCouncilNumber)) %>%
select(BusinessNumber, MemberCouncilNumber) %>%
distinct(BusinessNumber, MemberCouncilNumber, .keep_all = T)
# Author-cosigner-pair
acp.all <- left_join(
authors.all,
cosigners.all,
by = "BusinessNumber",
suffix = c(".author", ".cosigner")
)
# define edges
edges.all <- acp.all %>%
rename(
from = "MemberCouncilNumber.cosigner",
to = "MemberCouncilNumber.author"
) %>%
group_by(from, to) %>%
count() %>%
ungroup() %>%
filter(!is.na(from))
# author co-signatory cooperations
edges.all %>%
complete(from, to, fill = list(n = 0)) %>%
ggplot(aes(n)) +
geom_bar() +
labs(
x = "Num. of\ncooperations",
y = "Freq.",
title = "Who with Whom in the Council of States?",
subtitle = "Frequency of Author-Co-Signatory Cooperations",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern"
) +
theme_ipsum_rc() +
theme(panel.grid.minor = element_blank())
# creation of nodes
nodes.all <- left_join(
tibble::tibble(ID = unique(c(edges.all$from, edges.all$to))),
council.members.all %>%
mutate(ID = ID) %>%
select(ID, LastName, GenderAsString, CantonAbbreviation, ParlGroupAbbreviation),
by = "ID"
)
#remove duplicate nodes
nodes.all <- nodes.all %>% distinct()
## create tbl
cs50.all <- tidygraph::tbl_graph(
nodes = nodes.all %>% mutate(ID = as.character(ID)),
edges = edges.all %>% mutate_at(vars(from, to), as.character),
directed = F
)
## remove distorting names
nodes2.all <- nodes.all %>%
filter(!ID %in% c(4325, 4237, 3871, 4181, 1267, 1120, 4239, 525, 3830)) %>%
mutate(ID = as.character(ID))
edges2.all <- edges.all %>%
filter(!from %in% c(4325, 4237, 3871, 4181, 1267, 1120, 4239, 525, 3830)) %>%
filter(!to %in% c(4325, 4237, 3871, 4181, 1267, 1120, 4239, 525, 3830)) %>%
mutate_at(vars(from, to), as.character)
cs50.all <- tidygraph::tbl_graph(nodes = nodes2.all, edges = edges2.all, directed = F)
# For reproducibility reasons
set.seed(15)
# Network plot
network_colour.all <- cs50.all %>%
mutate(fraction = factor(ParlGroupAbbreviation, levels = c("G", "RL", "GL", "M-E", "S", "V", "-"))) %>%
mutate(importance = tidygraph::centrality_eigen(weights = n)) %>%
mutate(label = paste0(LastName, " (", CantonAbbreviation, ")")) %>%
ggraph() +
geom_edge_fan(aes(alpha = n, width = sqrt(n)), edge_colour = "gray60") +
scale_edge_width(range = c(0.001, 1)) +
scale_edge_alpha(range = c(0.5, 2)) +
geom_node_point(aes(size = importance^2, color = fraction)) +
geom_node_text(aes(label = label), size = 1, nudge_y = -0.08) +
scale_color_manual(values = c("chartreuse3", "blue3", "darkmagenta", "darkorange", "brown2", "darkgreen")) +
scale_size(range = c(0.1, 3)) +
labs(
title = "Who with Whom in the Council of States?",
subtitle = "Co-Signing Behaviour",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern\nNode Size: Eigenvector Centrality"
) +
theme_graph() +
theme_ipsum_rc(
grid = FALSE,
axis_text_size = 0
) +
theme(
legend.position = "none",
panel.grid = element_blank(),
axis.text = element_blank(),
axis.title = element_blank()
)
# ggsave("network_colour_noname_name.png", bg = "white", plot = network_colour.all, width = 6, height = 4, dpi = 1200)
### Without names
network_colour_noname <- cs50.all %>%
mutate(fraction = factor(ParlGroupAbbreviation, levels = c("G", "RL", "GL", "M-E", "S", "V", "-"))) %>%
mutate(importance = tidygraph::centrality_eigen(weights = n)) %>%
ggraph() +
geom_edge_fan(aes(alpha = n, width = sqrt(n)), edge_colour = "gray60") +
scale_edge_width(range = c(0.001, 1)) +
scale_edge_alpha(range = c(0.5, 2)) +
geom_node_point(aes(size = importance^2, color = fraction)) +
scale_color_manual(values = c("chartreuse3", "blue3", "darkmagenta", "darkorange", "brown2", "darkgreen")) +
scale_size(range = c(0.1, 3)) +
labs(
title = "Who with Whom in the Council of States?",
subtitle = "Co-Signing Behaviour in the 50th Legislative Period",
caption = "Data: Parliamentary Services of the Federal Assembly, Bern\nNode Size: Eigenvector Centrality"
) +
theme_graph() +
theme_ipsum_rc(
grid = FALSE,
axis_text_size = 0
) +
theme(
legend.position = "none",
panel.grid = element_blank(),
axis.text = element_blank(),
axis.title = element_blank()
)
# ggsave("network_colour_noname.png", bg = "white", plot = network_colour_noname, width = 6, height = 4, dpi = 1200)