library(tidyverse)
library(quanteda)
library(lubridate)
library(quanteda.textstats)
library(quanteda.textplots)
library(quanteda.textmodels)
library(stm)
library(hrbrthemes)
library(ggthemes)
library(tidytext)
library(formattable)
library(htmlwidgets)
setwd("~/Downloads/")
zeitungen <- readRDS("zeitungsartikel_de_2020.RDS")
#neue variable monat erstellen, verschiedene Zeitpunkte definieren
zeitungen <- zeitungen %>%
mutate(month = ifelse(grepl("2020-01", pubDateTime)==T, "01",
ifelse(grepl("2020-02", pubDateTime)==T, "02", ifelse(
grepl("2020-03", pubDateTime)==T, "03", ifelse(
grepl("2020-04", pubDateTime)==T, "04", ifelse(
grepl("2020-05", pubDateTime)==T, "05", ifelse(
grepl("2020-06", pubDateTime)==T, "06", ifelse(
grepl("2020-07", pubDateTime)==T, "07", ifelse(
grepl("2020-08", pubDateTime)==T, "08", ifelse(
grepl("2020-09", pubDateTime)==T, "09", ifelse(
grepl("2020-10", pubDateTime)==T, "10", ifelse(
grepl("2020-11", pubDateTime)==T, "11", "12"))))))))))))
zeitungen <- zeitungen %>%
mutate(zeitpunkt = case_when(
month == "01" ~"vor 1. Welle",
month == "02" ~"vor 1. Welle",
month == "03" ~"1. Welle",
month == "04" ~"1. Welle",
month == "05" ~"1. Welle",
month == "06" ~"nach 1. Welle",
month == "07" ~"nach 1. Welle",
month == "08" ~"nach 1. Welle",
month == "09" ~"nach 1. Welle",
month == "10" ~"2. Welle",
month == "11" ~"2. Welle",
month == "12" ~"2. Welle"))
zeitungen <-zeitungen %>%
mutate(zeitpunkt_corona = ifelse(zeitpunkt == "vor 1. Welle", "Vor", "Nach")
)
Es soll gezeigt werden, wann Corona die Berichterstattung dominiert hat und zu welchen Zeitpunkten sich die Berichterstattung wieder normalisiert hat.
# Im Zeitungstext nach "Corona" und "Covid" suchen --> 2 Kategorien bilden
zeitungen <- zeitungen %>%
mutate(corona_thema = ifelse(grepl("Corona", tx)==T, "Corona",
ifelse(grepl("Covid", tx)==T, "Corona",
"Anderes")))
#nach Tag und Corona gruppieren und Anteil berechnen
zeitungen_tag <- zeitungen %>%
group_by(pubDateTime)%>%
count()
zeitungen_corona_tag <- zeitungen %>%
group_by(pubDateTime, corona_thema)%>%
count()
zeitungen_corona_tag <- left_join(zeitungen_corona_tag, zeitungen_tag,
by=c("pubDateTime"))
zeitungen_corona_tag <- zeitungen_corona_tag %>%
mutate(anteil = n.x/n.y*100)
zeitungen_corona_tag$pubDateTime <- as.Date(zeitungen_corona_tag$pubDateTime)
#Visualisierung Anteil Corona Artikel
zeitungen_corona_tag %>%
filter(corona_thema == "Corona")%>%
ggplot(aes(pubDateTime, anteil, fill = corona_thema))+
geom_area(alpha=0.8 , size=.5, colour="#C5462B")+
scale_x_date(date_breaks = '1 month',
labels = scales::date_format("%b"))+
scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(0,100))+
labs(x = "",
y = "",
title = "Corona dominiert während dem ersten Lockdown die Medienberichterstattung",
subtitle = "Anteil Artikel mit Corona Nennungen")+
scale_fill_manual(values = "#C5462B")+
#texts
annotate("text", x=as.Date("2020-02-22"), y=82,
label="Erster Coronafall Schweiz",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-03-13"), y=86,
label="Beginn 1. Lockdown",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-04-24"), y=84.5,
label="Erste Öffnungsschritte",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-06-19"), y=73,
label="Aufhebung Grossteil der Massnahmen",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-09-28"), y=76.5,
label="Zulassung Grossveranstaltungen",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-10-16"), y=85,
label="Erneute Massnahmen",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-12-09"), y=83.5,
label="Einführung Sperrstunde",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
annotate("text", x=as.Date("2020-12-19"), y=84,
label="Erneute Schliessungen",
colour = "black", angle = 90, size = 3, family = "Helvetica") +
#lines
geom_vline(xintercept=as.Date("2020-03-16"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-02-25"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-04-27"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-06-22"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-10-01"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-10-19"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-12-12"), size=.5, alpha = 0.6, linetype = "dashed") +
geom_vline(xintercept=as.Date("2020-12-22"), size=.5, alpha = 0.6, linetype = "dashed") +
guides(fill = FALSE)+
theme_ipsum()+
theme(plot.title = element_text(size = 16, family = "Helvetica"),
plot.subtitle = element_text(size = 14, family = "Helvetica"),
axis.text.x = element_text(family = "Helvetica", size = 8),
axis.text.y = element_text(family = "Helvetica", size = 8))+
ggsave("Corona_1.png", height = 6, width = 11)
Welche Themen haben durch Corona abgenommen und wie oft kommt Corona vor in den Artikeln:
#relevante Themenbereiche auswählen, nach monat gruppieren
zeitungen_thema<- zeitungen %>%
filter(selectsclass %in% c("EU_Europa", "LabourMarket", "Agriculture", "Economy", "Finances_Taxes",
"InternationalRelations",
"Law_Order", "Education_Culture", "Environment_Energy", "Immigration_Asylum",
"Public_Health",
"GenderIssues_Discrimination", "PublicHealth"))%>%
group_by(month) %>%
count()
#relevante themenbereiche auswählen, nach monat und themenbereiche gruppieren
zeitungen_thema_2<- zeitungen %>%
filter(selectsclass %in% c("EU_Europa", "LabourMarket", "Agriculture", "Economy", "Finances_Taxes",
"InternationalRelations",
"Law_Order", "Education_Culture", "Environment_Energy", "Immigration_Asylum",
"Public_Health",
"GenderIssues_Discrimination", "PublicHealth"))%>%
group_by(month, selectsclass, corona_thema)%>%
count()
zeitungen_thema_2 <- left_join(zeitungen_thema, zeitungen_thema_2,
by=c("month"))
#Themenbereiche übersetzen
zeitungen_thema_2$selectsclass <- factor(zeitungen_thema_2$selectsclass,
levels = c("LabourMarket", "Education_Culture",
"EU_Europa", "Finances_Taxes",
"GenderIssues_Discrimination",
"PublicHealth", "Immigration_Asylum",
"InternationalRelations", "Agriculture", "Law_Order",
"Environment_Energy", "Economy"),
labels = c("Arbeitsmarkt", "Bildung & Kultur",
"EU & Europa", "Finanzen & Steuern",
"Geschlechterthemen & Diskriminierung",
"Gesundheitswesen", "Immigration & Asyl",
"Internationale Beziehungen",
"Landwirtschaft", "Recht & Ordnung",
"Umwelt & Energie", "Wirtschaft"))
#Visualisierung
zeitungen_thema_2 %>%
mutate(anteil = n.y/n.x*100)%>%
ggplot(aes(fill=corona_thema, y=anteil, x=factor(month, labels = c("Jan", "Feb", "Mar",
"Apr", "Mai", "Jun",
"Jul", "Aug", "Sep",
"Okt", "Nov", "Dez")))) +
geom_bar(position="stack", stat="identity", alpha = 0.8) +
scale_fill_manual(values = c("#FBB7A9", "#C5462B"), labels = c("Ohne Corona Nennungen",
"Corona Nennungen")) +
scale_y_continuous(labels = scales::percent_format(scale = 1))+
labs(title = "Die verschiedenen Themebereiche sind unterschiedlich stark betroffen", x = "", y = "",
subtitle = "Anteil Artikel pro Thema") +
facet_wrap(~selectsclass, ncol = 3, scales = "free") +
theme_ipsum() +
theme(legend.position='top',
legend.justification='left',
legend.direction='horizontal',
legend.title = element_blank(),
legend.text = element_text(size = 12, family = "Helvetica"),
plot.title = element_text(size = 16, family = "Helvetica"),
plot.subtitle = element_text(size = 14, family = "Helvetica"),
axis.text.x = element_text(family = "Helvetica", size = 8),
axis.text.y = element_text(family = "Helvetica", size = 8),
strip.text.x = element_text(family = "Helvetica"))+
ggsave("Corona_2.png", height = 10, width = 12)
Wie hat sich die Berichterstattung in einem bestimmten Thema verändert durch Corona? Welche Schlagwörter werden gebraucht vor Corona, während Corona?
#Artikel zu Arbeitsmarkt und vor corona filtern
zeitungen_labour_vor <- zeitungen %>%
filter(selectsclass == "LabourMarket",
zeitpunkt == "vor 1. Welle")%>%
select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)
#corpus erstellen
zeitungen_corpus_labour_vor <- corpus(zeitungen_labour_vor,text_field="tx")
#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_labour_tokens_vor <- zeitungen_corpus_labour_vor %>%
tokens("word", remove_punct = T,
remove_numbers = T,
remove_symbols=T) %>%
tokens_remove(stopwords('de')) %>%
tokens_tolower()
zeitungen_corpus_labour_dfm_vor <- dfm(zeitungen_corpus_labour_tokens_vor) %>%
dfm_wordstem(language="de")
zeitungen_corpus_labour_dfm_trimmed_vor <- zeitungen_corpus_labour_dfm_vor %>%
dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)
#topic model
topic.count <- 15 # anzahl topics
# Cstm rechnen
zeitungen_corpus_labour_dfm_trimmed_vor_stm <- convert(zeitungen_corpus_labour_dfm_trimmed_vor, to = "stm")
zeitungen_corpus_labour_dfm_trimmed_vor_model_stm <- stm(
zeitungen_corpus_labour_dfm_trimmed_vor_stm$documents,
zeitungen_corpus_labour_dfm_trimmed_vor_stm$vocab,
K = topic.count,
data = zeitungen_corpus_labour_dfm_trimmed_vor_stm$meta,
init.type = "Spectral"
)
as.data.frame(t(labelTopics(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, n = 10)$prob))
#Anteile für jeden Themenbereich
plot(
zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
type = "summary",
text.cex = 0.5,
main = "STM topic shares",
xlab = "Share estimation"
)
#filtern für Arbeitsmarkt und 1. Welle
zeitungen_labour_nach <- zeitungen %>%
filter(selectsclass == "LabourMarket",
zeitpunkt == "1. Welle")%>%
select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)
#corpus erstellen
zeitungen_corpus_labour_nach <- corpus(zeitungen_labour_nach,text_field="tx")
#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_labour_tokens_nach <- zeitungen_corpus_labour_nach %>%
tokens("word", remove_punct = T,
remove_numbers = T,
remove_symbols=T) %>%
tokens_remove(stopwords('de')) %>%
tokens_tolower()
zeitungen_corpus_labour_dfm_nach <- dfm(zeitungen_corpus_labour_tokens_nach) %>%
dfm_wordstem(language="de")
zeitungen_corpus_labour_dfm_trimmed_nach <- zeitungen_corpus_labour_dfm_nach %>%
dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)
#topic model
topic.count <- 15 # Anzahl topics
#stm rechnen
zeitungen_corpus_labour_dfm_trimmed_nach_stm <- convert(zeitungen_corpus_labour_dfm_trimmed_nach, to = "stm")
zeitungen_corpus_labour_dfm_trimmed_nach_model_stm <- stm(
zeitungen_corpus_labour_dfm_trimmed_nach_stm$documents,
zeitungen_corpus_labour_dfm_trimmed_nach_stm$vocab,
K = topic.count,
data = zeitungen_corpus_labour_dfm_trimmed_nach_stm$meta,
init.type = "Spectral"
)
as.data.frame(t(labelTopics(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, n = 10)$prob))
#anteile für jeden Themenbereich
plot(
zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
type = "summary",
text.cex = 0.5,
main = "STM topic shares",
xlab = "Share estimation"
)
#exrahieren beta and gamma
labour_beta_nach <- tidy(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
matrix = "beta")
labour_gamma_nach <- tidy(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
matrix = "gamma")
labour_beta_vor <- tidy(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
matrix = "beta")
labour_gamma_vor <- tidy(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
matrix = "gamma")
#schlagwörter pro thema
top_terms_labour_nach <- labour_beta_nach %>%
arrange(beta)%>%
group_by(topic)%>%
top_n(4, beta)%>%
arrange(-beta)%>%
select(topic, term)%>%
summarise(terms = list(term))%>%
mutate(terms = map(terms, paste, collapse = ", "))%>%
unnest()
#anteile pro thema
gamma_terms_labour_nach <- labour_gamma_nach %>%
group_by(topic)%>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma))%>%
left_join(top_terms_labour_nach, by = "topic")%>%
top_n(5, gamma)%>%
mutate(topic = 1:5,
time = "1. Welle")
#schlagwörter pro thema
top_terms_labour_vor <- labour_beta_vor %>%
arrange(beta)%>%
group_by(topic)%>%
top_n(4, beta)%>%
arrange(-beta)%>%
select(topic, term)%>%
summarise(terms = list(term))%>%
mutate(terms = map(terms, paste, collapse = ", "))%>%
unnest()
#anteile pro thema
gamma_terms_labour_vor <- labour_gamma_vor %>%
group_by(topic)%>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma))%>%
left_join(top_terms_labour_vor, by = "topic")%>%
top_n(5, gamma)%>%
mutate(topic = 1:5,
time = "Vor Corona")
#schlagwörter und anteile zusammenfügen
terms_topics_labour <- bind_rows(gamma_terms_labour_vor, gamma_terms_labour_nach)
terms_topics_labour$topic <- as.numeric(terms_topics_labour$topic)
#Beispielsartikel für jedes topic
thoughts2_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_labour_nach$tx, 206),
n = 2, topics = 2)$docs[[1]]
thoughts2_nach <- thoughts2_nach[1]
thoughts14_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_labour_nach$tx, 162),
n = 2, topics = 14)$docs[[1]]
thoughts14_nach <- thoughts14_nach[1]
thoughts1_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_labour_nach$tx, 225),
n = 2, topics = 1)$docs[[1]]
thoughts1_nach <- thoughts1_nach[1]
thoughts11_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_labour_nach$tx, 152),
n = 4, topics = 11)$docs[[1]]
thoughts11_nach <- thoughts11_nach[4]
thoughts15_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_labour_nach$tx, 290),
n = 2, topics = 15)$docs[[1]]
thoughts15_nach <- thoughts15_nach[1]
thoughts5_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_labour_vor$tx, 257),
n = 2, topics = 5)$docs[[1]]
thoughts5_vor <- thoughts5_vor[1]
thoughts1_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_labour_vor$tx, 230),
n = 2, topics = 1)$docs[[1]]
thoughts1_vor <- thoughts1_vor[1]
thoughts4_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_labour_vor$tx, 133),
n = 2, topics = 4)$docs[[1]]
thoughts4_vor <- thoughts4_vor[1]
thoughts7_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_labour_vor$tx, 101),
n = 2, topics = 7)$docs[[1]]
thoughts7_vor <- thoughts7_vor[2]
thoughts13_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_labour_vor$tx, 157),
n = 2, topics = 13)$docs[[1]]
thoughts13_vor <- thoughts13_vor[1]
#dataframe der beispielsartikel erstellen
thoughts <- c(thoughts2_nach, thoughts14_nach, thoughts1_nach, thoughts11_nach,
thoughts15_nach, thoughts5_vor,
thoughts1_vor, thoughts4_vor, thoughts7_vor, thoughts13_vor)
topic <- c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
time <- c("1. Welle", "1. Welle", "1. Welle", "1. Welle", "1. Welle",
"Vor Corona", "Vor Corona","Vor Corona", "Vor Corona", "Vor Corona")
labour_thoughts <- data.frame(thoughts, topic, time)
#tabelle erstellen mit Infos
labour_all <- full_join(terms_topics_labour, labour_thoughts, by = c("topic", "time"))%>%
mutate(gamma = percent(gamma))
colnames(labour_all) <- c("Topic", "Topic-Anteil", "Topic Schlagworte",
"Zeitpunkt", "Ausschnitt aus Zeitungsartikel zum Topic")
formattable(labour_all, align = "l",
list('Zeitpunkt' =formatter("span",
style = x ~ style(color = ifelse(x == "1. Welle",
"#C5462B", "#FBB7A9")))))%>%
as.htmlwidget() %>%
htmlwidgets::saveWidget(file="example.html")
#filtern für eu und vor corona
zeitungen_eu_vor <- zeitungen %>%
filter(selectsclass == "EU_Europa",
zeitpunkt == "vor 1. Welle")%>%
select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)
#corpus erstellen
zeitungen_corpus_eu_vor <- corpus(zeitungen_eu_vor,text_field="tx")
#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_eu_tokens_vor <- zeitungen_corpus_eu_vor %>%
tokens("word", remove_punct = T,
remove_numbers = T,
remove_symbols=T) %>%
tokens_remove(stopwords('de')) %>%
tokens_tolower()
zeitungen_corpus_eu_dfm_vor <- dfm(zeitungen_corpus_eu_tokens_vor) %>%
dfm_wordstem(language="de")
zeitungen_corpus_eu_dfm_trimmed_vor <- zeitungen_corpus_eu_dfm_vor %>%
dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)
#topic model
topic.count <- 15 # anzahl topics zuweisen
# rechnen STM
zeitungen_corpus_eu_dfm_trimmed_vor_stm <- convert(zeitungen_corpus_eu_dfm_trimmed_vor, to = "stm")
zeitungen_corpus_eu_dfm_trimmed_vor_model_stm <- stm(
zeitungen_corpus_eu_dfm_trimmed_vor_stm$documents,
zeitungen_corpus_eu_dfm_trimmed_vor_stm$vocab,
K = topic.count,
data = zeitungen_corpus_eu_dfm_trimmed_vor_stm$meta,
init.type = "Spectral"
)
as.data.frame(t(labelTopics(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, n = 10)$prob))
#anteile für jeden Themenbereich
plot(
zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
type = "summary",
text.cex = 0.5,
main = "STM topic shares",
xlab = "Share estimation"
)
#filtern für eu und 1. Welle
zeitungen_eu_nach <- zeitungen %>%
filter(selectsclass == "EU_Europa",
zeitpunkt == "1. Welle")%>%
select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)
#corpus erstellen
zeitungen_corpus_eu_nach <- corpus(zeitungen_eu_nach,text_field="tx")
#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_eu_tokens_nach <- zeitungen_corpus_eu_nach %>%
tokens("word", remove_punct = T,
remove_numbers = T,
remove_symbols=T) %>%
tokens_remove(stopwords('de')) %>%
tokens_tolower()
zeitungen_corpus_eu_dfm_nach <- dfm(zeitungen_corpus_eu_tokens_nach) %>%
dfm_wordstem(language="de")
zeitungen_corpus_eu_dfm_trimmed_nach <- zeitungen_corpus_eu_dfm_nach %>%
dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)
#topic model
topic.count <- 15 # anzahl topics zuweisen
# rechnene STM
zeitungen_corpus_eu_dfm_trimmed_nach_stm <- convert(zeitungen_corpus_eu_dfm_trimmed_nach, to = "stm")
zeitungen_corpus_eu_dfm_trimmed_nach_model_stm <- stm(
zeitungen_corpus_eu_dfm_trimmed_nach_stm$documents,
zeitungen_corpus_eu_dfm_trimmed_nach_stm$vocab,
K = topic.count,
data = zeitungen_corpus_eu_dfm_trimmed_nach_stm$meta,
init.type = "Spectral"
)
eu_topics_nach <- tidy(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, matrix = "beta")
eu_topics_terms <- eu_topics_nach %>%
group_by(topic)%>%
top_n(10, beta)%>%
ungroup()%>%
arrange(topic, -beta)
as.data.frame(t(labelTopics(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, n = 10)$prob))
#anteile für jeden Themenbereich
plot(
zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
type = "summary",
text.cex = 0.5,
main = "STM topic shares",
xlab = "Share estimation"
)
#extrahieren beta and gamma
eu_beta_nach <- tidy(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
matrix = "beta")
eu_gamma_nach <- tidy(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
matrix = "gamma")
eu_beta_vor <- tidy(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
matrix = "beta")
eu_gamma_vor <- tidy(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
matrix = "gamma")
#schlagwörter pro thema
top_terms_eu_nach <- eu_beta_nach %>%
arrange(beta)%>%
group_by(topic)%>%
top_n(4, beta)%>%
arrange(-beta)%>%
select(topic, term)%>%
summarise(terms = list(term))%>%
mutate(terms = map(terms, paste, collapse = ", "))%>%
unnest()
#anteile pro thema
gamma_terms_eu_nach <- eu_gamma_nach %>%
group_by(topic)%>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma))%>%
left_join(top_terms_eu_nach, by = "topic")%>%
top_n(5, gamma)%>%
mutate(topic = 1:5,
time = "1. Welle")
#schlagwörter pro thema
top_terms_eu_vor <- eu_beta_vor %>%
arrange(beta)%>%
group_by(topic)%>%
top_n(4, beta)%>%
arrange(-beta)%>%
select(topic, term)%>%
summarise(terms = list(term))%>%
mutate(terms = map(terms, paste, collapse = ", "))%>%
unnest()
#anteile pro thema
gamma_terms_eu_vor <- eu_gamma_vor %>%
group_by(topic)%>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma))%>%
left_join(top_terms_eu_vor, by = "topic")%>%
top_n(5, gamma)%>%
mutate(topic = 1:5,
time = "Vor Corona")
#schlagwörter und anteile zusammenfügen
terms_topics_eu <- bind_rows(gamma_terms_eu_vor, gamma_terms_eu_nach)
terms_topics_eu$topic <- as.numeric(terms_topics_eu$topic)
#Beispielsartikel für jedes topic
thoughts2_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_eu_nach$tx, 221),
n = 2, topics = 9)$docs[[1]]
thoughts2_nach_eu <- thoughts2_nach_eu[1]
thoughts14_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_eu_nach$tx, 212),
n = 8, topics = 12)$docs[[1]]
thoughts14_nach_eu <- thoughts14_nach_eu[1]
thoughts1_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_eu_nach$tx, 200),
n = 9, topics = 8)$docs[[1]]
thoughts1_nach_eu <- thoughts1_nach_eu[9]
thoughts11_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_eu_nach$tx, 229),
n = 2, topics = 7)$docs[[1]]
thoughts11_nach_eu <- thoughts11_nach_eu[1]
thoughts15_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
texts = strtrim(zeitungen_eu_nach$tx, 147),
n = 2, topics = 6)$docs[[1]]
thoughts15_nach_eu <- thoughts15_nach_eu[1]
thoughts5_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_eu_vor$tx, 229),
n = 2, topics = 12)$docs[[1]]
thoughts5_vor_eu <- thoughts5_vor_eu[1]
thoughts1_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_eu_vor$tx, 170),
n = 2, topics = 8)$docs[[1]]
thoughts1_vor_eu <- thoughts1_vor_eu[1]
thoughts4_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_eu_vor$tx, 114),
n = 2, topics = 5)$docs[[1]]
thoughts4_vor_eu <- thoughts4_vor_eu[1]
thoughts7_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_eu_vor$tx, 118),
n = 2, topics = 10)$docs[[1]]
thoughts7_vor_eu <- thoughts7_vor_eu[1]
thoughts13_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
texts = strtrim(zeitungen_eu_vor$tx, 92),
n = 3, topics = 4)$docs[[1]]
thoughts13_vor_eu <- thoughts13_vor_eu[3]
#dataframe mit beispielsartikeln erstellen
thoughts <- c(thoughts2_nach_eu, thoughts14_nach_eu, thoughts1_nach_eu,
thoughts11_nach_eu, thoughts15_nach_eu, thoughts5_vor_eu,
thoughts1_vor_eu, thoughts4_vor_eu, thoughts7_vor_eu, thoughts13_vor_eu)
topic <- c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
time <- c("1. Welle", "1. Welle", "1. Welle", "1. Welle", "1. Welle",
"Vor Corona", "Vor Corona","Vor Corona", "Vor Corona", "Vor Corona")
eu_thoughts <- data.frame(thoughts, topic, time)
#tabelle erstellen mit Infos (beispielsartikel und schlagwörter und anteile)
eu_all <- full_join(terms_topics_eu, eu_thoughts, by = c("topic", "time"))%>%
mutate(gamma = percent(gamma))
colnames(eu_all) <- c("Topic", "Topic-Anteil", "Topic Schlagworte",
"Zeitpunkt", "Ausschnitt aus Zeitungsartikel zum Topic")
formattable(eu_all, align = "l",
list('Zeitpunkt' =formatter("span",
style = x ~ style(color = ifelse(x == "1. Welle",
"#C5462B", "#FBB7A9")))))%>%
as.htmlwidget() %>%
htmlwidgets::saveWidget(file="example2.html")