library(tidyverse)
library(quanteda)
library(lubridate)
library(quanteda.textstats)
library(quanteda.textplots)
library(quanteda.textmodels)
library(stm)
library(hrbrthemes)
library(ggthemes)
library(tidytext)
library(formattable)
library(htmlwidgets)

setwd("~/Downloads/")
zeitungen <- readRDS("zeitungsartikel_de_2020.RDS")
#neue variable monat erstellen, verschiedene Zeitpunkte definieren
zeitungen <- zeitungen %>%
  mutate(month = ifelse(grepl("2020-01", pubDateTime)==T, "01",
         ifelse(grepl("2020-02", pubDateTime)==T, "02", ifelse(
           grepl("2020-03", pubDateTime)==T, "03", ifelse(
             grepl("2020-04", pubDateTime)==T, "04", ifelse(
               grepl("2020-05", pubDateTime)==T, "05", ifelse(
                 grepl("2020-06", pubDateTime)==T, "06", ifelse(
                   grepl("2020-07", pubDateTime)==T, "07", ifelse(
                     grepl("2020-08", pubDateTime)==T, "08", ifelse(
                       grepl("2020-09", pubDateTime)==T, "09", ifelse(
                         grepl("2020-10", pubDateTime)==T, "10", ifelse(
                           grepl("2020-11", pubDateTime)==T, "11", "12"))))))))))))

zeitungen <- zeitungen %>%
  mutate(zeitpunkt = case_when(
      month == "01" ~"vor 1. Welle",
      month == "02" ~"vor 1. Welle",
      month == "03" ~"1. Welle",
      month == "04" ~"1. Welle",
      month == "05" ~"1. Welle",
      month == "06" ~"nach 1. Welle",
      month == "07" ~"nach 1. Welle",
      month == "08" ~"nach 1. Welle",
      month == "09" ~"nach 1. Welle",
      month == "10" ~"2. Welle",
      month == "11" ~"2. Welle",
      month == "12" ~"2. Welle"))

zeitungen <-zeitungen %>% 
  mutate(zeitpunkt_corona = ifelse(zeitpunkt == "vor 1. Welle", "Vor", "Nach")
  )

Rubriken: Corona vs. Andere Artikel für alle Zeitungen und Zeitungen einzeln

Es soll gezeigt werden, wann Corona die Berichterstattung dominiert hat und zu welchen Zeitpunkten sich die Berichterstattung wieder normalisiert hat.

# Im Zeitungstext nach "Corona" und "Covid" suchen --> 2 Kategorien bilden
zeitungen <- zeitungen %>%
  mutate(corona_thema = ifelse(grepl("Corona", tx)==T, "Corona", 
                               ifelse(grepl("Covid", tx)==T, "Corona", 
                                      "Anderes")))
#nach Tag und Corona gruppieren und Anteil berechnen
zeitungen_tag <- zeitungen %>%
  group_by(pubDateTime)%>%
  count()

zeitungen_corona_tag <- zeitungen %>%
  group_by(pubDateTime, corona_thema)%>%
  count()

zeitungen_corona_tag <- left_join(zeitungen_corona_tag, zeitungen_tag, 
                                    by=c("pubDateTime"))

zeitungen_corona_tag <- zeitungen_corona_tag %>%
  mutate(anteil = n.x/n.y*100)

zeitungen_corona_tag$pubDateTime <- as.Date(zeitungen_corona_tag$pubDateTime)

#Visualisierung Anteil Corona Artikel
zeitungen_corona_tag %>%
  filter(corona_thema == "Corona")%>%
  ggplot(aes(pubDateTime, anteil, fill = corona_thema))+
  geom_area(alpha=0.8 , size=.5, colour="#C5462B")+
  scale_x_date(date_breaks = '1 month', 
               labels = scales::date_format("%b"))+
  scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(0,100))+
  labs(x = "",
       y = "",
       title = "Corona dominiert während dem ersten Lockdown die Medienberichterstattung",
       subtitle = "Anteil Artikel mit Corona Nennungen")+
  scale_fill_manual(values = "#C5462B")+
  #texts
  annotate("text", x=as.Date("2020-02-22"), y=82, 
             label="Erster Coronafall Schweiz", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-03-13"), y=86, 
             label="Beginn 1. Lockdown", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-04-24"), y=84.5, 
             label="Erste Öffnungsschritte", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-06-19"), y=73, 
             label="Aufhebung Grossteil der Massnahmen", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-09-28"), y=76.5, 
             label="Zulassung Grossveranstaltungen", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-10-16"), y=85, 
             label="Erneute Massnahmen", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-12-09"), y=83.5, 
             label="Einführung Sperrstunde", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  annotate("text", x=as.Date("2020-12-19"), y=84, 
             label="Erneute Schliessungen", 
            colour = "black", angle = 90, size = 3, family = "Helvetica") +
  #lines
  geom_vline(xintercept=as.Date("2020-03-16"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-02-25"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-04-27"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-06-22"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-10-01"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-10-19"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-12-12"), size=.5, alpha = 0.6, linetype = "dashed") +
  geom_vline(xintercept=as.Date("2020-12-22"), size=.5, alpha = 0.6, linetype = "dashed") +
  guides(fill = FALSE)+
  theme_ipsum()+
  theme(plot.title = element_text(size = 16, family = "Helvetica"),
        plot.subtitle = element_text(size = 14, family = "Helvetica"),
        axis.text.x = element_text(family = "Helvetica", size = 8),
        axis.text.y = element_text(family = "Helvetica", size = 8))+
  ggsave("Corona_1.png", height = 6, width = 11)

Klassifizierungen: welche Themen haben durch Corona abgenommen & Anteil Corona Nennungen im Thema

Welche Themen haben durch Corona abgenommen und wie oft kommt Corona vor in den Artikeln:

#relevante Themenbereiche auswählen, nach monat gruppieren
zeitungen_thema<- zeitungen %>%
  filter(selectsclass %in% c("EU_Europa", "LabourMarket", "Agriculture", "Economy", "Finances_Taxes", 
                       "InternationalRelations",
                       "Law_Order", "Education_Culture", "Environment_Energy", "Immigration_Asylum", 
                       "Public_Health",
                       "GenderIssues_Discrimination", "PublicHealth"))%>%
  group_by(month) %>%
  count()
#relevante themenbereiche auswählen, nach monat und themenbereiche gruppieren
zeitungen_thema_2<- zeitungen %>%
  filter(selectsclass %in% c("EU_Europa", "LabourMarket", "Agriculture", "Economy", "Finances_Taxes", 
                       "InternationalRelations",
                       "Law_Order", "Education_Culture", "Environment_Energy", "Immigration_Asylum", 
                       "Public_Health",
                       "GenderIssues_Discrimination", "PublicHealth"))%>%
  group_by(month, selectsclass, corona_thema)%>%
  count()

zeitungen_thema_2 <- left_join(zeitungen_thema, zeitungen_thema_2, 
                                    by=c("month"))
#Themenbereiche übersetzen
zeitungen_thema_2$selectsclass <- factor(zeitungen_thema_2$selectsclass, 
                                         levels = c("LabourMarket", "Education_Culture", 
                                                    "EU_Europa", "Finances_Taxes",
                                                    "GenderIssues_Discrimination", 
                                                    "PublicHealth", "Immigration_Asylum",
                                                    "InternationalRelations", "Agriculture", "Law_Order", 
                                                    "Environment_Energy", "Economy"), 
                                         labels = c("Arbeitsmarkt", "Bildung & Kultur", 
                                                    "EU & Europa", "Finanzen & Steuern", 
                                                    "Geschlechterthemen & Diskriminierung",
                                                    "Gesundheitswesen", "Immigration & Asyl", 
                                                    "Internationale Beziehungen", 
                                                    "Landwirtschaft", "Recht & Ordnung", 
                                                    "Umwelt & Energie", "Wirtschaft"))

#Visualisierung

zeitungen_thema_2 %>%
  mutate(anteil = n.y/n.x*100)%>%
  ggplot(aes(fill=corona_thema, y=anteil, x=factor(month, labels = c("Jan", "Feb", "Mar",
                                                                     "Apr", "Mai", "Jun",
                                                                     "Jul", "Aug", "Sep",
                                                                     "Okt", "Nov", "Dez")))) + 
    geom_bar(position="stack", stat="identity", alpha = 0.8) +
    scale_fill_manual(values = c("#FBB7A9", "#C5462B"), labels = c("Ohne Corona Nennungen", 
                                                                   "Corona Nennungen")) +
    scale_y_continuous(labels = scales::percent_format(scale = 1))+
    labs(title = "Die verschiedenen Themebereiche sind unterschiedlich stark betroffen", x = "", y = "",
         subtitle = "Anteil Artikel pro Thema") +
    facet_wrap(~selectsclass, ncol = 3, scales = "free") +
    theme_ipsum() +
    theme(legend.position='top', 
        legend.justification='left',
        legend.direction='horizontal', 
        legend.title = element_blank(),
        legend.text = element_text(size = 12, family = "Helvetica"),
        plot.title = element_text(size = 16, family = "Helvetica"),
        plot.subtitle = element_text(size = 14, family = "Helvetica"),
        axis.text.x = element_text(family = "Helvetica", size = 8),
        axis.text.y = element_text(family = "Helvetica", size = 8),
        strip.text.x = element_text(family = "Helvetica"))+
  ggsave("Corona_2.png", height = 10, width = 12)

Textanalyse

Wie hat sich die Berichterstattung in einem bestimmten Thema verändert durch Corona? Welche Schlagwörter werden gebraucht vor Corona, während Corona?

labour

#Artikel zu Arbeitsmarkt und vor corona filtern
zeitungen_labour_vor <- zeitungen %>%
  filter(selectsclass == "LabourMarket",
         zeitpunkt == "vor 1. Welle")%>%
  select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)

#corpus erstellen
zeitungen_corpus_labour_vor <- corpus(zeitungen_labour_vor,text_field="tx")

#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_labour_tokens_vor <- zeitungen_corpus_labour_vor %>%
  tokens("word", remove_punct = T, 
                                         remove_numbers = T,
                                         remove_symbols=T) %>%
  tokens_remove(stopwords('de')) %>%
  tokens_tolower()


zeitungen_corpus_labour_dfm_vor <- dfm(zeitungen_corpus_labour_tokens_vor) %>% 
  dfm_wordstem(language="de")

zeitungen_corpus_labour_dfm_trimmed_vor <- zeitungen_corpus_labour_dfm_vor %>%
  dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)


#topic model

topic.count <- 15 # anzahl topics

# Cstm rechnen
zeitungen_corpus_labour_dfm_trimmed_vor_stm <- convert(zeitungen_corpus_labour_dfm_trimmed_vor, to = "stm")

zeitungen_corpus_labour_dfm_trimmed_vor_model_stm <- stm(
  zeitungen_corpus_labour_dfm_trimmed_vor_stm$documents,
  zeitungen_corpus_labour_dfm_trimmed_vor_stm$vocab,
  K = topic.count,
  data = zeitungen_corpus_labour_dfm_trimmed_vor_stm$meta,
  init.type = "Spectral"
)


as.data.frame(t(labelTopics(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, n = 10)$prob))
#Anteile für jeden Themenbereich
plot(
  zeitungen_corpus_labour_dfm_trimmed_vor_model_stm,
  type = "summary",
  text.cex = 0.5,
  main = "STM topic shares",
  xlab = "Share estimation"
)

#filtern für Arbeitsmarkt und 1. Welle
zeitungen_labour_nach <- zeitungen %>%
  filter(selectsclass == "LabourMarket",
         zeitpunkt == "1. Welle")%>%
  select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)

#corpus erstellen
zeitungen_corpus_labour_nach <- corpus(zeitungen_labour_nach,text_field="tx")

#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_labour_tokens_nach <- zeitungen_corpus_labour_nach %>%
  tokens("word", remove_punct = T, 
                                         remove_numbers = T,
                                         remove_symbols=T) %>%
  tokens_remove(stopwords('de')) %>%
  tokens_tolower()


zeitungen_corpus_labour_dfm_nach <- dfm(zeitungen_corpus_labour_tokens_nach) %>% 
  dfm_wordstem(language="de")

zeitungen_corpus_labour_dfm_trimmed_nach <- zeitungen_corpus_labour_dfm_nach %>%
  dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)



#topic model

topic.count <- 15 # Anzahl topics

#stm rechnen
zeitungen_corpus_labour_dfm_trimmed_nach_stm <- convert(zeitungen_corpus_labour_dfm_trimmed_nach, to = "stm")

zeitungen_corpus_labour_dfm_trimmed_nach_model_stm <- stm(
  zeitungen_corpus_labour_dfm_trimmed_nach_stm$documents,
  zeitungen_corpus_labour_dfm_trimmed_nach_stm$vocab,
  K = topic.count,
  data = zeitungen_corpus_labour_dfm_trimmed_nach_stm$meta,
  init.type = "Spectral"
)


as.data.frame(t(labelTopics(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, n = 10)$prob))
#anteile für jeden Themenbereich
plot(
  zeitungen_corpus_labour_dfm_trimmed_nach_model_stm,
  type = "summary",
  text.cex = 0.5,
  main = "STM topic shares",
  xlab = "Share estimation"
)

#exrahieren beta and gamma
labour_beta_nach <- tidy(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                         matrix = "beta")
labour_gamma_nach <- tidy(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                          matrix = "gamma")

labour_beta_vor <- tidy(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                        matrix = "beta")
labour_gamma_vor <- tidy(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                         matrix = "gamma")


#schlagwörter pro thema 
top_terms_labour_nach <- labour_beta_nach %>%
  arrange(beta)%>%
  group_by(topic)%>%
  top_n(4, beta)%>%
  arrange(-beta)%>%
  select(topic, term)%>%
  summarise(terms = list(term))%>%
  mutate(terms = map(terms, paste, collapse = ", "))%>%
  unnest()
#anteile pro thema
gamma_terms_labour_nach <- labour_gamma_nach %>%
  group_by(topic)%>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma))%>%
  left_join(top_terms_labour_nach, by = "topic")%>%
  top_n(5, gamma)%>%
  mutate(topic = 1:5,
         time = "1. Welle")

#schlagwörter pro thema
top_terms_labour_vor <- labour_beta_vor %>%
  arrange(beta)%>%
  group_by(topic)%>%
  top_n(4, beta)%>%
  arrange(-beta)%>%
  select(topic, term)%>%
  summarise(terms = list(term))%>%
  mutate(terms = map(terms, paste, collapse = ", "))%>%
  unnest()
#anteile pro thema
gamma_terms_labour_vor <- labour_gamma_vor %>%
  group_by(topic)%>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma))%>%
  left_join(top_terms_labour_vor, by = "topic")%>%
  top_n(5, gamma)%>%
  mutate(topic = 1:5,
         time = "Vor Corona")
  
#schlagwörter und anteile zusammenfügen
terms_topics_labour <- bind_rows(gamma_terms_labour_vor, gamma_terms_labour_nach)

terms_topics_labour$topic <- as.numeric(terms_topics_labour$topic)
#Beispielsartikel für jedes topic
thoughts2_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_labour_nach$tx, 206), 
                          n = 2, topics = 2)$docs[[1]]

thoughts2_nach <- thoughts2_nach[1]

thoughts14_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_labour_nach$tx, 162), 
                          n = 2, topics = 14)$docs[[1]]

thoughts14_nach <- thoughts14_nach[1]

thoughts1_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_labour_nach$tx, 225), 
                          n = 2, topics = 1)$docs[[1]]

thoughts1_nach <- thoughts1_nach[1]

thoughts11_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_labour_nach$tx, 152), 
                          n = 4, topics = 11)$docs[[1]]

thoughts11_nach <- thoughts11_nach[4]

thoughts15_nach <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_labour_nach$tx, 290), 
                          n = 2, topics = 15)$docs[[1]]

thoughts15_nach <- thoughts15_nach[1]

thoughts5_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_labour_vor$tx, 257), 
                          n = 2, topics = 5)$docs[[1]]

thoughts5_vor <- thoughts5_vor[1]

thoughts1_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_labour_vor$tx, 230), 
                          n = 2, topics = 1)$docs[[1]]

thoughts1_vor <- thoughts1_vor[1]

thoughts4_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_labour_vor$tx, 133), 
                          n = 2, topics = 4)$docs[[1]]

thoughts4_vor <- thoughts4_vor[1]

thoughts7_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_labour_vor$tx, 101), 
                          n = 2, topics = 7)$docs[[1]]

thoughts7_vor <- thoughts7_vor[2]

thoughts13_vor <- findThoughts(zeitungen_corpus_labour_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_labour_vor$tx, 157), 
                          n = 2, topics = 13)$docs[[1]]

thoughts13_vor <- thoughts13_vor[1]


#dataframe der beispielsartikel erstellen
thoughts <- c(thoughts2_nach, thoughts14_nach, thoughts1_nach, thoughts11_nach, 
              thoughts15_nach, thoughts5_vor,
              thoughts1_vor, thoughts4_vor, thoughts7_vor, thoughts13_vor)

topic <- c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5)

time <- c("1. Welle", "1. Welle", "1. Welle", "1. Welle", "1. Welle", 
          "Vor Corona", "Vor Corona","Vor Corona", "Vor Corona", "Vor Corona")

labour_thoughts <- data.frame(thoughts, topic, time)
#tabelle erstellen mit Infos
labour_all <- full_join(terms_topics_labour, labour_thoughts, by = c("topic", "time"))%>%
  mutate(gamma = percent(gamma))

colnames(labour_all) <- c("Topic", "Topic-Anteil", "Topic Schlagworte", 
                      "Zeitpunkt", "Ausschnitt aus Zeitungsartikel zum Topic")

formattable(labour_all, align = "l",
            list('Zeitpunkt' =formatter("span",
                                 style = x ~ style(color = ifelse(x == "1. Welle", 
                                                                  "#C5462B", "#FBB7A9")))))%>%
  as.htmlwidget() %>%
  htmlwidgets::saveWidget(file="example.html")

EU & Europa

#filtern für eu und vor corona
zeitungen_eu_vor <- zeitungen %>%
  filter(selectsclass == "EU_Europa",
         zeitpunkt == "vor 1. Welle")%>%
  select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)

#corpus erstellen
zeitungen_corpus_eu_vor <- corpus(zeitungen_eu_vor,text_field="tx")

#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_eu_tokens_vor <- zeitungen_corpus_eu_vor %>%
  tokens("word", remove_punct = T, 
                                         remove_numbers = T,
                                         remove_symbols=T) %>%
  tokens_remove(stopwords('de')) %>%
  tokens_tolower()


zeitungen_corpus_eu_dfm_vor <- dfm(zeitungen_corpus_eu_tokens_vor) %>% 
  dfm_wordstem(language="de")

zeitungen_corpus_eu_dfm_trimmed_vor <- zeitungen_corpus_eu_dfm_vor %>%
  dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)



#topic model

topic.count <- 15 # anzahl topics zuweisen

# rechnen STM 
zeitungen_corpus_eu_dfm_trimmed_vor_stm <- convert(zeitungen_corpus_eu_dfm_trimmed_vor, to = "stm")

zeitungen_corpus_eu_dfm_trimmed_vor_model_stm <- stm(
  zeitungen_corpus_eu_dfm_trimmed_vor_stm$documents,
  zeitungen_corpus_eu_dfm_trimmed_vor_stm$vocab,
  K = topic.count,
  data = zeitungen_corpus_eu_dfm_trimmed_vor_stm$meta,
  init.type = "Spectral"
)


as.data.frame(t(labelTopics(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, n = 10)$prob))
#anteile für jeden Themenbereich
plot(
  zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
  type = "summary",
  text.cex = 0.5,
  main = "STM topic shares",
  xlab = "Share estimation"
)

#filtern für eu und 1. Welle
zeitungen_eu_nach <- zeitungen %>%
  filter(selectsclass == "EU_Europa",
         zeitpunkt == "1. Welle")%>%
  select(selectsclass, zeitpunkt, month, pubDateTime, so, tx)

#corpus erstellen
zeitungen_corpus_eu_nach <- corpus(zeitungen_eu_nach,text_field="tx")

#in features teilen, entfernen von irrelevanten Features
zeitungen_corpus_eu_tokens_nach <- zeitungen_corpus_eu_nach %>%
  tokens("word", remove_punct = T, 
                                         remove_numbers = T,
                                         remove_symbols=T) %>%
  tokens_remove(stopwords('de')) %>%
  tokens_tolower()


zeitungen_corpus_eu_dfm_nach <- dfm(zeitungen_corpus_eu_tokens_nach) %>% 
  dfm_wordstem(language="de")

zeitungen_corpus_eu_dfm_trimmed_nach <- zeitungen_corpus_eu_dfm_nach %>%
  dfm_trim(min_docfreq=0.001,max_docfreq=0.55,docfreq_type="prop",verbose=T)



#topic model

topic.count <- 15 # anzahl topics zuweisen

# rechnene STM 
zeitungen_corpus_eu_dfm_trimmed_nach_stm <- convert(zeitungen_corpus_eu_dfm_trimmed_nach, to = "stm")

zeitungen_corpus_eu_dfm_trimmed_nach_model_stm <- stm(
  zeitungen_corpus_eu_dfm_trimmed_nach_stm$documents,
  zeitungen_corpus_eu_dfm_trimmed_nach_stm$vocab,
  K = topic.count,
  data = zeitungen_corpus_eu_dfm_trimmed_nach_stm$meta,
  init.type = "Spectral"
)

eu_topics_nach <- tidy(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, matrix = "beta")

eu_topics_terms <- eu_topics_nach %>%
  group_by(topic)%>%
  top_n(10, beta)%>%
  ungroup()%>%
  arrange(topic, -beta)




as.data.frame(t(labelTopics(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, n = 10)$prob))
#anteile für jeden Themenbereich
plot(
  zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
  type = "summary",
  text.cex = 0.5,
  main = "STM topic shares",
  xlab = "Share estimation"
)

#extrahieren beta and gamma
eu_beta_nach <- tidy(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm,
                     matrix = "beta")
eu_gamma_nach <- tidy(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, 
                      matrix = "gamma")

eu_beta_vor <- tidy(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm,
                    matrix = "beta")
eu_gamma_vor <- tidy(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, 
                     matrix = "gamma")


#schlagwörter pro thema 
top_terms_eu_nach <- eu_beta_nach %>%
  arrange(beta)%>%
  group_by(topic)%>%
  top_n(4, beta)%>%
  arrange(-beta)%>%
  select(topic, term)%>%
  summarise(terms = list(term))%>%
  mutate(terms = map(terms, paste, collapse = ", "))%>%
  unnest()
#anteile pro thema 
gamma_terms_eu_nach <- eu_gamma_nach %>%
  group_by(topic)%>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma))%>%
  left_join(top_terms_eu_nach, by = "topic")%>%
  top_n(5, gamma)%>%
  mutate(topic = 1:5,
         time = "1. Welle")

#schlagwörter pro thema 
top_terms_eu_vor <- eu_beta_vor %>%
  arrange(beta)%>%
  group_by(topic)%>%
  top_n(4, beta)%>%
  arrange(-beta)%>%
  select(topic, term)%>%
  summarise(terms = list(term))%>%
  mutate(terms = map(terms, paste, collapse = ", "))%>%
  unnest()
#anteile pro thema 
gamma_terms_eu_vor <- eu_gamma_vor %>%
  group_by(topic)%>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma))%>%
  left_join(top_terms_eu_vor, by = "topic")%>%
  top_n(5, gamma)%>%
  mutate(topic = 1:5,
         time = "Vor Corona")
  
#schlagwörter und anteile zusammenfügen
terms_topics_eu <- bind_rows(gamma_terms_eu_vor, gamma_terms_eu_nach)

terms_topics_eu$topic <- as.numeric(terms_topics_eu$topic)
#Beispielsartikel für jedes topic
thoughts2_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_eu_nach$tx, 221), 
                          n = 2, topics = 9)$docs[[1]]

thoughts2_nach_eu <- thoughts2_nach_eu[1]

thoughts14_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_eu_nach$tx, 212), 
                          n = 8, topics = 12)$docs[[1]]

thoughts14_nach_eu <- thoughts14_nach_eu[1]

thoughts1_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_eu_nach$tx, 200), 
                          n = 9, topics = 8)$docs[[1]]

thoughts1_nach_eu <- thoughts1_nach_eu[9]

thoughts11_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_eu_nach$tx, 229), 
                          n = 2, topics = 7)$docs[[1]]

thoughts11_nach_eu <- thoughts11_nach_eu[1]

thoughts15_nach_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_nach_model_stm, 
                          texts = strtrim(zeitungen_eu_nach$tx, 147), 
                          n = 2, topics = 6)$docs[[1]]

thoughts15_nach_eu <- thoughts15_nach_eu[1]

thoughts5_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_eu_vor$tx, 229), 
                          n = 2, topics = 12)$docs[[1]]

thoughts5_vor_eu <- thoughts5_vor_eu[1]

thoughts1_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_eu_vor$tx, 170), 
                          n = 2, topics = 8)$docs[[1]]

thoughts1_vor_eu <- thoughts1_vor_eu[1]

thoughts4_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_eu_vor$tx, 114), 
                          n = 2, topics = 5)$docs[[1]]

thoughts4_vor_eu <- thoughts4_vor_eu[1]

thoughts7_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_eu_vor$tx, 118), 
                          n = 2, topics = 10)$docs[[1]]

thoughts7_vor_eu <- thoughts7_vor_eu[1]

thoughts13_vor_eu <- findThoughts(zeitungen_corpus_eu_dfm_trimmed_vor_model_stm, 
                          texts = strtrim(zeitungen_eu_vor$tx, 92), 
                          n = 3, topics = 4)$docs[[1]]

thoughts13_vor_eu <- thoughts13_vor_eu[3]


#dataframe mit beispielsartikeln erstellen
thoughts <- c(thoughts2_nach_eu, thoughts14_nach_eu, thoughts1_nach_eu, 
              thoughts11_nach_eu, thoughts15_nach_eu, thoughts5_vor_eu,
              thoughts1_vor_eu, thoughts4_vor_eu, thoughts7_vor_eu, thoughts13_vor_eu)

topic <- c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5)

time <- c("1. Welle", "1. Welle", "1. Welle", "1. Welle", "1. Welle", 
          "Vor Corona", "Vor Corona","Vor Corona", "Vor Corona", "Vor Corona")

eu_thoughts <- data.frame(thoughts, topic, time)
#tabelle erstellen mit Infos (beispielsartikel und schlagwörter und anteile)
eu_all <- full_join(terms_topics_eu, eu_thoughts, by = c("topic", "time"))%>%
  mutate(gamma = percent(gamma))

colnames(eu_all) <- c("Topic", "Topic-Anteil", "Topic Schlagworte", 
                      "Zeitpunkt", "Ausschnitt aus Zeitungsartikel zum Topic")


formattable(eu_all, align = "l",
            list('Zeitpunkt' =formatter("span",
                                 style = x ~ style(color = ifelse(x == "1. Welle", 
                                                                  "#C5462B", "#FBB7A9")))))%>%
  as.htmlwidget() %>%
  htmlwidgets::saveWidget(file="example2.html")