setwd("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\1. R-Code\\1. R-Code") library(foreign) set.seed(123456) #------------------------------------------------------------------------------------------------------- ## TWITTER FOLLOWERS SCRAPEN #------------------------------------------------------------------------------------------------------- library(twitteR) library(wordcloud) library(tm) library(NLP) options(stringsAsFactors=F) # get rid of factors ## LOG IN TWITTER API #necessary file for Windows download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") ## TWITTER ANMELDUNG #------------------------------------------------------------------------------------------------------ #to get your consumerKey and consumerSecret see the twitteR documentation for instructions consumer_key <- '????' #------------> NEU consumer_secret <- '????' #------------> NEU access_token <- '????-????' #------------> NEU access_secret <- '????' #------------> NEU setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) ## ANZAHL FOLLOWERS PRO PARTEI #----------------------------------------------------------------------------------------------------- # Liste von Accounts (screen names) accounts <- c("SVPch", "spschweiz", "FDP_Liberalen", "CVP_PDC", "GrueneCH", "BDPSchweiz", "evppev", "grunliberale", "LEGAdeiTicinesi") # Einen leeren data frame als container definieren results.party.followers <- data.frame() #--------------- (SCRAPEN WÄRE AUCH HIER DIREKT MÖGLICH) # Ueber account-Liste loopen for (account in accounts) { user <- getUser(account) # Twitter-Anfrage fuer ein Userobjekt print(user) fr <- user$getFriendsCount() # Anzahl Freunde extrahieren fo <- user$getFollowersCount() # Anzahl Followers extrahieren get # Resultate zusammenfuegen results.party.followers <- rbind(results.party.followers, c(account, fr, fo)) } results.party.followers <- setNames(results.party.followers, c("Party", "Follows", "Followers")) write.csv(results.party.followers, file = "party_followers.csv") ## Dieser Datensatz hilft die Anzahl Followers pro Partei zu sehen #---------------------------------------------------------------------------------------------------- ## SCRAPING FOLLOWER INFORMATIONEN #---------------------------------------------------------------------------------------------------- library(data.table) # Anfrage fuer ein Benutzerobjekt user <- getUser("FDP_Liberalen") #------------> NEU BEI ANDERER PARTEI # Followers extrahieren und Anzahl fetstellen fo <- user$getFollowers(retryOnRateLimit=180) FDP_followers <- rbindlist(lapply(fo, as.data.frame)) write.csv(FDP_followers, "FDP.Followers.csv") #------------> NEU BEI ANDERER PARTEI #----------------------------------------------------------------------------------------------------- # DATENSATZ ERSTELLEN #----------------------------------------------------------------------------------------------------- library(dplyr) cvp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\CVP.Followers.csv",stringsAsFactors = FALSE) bdp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\BDP.Followers1.csv",stringsAsFactors = FALSE) evp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\EVP.Followers1.csv",stringsAsFactors = FALSE) lega <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\Lega.Followers_neu.csv",stringsAsFactors = FALSE) gruene <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\GRUENE.Followers.csv",stringsAsFactors = FALSE) glp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\GLP.Followers.csv",stringsAsFactors = FALSE) svp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\SVP.Followers_neu.csv",stringsAsFactors = FALSE) sp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\spschweiz.Followers_neu.csv",stringsAsFactors = FALSE) fdp <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\3. Daten Follower Parteien\\FDP.Followers.csv" ,stringsAsFactors = FALSE) # Methavariable: PARTEI cvp$party <- "CVP" bdp$party <- "BDP" evp$party <- "EVP" lega$party <- "LEGA" gruene$party <- "GRUENE" glp$party <- "GLP" svp$party <- "SVP" sp$party <- "SP" fdp$party <- "FDP" head(bdp) head(fdp) # DATA FRAME df <- rbind(cvp, bdp, evp, lega, gruene, glp, svp, sp, fdp) df <- df %>% select(X, users, FollowersCount, FriendsCount, Name, ID, StatusesCount,FavoritesCount, Location, Description, Language, ProfileImageUrl, party) write.csv(df, "C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\1. Daten Komplett\\Followers_neu.csv") ###################################################################################################### #----------------------------------------------------------------------------------------------------- # Start Daten Analyse #----------------------------------------------------------------------------------------------------- ###################################################################################################### library(foreign) library(tidytext) library(dplyr) library(stringr) library(dplyr) df <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\1. Daten Komplett\\Followers_neu.csv",encoding = "utf-8") df6 <- df %>% select(users, Name, party, Description, Language) test <- df6$Description test1 <- str_replace_all(test, "(?<=\\<).*?(?=\\>)", "") test1[14] # Text Cleaning german text_vector_clean <- str_replace_all(test1,"", "") text_vector_clean1 <- str_replace_all(text_vector_clean, "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https|<[A-Za-z\\d]>|\\|and", "") text_vector_clean2 <- str_replace_all(text_vector_clean1,"[><[:punct:]]", " ") text_vector_clean3 <- str_replace_all(text_vector_clean2,"[^[:alnum:]]", " ") text_vector_clean4 <- gsub("[[:space:]]", " ", text_vector_clean3) text_vector_clean5 <- gsub("\r?\n|\r", " ", text_vector_clean4) library(stringr) text_vector_clean6 <- str_replace(gsub("\\s+", " ", str_trim(text_vector_clean5)), "B", "b") # Delete empty chars text_vector_clean7 <- text_vector_clean6[-which(text_vector_clean6=="")] #Test text_vector_clean6[14] text_vector_clean7[14] x <- sample(text_vector_clean7, 13) x # Ready for Translation text_vector_clean7 ######################################################################################################### #----------------------------------------------------------------------------------------------------- # TRANSLATE THE DESCRIPTIONS IN ENGLISH #----------------------------------------------------------------------------------------------------- ###################################################################################################### library(RSelenium) library(foreach) library(wdman) library(dplyr) # open selenium server and get port information selServ <- selenium(jvmargs = c("-Dwebdriver.chrome.verboseLogging=true"), port = 4567L) selServ$log() # evaluate remDr <- remoteDriver(port = 4567L, browserName = "chrome") # driver remDr$open() # open headless chrome remDr$navigate('https://www.deepl.com/translator') # navigate to deepL ip <- remDr$findElement(using='css', '.lmt__source_textarea') # get input container as element ip1 <- remDr$findElement(using='css', '.lmt__language_select--source') ip2 <- remDr$findElement(using = 'css', '.lmt__language_select.lmt__language_select--target') ip2$clickElement() Sys.sleep(3) option <- remDr$findElement("css", "#dl_translator .lmt__language_select.lmt__language_select--target.lmt__language_select--open.lmt__language_select--open_2 li[dl-value='EN']") option$clickElement() Sys.sleep(3) #empty character vector to fetch results translated <- c() t1 <- text_vector_clean7 # loop for(i in t2){ print(paste('Iteration:',i,'/ ', length(t1))) # for every tenth iteration we wanna refresh the browser if(is.integer(i/10)){ remDr$refresh() Sys.sleep(10) } start_t <- Sys.time() # try translating ip$clickElement() ip$sendKeysToElement(list(t1[i])) # send the snippet to the input container Sys.sleep(8) tes<- remDr$findElement(using='css selector',".lmt__language_select--source")$getElementText() if(any(tes=='?bersetze Englisch (erkannt)')) { ip$clearElement() # clear the input container (no need to reload the page) # save the batch translated <- c(translated, t1[i]) stop_t <- Sys.time() print(paste('Timediff:', stop_t - start_t)) next(i) } {Sys.sleep(5) # inject js: extract the value out of the output container clsf <- remDr$executeScript(script = 'return $(".lmt__target_textarea").val();', args = list("dummy"))[[1]] # if that value (transl.) contains '...' (loading) or no text, then we wait and fetch the value again if(any(c('[...]', "") %in% clsf)) { Sys.sleep(10) clsf <- remDr$executeScript(script = 'return $(".lmt__target_textarea").val();', args = list("dummy"))[[1]] } ip$clearElement() # clear the input container (no need to reload the page) # save the batch translated <- c(translated, clsf) stop_t <- Sys.time() print(paste('Timediff:', stop_t - start_t))} } translated translated1 <- as.data.frame(translated) write.csv(translated1, "Englische_Uebersetzung.csv") # close browser remDr$close() # stop the selenium server selServ$stop() ########################################################################################### #------------------------------------------------------------------------------------------- #PREPROCESSING #------------------------------------------------------------------------------------------- ############################################################################################ library(foreign) library(tidytext) library(dplyr) library(stringr) library(dplyr) library(stringr) df_en <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\1. Daten Komplett\\Englische Uebersetzung1.csv",encoding = "utf-8", stringsAsFactors = F) df <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\1. Daten Komplett\\Followers_neu.csv",encoding = "utf-8", stringsAsFactors = F) #---------------------------------------------------------------------------- df6 <- df %>% select(users, Name, party, Description, Language) df_neu <- df6 df_neu$Description <- str_replace_all(df_neu$Description, "(?<=\\<).*?(?=\\>)", "") df_neu$Description <- str_replace_all(df_neu$Description ,"", "") df_neu$Description <- str_replace_all(df_neu$Description , "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https|<[A-Za-z\\d]>|\\|and", "") df_neu$Description <- str_replace_all(df_neu$Description ,"[><[:punct:]]", " ") df_neu$Description <- str_replace_all(df_neu$Description ,"[^[:alnum:]]", " ") df_neu$Description <- gsub("[[:space:]]", " ", df_neu$Description ) df_neu$Description <- gsub("\r?\n|\r", " ", df_neu$Description ) df_neu$Description <- str_replace(gsub("\\s+", " ", str_trim(df_neu$Description )), "B", "b") df_neu1 <- df_neu[!(is.na(df_neu$Description) | df_neu$Description==""), ] df_stem <- as.data.frame(cbind(df_neu1,df_en$translated)) colnames(df_stem)[6] <- "translated" head(df_stem) write.csv(df_stem, "C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\1. Daten Komplett\\topic_data_Neu(final).csv") ########################################################################################### #------------------------------------------------------------------------------------------- #CORPUS #------------------------------------------------------------------------------------------- ############################################################################################ ## Dataset ready df_stem <- read.csv("C:\\Users\\mauro\\Documents\\Master Semester 2\\Datenjournalismus\\Blog\\2. Daten\\1. Daten Komplett\\topic_data_Neu(final).csv" ,encoding = "utf-8", stringsAsFactors = F) ####### STM #basic settings library(RSQLite) library(tm) library(stm) library(parallel) library(ggplot2) library(car) library(NLP) library(dplyr) library(tidytext) library(ggplot2) library(car) options(stringsAsFactors = F) set.seed(123456) lang <- "english" #------------------------------------------------------------------------------------------- head(df_stem) docs <- df_stem %>% select(users, party, translated) docs <- docs %>% filter(!docs$translated == "") docs$id <- rownames(docs) # sample a subset (in order to make demo faster) #docs <- docs[sample(nrow(docs), 6000), ] #remove Umlaute docs$translated <- sapply(docs$translated , function(x) iconv(x, "latin1", "ASCII", sub="")) enc2utf8(docs$translated) #preprocess texts myCorpus <- Corpus(VectorSource(docs$translated)) myCorpus <- tm_map(myCorpus, tolower) myCorpus <- tm_map(myCorpus, removePunctuation) myCorpus <- tm_map(myCorpus, removeNumbers) myCorpus <- tm_map(myCorpus, stripWhitespace) myCorpus <- tm_map(myCorpus, removeWords, c("twitter", "tweet", "tweets", "swiss", "switzerland", "im", "account", "ch", "de")) for (i in 1:length(docs$translated)) { docs$txt[i] <- gsub("[[:space:]]", " ", myCorpus[[i]]) } #Save Data saveRDS(docs, "docs(Corpus-final_mit Parteikürzel).rds") #------------------------------------------------------------------------------------------- #generate a corpus for the stm corpus <- textProcessor(docs[,c("txt")], metadata=docs[,c("party", "users", "id")], stem=T, language=lang, removestopwords=T, lowercase=F, removenumbers=F, removepunctuation=F, customstopwords=NULL) corpus <- prepDocuments(corpus$documents, corpus$vocab, corpus$meta, lower.thresh = 3) writeLines(as.character(corpus), con="corpus.txt") saveRDS(corpus, "corpus.RDS") #Removing 5320 of 13645 terms (12508 of 260887 tokens) due to frequency #Removing 419 Documents with No Words #Your corpus now has 35543 documents, 8325 terms and 248379 tokens. ############################################################################################ #------------------------------------------------------------------------------------------- # STRUCTURAL TOPIC MODEL #------------------------------------------------------------------------------------------- ############################################################################################ # search for optimal number of topics (k) srchK <- searchK(corpus$documents, corpus$vocab, 4:20, verbose = TRUE, init.type = "Spectral", cores = 1) srchK srchK1 <- searchK(corpus$documents, corpus$vocab, c(25,45,65,85,100), verbose = TRUE, init.type = "Spectral", cores = 1) srchK1 df_searchK <- as.data.frame(rbind(srchK$results,srchK1$results)) write.csv(df_searchK, "df_searchK(final).csv") saveRDS(srchK, file = "SearchK(final).RDS") plot(srchK) stm::plot.searchK(srchK) help(exclusivity) help(semanticCoherence) ggplot_SearchK <- ggplot(df_searchK, aes(x=semcoh, y=exclus)) + geom_point(col="red", size=df_searchK$em.its*0.04)+ geom_text(label=paste(df_searchK$K, "Topics, Iterations:", df_searchK$em.its) , check_overlap = T, size=3)+ scale_x_continuous(expand = c(0.7,0)) + geom_smooth(method = 'loess', span=1, level=0)+ labs(x="Semcantic Coherence: Maximised when top words of one topic \ntend to co-occur in a document of the same topic", y="Exclusivity: Captures if the top words of one topic differ \nfrom the top words of other topics", title="SearchK: Best N of Topics") ggplot_SearchK #------------------------------------------------------------------------------------------ # MODEL 5, 8, 11 UND 17 WERDEN NÄHER BETRACHTET #------------------------------------------------------------------------------------------ # Fit stm with optimal k set.seed(123456) STM5 <- stm(corpus$documents, corpus$vocab, 5, prevalence =~party , content = NULL, data = corpus$meta, max.em.its = 500, verbose = TRUE, init.type = "Spectral") saveRDS(STM5, file="stm5.RDS") STM8 <- stm(corpus$documents, corpus$vocab, 8, prevalence =~party , content = NULL, data = corpus$meta, max.em.its = 500, verbose = TRUE, init.type = "Spectral") saveRDS(STM8, file="stm8.RDS") STM11 <- stm(corpus$documents, corpus$vocab, 11, prevalence =~party , content = NULL, data = corpus$meta, max.em.its = 500, verbose = TRUE, init.type = "Spectral") saveRDS(STM11, file="stm11.RDS") STM17 <- stm(corpus$documents, corpus$vocab, 17, prevalence =~party , content = NULL, data = corpus$meta, max.em.its = 500, verbose = TRUE, init.type = "Spectral") saveRDS(STM17, file="stm17.RDS") #### Topic Quality png("END_TopicQuality.png", width=850,height=850, type = c("windows")) par(mfrow=c(2,2)) topicQuality(STM5, corpus$documents, xlab = "Semantic Coherence", ylab = "Exclusivity", labels = 1:ncol(STM5$theta), M = 10) topicQuality(STM8, corpus$documents, xlab = "Semantic Coherence", ylab = "Exclusivity", labels = 1:ncol(STM8$theta), M = 10) topicQuality(STM11, corpus$documents, xlab = "Semantic Coherence", ylab = "Exclusivity", labels = 1:ncol(STM11$theta), M = 10) topicQuality(STM17, corpus$documents, xlab = "Semantic Coherence", ylab = "Exclusivity", labels = 1:ncol(STM17$theta), M = 10) dev.off() par(mfrow=c(1,1)) # MODEL 11 HAT BESTE TOPIC QUALITY labelTopics(STM5, topics = 1:5, n = 10) labelTopics(STM8, topics = 1:8, n = 10) labelTopics(STM11, topics = 1:11, n = 10, frexweight = 0.7) labelTopics(STM17, topics = 1:17, n = 5) windows() plot.STM(STM11, type = "labels", labeltype = "prob", text.cex = 0.8) dev.off() # Find best text for topic myCorpus1 <- Corpus(VectorSource(docs$txt)) myCorpus1 <- tm_map(myCorpus1, removeWords, stopwords('english')) myCorpus1 <- tm_map(myCorpus1, stemDocument) for (i in 1:length(docs$txt)) { docs$txt1[i] <- gsub("[[:space:]]", " ", myCorpus1[[i]]) } text <- merge(corpus$meta, docs, by="id") plot.STM(STM11,type = "labels", topics = 11, labeltype = "prob") findThoughts(STM11, texts = text$txt1, n = 30, topics = 11, meta = id)$docs[[1]] ################################################################################################## # topic descriptives labelTopics(STM11, topics = 1:11, n = 10) pal <- c("#1EA1F3","#FFAC1E", "#F65D23") png("END_TopicCloud.png", width=700,height=1900, type = c("windows")) par(mfrow=c(6,2)) #Politics cloud(STM11, topic = 1, max.words = 100, colors = pal, scale=c(4,1)) cloud(STM11, topic = 2, max.words = 100, colors = pal, scale=c(4,1)) cloud(STM11, topic = 3, max.words = 100, colors = pal, scale=c(4,1)) cloud(STM11, topic = 4, max.words = 100, colors = pal, scale=c(4,1)) cloud(STM11, topic = 5, max.words = 100, colors = pal, scale=c(4,1)) cloud(STM11, topic = 6, max.words = 20, colors = pal, scale=c(4,1)) cloud(STM11, topic = 7, max.words = 20, colors = pal, scale=c(4,1)) cloud(STM11, topic = 8, max.words = 40, colors = pal, scale=c(4,1)) cloud(STM11, topic = 9, max.words = 30, colors = pal, scale=c(4,1)) cloud(STM11, topic = 10, max.words = 50, colors = pal, scale=c(4,1)) # Online Medien cloud(STM11, topic = 11, max.words = 50, colors = pal, scale=c(4,1)) # Meinungen dev.off() par(mfrow=c(1,1)) help("wordcloud") ## estimates # topic proportions TopicNames <- c("1. POLITIK ","2. (JUNG-) JOURNALISMUS ", "3. LIFESTYLE ", "4. POLITIKWISSENSCHAFT ", "5. ENTREPRENEURSHIP ", "6. PROFILBESCHREIBUNG ", "7. LOVE & FRIENDS ", "8. VERBÄNDE & ORGANISATIONEN ", "9. BERUF & INNOVATION ", "10. SOCIAL MEDIA & MARKETING ", "11. KOMMUNIKATION & MEINUNG ") # Plot Props plot(STM11, type = 'summary', topics = 1:11, n=4, xlim = c(0,0.4), topic.names = TopicNames ) # Plot Labels png("END1_Labels.png", width=750,height=1000, type = c("windows")) plot.STM(STM11,type = "labels", topics = 1:11, labeltype = "prob", topic.names = TopicNames) dev.off() ######################################################################################## library(igraph) library(huge) pal <- c("#1EA1F3","#FFAC1E", "#F65D23") mod <- topicCorr(STM11, cutoff = 0.06) plot(mod, vlabels = TopicNames,vertex.color = "gray", vertex.label.cex = 1, vertex.label.color = "black", vertex.size = 10, xlim =c(-1.3,1.3), ylim=c(-0.3,0.3)) ?plot.topicCorr #------------------------------------------------------------------------------------------ ############################################################################################ ## COMPARSION BETWEEN PARTIES AND TOPICS ############################################################################################ #------------------------------------------------------------------------------------------ par(mfrow=c(1,1)) prep <- estimateEffect(1:11 ~ party, STM11, meta = corpus$meta) summary(prep) PartyNames <- c("CVP", "BDP", "EVP", "LEGA", "GRUENE", "GLP", "SVP", "SP", "FDP") png("END1_TopicComparsion.png", width=1000,height=1200, type = c("windows")) par(mfrow=c(3,2)) plot.estimateEffect(prep, covariate = "party", topics = 1, xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic: 1. POLITIK", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") plot.estimateEffect(prep, covariate = "party", topics = 8, xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic: 8. VERBÄNDE & ORGANISATIONENS", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") plot.estimateEffect(prep, covariate = "party", topics = 10,xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic 10. SOCIAL MEDIA & MARKETING", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") plot.estimateEffect(prep, covariate = "party", topics = 9, xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic: 9. BERUF & INNOVATION", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") plot.estimateEffect(prep, covariate = "party", topics = 3, xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic: 3. LIFESTYLE", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") plot.estimateEffect(prep, covariate = "party", topics = 7, xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic: 7. LOVE & FRIENDS", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") dev.off() par(mfrow=c(1,1)) plot.estimateEffect(prep, covariate = "party", topics = 7, xlim = c(0.04,0.17), labeltype = "custom", custom.labels = PartyNames, main = "Topic 2. (JUNG-) JOURNALISMUS ", xlab = "Erwartete Häufigkeit des Topics in den descriptions, aufgeteilt nach Partei") # DIFFERENCEs BETWEEN PARTIES plot(prep, "party", method = 'difference', cov.value1 = "SVP", topics = 1:11, cov.value2 = "SP", xlim = c(-0.042,0.042), ylim = c(1,11), main = "SP-descriptions verglichen mit SVP-descriptions", xlab = " In SP-Follower Profilen häufigere Themen <= | => In SVP-Follower Profilen häufigere Themen", labeltype = "custom", custom.labels = TopicNames) plot(prep, "party", method = 'difference', cov.value1 = "SVP", topics = 1:11, cov.value2 = "GRUENE", xlim = c(-0.045,0.045), ylim = c(1,11), main = "GRUENE-descriptions verglichen mit SVP-descriptions", xlab = "In GRUENE-Follower Profilen häufigere Themen <= | => In SVP-Follower Profilen häufigere Themen ", labeltype = "custom", custom.labels = TopicNames) ############################################################################################ #------------------------------------------------------------------------------------------- #WORDCLOUDS #------------------------------------------------------------------------------------------- ############################################################################################ library(RXKCD) library(wordcloud) library(RColorBrewer) library(grDevices) # Words which do not appear in the documents greater than 1: 8702 # A sample: sample(corpus$words.removed, 10) # 283 rows containing single Words removed length(corpus$docs.removed) # Word frequencies words <- docs %>% unnest_tokens(word, txt) %>% anti_join(stop_words) words_stop <- docs %>% unnest_tokens(word, txt) # Frequency Plots #----------------------------------------- # only single words: Problem not really reppresentative distinct_df <- words %>% group_by(party, word) %>% filter(!word %in% c("politics", "president", "councillor", "council", "journalist", "communication", "politics", "political", "news", "media")) df_plot_single <- distinct_df %>% group_by(party) %>% dplyr::count(word, sort = TRUE) %>% mutate(word = reorder(word, n))%>% mutate(total = sum(n)) df_plot_single$perc <- df_plot_single$n/df_plot_single$total result <- df_plot_single %>% group_by(party) %>% mutate(rank = rank(-perc, ties.method = "random")) %>% filter(rank <=40) #----------------------------------------- # All words: problem often same kategories df_plot1 <- words %>% group_by(party) %>% dplyr::count(word, sort = TRUE) %>% mutate(word = reorder(word, n))%>% mutate(total = sum(n)) df_plot1$perc <- df_plot1$n/df_plot1$total result <- df_plot1 %>% group_by(party) %>% mutate(rank = rank(-perc, ties.method = "random")) %>% filter(rank <=50) #---------------------------------------------------- #Top 50 Wörter allg. pal <- c("#1EA1F3","#FFAC1E", "#F65D23") words1 <- words words1$party <- NULL df_plot2 <- words1 %>% dplyr::count(word, sort = TRUE) %>% mutate(word = reorder(word, n))%>% mutate(total = sum(n)) result_alle <- df_plot2 %>% mutate(rank = rank(-n, ties.method = "random")) %>% filter(rank <=10) %>% ggplot(aes(word, n)) + geom_bar(stat = "identity", fill = "#1EA1F3") + ylab("") + xlab("")+ coord_flip() + theme_classic() + theme(legend.position=c(0.9, 0.15), axis.line.y=element_blank(), axis.line.x = element_blank(), axis.ticks.y=element_line(), axis.text.y = element_text(size=12, face='bold'), axis.text.x = element_text(size=12, face='bold', colour="darkgray"), axis.title.x=element_text(""), axis.ticks.x=element_blank(), panel.grid.major.x = element_line(colour="lightgray"), strip.text=element_text(face='bold'), strip.background=element_rect(fill='#eeeeee', color=NA), panel.spacing.x = unit(0.25, 'in'), panel.spacing.y = unit(0.25, 'in') ) result_alle #----------------------------------------- pal <- c("#1EA1F3","#FFAC1E", "#F65D23") png("wordcloud1.png", width=1000,height=850, type = c("windows")) par(mfrow=c(1,3)) wordcloud(result$word[result$party=="SVP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal,fixed.asp=T) wordcloud(result$word[result$party=="SP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal,fixed.asp=T) wordcloud(result$word[result$party=="FDP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal,fixed.asp=T) dev.off() png("wordcloud2.png", width=1000,height=850, type = c("windows")) par(mfrow=c(1,3)) wordcloud(result$word[result$party=="CVP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal) wordcloud(result$word[result$party=="GRUENE"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal) wordcloud(result$word[result$party=="GLP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal) dev.off() png("wordcloud3.png", width=1000,height=850, type = c("windows")) par(mfrow=c(1,3)) wordcloud(result$word[result$party=="BDP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal) wordcloud(result$word[result$party=="EVP"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal) wordcloud(result$word[result$party=="LEGA"],result$rank^-1, scale=c(4,1), random.order=F, rot.per=.3, colors=pal) dev.off() par(mfrow=c(1,1)) table(df_stem$party) png("wordcloud.png", width=1280,height=800) dev.off() # Find best followers #----------------------------------------- # POLITIK words_POLITIK <- words %>% filter(word %in% c("president", "zurich", "council", "canton")) %>% group_by(users) words_POLITIK$party <- NULL words_POLITIK$translated <- NULL words_POLITIK$id <- NULL words_POLITIK$txt1 <- NULL deduped.data <- unique( words_POLITIK[ , 1:2 ]) sort(table(deduped.data$users),decreasing=TRUE) #LIVESTYLE words_POLITIK <- words %>% filter(word %in% c("interest", "music", "sport", "life", "like", "culture", "travel")) %>% group_by(users) words_POLITIK$party <- NULL words_POLITIK$translated <- NULL words_POLITIK$id <- NULL words_POLITIK$txt1 <- NULL deduped.data <- unique( words_POLITIK[ , 1:2 ]) sort(table(deduped.data$users),decreasing=TRUE) #VERBÄnde & ORG words_POLITIK <- words %>% filter(word %in% c("manag", "member", "director", "univers", "project", "teacher", "lawyer", "secretari", "campaig")) %>% group_by(users) words_POLITIK$party <- NULL words_POLITIK$translated <- NULL words_POLITIK$id <- NULL words_POLITIK$txt1 <- NULL deduped.data <- unique( words_POLITIK[ , 1:2 ]) sort(table(deduped.data$users),decreasing=TRUE)