####################################################### # Johanna Burger, University of Zurich # # 13-754-262 # # DDJ 2018, 26.05.2018 # # Text-Analyse für Blogbeitrag # ####################################################### rm(list=ls(all=TRUE)) #*********************************** Pakete laden **************************************# library(XML) library(methods) library(dplyr) library(httr) library(bitops) library(foreign) library(tidyr) library(ggridges) library(ggplot2) library(haven) library(plotly) library(lattice) library(reshape2) library(tm) library(stringr) library(ngram) library (stm) setwd("C://Users//J//Desktop//") #************************************************************** Daten öffnen **************************************# df = read.csv("1995-2017_corr_new.csv",stringsAsFactors = F, encoding="UTF-8") #************************* XML zusammenfügen **************************# temp_d_names <- list.dirs(path = ".", full.names = FALSE) d_names <- c() for (i in 1:length(temp_d_names)) { if (temp_d_names[i] != "") { d_names <- c(d_names, temp_d_names[i]) } } tab_varnames <- data.frame() num_columns <- 0 for (i in 1:length(d_names)) { if (i == 1) { writeLines("progress..") } writeLines(paste0("folder ", i, " of ", length(d_names))) f_names <- list.files(d_names[i]) for (j in 1:length(f_names)) { doc <- xmlParse(paste0(d_names[i], "/", f_names[j])) xmlList <- xmlToList(doc) dat <- data.frame(xmlList) dat$date <- d_names[i] row_entries <- c(d_names[i], f_names[j]) varNames <- c() for (k in 1:length(xmlList)) { varNames <- c(varNames, names(xmlList[[k]])) } row_entries <- c(row_entries, varNames) if (i == 1 & j == 1) { num_columns <- length(row_entries) + 30 tab_varnames <- data.frame(t(unlist(c(row_entries, rep(NA, num_columns-length(row_entries)), length(row_entries)-2)))) cnames <- c("folder", "file") for (k in 1:(num_columns-2)) { if (k < 10) { cnames <- c(cnames, paste0("name_var_0", k)) } else { cnames <- c(cnames, paste0("name_var_", k)) } } cnames <- c(cnames, "num_variables") colnames(tab_varnames) <- cnames } else if (length(row_entries) <= num_columns) { new_row <- c(row_entries, rep(NA, num_columns-length(row_entries)), length(row_entries)-2) tab_varnames <- rbind(tab_varnames, new_row, make.row.names = FALSE, stringsAsFactors = FALSE) } else { stop(paste0("There are more than 10 variables more than in the first xml file. ", "This is first the case in..", "\n - directory: '", d_names[i], "'", "\n - file: '", f_names[j],"'")) } } if (i == length(d_names)) { writeLines("task finished.") } } write.csv(tab_varnames, file = "1995-2017_corr_new.csv", row.names = FALSE) #************************************** nur einmal: Partein umcodieren ****************************************# df$agg_parties <- 1 df_aggre_parties = aggregate (agg_parties ~ party_new,df, sum) unique(df$party) df$party_new <- NA df[which(df$party == "FDP-Liberale"),]$party_new <- "FDP" df[which(df$party == "FDP"),]$party_new <- "FDP" df[which(df$party == "glp"),]$party_new <- "GLP" df[which(df$party == "GLP"),]$party_new <- "GLP" df[which(df$party == "BDP"),]$party_new <- "BDP" df[which(df$party == "GPS"),]$party_new <- "GPS" df[which(df$party == "SP"),]$party_new <- "SP" df[which(df$party == "SVP"),]$party_new <- "SVP" df[which(df$party == "svsp-ow"),]$party_new <- "CVP" df[which(df$party == "CSP"),]$party_new <- "CVP" df[which(df$party == "CSPO"),]$party_new <- "CVP" df[which(df$party == "CVP"),]$party_new <- "CVP" df[(is.na(df$party_new)),]$party_new <- "andere" #******************************* Session ID bereinigen ******************************# df$sessionID_new <- NA df$sessionID_new <- df$sessionID df[which(df$sessionID %in% c("3", "4", "5", "6", "7", "8", "9", "10", "11", "49", "59")),]$sessionID_new <- NA #*********************** Filtern nach Sprache ************************** # df.filtered_d <- df[grepl("[Ss]teuer",df$speech),] df.filtered_fr <- df[grepl("[Ii]mpôt",df$speech),] df.filtered_it <- df[grepl("[Ii]mposta",df$speech),] #************************** Corpus machen D ************************** # textcorpus_d <- VCorpus(VectorSource(df_filtered_d$speech), readerControl=list(encoding = "UTF-8", language = "german")) #*************************** Texte "reinigen" D***********************# textcorpus_d <- tm_map(textcorpus_d , content_transformer(tolower)) textcorpus_d <- tm_map(textcorpus_d , removeNumbers) textcorpus_d <- tm_map(textcorpus_d , removeWords, stopwords("german")) textcorpus_d <- tm_map(textcorpus_d , removePunctuation) textcorpus_d <- tm_map(textcorpus_d , stripWhitespace) textcorpus_d [[1]]$content for (i in 1: length (textcorpus_d )){ tmp <- gsub ("\\s+", " ", paste ( textcorpus_d [[i]]$ content, collapse = "\n")) textcorpus_d [[i]]$content <- gsub("^\\s +|\\s+$", "",tmp) } for (i in 1:length(textcorpus_d )) { df$speech_processed[i] <- textcorpus_d [[i]]$content } save (textcorpus_d, file = ".//corpus _ preprocessing.RData ") #******************* Sprachvariable einfügen D **********************# df.filtered_d$lang <- 1 #*************** Steuerdokumente finden D ***************************# df.filtered_d$count <- NA df.filtered_d$count <- str_count(df.filtered_d$speech_processed, "steuer") #******************* Sprachvariable einfügen ************************# df.filtered_fr$lang <- 2 #*************** Steuerdokumente finden IT **************************# df$count <- NA df$count <- str_count(df$speech_processed, "imposta") #******************* Sprachvariable einfügen IT *********************# df.filtered_it$lang <- 3 #********************* Daten zusammenfügen für Grafik ****************# df.bind <- rbind(df.filtered_d, df.filtered_fr, df.filtered_it) #************************* Aggregieren nach Sprache ******************# df.bind$taxyes <- 1 df_aggre_lang = aggregate (taxyes ~ sessionID_new + lang,df.bind, sum) #******************************************** Sessions NR fortlaufend machen *******************************# session <- unique(df_filtered_d$sessionID) abc<- NA session <- data.frame(session, abc) session$session1 <- sort.default(session$session, decreasing=F) session[1,1] session <- session[,-c(1)] session$sessionPR <- c(1:96) session$sessionID <- NA session$sessionID <- session$session1 session <- session[,-c(1:2)] df_filtered_d_new <- merge(df_filtered_d, session) View(df_filtered_d_new) df_aggre_lang_new$lang_wort <- NA df_aggre_lang_new$lang_wort <- as.factor(df_aggre_lang_new$lang) #******************************** Grafik Landessprachen (mit ggplot) *****************************************# plot_lang <- ggplot(data=df_aggre_lang_new, aes(x=sessionPR, y=taxyes, group=lang_wort,colour=lang_wort))+ geom_line(size = 0.8) + ggtitle("Parlamentsreden nach Sprachen") + labs(y="Parlamentsreden (N)", x = "Sessionen")+ scale_colour_discrete(name ="Sprachen",breaks=c("1", "2", "3"),labels=c("Deutsch", "Französisch", "Italienisch"))+ theme(panel.background = element_rect(fill = "#E6E6E6", colour = "white", size = 4))+ theme(plot.title = element_text(hjust = 0.5, size = 20, face="bold"))+ scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 96), labels = c("WiS 1995","HeS 1999","SoS 2003", "HeS 2007", "WiS 2011", "WiS 2017"))+ theme(legend.text=element_text(size=12), legend.title = element_text(size=14, face="bold"), axis.text=element_text(size=12),axis.title=element_text(size=14))+ geom_line() plot_lang #************************************ Nach Parteien aggregieren *********************# df_filtered_d_new$taxyes <- 1 df_aggre_party= aggregate (taxyes ~ sessionPR + party_new,df_filtered_d_new, sum) #********************************* Grafik nach Parteien ******************************# df_aggre_party_small <- df_aggre_party df_aggre_party_small[which(df_aggre_party_small$party_new %in% c("andere", "GLP", "BDP", "GPS")),]$party_new <- NA df_aggre_party_small <- na.omit(df_aggre_party_small) # Relevel für eine gute Lesbarkeit in der Legende df_aggre_party_small$party_new_level <- NA df_aggre_party_small[which(df_aggre_party_small$party_new %in% c("CVP")),]$party_new_level <- 2 df_aggre_party_small[which(df_aggre_party_small$party_new %in% c("FDP")),]$party_new_level <- 1 df_aggre_party_small[which(df_aggre_party_small$party_new %in% c("SP")),]$party_new_level <- 3 df_aggre_party_small[which(df_aggre_party_small$party_new %in% c("SVP")),]$party_new_level <- 4 levels(df_aggre_party$partynew) df_aggre_party_small$party_new_level<- as.factor(df_aggre_party_small$party_new_level) plot_parties <- ggplot(data=df_aggre_party_small, aes(x=sessionPR, y=taxyes, group=party_new_level, color=party_new_level))+ geom_line(size = 0.7) + ggtitle("Parlamentsreden nach Parteien") + labs(y="Parlamentsreden (N)", x = "Sessionen")+ labs(colour = "Parteien") + theme(panel.background = element_rect(fill = "#FAFAFA", colour = "white", size = 4))+ theme(plot.title = element_text(hjust = 0.5, size = 20, face="bold"))+ scale_colour_discrete(name ="Parteien",breaks=c("1", "2", "3","4"),labels=c("FDP", "CVP", "SP", "SVP"))+ scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 96), labels = c("WiS 1995","HeS 1999","SoS 2003", "HeS 2007", "WiS 2011", "WiS 2017"))+ theme(axis.text=element_text(size=12),axis.title=element_text(size=12), legend.text=element_text(size=12), legend.title = element_text(size=14, face="bold"))+ geom_line() plot_parties #************************* Häufigkeiten in verschiedenen Parteien ********************# WordVector <- function(textEntry) { textEntry <- paste (textEntry , collapse =" ") textWordsVector <- unlist ( strsplit ( textEntry , "\\W")) textWordsVector <- textWordsVector [ which (textWordsVector!= "")] return (textWordsVector) } text_array <- c() for (i in 1: length (textcorpus_d)){ text_array[i] <- paste(textcorpus_d[[i]]$speech_processed, collapse = " ") } kwic <- function (keyword , context , text) { textWordsVector <- WordVector (text) hits <- grep ( keyword , textWordsVector , value = FALSE ) if( length ( hits ) > 0){ result <- data.frame () for (h in 1: length ( hits )) { start <- hits [h]- context if( start < 1){ start <- 1 } end <- hits [h] + context myrow <- cbind (hits [h], paste (textWordsVector [start :(hits[h]-1)], collapse =" "), paste (textWordsVector [ hits[h]], collapse =" "), paste ( textWordsVector [( hits [h ]+1) : end ], collapse =" ")) result <- rbind ( result , myrow ) } colnames (result) <- c("position", "left", "keyword", "right") } else { result <- " KEYWORD NOT FOUND " } return ( result ) } df_filtered_d$text_for_freq <- NA for (i in seq_len(nrow(df_filtered_d))){ kwics <- kwic ("steuer", 20, df_filtered_d$speech_processed[i]) if(kwics != " KEYWORD NOT FOUND "){ v <- paste0(kwics$left, kwics$right) df_filtered_d$text_for_freq[i] = paste0(v) } } #********************* Datensubsets bilden nach Parteien ***********************# sub_SVP<- subset(df_filtered_d, party_new == "SVP") sub_SP<- subset(df_filtered_d, party_new == "SP") sub_BDP<- subset(df_filtered_d, party_new == "BDP") #********************* Häufigkeitsauszählung Wörter SVP und SP *****************# #svp docs1 <- Corpus(VectorSource(sub_SVP$text_for_freq), readerControl=list(encoding ="UTF-8", language="german")) dtm1 <- TermDocumentMatrix(docs1) m1 <- as.matrix(dtm1) v1 <- sort(rowSums(m1),decreasing=TRUE) d1 <- data.frame(word = names(v1),freq=v1) head(d1, 30) tab_svp_freq <- data.frame("freq" = c(807, 689, 614, 602, 589, 525, 435, 426, 406, 402), "word" = c("Franken","Bundesrat", "Prozent", "mehr", "Schweiz", "müssen", "Frage", "heute", "Artikel", "Millionen")) plot_svp <- ggplot2.barplot(data=tab_bdp_freq, xName="Anzahl Nennungen (N)", yName='', orientation="horizontal") #sp docs2 <- Corpus(VectorSource(sub_SP$text_for_freq), readerControl=list(encoding ="UTF-8", language="german")) dtm2 <- TermDocumentMatrix(docs2) m2 <- as.matrix(dtm2) v2 <- sort(rowSums(m2),decreasing=TRUE) d2 <- data.frame(word = names(v2),freq=v2) head(d2, 30) tab_sp_freq <- data.frame("freq" = c(689, 685, 512, 502, 497, 465, 390, 386, 373, 371), "word" = c("Bundesrat", "Franken", "mehr", "Schweiz", "Herr", "heute", "Kommission", "Artikel", "Antrag", "Prozent")) #bdp docs3 <- Corpus(VectorSource(sub_BDP$text_for_freq), readerControl=list(encoding ="UTF-8", language="german")) dtm3 <- TermDocumentMatrix(docs3) m3 <- as.matrix(dtm3) v3 <- sort(rowSums(m3),decreasing=TRUE) d3 <- data.frame(word = names(v3),freq=v3) head(d3, 30) tab_bdp_freq <- data.frame("freq" = c(110, 102, 101, 95, 86, 66, 57, 57, 56, 53, 53), "word" = c("Schweiz", "Franken", "Bundesrat", "heute", "Frage", "mehr", "Prozent", "schon", "bereits", "immer", "ganz")) #************* Häufikgeiten grafisch darstellen *************************# #svp plot_svp <- ggplot(tab_svp_freq, aes(reorder(word, freq), freq)) + geom_bar(position="dodge", stat="identity") + coord_flip() + geom_bar(colour="black", fill="#04B404", width=.8, stat="identity") + ggtitle("Meistgenannte Wörter im Steuerdiskurs in der SVP") + labs(y="Anzahl Nennungen (N) ", x = "")+ theme(plot.title = element_text(hjust = 0.5, size=20, face="bold"))+ theme(axis.text.y=element_text(angle=0, hjust=1, size=16, face="bold"))+ theme(axis.text=element_text(size=14), axis.title=element_text(size=14)) plot_svp #sp plot_sp <- ggplot(tab_sp_freq, aes(reorder(word, freq), freq)) + geom_bar(position="dodge", stat="identity") + coord_flip() + geom_bar(colour="black", fill="red", width=.8, stat="identity") + ggtitle('Meistgenannte Wörter im Steuerdiskurs in der SP') + labs(y="Anzahl Nennungen (N) ", x = "")+ theme(plot.title = element_text(hjust = 0.5, size=20, face="bold"))+ theme(axis.text.y=element_text(angle=0, hjust=1, size=16, face="bold"))+ theme(axis.text=element_text(size=14), axis.title=element_text(size=14)) plot_sp #bdp plot_bdp <- ggplot(tab_bdp_freq, aes(reorder(word, freq), freq)) + geom_bar(position="dodge", stat="identity") + coord_flip() + geom_bar(colour="black", fill="#ffde00", width=.8, stat="identity") + ggtitle('Meistgenannte Wörter im Steuerdiskurs in der BDP') + labs(y="Anzahl Nennungen (N) ", x = "")+ theme(plot.title = element_text(hjust = 0.5, size=20, face="bold"))+ theme(axis.text.y=element_text(angle=0, hjust=1, size=16, face="bold"))+ theme(axis.text=element_text(size=14), axis.title=element_text(size=14)) plot_bdp #********************* Abspeichern ****************************# save(textcorpus_d , file = ".//textcorpus_d_filtered.RData") write.csv(df_filtered_d, file = "df_filtered_d.csv", fileEncoding="UTF-8") write.csv(sub_SVP, file = "sub_SVP.csv", fileEncoding="UTF-8") write.csv(sub_SP, file = "sub_SP.csv", fileEncoding="UTF-8") write.csv(sub_BDP, file = "sub_BDP.csv", fileEncoding="UTF-8")