######################################################## # Sedef Bicer, University of Zurich # DDJ FS_2018 # Text-Analysen ######################################################### ####### Preset rm(list=ls(all=TRUE)) # clear working space setwd("/Users/**/Datenjournalismus_FS18") # define working directory options(stringsAsFactors=F) # disallow automatic conversion of strings to factors set.seed(123) # ensure reproduceability libs <- c("tm", "stringr", "RTextTools", "stm", "ggplot2") # define libraries to load / install #install.packages(libs) # only once! ##### Corpus import and preprocessing # load necessary packages library(tm) library(stringr) library(ngram) library(ggplot2) # define path to the original documents dir <- "./DDJ-Texte/speeches1995-2017_lang_datum.csv" #loading of all speeches in parlament all <- read.csv(dir, stringsAsFactors = F, encoding = "UTF-8") #show overview over data head(all) str(all) #date (datum) as data type date all$datum <- as.Date(all$datum) #subset the data (for example only one party or from a specific date) all <- subset(all, party == "FDP" & sprache == "german") all$party[all$party == "FDP-Liberale"] <- "FDP" #special case FDP because they changed the name to FDP-die Liberalen #### the labelling issue!!! textcorpus <- VCorpus(VectorSource(all$speech), readerControl = list(language = "german")) # inspect corpus summary(textcorpus) ############# # check and add meta data, e.g. author textcorpus[[1]]$meta for (i in 1:length(textcorpus)){ meta(textcorpus[[i]], "Party") <- all$party[i] meta(textcorpus[[i]], "typ") <- "speech" meta(textcorpus[[i]], "date") <- all$datum[i] meta(textcorpus[[i]], "ID") <- all$id[i] Encoding(textcorpus[[i]]$content) <- "UTF-8" } textcorpus[[2]]$meta textcorpus[[480]]$meta ##create document names for the dtm #initialize DocNames <- rep(NA, nrow(all)) # create all the names. The names should contain the type, the id as well as the date for (i in 1:nrow(all)){ DocNames[i] <- paste(all$party[i], "_speech_", as.character(all$datum[i]), "_id", as.character(all$id[i]), sep = "") } # look at the contents textcorpus[[1]]$content # Get information what tm can do getTransformations() # lowercase transformation textcorpus <- tm_map(textcorpus, content_transformer(tolower)) textcorpus[[1]]$content # remove punctuations textcorpus <- tm_map(textcorpus, removePunctuation) textcorpus[[1]]$content # remove numbers textcorpus <- tm_map(textcorpus, removeNumbers) textcorpus[[1]]$content # remove stopwords stopwords("german") # stoplist for german textcorpus <- tm_map(textcorpus, removeWords, c(stopwords("german"), "dass", "dadurch")) textcorpus[[1]]$content # stem words textcorpus <- tm_map(textcorpus, stemDocument, language = "german") textcorpus[[1]]$content # remove unnecessary whitespace for (i in 1:length(textcorpus)){ tmp <- gsub("\\s+", " ", paste(textcorpus[[i]]$content, collapse = "\n")) textcorpus[[i]]$content <- gsub("^\\s+|\\s+$", "", tmp) } textcorpus[[1]]$content #save your textcorpus save(textcorpus, file = "./DDJ-Texte/corpus_preprocessing.RData") ##### Feature generation #load your textcorpus load("./DDJ-Texte/corpus_preprocessing.RData") # generate document feature maxtrix dtm <- DocumentTermMatrix(textcorpus, control = list(minWordLength = 3)) # inspect your feature matrix dtm inspect(dtm[,100:105]) # name the documents with a term that contains the party the date and the id dtm$dimnames[[1]] <- DocNames ##### Dictionary building # associated terms with the term islam assotsIslam <- findAssocs(dtm, "islam", 0.30) strAssotsIslam <- names(assotsIslam[[1]]) ### --------------------- # Term frequency/Inverse document frequency (TF-IDF) tfidf <- DocumentTermMatrix(textcorpus, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))) tfidf <- as.data.frame(as.matrix(tfidf)) tfidf <- data.frame(t(tfidf)) ## wordcloud # fdp_medienmitteilung <- colnames(tfidf)[grepl("^FDP", colnames(tfidf))] tfidf_fdp <- subset(tfidf, select = fdp_medienmitteilung) tfidf_fdp <- tfidf_fdp[(rownames(tfidf_fdp)%in%strAssotsIslam),] #subset the tfidf (only the words which the party assots with islam) tfidf_fdp <- rowSums(tfidf_fdp) tfidf_fdp <- tfidf_fdp[tfidf_fdp > 0] tfidf_fdp <- tfidf_fdp[order(tfidf_fdp, decreasing = T)] tfidf_fdp <- tfidf_fdp[1:50] # load library library(wordcloud) #plot wordcloud for the most frequent words associated with the term "islam" (not used in the end) wordcloud(names(tfidf_fdp), tfidf_fdp, max.words=Inf, random.order = F, ordered.colors=TRUE, colors = terrain.colors(50, alpha = 1)) #plot wordcloud for the most associated terms with the term islam wordsAssotsIslam <- unlist(assotsIslam) namesWordsAssotsIslam <- names(assotsIslam$islam) wordcloud(namesWordsAssotsIslam, wordsAssotsIslam, scale = c(3, 0.75), max.words=100, random.order = F, ordered.colors=TRUE, colors = terrain.colors(length(wordsAssotsIslam), alpha = 1))