######################################################## # Sedef Bicer, University of Zurich # DDJ FS_2018 # Text-Analysen ######################################################### ### this code is exemplary for all media releases. Only the terms of the party have to be changed ####### Preset rm(list=ls(all=TRUE)) # clear working space setwd("/Users/**/Datenjournalismus_FS18") # define working directory options(stringsAsFactors=F) # disallow automatic conversion of strings to factors set.seed(123) # ensure reproducibility libs <- c("tm", "stringr", "RTextTools", "stm", "ggplot2") # define libraries to load / install #install.packages(libs) # only once! ##### Corpus import and preprocessing # load necessary packages library(tm) library(stringr) library(ngram) library(ggplot2) # define path to the original documents dir <- "./DDJ-Texte/Texte_UTF-8_FDP/" #### the labelling issue!!! textcorpus <- VCorpus(DirSource(dir, encoding = "UTF-8"), readerControl = list(language = "german")) # inspect corpus summary(textcorpus) ############# # check and add meta data, e.g. author textcorpus[[1]]$meta docnames <- list.files(dir, pattern = ".txt") for (i in 1:length(textcorpus)){ meta(textcorpus[[i]], "Party") <- unlist(strsplit(docnames[i], "_"))[1] meta(textcorpus[[i]], "typ") <- unlist(strsplit(docnames[i], "_"))[2] meta(textcorpus[[i]], "date") <- unlist(strsplit(docnames[i], "_"))[3] meta(textcorpus[[i]], "ID") <- unlist(strsplit(docnames[i], "_"))[5] Encoding(textcorpus[[i]]$content) <- "UTF-8" } textcorpus[[2]]$meta textcorpus[[480]]$meta # look at the contents textcorpus[[1]]$content # Get information what tm can do getTransformations() # lowercase transformation textcorpus <- tm_map(textcorpus, content_transformer(tolower)) textcorpus[[1]]$content # remove punctuations textcorpus <- tm_map(textcorpus, removePunctuation) textcorpus[[1]]$content # remove numbers textcorpus <- tm_map(textcorpus, removeNumbers) textcorpus[[1]]$content # remove stopwords stopwords("german") # stoplist for german textcorpus <- tm_map(textcorpus, removeWords, c(stopwords("german"), "dass", "dadurch")) textcorpus[[1]]$content # stem words textcorpus <- tm_map(textcorpus, stemDocument, language = "german") textcorpus[[1]]$content # remove unnecessary whitespace for (i in 1:length(textcorpus)){ tmp <- gsub("\\s+", " ", paste(textcorpus[[i]]$content, collapse = "\n")) textcorpus[[i]]$content <- gsub("^\\s+|\\s+$", "", tmp) } textcorpus[[1]]$content #save your textcorpus save(textcorpus, file = "./DDJ-Texte/corpus_preprocessing.RData") ##### Feature generation #load your textcorpus load("./DDJ-Texte/corpus_preprocessing.RData") # generate document feature maxtrix dtm <- DocumentTermMatrix(textcorpus, control = list(minWordLength = 3)) # inspect your feature matrix dtm inspect(dtm[,100:105]) ##### Dictionary building # associated terms with the term islam assotsIslam <- findAssocs(dtm, "islam", 0.4) strAssotsIslam <- names(assotsIslam[[1]]) ## creat dataframe for easier handling df <- as.data.frame(as.matrix(dtm)) #DocumentTermMatrix as data.frame df <- data.frame(t(df), optional = F) #transpose data.frame so rows are terms cols are documentnames ### islam Dictionary ##define all words which are related with Islam and are used to this topic #load the words from the dictionary from source macmillan (translated by me) islamDict <- read.csv(file = "islamDict_UTF-8.csv", header = T, encoding = ) islamDict <- as.vector(islamDict$Islam.related.words) islamDict <- tolower(islamDict) #other words containing islamic relevant words wordsContainingIslam <- rownames(df)[grepl("islam", rownames(df))] #gives vector with words (from loaded files) containing the term "islam" wordsContainingMuslim <- rownames(df)[grepl("muslim", rownames(df))] #gives vector with words (from loaded files) containing the term "muslim" wordsContainingMinarett <- rownames(df)[grepl("minaret", rownames(df))] #gives vector with words (from loaded files) containing the term "minaret" wordsContainingBurka <- rownames(df)[grepl("burka", rownames(df))] #gives vector with words (from loaded files) containing the term "burka" wordsContainingDschihad <- rownames(df)[grepl("dschihad", rownames(df))] #gives vector with words (from loaded files) containing the term "dschihad" #put all words together islamDict <- c(islamDict, wordsContainingIslam, wordsContainingMuslim, wordsContainingMinarett, wordsContainingBurka, wordsContainingDschihad) islamDict <- sort(unique(islamDict)) #load corpus #load(file = "./DDJ-Texte/corpus_preprocessing.RData") #subset the dataframe, only use terms which are related to islam (as in the dictionary) dfSubsetIslam <- df[rownames(df)%in%islamDict,] # vector with all dates (datum) a medienmitteilung was made allDates <- unique(matrix(meta(textcorpus, "date"))) allDates <- as.data.frame(allDates) #initialize an empty df for the for loop df_sum_date <- data.frame(matrix(vector(), length(allDates$V1), 3, dimnames=list(c(), c("nUsed", "nMedienmitteilungen", "dateString"))), stringsAsFactors=F) # loop to create a df which counts the words which are related to the islam (according to generated dictionary) for(i in 1:length(allDates$V1)){ dateDot <- gsub("-", ".", allDates$V1[i]) date_medienmitteilung <- colnames(dfSubsetIslam)[grepl(dateDot, colnames(dfSubsetIslam))] df_date <- subset(dfSubsetIslam, select = date_medienmitteilung) ##using a dataset with only words from the IslamDictionary if(ncol(df_date)>1) df_sum_date$nUsed[i] <- sum(rowSums(df_date)) else df_sum_date$nUsed[i] <- sum(df_date) df_sum_date$nMedienmitteilungen[i] <- ncol(df_date) df_sum_date$dateString[i] <- as.character(allDates$V1[[i]][1]) ## print to see what happens # print(dateDot) # print(date_medienmitteilung) # print(head(df_date)) # print(sum(rowSums(df_date))) # print("---------------------") # print(df_sum_date$nUsed[i]) # print("---------------------") } df_sum_date$Islammeldung <- df_sum_date$nUsed > 0 # shows if at least one term related to islam was used at a certain date df_sum_date$partei <- "FDP" # specify party df_sum_date$dateString <- as.Date(unlist(df_sum_date$dateString)) #convert date to Type date sum(df_sum_date$nUsed) # number of islamic related terms used for all dates (datum) incl. NA df_sum_date <- df_sum_date[!is.na(df_sum_date$dateString),] # exclude all objectives which have dates (datum) NA df_sum_date <- df_sum_date[df_sum_date$Islammeldung,] # reduce df_sum_date to only rows where an islamic term is used df_sum_date_fdp <- df_sum_date #save data.frame with the name of the party (so we can combine the data.frames of different parties) save(df_sum_date_fdp, file = "fdp_media.Rda") ### example for wordclouds (the most frequent used words which the party associates with the term "islam") ## not used in the end because of too few data # Term frequency/Inverse document frequency (TF-IDF) tfidf <- DocumentTermMatrix(textcorpus, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))) tfidf <- as.data.frame(as.matrix(tfidf)) tfidf <- data.frame(t(tfidf)) ## wordcloud # fdp_medienmitteilung <- colnames(tfidf)[grepl("^FDP", colnames(tfidf))] tfidf_fdp <- subset(tfidf, select = fdp_medienmitteilung) tfidf_fdp <- tfidf_fdp[(rownames(tfidf_fdp)%in%strAssotsIslam),] #subset the tfidf (only the words which the party associates with islam) tfidf_fdp <- rowSums(tfidf_fdp) tfidf_fdp <- tfidf_fdp[tfidf_fdp > 0] tfidf_fdp <- tfidf_fdp[order(tfidf_fdp, decreasing = T)] tfidf_fdp <- tfidf_fdp[1:50] # load library library(wordcloud) #plot wordcloud wordcloud(names(tfidf_fdp), tfidf_fdp, max.words=Inf, random.order = F, ordered.colors=TRUE, colors = terrain.colors(50, alpha = 1))