######################################################## # Sedef Bicer, University of Zurich # DDJ FS_2018 # Text-Analysen ######################################################### ### this code is exemplary for all speeches. Only the Terms of the party have to be changed ####### Preset rm(list=ls(all=TRUE)) # clear working space setwd("/Users/**/Datenjournalismus_FS18") # define working directory options(stringsAsFactors=F) # disallow automatic conversion of strings to factors set.seed(123) # ensure reproducibility libs <- c("tm", "stringr", "RTextTools", "stm", "ggplot2") # define libraries to load / install #install.packages(libs) # only once! ##### Corpus import and preprocessing # load necessary packages library(tm) library(stringr) library(ngram) library(ggplot2) # define path to the original documents dir <- "./DDJ-Texte/speeches1995-2017_lang_datum.csv" #loading of all speeches in parlament all <- read.csv(dir, stringsAsFactors = F, encoding = "UTF-8") #show overview over data head(all) str(all) #date (datum) as data type date all$datum <- as.Date(all$datum) ###subset the data (for example only one party or from a specific date) #since we cannot create too large dataframe we have to split the data of each party in several subsets ##FDP1 from 2000 till 2004 only german #subFromAll <- subset(all, datum >= "2000-01-01" & datum < "2005-01-01" & party == "FDP" & sprache == "german") ##FDP2 from 2006 till 2010 only german #subFromAll <- subset(all, datum >= "2005-01-01" & datum < "2010-01-01" & party == "FDP" & sprache == "german") ##FDP3 from 2010 till 2014 only german #subFromAll <- subset(all, datum >= "2010-01-01" & datum < "2015-01-01" & party == "FDP" & sprache == "german") ##FDP4 from 2015 till 2019 only german subFromAll <- subset(all, datum >= "2015-01-01" & datum < "2020-01-01" & party == "FDP" | party == "FDP-Liberale" & sprache == "german") subFromAll$party[subFromAll$party == "FDP-Liberale"] <- "FDP" #specialcase FDP because they changed the name to FDP-die Liberalen #### the labelling issue!!! textcorpus <- VCorpus(VectorSource(subFromAll$speech), readerControl = list(language = "german")) # inspect corpus summary(textcorpus) ############# # check and add meta data, e.g. author # adding meta data party, typ, date and id, make sure the encoding is UTF-8 textcorpus[[4]]$meta for (i in 1:length(textcorpus)){ meta(textcorpus[[i]], "Party") <- subFromAll$party[i] meta(textcorpus[[i]], "typ") <- "speech" meta(textcorpus[[i]], "date") <- subFromAll$datum[i] meta(textcorpus[[i]], "ID") <- subFromAll$id[i] Encoding(textcorpus[[i]]$content) <- "UTF-8" } textcorpus[[2]]$meta textcorpus[[480]]$meta ##creat document names for the dtm #initialize DocNames <- rep(NA, nrow(subFromAll)) # create all the names. The names should contain the type, the id as well as the date for (i in 1:nrow(subFromAll)){ DocNames[i] <- paste(subFromAll$party[i], "_speech_", as.character(subFromAll$datum[i]), "_id", as.character(subFromAll$id[i]), sep = "") } # look at the contents textcorpus[[1]]$content # Get information what tm can do getTransformations() # lowercase transformation textcorpus <- tm_map(textcorpus, content_transformer(tolower)) textcorpus[[1]]$content # remove punctuations textcorpus <- tm_map(textcorpus, removePunctuation) textcorpus[[1]]$content # remove numbers textcorpus <- tm_map(textcorpus, removeNumbers) textcorpus[[1]]$content # remove stopwords stopwords("german") # stoplist for german textcorpus <- tm_map(textcorpus, removeWords, c(stopwords("german"), "dass", "dadurch")) textcorpus[[1]]$content # stem words textcorpus <- tm_map(textcorpus, stemDocument, language = "german") textcorpus[[1]]$content # remove unnecessary whitespace for (i in 1:length(textcorpus)){ tmp <- gsub("\\s+", " ", paste(textcorpus[[i]]$content, collapse = "\n")) textcorpus[[i]]$content <- gsub("^\\s+|\\s+$", "", tmp) } textcorpus[[1]]$content #save your textcorpus save(textcorpus, file = "./DDJ-Texte/corpus_preprocessing.RData") ##### Feature generation #load your textcorpus load("./DDJ-Texte/corpus_preprocessing.RData") # generate document feature maxtrix dtm <- DocumentTermMatrix(textcorpus, control = list(minWordLength = 3)) # inspect your feature matrix dtm inspect(dtm[,100:105]) # name the documents with a term that contains the party the date and the id dtm$dimnames[[1]] <- DocNames ##### Dictionary building ## convert data as a data frame df <- as.data.frame(as.matrix(dtm)) #DocumentTermMatrix as data.frame df <- data.frame(t(df), optional = F) #transpose data.frame so rows are terms cols are documentnames ### islam Dictonary ##define all words which are related with Islam and are used to this topic #load the words from the dictionary from source macmillan (translated by me) islamDict <- read.csv(file = "islamDict_UTF-8.csv", header = T, encoding = ) islamDict <- as.vector(islamDict$Islam.related.words) islamDict <- tolower(islamDict) #other words containing islamic relevant words wordsContainingIslam <- rownames(df)[grepl("islam", rownames(df))] #gives vector with words (from loaded files) containing the term "islam" wordsContainingMuslim <- rownames(df)[grepl("muslim", rownames(df))] #gives vector with words (from loaded files) containing the term "muslim" wordsContainingMinarett <- rownames(df)[grepl("minaret", rownames(df))] #gives vector with words (from loaded files) containing the term "minaret" wordsContainingBurka <- rownames(df)[grepl("burka", rownames(df))] #gives vector with words (from loaded files) containing the term "burka" wordsContainingDschihad <- rownames(df)[grepl("dschihad", rownames(df))] #gives vector with words (from loaded files) containing the term "dschihad" #put all words together islamDict <- c(islamDict, wordsContainingIslam, wordsContainingMuslim, wordsContainingMinarett, wordsContainingBurka, wordsContainingDschihad) islamDict <- sort(unique(islamDict)) #load corpus #load(file = "./DDJ-Texte/corpus_preprocessing.RData") #subset the dataframe, only use terms which are related to islam (as in the dictionary) dfSubsetIslam <- df[rownames(df)%in%islamDict,] # vector with all dates (datum) a speech was made allDates <- unique(subFromAll$datum) #initalize a empty df for the for loop df_sum_date <- data.frame(matrix(vector(), length(allDates), 3, dimnames=list(c(), c("nUsed", "nParlamentsreden", "dateString"))), stringsAsFactors=F) # loop to create a df which counts the words which are related to the islam (according to our dictionary) for(i in 1:length(allDates)){ dateDot <- gsub("-", ".", allDates[i]) date_parlamentsrede <- colnames(dfSubsetIslam)[grepl(dateDot, colnames(dfSubsetIslam))] df_date <- subset(dfSubsetIslam, select = date_parlamentsrede) ##using a dataset with only words from the IslamDictionary if(ncol(df_date)>1) df_sum_date$nUsed[i] <- sum(rowSums(df_date)) else df_sum_date$nUsed[i] <- sum(df_date) df_sum_date$nParlamentsreden[i] <- ncol(df_date) df_sum_date$dateString[i] <- as.character(allDates[[i]][1]) print(dateDot) print(date_parlamentsrede) print(head(df_date)) print(sum(rowSums(df_date))) print("---------------------") print(df_sum_date$nUsed[i]) print("---------------------") } df_sum_date$Islammeldung <- df_sum_date$nUsed > 0 # shows if at least one term related to islam was used at a certain date df_sum_date$partei <- "FDP" # specify party df_sum_date$dateString <- as.Date(unlist(df_sum_date$dateString)) #convert date to Type date sum(df_sum_date$nUsed) # number of islamic related terms used for all dates (datums) incl. NA df_sum_date <- df_sum_date[!is.na(df_sum_date$dateString),] # exclude all objectives which have dates (datum) NA df_sum_date <- df_sum_date[df_sum_date$Islammeldung,] # reduce df_sum_date to only rows where a islamic term is used df_sum_date_fdp1 <- df_sum_date df_sum_date_fdp2 <- df_sum_date df_sum_date_fdp3<- df_sum_date df_sum_date_fdp4 <- df_sum_date ##save dataframe (choose the right one!! -> comment the others!) #save(df_sum_date_fdp1, file = "fdp1_speech.Rda") #save(df_sum_date_fdp2, file = "fdp2_speech.Rda") #save(df_sum_date_fdp3, file = "fdp3_speech.Rda") save(df_sum_date_fdp4, file = "fdp4_speech.Rda")