########################################################
# Sedef Bicer, University of Zurich
# DDJ FS_2018
# Text-Analysen
#########################################################

### this code is exemplary for all media releases. Only the terms of the party have to be changed

####### Preset
rm(list=ls(all=TRUE)) # clear working space
setwd("/Users/**/Datenjournalismus_FS18") # define working directory
options(stringsAsFactors=F) # disallow automatic conversion of strings to factors
set.seed(123) # ensure reproducibility

libs <- c("tm", "stringr", "RTextTools", "stm", "ggplot2") # define libraries to load / install
#install.packages(libs) # only once!


##### Corpus import and preprocessing

# load necessary packages
library(tm)
library(stringr)
library(ngram)
library(ggplot2)

# define path to the original documents
dir <- "./DDJ-Texte/Texte_UTF-8_FDP/"


#### the labelling issue!!!
textcorpus <- VCorpus(DirSource(dir, encoding = "UTF-8"), readerControl = 
                        list(language = "german")) 

# inspect corpus
summary(textcorpus)

#############
# check and add meta data, e.g. author
textcorpus[[1]]$meta
docnames <- list.files(dir, pattern = ".txt")
for (i in 1:length(textcorpus)){
  meta(textcorpus[[i]], "Party") <- unlist(strsplit(docnames[i], "_"))[1]
  meta(textcorpus[[i]], "typ") <- unlist(strsplit(docnames[i], "_"))[2]
  meta(textcorpus[[i]], "date") <- unlist(strsplit(docnames[i], "_"))[3]
  meta(textcorpus[[i]], "ID") <- unlist(strsplit(docnames[i], "_"))[5]
  Encoding(textcorpus[[i]]$content) <- "UTF-8"
}
textcorpus[[2]]$meta
textcorpus[[480]]$meta



# look at the contents
textcorpus[[1]]$content

# Get information what tm can do
getTransformations()


# lowercase transformation
textcorpus <- tm_map(textcorpus, content_transformer(tolower))
textcorpus[[1]]$content

# remove punctuations
textcorpus <- tm_map(textcorpus, removePunctuation)
textcorpus[[1]]$content

# remove numbers
textcorpus <- tm_map(textcorpus, removeNumbers)
textcorpus[[1]]$content

# remove stopwords
stopwords("german") # stoplist for german
textcorpus <- tm_map(textcorpus, removeWords, c(stopwords("german"), "dass", "dadurch"))
textcorpus[[1]]$content

# stem words
textcorpus <- tm_map(textcorpus, stemDocument, language = "german")
textcorpus[[1]]$content

# remove unnecessary whitespace
for (i in 1:length(textcorpus)){
  tmp <- gsub("\\s+", " ", paste(textcorpus[[i]]$content, collapse = "\n"))
  textcorpus[[i]]$content <- gsub("^\\s+|\\s+$", "", tmp)
}
textcorpus[[1]]$content

#save your textcorpus
save(textcorpus, file = "./DDJ-Texte/corpus_preprocessing.RData")


##### Feature generation

#load your textcorpus
load("./DDJ-Texte/corpus_preprocessing.RData")



# generate document feature maxtrix
dtm <- DocumentTermMatrix(textcorpus, control = list(minWordLength = 3))


# inspect your feature matrix
dtm
inspect(dtm[,100:105])


##### Dictionary building

# associated terms with the term islam
assotsIslam <- findAssocs(dtm, "islam", 0.4)
strAssotsIslam <- names(assotsIslam[[1]])


## creat dataframe for easier handling
df <- as.data.frame(as.matrix(dtm)) #DocumentTermMatrix as data.frame
df <- data.frame(t(df), optional = F) #transpose data.frame so rows are terms cols are documentnames


### islam Dictionary

##define all words which are related with Islam and are used to this topic
#load the words from the dictionary from source macmillan (translated by me)
islamDict <- read.csv(file = "islamDict_UTF-8.csv", header = T, encoding = )
islamDict <- as.vector(islamDict$Islam.related.words)
islamDict <- tolower(islamDict)

#other words containing islamic relevant words
wordsContainingIslam <- rownames(df)[grepl("islam", rownames(df))] #gives vector with words (from loaded files) containing the term "islam"
wordsContainingMuslim <- rownames(df)[grepl("muslim", rownames(df))] #gives vector with words (from loaded files) containing the term "muslim"
wordsContainingMinarett <- rownames(df)[grepl("minaret", rownames(df))] #gives vector with words (from loaded files) containing the term "minaret"
wordsContainingBurka <- rownames(df)[grepl("burka", rownames(df))] #gives vector with words (from loaded files) containing the term "burka"
wordsContainingDschihad <- rownames(df)[grepl("dschihad", rownames(df))] #gives vector with words (from loaded files) containing the term "dschihad"

#put all words together 
islamDict <- c(islamDict, wordsContainingIslam, wordsContainingMuslim, wordsContainingMinarett, wordsContainingBurka, wordsContainingDschihad)
islamDict <- sort(unique(islamDict))

#load corpus
#load(file = "./DDJ-Texte/corpus_preprocessing.RData")


#subset the dataframe, only use terms which are related to islam (as in the dictionary)
dfSubsetIslam <- df[rownames(df)%in%islamDict,]

# vector with all dates (datum) a medienmitteilung was made
allDates <- unique(matrix(meta(textcorpus, "date")))
allDates <- as.data.frame(allDates)

#initialize an empty df for the for loop
df_sum_date <- data.frame(matrix(vector(), length(allDates$V1), 3,
                                      dimnames=list(c(), c("nUsed", "nMedienmitteilungen", "dateString"))),
                               stringsAsFactors=F)


# loop to create a df which counts the words which are related to the islam (according to generated dictionary)
for(i in 1:length(allDates$V1)){
  dateDot <- gsub("-", ".", allDates$V1[i])
  date_medienmitteilung <- colnames(dfSubsetIslam)[grepl(dateDot, colnames(dfSubsetIslam))]
  df_date <- subset(dfSubsetIslam, select = date_medienmitteilung) ##using a dataset with only words from the IslamDictionary
  if(ncol(df_date)>1) df_sum_date$nUsed[i] <- sum(rowSums(df_date))
  else df_sum_date$nUsed[i] <- sum(df_date)
  df_sum_date$nMedienmitteilungen[i] <- ncol(df_date)
  df_sum_date$dateString[i] <- as.character(allDates$V1[[i]][1])
  
  ## print to see what happens
  # print(dateDot)
  # print(date_medienmitteilung)
  # print(head(df_date))
  # print(sum(rowSums(df_date)))
  # print("---------------------")
  # print(df_sum_date$nUsed[i])
  # print("---------------------")
}


df_sum_date$Islammeldung <- df_sum_date$nUsed > 0 # shows if at least one term related to islam was used at a certain date
df_sum_date$partei <- "FDP" # specify party 

df_sum_date$dateString <- as.Date(unlist(df_sum_date$dateString)) #convert date to Type date

sum(df_sum_date$nUsed) # number of islamic related terms used for all dates (datum) incl. NA

df_sum_date <- df_sum_date[!is.na(df_sum_date$dateString),] # exclude all objectives which have dates (datum) NA

df_sum_date <- df_sum_date[df_sum_date$Islammeldung,] # reduce df_sum_date to only rows where an islamic term is used

df_sum_date_fdp <- df_sum_date #save data.frame with the name of the party (so we can combine the data.frames of different parties)

save(df_sum_date_fdp, file = "fdp_media.Rda")




### example for wordclouds (the most frequent used words which the party associates with the term "islam")
## not used in the end because of too few data

# Term frequency/Inverse document frequency (TF-IDF)
tfidf <- DocumentTermMatrix(textcorpus, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
tfidf <- as.data.frame(as.matrix(tfidf))
tfidf <- data.frame(t(tfidf))


## wordcloud
# 
fdp_medienmitteilung <- colnames(tfidf)[grepl("^FDP", colnames(tfidf))]
tfidf_fdp <- subset(tfidf, select = fdp_medienmitteilung)
tfidf_fdp <- tfidf_fdp[(rownames(tfidf_fdp)%in%strAssotsIslam),] #subset the tfidf (only the words which the party associates with islam)
tfidf_fdp <- rowSums(tfidf_fdp)
tfidf_fdp <- tfidf_fdp[tfidf_fdp > 0]
tfidf_fdp <- tfidf_fdp[order(tfidf_fdp, decreasing = T)]
tfidf_fdp <- tfidf_fdp[1:50]

# load library
library(wordcloud) 

#plot wordcloud
wordcloud(names(tfidf_fdp), tfidf_fdp, max.words=Inf, random.order = F,
          ordered.colors=TRUE, colors = terrain.colors(50, alpha = 1))