########################################################
# Sedef Bicer, University of Zurich
# DDJ FS_2018
# Text-Analysen
#########################################################

####### Preset
rm(list=ls(all=TRUE)) # clear working space
setwd("/Users/**/Datenjournalismus_FS18") # define working directory
options(stringsAsFactors=F) # disallow automatic conversion of strings to factors
set.seed(123) # ensure reproduceability

libs <- c("tm", "stringr", "RTextTools", "stm", "ggplot2") # define libraries to load / install
#install.packages(libs) # only once!


##### Corpus import and preprocessing

# load necessary packages
library(tm)
library(stringr)
library(ngram)
library(ggplot2)


# define path to the original documents
dir <- "./DDJ-Texte/speeches1995-2017_lang_datum.csv"

#loading of all speeches in parlament
all <- read.csv(dir, stringsAsFactors = F, encoding = "UTF-8")
#show overview over data
head(all)
str(all)

#date (datum) as data type date
all$datum <- as.Date(all$datum)

#subset the data (for example only one party or from a specific date)
all <- subset(all, party == "FDP" & sprache == "german")

all$party[all$party == "FDP-Liberale"] <- "FDP" #special case FDP because they changed the name to FDP-die Liberalen


#### the labelling issue!!!
textcorpus <- VCorpus(VectorSource(all$speech), readerControl = list(language = "german"))


# inspect corpus
summary(textcorpus)

############# 
# check and add meta data, e.g. author
textcorpus[[1]]$meta
for (i in 1:length(textcorpus)){
  meta(textcorpus[[i]], "Party") <- all$party[i]
  meta(textcorpus[[i]], "typ") <- "speech"
  meta(textcorpus[[i]], "date") <- all$datum[i]
  meta(textcorpus[[i]], "ID") <- all$id[i]
  Encoding(textcorpus[[i]]$content) <- "UTF-8"
}
textcorpus[[2]]$meta
textcorpus[[480]]$meta

##create document names for the dtm
#initialize
DocNames <- rep(NA, nrow(all))

# create all the names. The names should contain the type, the id as well as the date
for (i in 1:nrow(all)){
  DocNames[i] <- paste(all$party[i], "_speech_", as.character(all$datum[i]), "_id", as.character(all$id[i]), sep = "")
}

# look at the contents
textcorpus[[1]]$content

# Get information what tm can do
getTransformations()


# lowercase transformation
textcorpus <- tm_map(textcorpus, content_transformer(tolower))
textcorpus[[1]]$content

# remove punctuations
textcorpus <- tm_map(textcorpus, removePunctuation)
textcorpus[[1]]$content

# remove numbers
textcorpus <- tm_map(textcorpus, removeNumbers)
textcorpus[[1]]$content

# remove stopwords
stopwords("german") # stoplist for german
textcorpus <- tm_map(textcorpus, removeWords, c(stopwords("german"), "dass", "dadurch"))
textcorpus[[1]]$content

# stem words
textcorpus <- tm_map(textcorpus, stemDocument, language = "german")
textcorpus[[1]]$content

# remove unnecessary whitespace
for (i in 1:length(textcorpus)){
  tmp <- gsub("\\s+", " ", paste(textcorpus[[i]]$content, collapse = "\n"))
  textcorpus[[i]]$content <- gsub("^\\s+|\\s+$", "", tmp)
}
textcorpus[[1]]$content

#save your textcorpus
save(textcorpus, file = "./DDJ-Texte/corpus_preprocessing.RData")


##### Feature generation

#load your textcorpus
load("./DDJ-Texte/corpus_preprocessing.RData")



# generate document feature maxtrix
dtm <- DocumentTermMatrix(textcorpus, control = list(minWordLength = 3))


# inspect your feature matrix
dtm
inspect(dtm[,100:105])

# name the documents with a term that contains the party the date and the id
dtm$dimnames[[1]] <- DocNames



##### Dictionary building


# associated terms with the term islam
assotsIslam <- findAssocs(dtm, "islam", 0.30)
strAssotsIslam <- names(assotsIslam[[1]])


### ---------------------

# Term frequency/Inverse document frequency (TF-IDF)
tfidf <- DocumentTermMatrix(textcorpus, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
tfidf <- as.data.frame(as.matrix(tfidf))
tfidf <- data.frame(t(tfidf))


## wordcloud
# 
fdp_medienmitteilung <- colnames(tfidf)[grepl("^FDP", colnames(tfidf))]
tfidf_fdp <- subset(tfidf, select = fdp_medienmitteilung)
tfidf_fdp <- tfidf_fdp[(rownames(tfidf_fdp)%in%strAssotsIslam),] #subset the tfidf (only the words which the party assots with islam)
tfidf_fdp <- rowSums(tfidf_fdp)
tfidf_fdp <- tfidf_fdp[tfidf_fdp > 0]
tfidf_fdp <- tfidf_fdp[order(tfidf_fdp, decreasing = T)]
tfidf_fdp <- tfidf_fdp[1:50]

# load library
library(wordcloud) 

#plot wordcloud for the most frequent words associated with the term "islam" (not used in the end)
wordcloud(names(tfidf_fdp), tfidf_fdp, max.words=Inf, random.order = F,
          ordered.colors=TRUE, colors = terrain.colors(50, alpha = 1))

#plot wordcloud  for the most associated terms with the term islam
wordsAssotsIslam <- unlist(assotsIslam)
namesWordsAssotsIslam <- names(assotsIslam$islam)

wordcloud(namesWordsAssotsIslam, wordsAssotsIslam, scale = c(3, 0.75), max.words=100, random.order = F,
          ordered.colors=TRUE, colors = terrain.colors(length(wordsAssotsIslam), alpha = 1))