########################################################
# Sedef Bicer, University of Zurich
# DDJ FS_2018
# Text-Analysen
#########################################################

### this code is exemplary for all speeches. Only the Terms of the party have to be changed


####### Preset
rm(list=ls(all=TRUE)) # clear working space
setwd("/Users/**/Datenjournalismus_FS18") # define working directory
options(stringsAsFactors=F) # disallow automatic conversion of strings to factors
set.seed(123) # ensure reproducibility

libs <- c("tm", "stringr", "RTextTools", "stm", "ggplot2") # define libraries to load / install
#install.packages(libs) # only once!


##### Corpus import and preprocessing

# load necessary packages
library(tm)
library(stringr)
library(ngram)
library(ggplot2)


# define path to the original documents
dir <- "./DDJ-Texte/speeches1995-2017_lang_datum.csv"

#loading of all speeches in parlament
all <- read.csv(dir, stringsAsFactors = F, encoding = "UTF-8")
#show overview over data
head(all)
str(all)

#date (datum) as data type date
all$datum <- as.Date(all$datum)

###subset the data (for example only one party or from a specific date)

#since we cannot create too large dataframe we have to split the data of each party in several subsets
##FDP1  from 2000 till 2004 only german
#subFromAll <- subset(all, datum >= "2000-01-01" & datum < "2005-01-01" & party == "FDP" & sprache == "german")
##FDP2 from 2006 till 2010 only german
#subFromAll <- subset(all, datum >= "2005-01-01" & datum < "2010-01-01" & party == "FDP" & sprache == "german")
##FDP3 from 2010 till 2014 only german
#subFromAll <- subset(all, datum >= "2010-01-01" & datum < "2015-01-01" & party == "FDP" & sprache == "german")
##FDP4 from 2015 till 2019 only german
subFromAll <- subset(all, datum >= "2015-01-01" & datum < "2020-01-01" & party == "FDP" | party == "FDP-Liberale" & sprache == "german")

subFromAll$party[subFromAll$party == "FDP-Liberale"] <- "FDP" #specialcase FDP because they changed the name to FDP-die Liberalen

#### the labelling issue!!!
textcorpus <- VCorpus(VectorSource(subFromAll$speech), readerControl = list(language = "german"))


# inspect corpus
summary(textcorpus)

############# 
# check and add meta data, e.g. author
# adding meta data party, typ, date and id, make sure the encoding is UTF-8
textcorpus[[4]]$meta
for (i in 1:length(textcorpus)){
  meta(textcorpus[[i]], "Party") <- subFromAll$party[i]
  meta(textcorpus[[i]], "typ") <- "speech"
  meta(textcorpus[[i]], "date") <- subFromAll$datum[i]
  meta(textcorpus[[i]], "ID") <- subFromAll$id[i]
  Encoding(textcorpus[[i]]$content) <- "UTF-8"
}
textcorpus[[2]]$meta
textcorpus[[480]]$meta

##creat document names for the dtm
#initialize
DocNames <- rep(NA, nrow(subFromAll))

# create all the names. The names should contain the type, the id as well as the date
for (i in 1:nrow(subFromAll)){
  DocNames[i] <- paste(subFromAll$party[i], "_speech_", as.character(subFromAll$datum[i]), "_id", as.character(subFromAll$id[i]), sep = "")
}

# look at the contents
textcorpus[[1]]$content

# Get information what tm can do
getTransformations()


# lowercase transformation
textcorpus <- tm_map(textcorpus, content_transformer(tolower))
textcorpus[[1]]$content

# remove punctuations
textcorpus <- tm_map(textcorpus, removePunctuation)
textcorpus[[1]]$content

# remove numbers
textcorpus <- tm_map(textcorpus, removeNumbers)
textcorpus[[1]]$content

# remove stopwords
stopwords("german") # stoplist for german
textcorpus <- tm_map(textcorpus, removeWords, c(stopwords("german"), "dass", "dadurch"))
textcorpus[[1]]$content

# stem words
textcorpus <- tm_map(textcorpus, stemDocument, language = "german")
textcorpus[[1]]$content

# remove unnecessary whitespace
for (i in 1:length(textcorpus)){
  tmp <- gsub("\\s+", " ", paste(textcorpus[[i]]$content, collapse = "\n"))
  textcorpus[[i]]$content <- gsub("^\\s+|\\s+$", "", tmp)
}
textcorpus[[1]]$content

#save your textcorpus
save(textcorpus, file = "./DDJ-Texte/corpus_preprocessing.RData")


##### Feature generation

#load your textcorpus
load("./DDJ-Texte/corpus_preprocessing.RData")



# generate document feature maxtrix
dtm <- DocumentTermMatrix(textcorpus, control = list(minWordLength = 3))


# inspect your feature matrix
dtm
inspect(dtm[,100:105])

# name the documents with a term that contains the party the date and the id
dtm$dimnames[[1]] <- DocNames


##### Dictionary building


## convert data as a data frame

df <- as.data.frame(as.matrix(dtm)) #DocumentTermMatrix as data.frame
df <- data.frame(t(df), optional = F) #transpose data.frame so rows are terms cols are documentnames


### islam Dictonary

##define all words which are related with Islam and are used to this topic
#load the words from the dictionary from source macmillan (translated by me)
islamDict <- read.csv(file = "islamDict_UTF-8.csv", header = T, encoding = )
islamDict <- as.vector(islamDict$Islam.related.words)
islamDict <- tolower(islamDict)

#other words containing islamic relevant words
wordsContainingIslam <- rownames(df)[grepl("islam", rownames(df))] #gives vector with words (from loaded files) containing the term "islam"
wordsContainingMuslim <- rownames(df)[grepl("muslim", rownames(df))] #gives vector with words (from loaded files) containing the term "muslim"
wordsContainingMinarett <- rownames(df)[grepl("minaret", rownames(df))] #gives vector with words (from loaded files) containing the term "minaret"
wordsContainingBurka <- rownames(df)[grepl("burka", rownames(df))] #gives vector with words (from loaded files) containing the term "burka"
wordsContainingDschihad <- rownames(df)[grepl("dschihad", rownames(df))] #gives vector with words (from loaded files) containing the term "dschihad"

#put all words together 
islamDict <- c(islamDict, wordsContainingIslam, wordsContainingMuslim, wordsContainingMinarett, wordsContainingBurka, wordsContainingDschihad)
islamDict <- sort(unique(islamDict))

#load corpus
#load(file = "./DDJ-Texte/corpus_preprocessing.RData")



#subset the dataframe, only use terms which are related to islam (as in the dictionary)
dfSubsetIslam <- df[rownames(df)%in%islamDict,]

# vector with all dates (datum) a speech was made

allDates <- unique(subFromAll$datum)



#initalize a empty df for the for loop
df_sum_date <- data.frame(matrix(vector(), length(allDates), 3,
                                      dimnames=list(c(), c("nUsed", "nParlamentsreden", "dateString"))),
                               stringsAsFactors=F)


# loop to create a df which counts the words which are related to the islam (according to our dictionary)
for(i in 1:length(allDates)){
  dateDot <- gsub("-", ".", allDates[i])
  date_parlamentsrede <- colnames(dfSubsetIslam)[grepl(dateDot, colnames(dfSubsetIslam))]
  df_date <- subset(dfSubsetIslam, select = date_parlamentsrede) ##using a dataset with only words from the IslamDictionary
  if(ncol(df_date)>1) df_sum_date$nUsed[i] <- sum(rowSums(df_date))
  else df_sum_date$nUsed[i] <- sum(df_date)
  df_sum_date$nParlamentsreden[i] <- ncol(df_date)
  df_sum_date$dateString[i] <- as.character(allDates[[i]][1])
  print(dateDot)
  print(date_parlamentsrede)
  print(head(df_date))
  print(sum(rowSums(df_date)))
  print("---------------------")
  print(df_sum_date$nUsed[i])
  print("---------------------")
}


df_sum_date$Islammeldung <- df_sum_date$nUsed > 0 # shows if at least one term related to islam was used at a certain date
df_sum_date$partei <- "FDP" # specify party 

df_sum_date$dateString <- as.Date(unlist(df_sum_date$dateString)) #convert date to Type date

sum(df_sum_date$nUsed) # number of islamic related terms used for all dates (datums) incl. NA

df_sum_date <- df_sum_date[!is.na(df_sum_date$dateString),] # exclude all objectives which have dates (datum) NA

df_sum_date <- df_sum_date[df_sum_date$Islammeldung,] # reduce df_sum_date to only rows where a islamic term is used



df_sum_date_fdp1 <- df_sum_date
df_sum_date_fdp2 <- df_sum_date
df_sum_date_fdp3<- df_sum_date
df_sum_date_fdp4 <- df_sum_date
  
  
##save dataframe (choose the right one!! -> comment the others!)
#save(df_sum_date_fdp1, file = "fdp1_speech.Rda")
#save(df_sum_date_fdp2, file = "fdp2_speech.Rda")
#save(df_sum_date_fdp3, file = "fdp3_speech.Rda")
save(df_sum_date_fdp4, file = "fdp4_speech.Rda")