############################################################################################################
#WEBSCRAPER MEDIENMITTEILUNGEN FDP-Die Liberalen SCHWEIZ HISTORISCH#
############################################################################################################
#Libraries
library(httr)
library(RCurl)
library(rvest)
library(dplyr)
library(Rcrawler)
library(data.table)
library(readr)
library(tidyr)
library(plyr)
library(stringr)
library(foreign)

setwd("C:/Users/Sedef/Desktop/Datenjournalismus_FS18/DDJ-Texte/Texte_UTF-8")
############################################################################################################
#Webpage of interest
url <- "http://www.fdp.ch/aktuell/medienmitteilungen/"

myOpts <- curlOptions(connecttimeout = 250)
############################################################################################################
#Preparation of webpage to scrape its media releases
Links <- data.frame()   #Build Dataframe with all Links of media releases
Linkspages <- data.frame()
archivlink <- c(2010:2017)

for(a in archivlink){
    tryCatch({
    pg   <- read_html(paste0(url, "medienmitteilungen-", a, "/"))
    pgsub<- pg %>% 
      html_nodes(xpath = '//h2/a')
    linkspg <- html_attr(pgsub, "href")                     ##Extracts all links left in pgsub
    tmp0    <- as.data.frame(linkspg)                       ##Transforms Links into a Data Frame
    Links   <- rbind(Links, tmp0)
    pgadd   <- pg %>% html_nodes(xpath = '//div[21]/ul/li/a')
    linkadd <- html_attr(pgadd, "href")
    tmp01   <- as.data.frame(linkadd)
    Linkspages <- rbind(Linkspages, tmp01)
    Sys.sleep(2)
    }, error=function(e){cat("ERROR: ", conditionMessage(e), " Error on Page: ", a, "\n")})
}

Linkspages <- data.frame(lapply(Linkspages, as.character), stringsAsFactors=FALSE)

for(p in Linkspages$linkadd){
  tryCatch({
  pg     <- read_html(paste0("http://www.fdp.ch/", p), ssl_verifypeer = 0L)
  pgsub  <- pg %>% 
    html_nodes(xpath = '//h2/a')
  linkspg<- html_attr(pgsub, "href")                  ##Extracts all links left in pgsub
  tmp02  <- as.data.frame(linkspg)                    ##Transforms Links into a Data Frame
  Links  <- rbind(Links, tmp02)
  Sys.sleep(2)
  }, error=function(e){cat("ERROR: ", conditionMessage(e), " Error Code on Page: ", p, "\n")})
}

Links_failsave <- Links

#Remove all links which do not lead to a Medienmitteilung
Links <- Links[!duplicated(Links$linkspg), ]
Links <- as.data.frame(Links)
Links <- Links[grep("medienmitteilungen", Links$Links), ]
Links <- as.data.frame(Links)
Links <- filter(Links, Links!="")
Links <- data.frame(lapply(Links, as.character), stringsAsFactors=FALSE)
Links$Links <- gsub("https://www.fdp.ch/", "", Links$Links)
############################################################################################################
#Actual Scraper downloading and storing each File as a Json file tagged as Medienmitteilung | Datum | FDP
j <- 1
Sys.setlocale("LC_TIME", "English")

for (i in Links$Links) {
  tryCatch({
    repos <- GET(url = paste0(url, "medienmitteilungen-", a, "/"))
    a <- status_code(repos)
    pg2 <- read_html(paste0("http://www.fdp.ch/", i), .opts = myOpts, ssl_verifypeer = 0L)
    Titel <- pg2 %>% 
      html_nodes(xpath = '//h1') %>% html_text()
    Titel <- gsub("\r?\n|\r", " ", Titel[1]) 
    Titel <- trimws(Titel, which = c("both"))
    
    Datum <- pg2 %>% 
      html_nodes(xpath='//time' ) %>% 
      html_text()
    Datum <- Datum[1]
    Datum <- gsub("\r?\n|\r|\t", " ", Datum) 
    Datum <- trimws(Datum, which = c("both"))
    Datum <- gsub("^.*?\n", "", Datum)
    Datum <- as.Date(Datum, "%d. %B %Y")
    
    Text <- pg2 %>%
      html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "article", " " ))]') %>% html_text()
    Text <- gsub("\r?\n|\r|\t", " ", Text[1])         #Removes al \n 's 
    Text <- trimws(Text, which = c("both"))
    Akteur <- "FDP-Die Liberalen"
    Kuerzel <- "FDP"
    Quelle <- "https://www.fdp.ch/aktuell/medienmitteilungen/"
    
    tmp <- data.frame(Datum, Titel, Akteur, Kuerzel, Text, Quelle)
    
    mytime <- Datum
    myfile <- file.path(getwd(), paste0("FDP_Medienmitteilung_", mytime, "_ID_", j, ".txt"))
    fileConn <- file(myfile, encoding="UTF-8")
    writeLines(Text, fileConn)
    close(fileConn)
    j <- j+1
    
    Sys.sleep(2)
  }, error=function(e){cat("ERROR: ", conditionMessage(e), " Error Code: ", a, " on page: ", i, "\n")})
}