############################################################################################################ #WEBSCRAPER MEDIENMITTEILUNGEN FDP-Die Liberalen SCHWEIZ HISTORISCH# ############################################################################################################ #Libraries library(httr) library(RCurl) library(rvest) library(dplyr) library(Rcrawler) library(data.table) library(readr) library(tidyr) library(plyr) library(stringr) library(foreign) setwd("C:/Users/Sedef/Desktop/Datenjournalismus_FS18/DDJ-Texte/Texte_UTF-8") ############################################################################################################ #Webpage of interest url <- "http://www.fdp.ch/aktuell/medienmitteilungen/" myOpts <- curlOptions(connecttimeout = 250) ############################################################################################################ #Preparation of webpage to scrape its media releases Links <- data.frame() #Build Dataframe with all Links of media releases Linkspages <- data.frame() archivlink <- c(2010:2017) for(a in archivlink){ tryCatch({ pg <- read_html(paste0(url, "medienmitteilungen-", a, "/")) pgsub<- pg %>% html_nodes(xpath = '//h2/a') linkspg <- html_attr(pgsub, "href") ##Extracts all links left in pgsub tmp0 <- as.data.frame(linkspg) ##Transforms Links into a Data Frame Links <- rbind(Links, tmp0) pgadd <- pg %>% html_nodes(xpath = '//div[21]/ul/li/a') linkadd <- html_attr(pgadd, "href") tmp01 <- as.data.frame(linkadd) Linkspages <- rbind(Linkspages, tmp01) Sys.sleep(2) }, error=function(e){cat("ERROR: ", conditionMessage(e), " Error on Page: ", a, "\n")}) } Linkspages <- data.frame(lapply(Linkspages, as.character), stringsAsFactors=FALSE) for(p in Linkspages$linkadd){ tryCatch({ pg <- read_html(paste0("http://www.fdp.ch/", p), ssl_verifypeer = 0L) pgsub <- pg %>% html_nodes(xpath = '//h2/a') linkspg<- html_attr(pgsub, "href") ##Extracts all links left in pgsub tmp02 <- as.data.frame(linkspg) ##Transforms Links into a Data Frame Links <- rbind(Links, tmp02) Sys.sleep(2) }, error=function(e){cat("ERROR: ", conditionMessage(e), " Error Code on Page: ", p, "\n")}) } Links_failsave <- Links #Remove all links which do not lead to a Medienmitteilung Links <- Links[!duplicated(Links$linkspg), ] Links <- as.data.frame(Links) Links <- Links[grep("medienmitteilungen", Links$Links), ] Links <- as.data.frame(Links) Links <- filter(Links, Links!="") Links <- data.frame(lapply(Links, as.character), stringsAsFactors=FALSE) Links$Links <- gsub("https://www.fdp.ch/", "", Links$Links) ############################################################################################################ #Actual Scraper downloading and storing each File as a Json file tagged as Medienmitteilung | Datum | FDP j <- 1 Sys.setlocale("LC_TIME", "English") for (i in Links$Links) { tryCatch({ repos <- GET(url = paste0(url, "medienmitteilungen-", a, "/")) a <- status_code(repos) pg2 <- read_html(paste0("http://www.fdp.ch/", i), .opts = myOpts, ssl_verifypeer = 0L) Titel <- pg2 %>% html_nodes(xpath = '//h1') %>% html_text() Titel <- gsub("\r?\n|\r", " ", Titel[1]) Titel <- trimws(Titel, which = c("both")) Datum <- pg2 %>% html_nodes(xpath='//time' ) %>% html_text() Datum <- Datum[1] Datum <- gsub("\r?\n|\r|\t", " ", Datum) Datum <- trimws(Datum, which = c("both")) Datum <- gsub("^.*?\n", "", Datum) Datum <- as.Date(Datum, "%d. %B %Y") Text <- pg2 %>% html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "article", " " ))]') %>% html_text() Text <- gsub("\r?\n|\r|\t", " ", Text[1]) #Removes al \n 's Text <- trimws(Text, which = c("both")) Akteur <- "FDP-Die Liberalen" Kuerzel <- "FDP" Quelle <- "https://www.fdp.ch/aktuell/medienmitteilungen/" tmp <- data.frame(Datum, Titel, Akteur, Kuerzel, Text, Quelle) mytime <- Datum myfile <- file.path(getwd(), paste0("FDP_Medienmitteilung_", mytime, "_ID_", j, ".txt")) fileConn <- file(myfile, encoding="UTF-8") writeLines(Text, fileConn) close(fileConn) j <- j+1 Sys.sleep(2) }, error=function(e){cat("ERROR: ", conditionMessage(e), " Error Code: ", a, " on page: ", i, "\n")}) }