#load packages
library("swissparl")
## Warning: package 'swissparl' was built under R version 4.0.5
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("dplyr")

1. Datensätze herunterladen

# get data business-subject-text-link
sb.dt <- get_data(
  table = "SubjectBusiness",
  Language = "DE",
)
#get data of the transcripts, (264'982 objects) divide into 10 batches because of loading issues.
ts1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4601", "4602", "4603", "4604", "4605", "4606", "4607", "4608",
                "4609", "4610")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts2.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4611", "4612", "4613", "4614", "4615", "4616")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts2.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4616", "4617", "4618", "4619", "4620")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts3.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4701", "4702", "4703", "4704", "4705")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts3.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4706", "4707", "4708", "4709", "4710")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts4.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4711", "4712", "4713", "4714")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts4.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4715", "4716", "4717", "4718")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts5.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4801", "4802", "4803", "4804", "4805")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts5.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4806", "4807", "4808", "4809", "4810")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts6.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4811", "4812", "4813", "4814", "4815")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts6.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4816", "4817", "4818", "4819", "4820")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts7.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4901", "4902", "4903", "4904", "4905")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts7.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4906", "4907", "4908", "4909", "4910")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts8.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("4911", "4912", "4913", "4914", "4915")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts8.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("4916", "4917", "4918", "4919", "4920")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts9.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("5001", "5002", "5003", "5004", "5005")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts9.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("5006", "5007", "5008", "5009", "5010")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts10.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("5011", "5012", "5013", "5014", "5015")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts10.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("5016", "5017", "5018", "5019")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts11.1.dt <- get_data(
  table = "Transcript",
  IdSession = c("5101", "5102", "5103", "5104", "5105")
)
#get data of the transcripts, (264'982 objects) divide into batches because of loading issues.
ts11.2.dt <- get_data(
  table = "Transcript",
  IdSession = c("5106", "5107", "5108", "5109")
)
# combine the transcript badges
ts.dt <- rbind(ts1.dt, ts2.1.dt, ts2.2.dt, ts3.1.dt, ts3.2.dt, ts4.1.dt, ts4.2.dt, ts5.1.dt, ts5.2.dt, ts6.1.dt, ts6.2.dt, ts7.1.dt, ts7.2.dt, ts8.1.dt, ts8.2.dt, ts9.1.dt, ts9.2.dt, ts10.1.dt, ts10.2.dt, ts11.1.dt, ts11.2.dt)
# save dataframe in csv and rds format
write.csv(ts.dt,'swissparltext.csv', row.names = FALSE)
write.csv(ts.dt,'swissparltextId.csv', row.names = TRUE)
saveRDS(ts.dt,"swissparltext.rds")
# get data on MPs
ps.dt <- get_data(
  table = "Person",
  Language = "DE"
)
# get data on party of each MP
pm.dt <- get_data(
  table = "MemberParty",
  Language = "DE",
)

2. Datensätze zusammenfügen

# prepare subject data
sb_clean.dt <- select(sb.dt, IdSubject, BusinessNumber, BusinessShortNumber, Title, SortOrder)
# only the German description in Texts
ts_clean.dt <- ts.dt %>% 
  filter(Language == "DE")
text.dt <- left_join(sb_clean.dt, ts_clean.dt, by = "IdSubject", copy = FALSE)
# clean and create MP dataframe
ps_clean.dt <- select(ps.dt, PersonNumber, GenderAsString)

pm_clean.dt <- select(pm.dt, PersonNumber, PartyNumber, PartyName, PartyAbbreviation)

speaker.dt <- left_join(ps_clean.dt, pm_clean.dt, by = "PersonNumber", copy = FALSE)
# create full dataframe
swissparldata.dt <- left_join(text.dt, speaker.dt, by = "PersonNumber", copy = FALSE)

3. Datensatz speichern

Da es sich um sehr viele Daten handelt und das Laden (get data) mehrere Stunden dauert, speichere ich den Datensatz ab und beginne ein 2. Dokument.

# save dataframe in csv and rds format
write.csv(swissparldata.dt,'swissparldata.csv', row.names = TRUE)
saveRDS(swissparldata.dt,"swissparldata.rds")

`