#Kommentiertes R-Skript
#Titel: Den Stadtzürchern vergeht die Lust am Heiraten – unabhängig von der Nationalität
#Sedef Biçer (sedef.bicer@uzh.ch)
#Forschungsseminar: Politischer Datenjournalismus (HS 2017)
#Dozierende: Prof. Dr. Fabrizio Gilardi, Dr. Bruno Wüest, Alexandra Kohler
#Abgabedatum: 17.12.2017


###########
## Setup ##
###########

#load Packages
library(ggplot2)
library(ggridges)
library(texreg)

#set working directory
setwd("//pwifiles.uzh.ch/datenjournalismus/Daten")

#loading file bevoelkerung.csv
bevoelkerung <- read.csv("bevoelkerung.csv")
head(bevoelkerung)

#variables for subsets small multiples
subNation <- c("Italien", "Türkei", "Schweiz", "Deutschland", "Portugal", "Spanien", "Serbien, Montenegro, Kosovo") #Hinzufügen von Deutschland, Portugal, Spanien, "Serbien, Montenegro, Kosovo", Sri Lanka
subAlter <- c("20-24", "25-29", "30-34", "35-39", "40-44")

#eliminate unknown in bevoelkerung
subZivilstand <- unique(bevoelkerung$Ziv2Lang)
subZivilstand <- subZivilstand[subZivilstand != "Unbekannt"]
subZivilstand <- factor(subZivilstand)

#subset for small multiples
BEV <- subset(bevoelkerung, NationHistLang %in% subNation & AlterV05Kurz %in% subAlter & Ziv2Lang %in% subZivilstand)
head(BEV)

#######################################################
## Build dataset for ggplot introduction graph: 20-44## 
#######################################################

#Introduce new categorie variable for introduction graph
bevoelkerung$AlterKatNeu <- NA
Zwischen20und44 <- c("20-24", "25-29", "30-34", "35-39", "40-44")
Zwischen45und74 <- c("45-49", "50-54", "55-59", "60-64", "65-69", "70-74")
Aelter44 <- c("45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85-89", "90-94", "95-99", "100 u. älter")

bevoelkerung$AlterKatNeu[bevoelkerung$AlterV05Kurz %in% Zwischen20und44] <- "20-44"
bevoelkerung$AlterKatNeu[bevoelkerung$AlterV05Kurz %in% Aelter44] <- "45-74"
#bevoelkerung$AlterKatNeu[bevoelkerung$AlterV05Kurz %in% Aelter44] <- "> 44"

Aelter19 <- c("20-44", "> 44")
Aelter19bis74 <- c("20-44", "45-74")

BEV_Intro <- subset(bevoelkerung, AlterKatNeu %in% Aelter14bis74 & Ziv2Lang %in% subZivilstand)
head(BEV_Intro)

bevoelkerung$Auslaender <- NA
bevoelkerung$Auslaender[bevoelkerung$NationHistLang == "Schweiz"] <- "Schweizer"
bevoelkerung$Auslaender[bevoelkerung$NationHistLang != "Schweiz"] <- "Ausländer"

Auslaender <- unique(bevoelkerung$Auslaender)

BEV_Intro2 <-  subset(bevoelkerung, AlterV05Kurz %in% Zwischen20und44 & Ziv2Lang %in% subZivilstand)
head(BEV_Intro2)



#Zivilstand <- sort(unique(BEV$Ziv2Lang))
Zivilstand <- subZivilstand

Jahr <- sort(unique(BEV$StichtagDatJahr))

#----------------------------------------------
#Unterscheidung Alter von 20-44 und 45-74
BEV_Intro_Table <- expand.grid(Jahr = Jahr, Zivilstand = Zivilstand, Alter = Aelter19bis74)
head(BEV_Intro_Table,50)
nrow(BEV_Intro_Table)

BEV_Intro_Table$Anteil_Zivilstand <- NA

for(k in Zivilstand){
  print(k)
  for(l in Jahr){
    print(l)
    for(m in Aelter19bis74){
      print(m)
      subBEV_Intro <- subset(BEV_Intro, StichtagDatJahr == l & AlterKatNeu == m)
      anteil <- 100 * mean(subBEV_Intro$Ziv2Lang == k)
      print(anteil)
      BEV_Intro_Table$Anteil_Zivilstand[BEV_Intro_Table$Zivilstand == k & BEV_Intro_Table$Jahr == l & BEV_Intro_Table$Alter == m] <- 100 * mean(subBEV_Intro$Ziv2Lang == k)
    }
  }
}

head(BEV_Intro_Table, 100)

setwd("C:/Users/Sedef/Desktop/Datenjournalismus/Abgabe")
ppi <- 300
png(file="Zeitreihe-1.png", width=6*ppi, height=4*ppi, res=ppi)
ggplot(data = subset(BEV_Intro_Table, Zivilstand %in% c("Ledig", "Verheiratet", "Geschieden"))) + aes(x = Jahr, y = Anteil_Zivilstand, color = Zivilstand) +
  geom_point() + 
  geom_smooth(se = FALSE) +
  labs(title = "Zivilstand der Stadtzürcher im Alter von 20 - 44 Jahren", subtitle = "(Anteil verheiratet, ledig und geschieden, 1993 - 2016)", y = "Prozent") + 
  ylim(0,100) +
  scale_x_continuous(breaks = c(1995, 2000, 2005, 2010, 2015)) +
  facet_wrap(~Alter, nrow = length(Alter))
dev.off()
#-------------------------------------------------------------

#Unterscheidung Schweizer vs. Ausländer
BEV_Intro2_Table <- expand.grid(Jahr = Jahr, Zivilstand = Zivilstand, Nationalität = Auslaender)
head(BEV_Intro2_Table,50)
nrow(BEV_Intro2_Table)

BEV_Intro2_Table$Anteil_Zivilstand <- NA

for(k in Zivilstand){
  print(k)
  for(l in Jahr){
    print(l)
    for(m in Auslaender){
      print(m)
      subBEV_Intro2 <- subset(BEV_Intro2, StichtagDatJahr == l & Auslaender == m)
      anteil <- 100 * mean(subBEV_Intro2$Ziv2Lang == k)
      print(anteil)
      BEV_Intro2_Table$Anteil_Zivilstand[BEV_Intro2_Table$Zivilstand == k & BEV_Intro2_Table$Jahr == l & BEV_Intro2_Table$Nationalität == m] <- 100 * mean(subBEV_Intro2$Ziv2Lang == k)
    }
  }
}

head(BEV_Intro2_Table, 100)

setwd("C:/Users/Sedef/Desktop/Datenjournalismus/Abgabe")
ppi <- 300
png(file="Zeitreihe-2.png", width=7*ppi, height=4*ppi, res=ppi)
ggplot(data = subset(BEV_Intro2_Table, Zivilstand %in% c("Ledig", "Verheiratet", "Geschieden"))) + aes(x = Jahr, y = Anteil_Zivilstand, color = Zivilstand) +
  geom_point() + 
  geom_smooth(se = FALSE) +
  labs(title = "Zivilstand der Stadtzürcher im Alter von 20 - 44 Jahren", subtitle = "(Anteil verheiratet, ledig und geschieden unterteilt in Ausländer und Schweizer, 1993 - 2016)", y = "Prozent") + 
  ylim(0,100) +
  scale_x_continuous(breaks = c(1995, 2000, 2005, 2010, 2015)) +
  facet_wrap(~Nationalität, nrow = 2)
dev.off()
#-------------------------------------------------------------


#############################################
## Build dataset for ggplot small multiples## 
#############################################

Nation <- sort(unique(BEV$NationHistLang))

Alter <- sort(unique(BEV$AlterV05Kurz))

Zivilstand <- sort(unique(BEV$Ziv2Lang))

Jahr <- sort(unique(BEV$StichtagDatJahr))

BEV_Nation <- expand.grid(Nationalitaet = Nation, Jahr = Jahr, Alter = Alter, Zivilstand = Zivilstand)
head(BEV_Nation,300)
nrow(BEV_Nation)

BEV_Nation$Anteil_Zivilstand <- NA

for(i in Nation){
  print(i)
  for(j in Alter){
    print(j)
    for(k in Zivilstand){
      #print(k)
      for(l in Jahr){
        #print(l)
        subBEV <- subset(BEV, NationHistLang == i & AlterV05Kurz == j & StichtagDatJahr == l)
        BEV_Nation$Anteil_Zivilstand[BEV_Nation$Nationalitaet == i & BEV_Nation$Alter == j & BEV_Nation$Zivilstand == k & BEV_Nation$Jahr == l] <- 100 * mean(subBEV$Ziv2Lang == k)
      }
    }
  }
}

head(BEV_Nation, 100)
NationInSmallMultiple <- c("Italien", "Türkei", "Schweiz", "Portugal", "Spanien", "Serbien, Montenegro, Kosovo")
AlterInSmallMultiple <- c("20-24", "25-29", "30-34", "35-39", "40-44")

ppi <- 300
png(file="smallMulitples.png", width=18*ppi, height=18*ppi, res  = ppi)
ggplot(data = subset(BEV_Nation, Zivilstand %in% c("Ledig", "Verheiratet", "Geschieden") & Nationalitaet %in% NationInSmallMultiple & Alter %in% AlterInSmallMultiple)) + aes(x = Jahr, y = Anteil_Zivilstand, color = Zivilstand) +
  geom_point() + 
  geom_smooth(se = FALSE) +
  facet_wrap(~ Nationalitaet + Alter, nrow = length(Nation)) +
  labs(title = "Zivilstand nach Alterskategorie und Nationalität", subtitle = "Anteil verheiratet, ledig und geschieden, 1993 - 2016", y = "Prozent") + 
  ylim(0,100) +
  theme(text = element_text(size = 18),
        axis.text = element_text(size= 15),
        axis.title = element_text(size = 20, face = "bold"))
dev.off()


# Abfrage zu Zivilstand im Jahr
Jahr <- 1993 #Eingabe Jahr
Zivilstand <- "Verheiratet" #Eingabe Zivilstand
NationInAua <- "Ausländer" # oder "Schweizer"
BEV_Intro2_Table$Anteil_Zivilstand[BEV_Intro2_Table$Jahr == Jahr & BEV_Intro2_Table$Zivilstand == Zivilstand & BEV_Intro2_Table$Nationalität == NationInAua] #in %

# Abfrage zu Zivilstand im Jahr, für Alterskategorie und Nationalität
Jahr <- 2016 #Eingabe Jahr
Zivilstand <- "Verheiratet" #Eingabe Zivilstand
Nation <- "Schweiz"
Alter <- "30-34"
BEV_Nation$Anteil_Zivilstand[BEV_Nation$Jahr == Jahr & BEV_Nation$Zivilstand == Zivilstand & BEV_Nation$Nationalitaet == Nation & BEV_Nation$Alter == Alter]