############################################################ # Project: DDJ Blogbeitrag # Autohr: Mael Kubli # Mail: mael.kubli@uzh.ch ############################################################ #Libraries library(data.table) #Data Load library(foreign) #Data Load library(sqldf) #Data Load with SQL (For Big Data ONLY) library(dplyr) #Data Manipulation & Transformation library(gdata) #Data Manipulation & Transformation library(sjmisc) #Data Manipulation & Transformation library(tidyr) #Data Manipulation library(car) #Data Transformation library(glm2) #Data Processing (Fitting Generalized Linear Models) library(lme4) #Data Processing (Mixed-Effects Models) library(MASS) #Data Analysis (Statistics) library(Hmisc) #Data Analysis (Statistics) library(arm) #Data Analysis (Statistics) library(textreg) #Data Analysis (Statistics) library(gtools) #Variuos Programming Tools library(ggplot2) #Data Visualization library(ggridges) #Data Visualization (Ridgeline Plots for ggplot2) library(viridis) library(gganimate) #devtools::install_github("dgrtwo/gganimate") !!!! library(ggsci) #Data Visualization library(scales) #Data Visualization (Extras) library(cowplot) #Data Visualization (Extras) library(ggExtra) #Data Visualization (Extras) library(sjPlot) #Data Visualization (Hirarchic Models) library(stargazer) #Well Formatted Tables library(plyr) library(forcats) library(rvest) #For rename variables (gtools library auch nötig) library(WDI) #World Development Indicators (API for Scraping) library(countrycode)#Can deal with different countrynames library(rCurl) #Webscraping library(httr) #Webscraping library(RSelenium) #Webscraping library(XML) #Webscraping library(httr) #Webscraping library(lubridate) #Webscraping ############################################################ #Setting Directory #PC #setwd("D:/CloudStation/Datensaetze/Data-Stadt-ZH") #MAC setwd("~/CloudStation/Datensaetze/Data-Stadt-ZH") ############################################################ #Data Import df <- fread("DDJ-Data-Full.csv") ############################################################ #Data Transformation #Delete Columns which are of no Interest df <- df[, c(1, 3:5, 7:14, 16, 19, 24, 29, 32:35)] #Summarize PersNums which occur more than once in the data while deleting several columns df <- df[ ,countid:=.N, by= c("PersNum", "StichtagDatJahr", "SexCd")] df <- df %>% group_by(PersNum, StichtagDatJahr, AlterV05Kurz, SexCd, Ziv2Lang, AnzahlKinder, HHtypLang, GebLandHistLang, NationHistLang, KreisLang, QuarLang, GebNum, AnzWhgGeb, AnzWezuWir, AnzZuzuWir, AnzUmzuWir, countid) %>% summarise_at(.vars = vars(AnzZimmer_mean,WohnFlaeche_mean, AnzZimmer_sum, WohnFlaeche_sum), .funs = c(mean="mean")) df <- rename(df, c("AnzZimmer_mean_mean"="AnzZimmer_mean", "WohnFlaeche_mean_mean"="WohnFlaeche_mean", "AnzZimmer_sum_mean"="AnzZimmer_sum", "WohnFlaeche_sum_mean"="WohnFlaeche_sum")) fwrite(df, file = "DDJ-Full.csv") df <- fread("DDJ-Full.csv", encoding = 'UTF-8') #Create New Variable with Wohnflaeche per person in average per building df <- df[,PersperBuilding:=.N,by= c("GebNum", "StichtagDatJahr")] df$WohFlaechePers <- ((df$WohnFlaeche_sum)/df$PersperBuilding) fwrite(df, file = "DDJ-Full.csv") ######################################################################### #Create Panel Data for Blog (2008-2016) ######################################################################### df <- fread("DDJ-Full.csv", encoding = 'UTF-8') dfp <- df[!(StichtagDatJahr < 2007)] ## make sure that your PersNum and Year variables are unique check.unique <-as.data.frame(table(dfp$PersNum, dfp$StichtagDatJahr)) ## PersNum Years missing in your dataset (might be because they did not live in Zurich then) check.unique[check.unique$Freq==0,] ## PersNum years more than once in your dataset check.unique[check.unique$Freq>1,] ## Delete Duplicates dfp <- dfp[!duplicated(dfp),] #Order Data by PersNum for future subsetting dfp <- dfp[order(dfp$PersNum),] #Create subset of PersNum that occured not in the year 2007 dfp <- dfp[unsplit(table(dfp$PersNum), dfp$PersNum) < 10, ] dfp <- dfp[!(StichtagDatJahr < 2008)] #Delete all Rows wehre the WohnFlaeche is not known #This is done here because if it is done earlier there would be no people left from before 2008 dfp <- dfp[!(is.na(dfp$WohnFlaeche_mean)),] #Delete all Builduings with no Flat dfp <- dfp[!(AnzWhgGeb == 0)] #Delete all Persons with a living space under 4m^2 dfp <- dfp[!(WohFlaechePers <= 8)] #Order Data by PersNum for future subsetting dfp <- dfp[order(dfp$PersNum),] #Create subset of PersNums that only occured over all years from 2008 to 2016 dfp <- dfp[,if(.N==9).SD,by=PersNum] #Round WohnFlaeche_mean / AnzZimmer_mean / WohFlaechePers_mean dfp <-dfp %>% mutate_at(.vars = vars(AnzZimmer_mean, WohnFlaeche_mean, WohFlaechePers), .funs = round,2) fwrite(dfp, file = "DDJ-Panel-neu.csv") ######################################################################### #Creating and changing Variables ######################################################################### dfp <- fread("DDJ-Panel-neu.csv", encoding = 'UTF-8') #Nation VAriable out of NationHistLang dfp$Nation <- car::recode(dfp$NationHistLang, "'Schweiz'= 1;'Indien'= 2; 'Europa'= 3; 'Deutschland'= 4; 'Amerika'= 5; 'Sri Lanka'= 2 ; 'Italien'= 4; 'Spanien'= 4; 'Asien'= 2; 'Österreich'= 4; 'Portugal'= 4; 'Serbien, Montenegro, Kosovo'= 6; 'Afrika'= 7; 'Grossbritannien'= 4; 'Unzuteilbar'= 8; 'Türkei'= 3; 'USA'= 5; 'Niederlande'= 4; 'Brasilien'= 5; 'Frankreich'= 4; 'Ozeanien'= 9; 'Griechenland'= 4; 'Serbien'= 6; 'Bosnien und Herzegowina'= 6; 'Mazedonien' = 6; 'Kroatien'= 6", as.factor.result=TRUE) dfp$Nation <- car::recode(dfp$Nation, "1 = 'Schweiz'; 2 = 'Asien'; 3 = 'Europa'; 4 = 'Westeuropa'; 5 = 'Amerika'; 6 = 'Balkanstaaten'; 7 = 'Afrika'; 8 = 'Unzuteilbar'; 9 = 'Ozeanien'") #Correct mistake in Dataset dfp$NationHistLang <- car::recode(dfp$NationHistLang, "'Serbien'='Serbien, Montenegro, Kosovo'") #Clear Data from GebLand Unzuteilbar dfp <- dfp[dfp$NationHistLang!="Unzuteilbar",] #Group Age in 5 classes from 0 to 4 dfp$AgeGroup <- car::recode(dfp$AlterV05Kurz, "'0-4'=0; '5-9'=0; '10-14'=0; '15-19'=0; '20-24'=1; '25-29'=1; '30-34'=1; '35-39'=1; '40-44'=2; '45-49'=2; '50-54'=2; '55-59'=2; '60-64'=3; '65-69'=3; '70-74'=3; '75-79'=3; '80-84'=4; '85-89'=4; '90-94'=4; '95-99'=4;") #Sex (0=m 1=f) dfp$SexCd <- car::recode(dfp$SexCd, "1=0; 2=1") #Set Switzerland (Schweiz) as reference for later Regression dfp <- within(dfp, Nation <- relevel(Nation, ref = "Schweiz")) dfp$AlterV05Kurz <- NULL dfp$HHtypLang <- NULL ######################################################################### #Graphics for Blog ######################################################################### #1) RidgePlot Wohnungsflaeche 2008-2016 Nach Region und Nationalität results <- "D:/CloudStation/Datensaetze/Data-Stadt-ZH/results/" Year <- unique(dfp$StichtagDatJahr) for (i in seq_along(Year)){ plot <- ggplot(data = subset(dfp, WohnFlaeche_mean %in% seq(0,300) & is.na(Nation) == FALSE & StichtagDatJahr == Year[i])) + aes(x=WohnFlaeche_mean, y=reorder(Nation, -(WohnFlaeche_mean)), fill = factor(..quantile.., labels = c("0-25 %", "25-50 %", "50-75%", "75-100 %"))) + geom_density_ridges_gradient(calc_ecdf = TRUE, quantiles = 4, rel_min_height = 0.015 ) + scale_fill_viridis(discrete = TRUE, name = "Quartile") + labs( x="Quadratmeter" ~(m^2), y="") + ggtitle(paste('Wohnungsflächenverteilung in der Stadt Zürich von ', Year[i], sep = '')) scale_x_continuous(limits = c(0, 300)) + theme_bw() ggsave(plot, file = paste(results, 'Ridge_Plot_Wohnungsflaeche_Reg_', Year[i], ".pdf", sep = ''), scale = 2, width = 8, height = 8) ggsave(plot, file = paste(results, 'Ridge_Plot_Wohnungsflaeche_Reg_', Year[i], ".png", sep = ''), scale = 2, width = 4, height = 4, dpi = 300) } for (i in seq_along(Year)){ plot <- ggplot(data = subset(dfp, WohnFlaeche_mean %in% seq(0,300) & is.na(NationHistLang) == FALSE & StichtagDatJahr == Year[i])) + aes(x=WohnFlaeche_mean, y=reorder(NationHistLang, -(WohnFlaeche_mean)), fill = factor(..quantile.., labels = c("0-25 %", "25-50 %", "50-75%", "75-100 %"))) + geom_density_ridges_gradient(calc_ecdf = TRUE, quantiles = 4, rel_min_height = 0.015 ) + scale_fill_viridis(discrete = TRUE, name = "Quartile") + labs( x="Quadratmeter" ~(m^2), y="") + ggtitle(paste('Wohnungsflächenverteilung in der Stadt Zürich von ', Year[i], sep = '')) scale_x_continuous(limits = c(0, 300)) + theme_bw() ggsave(plot, file = paste(results, 'Ridge_Plot_Wohnungsflaeche_Nat_', Year[i], ".pdf", sep = ''), scale = 2, width = 8, height = 8) ggsave(plot, file = paste(results, 'Ridge_Plot_Wohnungsflaeche_Nat_', Year[i], ".png", sep = ''), scale = 2, width = 4, height = 4, dpi = 300) } #################################################################################### #Animated Ridgepolts (2008-2016) #Only Works on Mac at the moment for me dfp$year_f <- factor(dfp$StichtagDatJahr) plot <- ggplot(data = subset(dfp, WohnFlaeche_mean %in% seq(0,250) & is.na(NationHistLang) == FALSE ), aes(x=WohnFlaeche_mean, y=reorder(NationHistLang, +(WohnFlaeche_mean)), fill = factor(..quantile.., labels = c("0-25 %", "25-50 %", "50-75%", "75-100 %")), frame = year_f)) + geom_density_ridges_gradient(calc_ecdf = TRUE, quantiles = 4, rel_min_height = 0.015 ) + scale_fill_viridis(discrete = TRUE, name = "Quartile") + labs( x="Quadratmeter" ~(m^2), y="") + scale_x_continuous(limits = c(0, 250)) + theme_bw() gganimate(plot, interval = 1.5, title_frame = TRUE, "output1.gif", ani.width=800, ani.height=600) gganimate(plot, interval = 1.5, title_frame = TRUE, "output1big.gif", ani.width=1200, ani.height=900) plot2 <- ggplot(data = subset(dfp, WohnFlaeche_mean %in% seq(0,250) & is.na(Nation) == FALSE ), aes(x=WohnFlaeche_mean, y=reorder(Nation, +(WohnFlaeche_mean)), fill = factor(..quantile.., labels = c("0-25 %", "25-50 %", "50-75%", "75-100 %")), frame = year_f)) + geom_density_ridges_gradient(calc_ecdf = TRUE, quantiles = 4, rel_min_height = 0.015 ) + scale_fill_viridis(discrete = TRUE, name = "Quartile") + labs( x="Quadratmeter" ~(m^2), y="") + scale_x_continuous(limits = c(0, 250)) + theme_bw() gganimate(plot2, interval = 1.5, title_frame = TRUE, "output2.gif", ani.width=800, ani.height=600) gganimate(plot2, interval = 1.5, title_frame = TRUE, "output2big.gif", ani.width=1200, ani.height=900) plot12 <- ggplot(data = subset(dfp, WohFlaechePers %in% seq(0,200) & is.na(NationHistLang) == FALSE ), aes(x=WohFlaechePers, y=reorder(NationHistLang, +(WohFlaechePers)), fill = factor(..quantile.., labels = c("0-25 %", "25-50 %", "50-75%", "75-100 %")), frame = year_f)) + geom_density_ridges_gradient(calc_ecdf = TRUE, quantiles = 4, rel_min_height = 0.015 ) + scale_fill_viridis(discrete = TRUE, name = "Quartile") + labs( x="Quadratmeter" ~(m^2), y="") + scale_x_continuous(limits = c(0, 200)) + theme_bw() gganimate(plot2, interval = 1.5, title_frame = TRUE, "output12.gif", ani.width=800, ani.height=600) gganimate(plot2, interval = 1.5, title_frame = TRUE, "output12big.gif", ani.width=1200, ani.height=900) plot22 <- ggplot(data = subset(dfp, WohFlaechePers %in% seq(0,200) & is.na(Nation) == FALSE ), aes(x=WohFlaechePers, y=reorder(Nation, +(WohFlaechePers)), fill = factor(..quantile.., labels = c("0-25 %", "25-50 %", "50-75%", "75-100 %")), frame = year_f)) + geom_density_ridges_gradient(calc_ecdf = TRUE, quantiles = 4, rel_min_height = 0.015 ) + scale_fill_viridis(discrete = TRUE, name = "Quartile") + labs( x="Quadratmeter" ~(m^2), y="") + scale_x_continuous(limits = c(0, 200)) + theme_bw() gganimate(plot22, interval = 1.5, title_frame = TRUE, "output22.gif", ani.width=800, ani.height=600) gganimate(plot22, interval = 1.5, title_frame = TRUE, "output22big.gif", ani.width=1200, ani.height=900) ########################################################################## #2) Ridge Plot for 2008 & 2012 & 2016 ggplot() + aes(y=reorder(NationHistLang, +(WohnFlaeche_mean))) + geom_density_ridges(data = subset(dfp, WohnFlaeche_mean %in% seq(0,250) & StichtagDatJahr == c("2008")), aes(x=WohnFlaeche_mean, fill = paste(StichtagDatJahr)), alpha = .2, color = "white", from = 0, to = 250) + geom_density_ridges(data = subset(dfp, WohnFlaeche_mean %in% seq(0,250) & StichtagDatJahr == c("2012")), aes(x=WohnFlaeche_mean, fill = paste(StichtagDatJahr)), alpha = .2, color = "white", from = 0, to = 250) + geom_density_ridges(data = subset(dfp, WohnFlaeche_mean %in% seq(0,250) & StichtagDatJahr == c("2016")), aes(x=WohnFlaeche_mean, fill = paste(StichtagDatJahr)), alpha = .2, color = "white", from = 0, to = 250) + labs(title = "Verteilung der Wohnungsfläche in der Stadt Zuerich", x="Quadratmeter" ~(m^2), y="") + #guides(fill = guide_legend(title = "Jahr")) + guides(fill=FALSE) + facet_wrap(~StichtagDatJahr) + scale_x_continuous(limits = c(0, 250)) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + ggsave("Ridge-Plot-bewegung.pdf", width=8) + ggsave("Ridge-Plot-bewegung.png", height = 3, width=6, dpi = 300) ########################################################################## #3) Veraenderung der Wohnungsflaeche pro Region ueber die Zeit ggplot(data = dfp, aes(y=WohnFlaeche_mean, x=StichtagDatJahr, color=Nation)) + geom_jitter(width=0.1, alpha=.05)+ #geom_line(stat="summary", fun.y="mean", color="black", size=2) + geom_smooth(method = "loess", se = FALSE, na.rm = FALSE) + scale_y_continuous(limits = c(0,200)) + labs(title = "Entwicklung der Wohnungsfläche von 2008 - 2016", x="", y="Wohnungsfläche in" ~(m^2)) + guides(color = guide_legend(title = "Region")) + theme_bw() + ggsave("Entwicklung-Flaeche.pdf", height = 8, width = 8) + ggsave("Entwicklung-Flaeche.png", height = 4, width=6, dpi = 300) #3.1 Veraenderung der Wohnungsflaeche pro Land ueber die Zeit ggplot(data = dfp, aes(y=WohnFlaeche_mean, x=StichtagDatJahr, color=NationHistLang)) + geom_jitter(width=0.1, alpha=.05)+ #geom_line(stat="summary", fun.y="mean", color="black", size=2) + geom_smooth(method = "loess", se = FALSE, na.rm = FALSE) + scale_y_continuous(limits = c(0,200)) + labs(title = "Entwicklung der Wohnungsfläche von 2008 - 2016", x="", y="Wohnungsfläche in" ~(m^2)) + guides(color = guide_legend(title = "Herkunftsland")) + facet_wrap(~ NationHistLang) + theme_bw() + ggsave("Entwicklung-Flaeche-Herkunftsland-facet.pdf", height = 8, width = 8) + ggsave("Entwicklung-Flaeche-Herkunftsland-facet.png", height = 4, width=4, dpi = 300) ########################################################################## #4) Veraenderung der Wohnungsflaeche pro Region ueber die Zeit ohne die Schweiz ggplot(data = subset(dfp, Nation != "Unzuteilbar" & Nation != "Schweiz"), aes(y=WohnFlaeche_mean, x=StichtagDatJahr, color=Nation)) + geom_jitter(width=0.1, alpha=.05)+ #geom_point(stat="summary", fun.y="mean", color="red", size=2) + geom_smooth(method = "loess", se = FALSE, na.rm = FALSE) + scale_y_continuous(limits = c(0,300)) + labs(title = "Entwicklung der Wohnungsfläche von 2008 - 2016", x="", y="Wohnungsfläche in" ~(m^2)) + guides(color = guide_legend(title = "Region")) + theme_bw() + ggsave("Entwicklung-Flaeche-ohne-ch.pdf", height = 8, width = 8) + ggsave("Entwicklung-Flaeche-ohne.ch.png", height = 4, width=4, dpi = 300) ########################################################################## #5) Veraenderung der Wohnungsflaeche pro Region ueber die Zeit Facet ggplot(data = subset(dfp, Nation != "Unzuteilbar"), aes(y=WohnFlaeche_mean, x=StichtagDatJahr, color=Nation)) + geom_jitter(width=0.1, alpha=.05)+ #geom_point(stat="summary", fun.y="mean", color="red", size=2) + geom_smooth(method = "loess", se = FALSE, na.rm = FALSE) + scale_y_continuous(limits = c(0,300)) + labs(title = "Entwicklung der Wohnungsfläche von 2008 - 2016", x="", y="Wohnunfsfläche in" ~(m^2)) + guides(color = guide_legend(title = "Region")) + facet_wrap(~ Nation) + theme_bw() + ggsave("Entwicklung-Flaeche-Facet.pdf", height = 8, width = 8) + ggsave("Entwicklung-Flaeche-Facet.png", height = 4, width=4, dpi = 300) ######################################################################### #Regression ######################################################################### #Linear Regressions for Wohnungsflaeche per Person dfp$StichtagDatJahr_K <- (dfp$StichtagDatJahr-2008) mod1 <- lm(WohnFlaeche_mean ~ Nation + AgeGroup + AnzahlKinder + SexCd, data=dfp ) mod2 <- lm(WohnFlaeche_mean ~ Nation + StichtagDatJahr_K + AgeGroup + AnzahlKinder + SexCd, data=dfp ) mod3 <- lm(WohnFlaeche_mean ~ Nation + StichtagDatJahr_K + AgeGroup + AnzahlKinder, data=dfp) stargazer(mod1, mod2, mod3, type = "html", header=FALSE, style = "default", title="Determinanten für die Wohnungsflächen in der Stadt Zürich", covariate.labels = c("Afrika", "Amerika", "Asien", "Balkanstaaten", "Europa", "Ozeanien", "Westeuropa", "Jahr", "Altersgruppe", "Kinderanzahl", "Geschlecht (ref. = m)", "Abzisse"), dep.var.caption = c("Abhängige Variable:"), dep.var.labels = c("Wohnungsfläche"), single.row = T, out="DDJ-Regression-Output-1.html" ) ########################################################################## #Saeulendiagramm der Wohnungsflaeche data_summary <- function(data, varname, groupnames){ require(plyr) summary_func <- function(x, col){ c(mean = mean(x[[col]], na.rm=TRUE), sd = sd(x[[col]], na.rm=TRUE)) } data_sum<-ddply(data, groupnames, .fun=summary_func, varname) data_sum <- rename(data_sum, c("mean" = varname)) return(data_sum) } dfpsmall <- data_summary(dfp, varname = "WohnFlaeche_mean", groupnames = c("StichtagDatJahr", "Nation")) ggplot(dfpsmall, aes(x=Nation, y=WohnFlaeche_mean, fill=as.factor(StichtagDatJahr))) + geom_bar(position=position_dodge(), stat="identity") + labs(title="Wohnungsfäche pro Person von verschiednenen Nationalitätsgruppen ", y="Wohnungsfläche " ~(m^2), x="") + guides(fill=guide_legend("")) + theme(legend.position = "bottom", legend.justification = "center", legend.direction = "horizontal") + geom_text(aes(label=round(WohnFlaeche_mean, 0)), position=position_dodge(width=0.9), vjust=-0.25, size = 2) + scale_y_continuous(expand = c(0, 0)) ########################################################################### #Boxplot Nationality vs Wohnungsflaeche per Person ggplot(data = dfp, aes(x=reorder(NationHistLang, -WohFlaechePers), y=WohFlaechePers, color = Nation)) + geom_boxplot(width = .75) + labs(title="Wohnfläche pro Person verschiedner Nationalitäten ", x="", y="Wohnfläche pro Person " ~(m^2)) + guides(color = guide_legend(title = "Region")) + theme(axis.text.x = element_text(color="black", size=10, angle=45, hjust = 1)) + theme(panel.border = element_rect(colour = "black"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + scale_y_continuous(expand = c(0, 0), limits = c(0, 150)) ########################################################################### #Anzahl Wohnungen in der Satdt über die Jahre woh <- fread("wohnung.csv") Wohnungen <- setDT(woh)[, .N, keyby=StichtagDatJahr] Wohnungen #Anzahl Gebäude in der Satdt über die Jahre geb <- fread("gebaeude.csv") Gebaeude <- setDT(geb)[, .N, keyby=StichtagDatJahr] Gebaeude