Libraries

library(quanteda)
## Package version: 2.0.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(formattable)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(parallel)
library(regexSelect)
library(ggplot2)
library(ggthemes)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
#farben
frau <- "#70b7b6"
mann <- "#c8b446"

links <- "#ffc65c"
mitte <- "#f08f43"
rechts <- "#e06552"

alt <- "#472d30"
jung <- "#bf8891"

Daten einlesen & Vorbereiten

setwd("/Users/annameisser/Desktop/DATA/AAA")

load("NR_SR_Jan_Okt_2019.RData")

# nach Sprache filtern
NR_SR_Jan_Okt_2019 <- NR_SR_Jan_Okt_2019 %>%  filter(Lang == "de") %>% filter(Candidate.Nationalrat == "1" | Candidate.Staenderat == "1") 

NR_SR_Jan_Okt_2019$Text <- NR_SR_Jan_Okt_2019$Text %>% str_remove_all("http[:graph:]+")
NR_SR_Jan_Okt_2019$Alter <- NR_SR_Jan_Okt_2019$Age 
NR_SR_Jan_Okt_2019$Alter <- as.numeric(NR_SR_Jan_Okt_2019$Alter)
NR_SR_Jan_Okt_2019$Alter <- car::recode(NR_SR_Jan_Okt_2019$Alter, "17:38 = 'jung'; 39:100 = 'alt'")

corpus_SP <- corpus(NR_SR_Jan_Okt_2019,text_field="Text")
#corpus_SP

Anzahl Tweets Frauen / Männer im Datensatz

### Total Mönner / Frauen im Datensatz für verhältnis vergleich

NR_klein <- NR_SR_Jan_Okt_2019 %>% select(c("Gender"))

library(reshape)
library(reshape2)

NR_klein$Gender <- car::recode(NR_klein$Gender, "'' = 'u'; 'NA'='u'")

NR_klein_f <- filter(NR_klein, Gender == "f")
NR_klein_f$case <- 1
NR_klein_f <- sum(NR_klein_f$case)
  

NR_klein_m <- filter(NR_klein, Gender == "m")
NR_klein_m$case <- 1
NR_klein_m <- sum(NR_klein_m$case)

cases <- c(NR_klein_f, NR_klein_m)
group <- c("f", "m")

tweet_cases <- as.data.frame(cases, group)
tweet_cases$group <- c("f", "m")

Anzahl Parteien im Datensatz

###### Total Parteien Tweets für Vergleich

NR_party <- NR_SR_Jan_Okt_2019 %>% select(c("Party_Short"))

NR_party$Party_Short <- car::recode(NR_party$Party_Short, "'' = 'u'; 'NA'='u'")

NR_party_glp <- filter(NR_party, Party_Short == "glp")
NR_party_glp$case <- 1
NR_party_glp <- sum(NR_party_glp$case)
  
NR_party_FDP <- filter(NR_party, Party_Short == "FDP")
NR_party_FDP$case <- 1
NR_party_FDP <- sum(NR_party_FDP$case)

NR_party_CVP <- filter(NR_party, Party_Short == "CVP")
NR_party_CVP$case <- 1
NR_party_CVP <- sum(NR_party_CVP$case)

NR_party_SP <- filter(NR_party, Party_Short == "SP")
NR_party_SP$case <- 1
NR_party_SP <- sum(NR_party_SP$case)

NR_party_Grüne <- filter(NR_party, Party_Short == "Grüne")
NR_party_Grüne$case <- 1
NR_party_Grüne <- sum(NR_party_Grüne$case)

NR_party_SVP <- filter(NR_party, Party_Short == "SVP")
NR_party_SVP$case <- 1
NR_party_SVP <- sum(NR_party_SVP$case)

NR_party_jglp <- filter(NR_party, Party_Short == "jglp")
NR_party_jglp$case <- 1
NR_party_jglp <- sum(NR_party_jglp$case)

NR_party_jf <- filter(NR_party, Party_Short == "jf")
NR_party_jf$case <- 1
NR_party_jf <- sum(NR_party_jf$case)

NR_party_BDP <- filter(NR_party, Party_Short == "BDP")
NR_party_BDP$case <- 1
NR_party_BDP <- sum(NR_party_BDP$case)

NR_party_JUSO <- filter(NR_party, Party_Short == "JUSO")
NR_party_JUSO$case <- 1
NR_party_JUSO <- sum(NR_party_JUSO$case)

NR_party_JCVP <- filter(NR_party, Party_Short == "JCVP")
NR_party_JCVP$case <- 1
NR_party_JCVP <- sum(NR_party_JCVP$case)

NR_party_EVP <- filter(NR_party, Party_Short == "EVP")
NR_party_EVP$case <- 1
NR_party_EVP <- sum(NR_party_EVP$case)

NR_party_JSVP <- filter(NR_party, Party_Short == "JSVP")
NR_party_JSVP$case <- 1
NR_party_JSVP <- sum(NR_party_JSVP$case)

NR_party_JG <- filter(NR_party, Party_Short == "JG")
NR_party_JG$case <- 1
NR_party_JG <- sum(NR_party_JG$case)

NR_party_Piraten <- filter(NR_party, Party_Short == "Piraten")
NR_party_Piraten$case <- 1
NR_party_Piraten <- sum(NR_party_Piraten$case)

NR_party_JBDP <- filter(NR_party, Party_Short == "JBDP")
NR_party_JBDP$case <- 1
NR_party_JBDP <- sum(NR_party_JBDP$case)

NR_party_jevp <- filter(NR_party, Party_Short == "jevp")
NR_party_jevp$case <- 1
NR_party_jevp <- sum(NR_party_jevp$case)

NR_party_up <- filter(NR_party, Party_Short == "up!")
NR_party_up$case <- 1
NR_party_up<- sum(NR_party_up$case)

NR_party_AL <- filter(NR_party, Party_Short == "AL ZH")
NR_party_AL$case <- 1
NR_party_AL <- sum(NR_party_AL$case)

cases <- c(NR_party_glp, NR_party_FDP, NR_party_CVP, NR_party_SP, NR_party_Grüne, NR_party_SVP, NR_party_jglp, NR_party_jf,
           NR_party_BDP, NR_party_JUSO, NR_party_JCVP, NR_party_EVP, NR_party_JSVP, NR_party_JG, NR_party_Piraten, NR_party_JBDP,
           NR_party_jevp, NR_party_up, NR_party_AL)
group <- c("glp", "FDP", "CVP", "SP", "Grüne", "SVP", "jglp", "jf", "BDP", "JUSO", "JCVP", "EVP", "JSVP", "JG", "Piraten", "JBDP", "jevp",
           "up!", "AL ZH")

tweet_cases2 <- as.data.frame(cases, group)
tweet_cases2$group <- c("glp", "FDP", "CVP", "SP", "Grüne", "SVP", "jglp", "jf", "BDP", "JUSO", "JCVP", "EVP", "JSVP", "JG",
                        "Piraten", "JBDP", "jevp", "up!", "AL ZH")

Anzahl Alt / Jung

### alter Vergleich

NR_alter <- NR_SR_Jan_Okt_2019 %>% select(c("Age"))

NR_alter$Age <- car::recode(NR_alter$Age, "'' = 'u'; 'NA'='u'")

NR_alt <- filter(NR_alter, Age > 38)
NR_alt$case <- 1
NR_alt <- sum(NR_alt$case)
  

NR_jung <- filter(NR_alter, Age <= 38)
NR_jung$case <- 1
NR_jung <- sum(NR_jung$case)

cases <- c(NR_alt, NR_jung)
group <- c("alt", "jung")

tweet_cases3 <- as.data.frame(cases, group)
tweet_cases3$group <- c("alt", "jung")

1. Wie sieht Geschlechterinklusive Sprache aus?

Art <- as.character(c("Neutralisierung", "Doppelenennung",  "Schrägstrich", "Klammerschreibweise", "Binnen-I", "Gendergap", "GenderX", "Genderstern"))

Beispiel <- as.character(c("die Studierenden", "Kollegen & Kolleginnen",  "Kolleg/innen", "Kolleg(inn)en", "LehrerInnen", "Lehrer_Innen", "Lehrx", "Richter*innen" ))

Form <- as.factor(c("Neutralisierung", "Feminisierung", "Feminisierung", "Feminisierung", "Feminisierung", "Genderinklusiv", "Genderinklusiv", "Genderinklusiv"))

sprache <- as.data.frame(Art)

sprache$Beispiel <- Beispiel

sprache$Form <- Form

tabelle <- formattable(sprache, align=c("l", "l", "l"), list(area(col = Form ~ normalize_bar("red"))))

tabelle
Art Beispiel Form
Neutralisierung die Studierenden Neutralisierung
Doppelenennung Kollegen & Kolleginnen Feminisierung
Schrägstrich Kolleg/innen Feminisierung
Klammerschreibweise Kolleg(inn)en Feminisierung
Binnen-I LehrerInnen Feminisierung
Gendergap Lehrer_Innen Genderinklusiv
GenderX Lehrx Genderinklusiv
Genderstern Richter*innen Genderinklusiv

2. Datensatz anschauen: wer Twittert?

wer_twittert <- select(NR_SR_Jan_Okt_2019,Alter, Party_Short, Age, Gender, Source, Name)

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

Wer twittert Plots Gender

##############################3 Geschlecht ##############################
gender <- wer_twittert %>% group_by(Name, Gender) %>% select(Name) %>% summarise_all(sum) %>% na.omit()

geschlecht <- getmode(gender$Gender)
geschlecht_m <- filter(gender, Gender == "m")
geschlecht_f <- filter(gender, Gender == "f")

WT_geschlecht <- ggplot() + geom_bar(data = gender, mapping = aes(x = Gender), fill = alpha(c(frau, mann), .9)) + theme_tufte() + xlab("") + ylab("") + ggtitle("Anzahl Kandidat*innen") + theme(text = element_text(family = "Arial"))  + scale_x_discrete(labels = c("Frauen", "Männer"))
 
gender2 <- wer_twittert %>% group_by(Gender, Status_id) %>% select(Status_id) %>% summarise_all(sum) %>% na.omit

geschlecht_m2 <- filter(gender2, Gender == "m")
geschlecht_f2 <- filter(gender2, Gender == "f")

WT_geschlecht2 <- ggplot() + geom_bar(data = gender2, mapping = aes(x = Gender), fill = alpha(c(frau, mann), .9)) + ylab("") + theme_tufte() + xlab("")  + ggtitle("Anzahl Tweets") + theme(text = element_text(family = "Arial"))  + scale_x_discrete(labels = c("Frauen", "Männer"))

plot_grid( WT_geschlecht, WT_geschlecht2)

Wer twittert Alter

############################################################################
############################# Alter #######################################

wer_twittert$Age <- as.numeric(wer_twittert$Age)

alter <- wer_twittert %>% group_by(Name, Age) %>% select(Name) %>% summarise_all(sum)
alter_mean <- mean(alter$Age, na.rm = TRUE)
alter_median <- median(alter$Age, na.rm = TRUE)

alter <- wer_twittert %>% group_by(Name, Alter) %>% select(Name) %>% summarise_all(sum) %>% na.omit

alter_alt <- filter(alter, Alter == "alt")
alter_jung <- filter(alter, Alter =="jung")

WT_alter <- ggplot() + geom_bar(data = alter, mapping = aes(x = Alter), fill = alpha(c(alt, jung), .9)) + ylab("") +theme_tufte() + xlab("") +  ggtitle("Anzahl Kandidat*innen") + theme(text = element_text(family = "Arial"))  + scale_x_discrete(labels = c("über 38", "unter 38"))
 
alter2 <- wer_twittert %>% group_by(Alter, Status_id) %>% select(Status_id) %>% summarise_all(sum) %>% na.omit

alter_alt2 <- filter(alter2, Alter == "alt")

alter_jung2 <- filter(alter2, Alter == "jung")
WT_alter2 <- ggplot() + geom_bar(data = alter2, mapping = aes(x = Alter), fill = alpha(c(alt, jung), .9)) + ylab("") +theme_tufte() + xlab("") +  ggtitle("Anzahl Tweets") +  theme(text = element_text(family = "Arial")) + scale_x_discrete(labels = c("über 38", "unter 38"))

plot_grid( WT_alter, WT_alter2)

Wer twittert Partei

###############################################################################
######################### Partei ##############################################

partei <- wer_twittert %>% group_by(Name, Party_Short) %>% select(Name) %>% summarise_all(sum) %>% na.omit

partei_mode <- getmode(partei$Party_Short)

partei <- within(partei, 
                   Party_Short <- factor(Party_Short, 
                                      levels=names(sort(table(Party_Short), 
                                                        decreasing=TRUE))))

head(partei$Party_Short)
## [1] FDP     Piraten Piraten CVP     SP      Grüne  
## 39 Levels: glp FDP CVP SP Grüne SVP jglp jf BDP JUSO JCVP EVP JSVP ... PUM
partei <- filter(partei, Party_Short %in% c("glp", "FDP", "CVP", "SP", "Grüne", "SVP", "jglp", "jf", "BDP", "JUSO"))

#Farben zu links / mitte / rechts

WT_partei <- ggplot() + geom_bar(data = partei, mapping = aes(x = Party_Short), 
                                fill = c(mitte, rechts, mitte, links, links, rechts, mitte, rechts, rechts, links), bindwidth = .9, )  + theme_tufte() + theme(axis.text.x = element_text(angle = 45)) +
  scale_fill_manual(values = alpha("orange3"), .5) +
    xlab("") + ylab("") + ggtitle("Anzahl Kandidat*innen") + theme(text = element_text(family = "Arial")) 



partei2 <- wer_twittert %>% group_by(Status_id, Party_Short) %>% select(Status_id) %>% summarise_all(sum) %>% na.omit

partei2 <- within(partei2, 
                   Party_Short <- factor(Party_Short, 
                                      levels=names(sort(table(Party_Short), 
                                                        decreasing=TRUE))))

head(partei2$Party_Short)
## [1] glp  LOVB SP   JCVP SVP  jevp
## 39 Levels: CVP glp jglp up! FDP SVP SP Grüne EVP jf JSVP Piraten BDP ... PdA
partei2 <- filter(partei2, Party_Short %in% c("CVP", "glp", "jglp", "up!", "SVP", "FDP", "SP", "Grüne", "EVP", "jf"))


WT_partei2 <- ggplot() + geom_bar(data = partei2, mapping = aes(x = Party_Short), 
                                fill = c(mitte, mitte, mitte, mitte, rechts, rechts, links, links, mitte, rechts),  bindwidth = .9, )  + theme_tufte() + theme(axis.text.x = element_text(angle = 45))  + ggtitle("Anzahl Tweets") + theme(text = element_text(family = "Arial")) +
  xlab("") + ylab("") + labs(fill = "Legend") + 
  scale_color_manual(name = "Links Rechts Einordnung der Partei", values = c(links, mitte, rechts), labels = c(links, mitte, rechts))

plot_grid(WT_partei, WT_partei2)

3. Reihenfolge

Für Reihenfolge braucht es urpsrüngliche Wortreihenfolge: in context / tokens sentences

quanteda_options(threads = detectCores())

RF_tokens <- tokens(corpus_SP) %>% tokens_tolower()

#RF_dfm <- dfm(RF_tokens)
dict_reihe <- dictionary(list(fm = c("kandidatin und kandidat","kandidatin & kanditat", "kandidatin&kandidat","kandidatin u. kandidat", 
                               "kandidatin u kandidat","kandidatin / kandidat","kandidatin/kandidat",
                              "kandidatinnen und kandidaten", "kandidatinnen & kandidaten", 
                              "kandidatinnen&kandidaten", "kandidatinnen u. kandidaten", 
                               "kandidatinnen u kandidaten", "kandidatinnen / kandidaten", "kandidatinnen/kandidaten",
                              "wählerin und wähler", "wählerin & wähler", 
                               "wählerin&wähler", "wählerin u. wähler", 
                               "wählerin u wähler",  "wählerin / wähler", "wählerin/wähler",
                               "wählerinnen und wähler", "wählerinnen & wähler", 
                               "wählerinnen&wähler",  "wählerinnen u. wähler", 
                              "wählerinnen u wähler", "wählerinnen / wähler", "wählerinnen/wähler", "bürgerin und bürger", 
                              "bürgerin & bürger",  "bürgerin&bürger", "bürgerin u. bürger", 
                              "bürgerin u bürger",  "bürgerin / bürger",  "bürgerin/bürger",
                              "bürgerinnen und bürger", "bürgerinnen & bürger", 
                               "bürgerinnen&bürger", "bürgerinnen u. bürger", 
                               "bürgerinnen u bürger", "bürgerinnen / bürger", "bürgerinnen/bürger"), 
                              mf = c("kandidat und kandidatin","kandidat & kandidatin", "kandidat&kandidatin", "kandidat u. kandidatin", 
                              "kandidat u kandidatin","kandidat / kandidatin", "kandidat/kandidatin",
                               "kandidaten und kandidatinnen", "kandidaten & kandidatinnen", 
                              "kandidaten&kandidatinnen",  "kandidaten u. kandidatinnen", 
                              "kandidaten u kandidatinnen","kandidaten / kandidatinnen", "kandidaten/kandidatinnen", 
                               "wähler und wählerin","wähler & wählerin", 
                               "wähler&wählerin",  "wähler u. wählerin", 
                              "wähler u wählerin", "wähler / wählerin","wähler/wählerin",
                             "wähler und wählerinnen",  "wähler & wählerinnen", 
                              "wähler&wählerinnen","wähler u. wählerinnen", 
                               "wähler u wählerinnen", "wähler / wählerinnen", "wähler/wählerinnen", 
                             "bürger und bürgerin", "bürger & bürgerin", 
                               "bürger&bürgerin", "bürger u. bürgerin", 
                              "bürger u bürgerin", "bürger / bürgerin", "bürger/bürgerin", 
                               "bürger und bürgerinnen", "bürger & bürgerinnen", 
                              "bürger&bürgerinnen",  "bürger u. bürgerinnen", 
                               "bürger u bürgerinnen", "bürger / bürgerinnen", "bürger/bürgerinnen")))

Plot Frauen / Männer

# im Vergleich zu wie viele Frauen Twittern / wie viele Mönner Kandidaten twitteren? 
dfm_reihe <- tokens_lookup(RF_tokens, dict_reihe) %>% dfm() 

#reihe <- textstat_frequency(dfm_reihe) 

reihe_GE <- textstat_frequency(dfm_reihe, group = "Gender")

reihe_GE$group <- car::recode(reihe_GE$group, "'' = 'u'; 'NA'='u'")

reihe_GE <- merge(tweet_cases, reihe_GE,by= "group")

# ohne relative
plot_GE1 <- ggplot() + geom_col(position = "dodge", data = reihe_GE, mapping = aes(x = feature, y = frequency, fill = group)) +  
  scale_fill_manual(values = alpha(c(frau, mann), .9)) + 
  theme_tufte() + ylab("") + xlab("") + scale_x_discrete(labels = c("Frau vor Mann", "Mann vor Frau")) + 
  theme(text = element_text(family = "Arial")) 



plot_GE2 <- ggplot() + geom_col(position = "dodge", data = reihe_GE, mapping = aes(x = feature, y = frequency/cases, fill = group)) + 
  scale_fill_manual(values = alpha(c(frau, mann), .9)) +  
  theme_tufte()  + scale_x_discrete(labels = c("Frau vor Mann", "Mann vor Frau")) +
  theme(text = element_text(family = "Arial", size = 14))  + labs(fill = "Geschlecht", title = "Reihenfolge der Nennung", x = "", y = "") 

plot_GE2

2. Reihenfolge Häufigkeit im Vergleich

############### ist es besser mit cases - alle tweets oder nur tweets die eines der Wörter besprecheen? 

quanteda_options(threads = detectCores())

HK_tokens <- tokens(corpus_SP, "word", remove_url=T,  remove_numbers = T) %>% tokens_tolower()

HK_dfm <- dfm(HK_tokens)

dict_GE <- dictionary(list(mann = c("wähler", "kandidat", "kandidaten", "bürger", "nationalratskdandidat", "nationalratskandidaten" ), frau = c("wählerin", "wählerinnen", "kandidatin", "kandidatinnen", "bürgerin", "bürgerinnen", "nationalratskandidatin", "nationalratskandidatinnen")))

dfm_GE2 <- dfm_lookup(HK_dfm, dict_GE)

#########################################################################
# muss vorkomnisse in Verbindung zu einander abziehen um in einer Grafik darzustellen ohne

reihe_GEtotal <- textstat_frequency(dfm_reihe, group = "Gender") 
abziehen_f <- reihe_GEtotal %>% filter(group == "f")
abziehen_f <- sum(abziehen_f$frequency)

abziehen_m <- reihe_GEtotal %>% filter(group == "m")
abziehen_m <- sum(abziehen_m$frequency)

abziehen_nicht <- 0


feature <- c("mf", "fm", "mann", "frau")
abziehen1 <- c(abziehen_nicht, abziehen_nicht, abziehen_m, abziehen_f)

abziehen <- as.data.frame(abziehen1, feature)
abziehen$feature <- feature

#######################################################################

reihe_GE2 <- textstat_frequency(dfm_GE2, group = "Gender")

reihe_GE2 <- merge(tweet_cases, reihe_GE2, by= "group")

reihe_GE2 <- select(reihe_GE2, group, cases, feature, frequency)
reihe_GE3 <- select(reihe_GE, group, cases, feature, frequency)

reihe_GE3 <- rbind(reihe_GE2, reihe_GE3)

reihe_GE3 <- merge(reihe_GE3, abziehen, by = "feature")

reihe_GE3$abziehen1 <- as.numeric(reihe_GE3$abziehen1)
reihe_GE3$frequency <- as.numeric(reihe_GE3$frequency)

reihe_GE3$V3 <- reihe_GE3$frequency-reihe_GE3$abziehen1

Plot im Vergleich zu Reihenfolge

reihe_GEplot <- reihe_GE3
reihe_GEplot$feature <- car::recode(reihe_GEplot$feature, "'fm' = 'Kombination Frau - Mann'; 'mf' = 'Kombination Mann - Frau'; 'frau' = 'nur Frau'; 'mann' = 'nur Mann'")

plot_GE2 <- ggplot() + geom_col(position ="dodge", data = reihe_GEplot, mapping = aes(x = feature, y = V3/cases, fill = group), width = .6) +  
  scale_fill_manual(values = alpha(c(frau, mann), .9)) + 
  theme_tufte()  + theme(text = element_text(family = "Arial", size = 22)) + labs(fill = "Geschlecht", x = "", y = "", title = "Reihenfolge der Nennung im Vergleich zu Einzelnennungen") + coord_flip()

plot_GE2

## selbe mit Alter

######################################################################### dfm reihe ##############################

reihe_alter_total <- textstat_frequency(dfm_reihe, groups = "Age") 

reihe_alter_total$group <- as.numeric(reihe_alter_total$group)
## Warning: NAs durch Umwandlung erzeugt
reihe_alter_total$group <- car::recode(reihe_alter_total$group, "17:38 = 'jung'; 39:100 = 'alt'")

abziehen_alt <- reihe_alter_total %>% filter(group == "alt")
abziehen_alt <- sum(abziehen_alt$frequency)

abziehen_jung <- reihe_alter_total %>% filter(group == "jung")
abziehen_jung <- sum(abziehen_jung$frequency)

abziehen_nicht <- 0


feature <- c("mf", "mf", "fm", "fm", "mann", "mann", "frau", "frau")
abziehen1 <- c(abziehen_nicht, abziehen_nicht, abziehen_nicht, abziehen_nicht, abziehen_alt, abziehen_jung, abziehen_alt, abziehen_jung)

abziehen <- as.data.frame(abziehen1, feature)
abziehen$feature <- c("mf", "mf", "fm", "fm", "mann", "mann", "frau", "frau")
abziehen$group <- c("alt", "jung", "alt", "jung", "alt", "jung", "alt", "jung")



reihe_alter_total$group <- car::recode(reihe_alter_total$group, "'' = 'u'; 'NA'='u'") 
reihe_alter_total <- select(reihe_alter_total, group, frequency, feature)

#reihe_alt1 <- merge(tweet_cases3, reihe_alter_total,by= "group")

####################################################################### dfm_GE2 ###########################

reihe_alter <- textstat_frequency(dfm_GE2, group = "Alter")

reihe_alter$feature <- as.factor(reihe_alter$feature)
reihe_alter$frequency <- as.numeric(reihe_alter$frequency)

reihe_alter2 <- reihe_alter %>% select(group, frequency, feature)  


#reihe_alter3 <- merge(tweet_cases3, reihe_alter2, by = "group")


######################################## merging ###########################################

reihe_alter2 <- merge(reihe_alter2, tweet_cases3, by = "group")
reihe_alter_total <- merge(reihe_alter_total, tweet_cases3, by = "group")

reihe_alt5 <- rbind(reihe_alter2, reihe_alter_total)
reihe_alt5 <- merge(reihe_alt5, abziehen, by = c("feature", "group"))


reihe_alt5$abziehen1 <- as.numeric(reihe_alt5$abziehen1)
reihe_alt5$frequency <- as.numeric(reihe_alt5$frequency)

reihe_alt5$V4 <- reihe_alt5$frequency-reihe_alt5$abziehen1


#######################################################################

Plot im Vergleich zu Reihenfolge

alt_plot <- reihe_alt5

alt_plot$feature <- car::recode(alt_plot$feature, "'fm' = 'Kombination Frau - Mann'; 'mf' = 'Kombination Mann - Frau'; 'frau' = 'nur Frau'; 'mann' = 'nur Mann'")

alt_plot1 <- ggplot() + geom_col(position = "dodge", data = alt_plot, mapping = aes(x = feature, y = V4/cases, fill = group), width = .6) + 
  scale_fill_manual(values = alpha(c(alt, jung), .9)) + 
  theme_tufte() + theme(text = element_text(family = "Arial", size = 22)) + labs(fill = "Alter", x = "", y="", title = "Reihenfolge der Nennung im Vergleich zu Einzelnennungen")+ coord_flip()

alt_plot1

3. Wortende

Create a dictionary

dict_inklusiv_test <- dictionary(list(inklusiv = c("*_innen", "*_Innen", "*_in", "*_In", "*_in", 
                                                   "*'/inn'", "*'/In'", "*'/innen'", "'*/Innen'", 
                                                   "'/inn'", "'/In'", "'/innen'", "'/Innen",
                                                    "*In", "*Innen", "*'*Innen'", "*'*innen'", 
                                                   "*'*In'","*'*in'", "*!n", "*!innen",
                                                   "*-In", "*-Innen", "*-innen",
                                                   "-In", "-Innen", "-innen")), tolower = F)

Wichtig: kein Stemming weil wir benötigen Wortende, wir benötigen punkte

WE_tokens <- tokens(corpus_SP, what = "word", remove_numbers = T, remove_url = T) %>% tokens_remove(stopwords('de')) 
  
WE_dfm_test <- dfm(WE_tokens, tolower = FALSE)

dfm_WE_test2 <- dfm_lookup(WE_dfm_test, dict_inklusiv_test, valuetype = "glob", case_insensitive = FALSE)

#topfeatures(dfm_WE_test, n = 70)

Plots Geschlecht

############################################################

inklusive_GE <- textstat_frequency(dfm_WE_test2, group = "Gender")
#inklusive <- filter(inklusive, feature != filter_list)
inklusive_GE2 <- merge(inklusive_GE, tweet_cases, by = "group")

INK_plot_GE <- ggplot() + geom_col(data = inklusive_GE2, mapping = aes(x = group, y = frequency/cases ), 
                                   fill = alpha(c(frau, mann), .9)) +
    theme_tufte() + xlab("") + ylab("")  + theme(text = element_text(family = "Arial", size = 22))  

  #INK_plot_GE

Plots Alter

####################################### Plotten - nach Alter - median?  ##########################

inklusive_alter <- textstat_frequency(dfm_WE_test2, group = "Alter")
inklusive_alter2 <- merge(inklusive_alter, tweet_cases3, by = "group")
  
INK_plot_alt <- ggplot() + geom_col(data = inklusive_alter2, mapping = aes(x = group, y = frequency/cases), 
                                    fill = alpha(c(alt, jung), .9)) +
    theme_tufte() + xlab("") + ylab("")  + theme(text = element_text(family = "Arial", size = 22))  

title <- title <- ggdraw() + draw_label("Geschlechtergerechte Wortenden",    x = 0, hjust = -.2, vjust = 6.5)  + theme(text = element_text(family = "Arial", size = 40))

plots <- plot_grid(INK_plot_GE, INK_plot_alt) 

plot_grid(title, plots, ncol = 1)

Plots Parteien

####################################### Plotten - nach Partei ##########################
fdp <- "#3872B5"
al <- "#AD3434"
glp <- "#999900"
grün <- "#5D8132"
cvp <- "#B56100"
sp <- "#D0362E"
piraten <- "#B57300"
jevp <- "#996E00"
EVP <- "#996E00"
jglp <- "#999900"
JCVP <- "#B56100"



library(reshape)

inklusive_partei <- textstat_frequency(dfm_WE_test2, group = "Party_Short")

inklusive_partei2 <- merge(inklusive_partei, tweet_cases2, by = "group")



inklusive_partei2 <- filter(inklusive_partei2, group %in% c("AL ZH", "jevp", "Grüne", "SP", "glp", "FDP", "EVP", "jglp", "CVP", "JCVP"))

INK_plot_partei <- ggplot() + geom_col(data = inklusive_partei2, mapping = aes(x = reorder(group, -(frequency/cases)), y = frequency/cases), 
                                       fill = c(al, cvp, EVP, fdp, glp, grün, JCVP, jevp, jglp, sp  ), alpha = .9) +
    theme_tufte() +  theme(text = element_text(family = "Arial", size = 22)) + labs(title = "Geschlechtergerechte Wortenden Parteien", x = "", y = "") 

INK_plot_partei

Welche inklusiven Formen werden gewäehlt?

dfm_WE_test <- dfm_select(WE_dfm_test, dict_inklusiv_test, valuetype = "glob", case_insensitive = FALSE) #was ist am beliebtesten
inklusive <- textstat_frequency(dfm_WE_test)

#topfeatures(dfm_WE_test, 70)

filter_list  <- c("Opt-In", "@velo_in", "OptIn", "#LinkedIn",  "Fly-In", "#OptIn", "@LinkedIn")
inklusive <- filter(inklusive, feature != filter_list)

Overall geschlechterinklusive Sprache

reihe_einzeln <- textstat_frequency(dfm_reihe, group = "Name") 
reihe_einzeln <- select(reihe_einzeln, group, frequency)

GE_einzeln <- textstat_frequency(dfm_GE2, group = "Name")
GE_einzeln1 <- filter(GE_einzeln, feature == "frau")
GE_einzeln1 <- filter(GE_einzeln1, frequency > 2)

GE_einzeln2 <- filter(GE_einzeln, feature == "mann")
GE_einzeln2 <- filter(GE_einzeln2,  group %in%  c("Ale Chioccarello", "Alime Kösecioğulları",  "Andrea Degen Iseli, Dr. med." ,              
  "Andreas Bisig"   ,                            "Andri Silberschmidt"  ,                       "Balthasar Glättli" ,                         
  "Barbara Schaffner" ,                          "Beat Flach" ,                                 "Beatrice Simon",                             
 "Chris Schmid"  ,                              "Christa Markwalder" ,                         "Christian Keller \U0001f4cc" ,               
"Christian Wasserfallen"   ,                   "Christina Bachmann-Roth",                     "Claudio Zanetti"  ,                          
 "Cédric Wermuth" ,                             "Diana Gutjahr"  ,                             "Doris Fiala" ,                               
 "Elias Meier" ,                                "Elisabeth Schneider-Schneiter"   ,            "Fabienne Brauchli" ,                         
 "Furkan Oguz" ,                                "Gerhard Pfister"     ,                        "Hans-Jakob Boesch" ,                         
 "Jeanine Glarner" ,                            "Karin Fehr Thoma" ,                           "Lukas Reimann" ,                             
 "Lutz Fischer-Lamprecht \U0001f1ec\U0001f1f1", "Manuel C. Widmer" ,                           "Marc Oliver Bürgi" ,                         
 "Marc Schinzel"       ,                        "Marianne Binder" ,                            "Mario Pasinelli \U0001f1e8\U0001f1ed",       
 "Martin Brügger"  ,                            "Maya Bally"     ,                             "Maya Graf" ,                                 
"Maya Weber Hadorn"  ,                         "Meret Schneider"   ,                          "Min Li Marti"  ,                             
 "Müller-Boder Nicole" ,                        "Nancy Holten | TV-Moderatorin"  ,             "Natascha Wey"     ,                          
 "Nicola Forster" ,                             "Nicole Barandun"    ,                         "Oliver Hunziker"  ,                          
 "Oliver Imfeld",                               "Oliver Thommen"   ,                           "Philipp Schönbächler"  ,                     
"Roger Stettler" ,                             "Ruedi Löffel"   ,                             "Ruedi Noser"   ,                             
"Sandro Covo",                                 "Silvio A. Fareri"   ,                         "Stefan Schlegel" ,                           
 "Stefanie Heimgartner" ,                       "Susanne Brunner" ,                            "Synes Ernst"   ,                             
 "Thomas Aeschi"   ,                            "Victor Kadlubowski" ,                        "Walter Angst"  ,                             
 "Yvonne Bürgin"  ,                             "thomas hardegger" ))


GE_einzeln1 <- select(GE_einzeln1, frequency, group)
GE_einzeln2 <- select(GE_einzeln2, frequency, group)
GE_einzeln_tot <- merge(GE_einzeln1, GE_einzeln2, by = "group")
GE_einzeln_tot$frequency <- GE_einzeln_tot$frequency.x/GE_einzeln_tot$frequency.y
GE_einzeln_tot <- select(GE_einzeln_tot, group, frequency)

inklusive_einzeln <- textstat_frequency(dfm_WE_test2, group = "Name")
inklusive_einzeln <- select(inklusive_einzeln, group, frequency)

overall <- rbind(reihe_einzeln, GE_einzeln_tot, inklusive_einzeln)

overall$frequency <- as.numeric(overall$frequency)

overall1 <- overall %>% group_by(group) %>% select(frequency) %>% summarise_all(sum) %>%  arrange(desc(frequency))

overall2 <- filter(overall1, frequency >19)

overall2$group <- car::recode(overall2$group, "'Christian Keller 📌' = 'Christian Keller'")
overall2$group <- car::recode(overall2$group,  "'(((Kilian))) ☮ ✏️ ⚓ 🕹' = 'Kilian Brogli'" )

Die Fleissigsten absolut

fdp <- "#3872B5"
al <- "#AD3434"
glp <- "#999900"
grün <- "#5D8132"
cvp <- "#B56100"
sp <- "#D0362E"
piraten <- "#B57300"
svp <- "#4F9141"



overall2$Ausrichtung <- c("fdp", "al", "grün", "glp", "glp", "grün", "glp", "grün", "sp", "cvp")

plot_overall <- ggplot() + geom_col(data = overall2, mapping = aes(x = reorder(group, -(frequency)), y = frequency, fill = Ausrichtung))  + 
  scale_fill_manual(values = c(al, cvp, fdp, glp, grün, piraten, sp)) + 
              theme_tufte()  + theme(text = element_text(family = "Arial", size = 22)) + 
            theme(axis.text.x = element_text(angle = 90)) + labs(title = "Die Fleissigsten", 
                                                                 fill = "Partei", x = "", y = "")

plot_overall

Die Fleissigsten relativ

wer_twittert$tweets <- 1

einzeln_t <- wer_twittert %>% group_by(Name) %>% select(tweets) %>% summarise_all(sum)

reihe_einzeln <- textstat_frequency(dfm_reihe, group = "Name") 
reihe_einzeln <- select(reihe_einzeln, group, frequency)

GE_einzeln_tot <- select(GE_einzeln_tot, group, frequency)

ende_einzeln <- textstat_frequency(dfm_WE_test2, group = "Name")
ende_einzeln <- select(ende_einzeln, group, frequency)


einzeln <- merge(reihe_einzeln, GE_einzeln_tot, by = c("group"))
einzeln$frequency <- einzeln$frequency.x + einzeln$frequency.y
einzeln <- select(einzeln, group, frequency)

einzeln <- merge(einzeln, ende_einzeln, by = c("group"))
einzeln$frequency <- einzeln$frequency.x + einzeln$frequency.y
einzeln <- select(einzeln, group, frequency)

einzeln <- einzeln %>% group_by(group) %>% select(frequency) %>% summarise_all(sum)

names(einzeln_t)[names(einzeln_t) == "Name"] <- "group"

einzeln2 <- merge(einzeln_t, einzeln, by = c("group"))

einzeln2$v1 <- einzeln2$frequency / einzeln2$tweets

einzeln_p <- filter(einzeln2, v1 >0.036170213)
einzeln_p$Partei<- c("cvp", "glp", "cvp", "svp", "fdp", "grün", "sp", "sp", "cvp", "al")
einzeln_p$group <- car::recode(einzeln_p$group, "'Andrea Degen Iseli, Dr. med.' = 'Andrea Degen Iseli'" )

einzeln_plot <- ggplot() + geom_col(data = einzeln_p, 
                                    mapping = aes(x = reorder(group, -(v1)), 
                                                  y = v1, fill = Partei), width = .6) + 
  scale_fill_manual(values = c(al, cvp, fdp, glp, grün, sp, svp)) + 
  theme_tufte() + theme(text = element_text(family = "Arial", size = 22)) + labs( x = "", y="", title = "Wer am häufigsten geschlechtergerechte Sprache benutzt") +  theme(axis.text.x = element_text(angle = 90)) 

einzeln_plot