#brain-clearing
rm(list=ls())
#load libraries
library(markdown) ##markdown document
library(knitr) ##create document
library(haven) ##import data
library(readr) ##import data
library(dplyr) ##data wrangling
library(tidyverse) ##different packages for data wrangling
library(tidyr) ##tidy data
library(naniar) ##tabyl function
library(ggplot2) ##visualization
library(ggthemes) ##visualization
library(janitor) ##tabyl
library(quanteda) ##textanalyse
library(MASS)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(lubridate)
library(caret)
library(stm)
library(data.table)
library(tm)
library(geometry)
library(tidytext)
library(wordcloud)
library(Rtsne)
library(rsvd)
#setting working directory automatically where the project is located
#theme importe
ski_frau <- read.csv("C:/Users/arnol/Desktop/UZH_Master/FS23/Forschungsseminar Politischer Datenjournalismus/Blog/blog_R/data/ski_frauen.csv")
ski_mann <- read.csv("C:/Users/arnol/Desktop/UZH_Master/FS23/Forschungsseminar Politischer Datenjournalismus/Blog/blog_R/data/ski_manner.csv")
\(\bullet\) Frauen für Frauen
\(\bullet\) Männer für Männer
#frauen
ski_frau <- mutate(ski_frau, Geschlecht = "Frauen")
janitor::tabyl(ski_frau$Geschlecht)
#männer
ski_mann <- mutate(ski_mann, Geschlecht = "Männer")
janitor::tabyl(ski_mann$Geschlecht)
ski_bind <- rbind(ski_mann, ski_frau)
Jahre
Jahre anstatt datum und nur ab 2003 (i.e. letzte 20 Jahre)
#datum in jahr
ski <- ski_bind %>%
mutate(date = as.Date(pubtime))
ski$jahr <- format(as.Date(ski$date, format = "%Y%m%d"), format = "%Y")
tabyl(ski$jahr)
#2003-2023
ski <- ski %>%
filter(jahr > 2002)
class(ski1$jahr) #character
ski$jahr <- as.numeric(ski$jahr)
Html-Code
html-Code entfernen
ski <- ski %>%
mutate(text = gsub("<.*?>", " ", content))
Mediengrösse
Ich habe mich entschieden die lokalen Zeitungen zu entfernen, da diese
einen Bias bezüglich Sportler aus ihrer Region enthalten könnten.
Um nach grösse regionale und überregionale Medien im Datensatz zu filtern habe ich an die im FOEG aufgelisteten Zeitungen orientiert: https://www.foeg.uzh.ch/de/jahrbuch-qualit%C3%A4t-der-medien/hauptbefunde.html
Zusätzlich habe ich den Schweizer Illustrierter auch in den Datensatz miteinbezogen.
#print(unique(ski$medium_name))
#tabyl(ski$medium_name)
ski1 <- ski %>%
filter(medium_name %in% c("Aargauer Zeitung / MLZ", "Basler Zeitung",
"Berner Zeitung", "Der Bund", "Neue Luzerner Zeitung", "Luzerner Zeitung",
"Neue Zürcher Zeitung", "St. Galler Tagblatt", "Tages-Anzeiger",
"Blick", "20 Minuten", "NZZ am Sonntag", "Schweiz am Sonntag / MLZ",
"SonntagsZeitung", "Die Weltwoche", "Die Wochenzeitung",
"Sonntagsblick", "nzz.ch", "tagesanzeiger.ch",
"luzernerzeitung.ch", "tagblatt.ch", "bazonline.ch",
"bernerzeitung.ch", "aargauerzeitung.ch", "blick.ch",
"20 minuten", "20 minuten online", "20 Minuten Online",
"srf.ch", "Schweizer Illustrierte"))
write.csv(ski1, file = "ski1.csv") #save file
#every article only once in the dataset
unique_ids <- unique(ski1$id)
# Find the duplicated IDs
duplicate_ids <- ski1 %>%
group_by(id) %>%
filter(n() > 1) %>%
distinct(id) %>%
ungroup()
# Remove observations with duplicated IDs from the dataset
ski1_unique <- ski1 %>%
filter(!(id %in% duplicate_ids$id))
class(ski1$Geschlecht) #character
tabyl(ski1$Geschlecht)
#plot
gender_colors <- c(pct.f = "#008081", pct.m = "#FD7601")
plot_attribute <- ski1 %>%
ggplot(aes(x = Geschlecht, fill = Geschlecht)) +
geom_bar() +
scale_fill_manual(values = gender_colors) +
theme_clean() +
theme(legend.position = "none") +
xlab("Geschlecht der Skifahrer") +
ylab("Anzahl Artikel") +
labs(title = "Es wird mehr über Skifahrer als über Skifahrerinnen publiziert",
caption = "Data: SWISSDOX@LiRI")
plot_Geschlecht
#barplot
barplot_haufigkeit <- ski1 %>%
ggplot(aes(x=jahr, fill = Geschlecht)) +
geom_bar(position = "dodge", stat = "count") +
theme_minimal()+
theme(axis.text.x=element_text(angle=45)) +
scale_fill_manual(values = gender_colors)+
scale_x_continuous(breaks = unique(ski1$jahr), labels = unique(ski1$jahr)) +
labs( title = "Nur im 2016 wurden mehr Artikel über Skifahrerinnen als über Skifahrer publiziert",
x = "Jahr",
y = "Anzahl Publikationen"
,caption = "Daten: SWISSDOX@LiRI")
barplot_haufigkeit
#lineplot
ski1_count <- ski1 %>%
group_by(jahr, Geschlecht) %>%
summarise(count = n()) %>%
ungroup()
lineplot_haufigkeit <- ski1_count %>%
ggplot(aes(x = jahr, y = count, color = Geschlecht, fill = Geschlecht)) +
geom_line(alpha = 0.5) +
geom_point(size = 4) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
scale_color_manual(values = gender_colors) +
scale_fill_manual(values = gender_colors) +
scale_x_continuous(breaks = unique(ski1$jahr), labels = unique(ski1$jahr)) +
labs(title = "Nur im 2016 wurden mehr Artikel über Skifahrerinnen als über Skifahrer publiziert",
x = "Jahr",
y = "Anzahl Publikationen",
caption = "Daten: SWISSDOX@LiRI")
lineplot_haufigkeit
ski1_percent <- ski1 %>%
group_by(jahr, Geschlecht) %>%
summarise(count = n()) %>%
ungroup()
ski1_percent <- ski1_percent %>%
group_by(jahr) %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
#barplot
barplot_haufigkeit_percent <- ski1_percent %>%
ggplot(aes(x = jahr, y = percent, fill = Geschlecht)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
scale_fill_manual(values = gender_colors) +
scale_x_continuous(breaks = unique(ski1_percent$jahr), labels = unique(ski1_percent$jahr)) +
labs(title = "Nur im 2016 wurden mehr Artikel über Skifahrerinnen als über Skifahrer publiziert",
x = "Jahr",
y = "Prozent",
caption = "Daten: SWISSDOX")
barplot_haufigkeit_percent
#lineplot
lineplot_haufigkeit_prozent <- ski1_percent %>%
ggplot(aes(x = jahr, y = percent, color = Geschlecht, fill = Geschlecht)) +
geom_line(alpha = 0.5) +
geom_point(size = 4) +
geom_hline(yintercept = 50, color = "gray", linetype = "dashed") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45),
panel.grid.minor.x = element_blank()) +
scale_color_manual(values = gender_colors) +
scale_fill_manual(values = gender_colors) +
scale_y_continuous(limits = c(0, 100)) +
scale_x_continuous(breaks = unique(ski1_percent$jahr), labels = unique(ski1_percent$jahr)) +
labs(title = "Nur in 2016 wurden mehr Artikel über Skifahrerinnen als über Skifahrer publiziert",
x = "Jahr",
y = "Prozent",
caption = "Daten: SWISSDOX@LiRI")
lineplot_haufigkeit_prozent
stackedbarplot_haufigkeit_percent <- ski1_percent %>%
ggplot(aes(x = jahr, y = percent, fill = Geschlecht)) +
geom_bar(stat = "identity", position = "stack") +
geom_hline(yintercept = 50, color = "black", linetype = "dashed") + #50% linie
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
scale_fill_manual(values = gender_colors) +
scale_x_continuous(breaks = unique(ski1$jahr), labels = unique(ski1$jahr)) +
labs(title = "Nur im 2016 wurden mehr Artikel über Skifahrerinnen als über Skifahrer publiziert",
x = "Jahr",
y = "Prozent",
caption = "Daten: SWISSDOX")
stackedbarplot_haufigkeit_percent
ski1_wide <- ski1_percent %>%
spread(Geschlecht, percent) %>%
group_by(jahr) %>%
summarise(Frauen = sum(Frauen, na.rm = TRUE),
Männer = sum(Männer, na.rm = TRUE),
Differenz = Männer - Frauen)
ski1_wide
ggplot(aes(x=jahr), data = ski1_wide)+
geom_line(aes(y= Differenz),color = "gray", label = "Gender Gap") +
geom_line(aes(y=Frauen), color = "red", label = "Skifahrerinnen") +
geom_line(aes(y=Männer), color = "blue", label = "Skifahrer")+
theme_minimal() +
labs(title = "Nur im 2016 wurden mehr Artikel über Skifahrerinnen als über Skifahrer publiziert",
x = "Jahr",
y = "Prozent",
caption = "Daten: SWISSDOX")
names_m <- c("didier", "cuche", "urs", "imboden", "didier", "défago", "didier", "defago", "daniel", "albrecht", "konrad", "hari", "bruno", "kernen", "tobias", "grünenfelder", "tobias", "gruenenfelder", "marc", "berthod", "marc", "gini", "stéphane", "de", "siebenthal", "stephane", "de", "siebenthal", "dimitri", "cuche", "silvan", "zurbriggen", "patrick", "küng", "patrick", "kueng", "justin", "murisier", "ami", "oreiller", "markus", "vogel", "christian", "spescha", "mauro", "caviezel", "reto", "schmidiger", "beat", "feuz", "vitus", "lüönd", "vitus", "lueoend", "thomas", "tumler", "luca", "aerni", "luca", "ärni", "sandro", "viletta", "urs", "kryenbühl", "urs", "kryenbuehl", "bernhard", "niederberger", "fernando", "schmed", "loïc", "meillard", "loic", "meillard", "gino", "caviezel", "marco", "odermatt", "ramon", "zenhäusern", "ramon", "zenhaeusern", "sandro", "simonet", "gilles", "roulin", "cédric", "noger", "cedric", "noger", "josua", "mettler", "marco", "reymond", "noel", "von", "grünigen", "noel", "von", "gruenigen", "reto", "mächler", "reto", "maechler", "ralph", "weber", "lars", "rösti", "lars", "roesti", "denis", "corthay", "lenz", "hächler", "lenz", "haechler")
names_f <- c("tamara", "müller", "tamara", "mueller", "sonja", "nef", "marlies", "oester", "marlies", "öster", "nadia", "styger", "tanja", "pieren", "fabienne", "suter", "erika", "dicht", "fränzi", "aufdenblatten", "fraenzi", "aufdenblatten", "franziska", "christine", "aufdenblatten", "franziska", "aufdenblatten", "lilian", "kummer", "eliane", "volken", "miriam", "gmür", "miriam", "gmuer", "marianne", "abderhalden", "tina", "weirather", "christina", "weirather", "aita", "camastral", "andrea", "dettling", "sylviane", "berthod", "lara", "gut", "lara", "gut-behrami", "behrami", "pascale", "berthod", "aline", "bonjour", "martina", "schild", "rabea", "grand", "jessica", "pünchera", "jessica", "puenchera", "nadja", "jnglin-kamer", "nadja", "kamer", "marina", "nigg", "jasmine", "flury", "esther", "good", "margaux", "givel", "wendy", "holdener", "corinne", "suter", "dominique", "gisin", "dominique", "sabine", "gisin", "jasmina", "suter", "denise", "feierabend", "fabienne", "suter", "aline", "danioth", "rahel", "kopp", "mélanie", "meillard", "melanie", "meillard", "priska", "nufer", "nathalie", "gröbli", "nathalie", "groebli", "vivianne", "härri", "vivianne", "haerri", "camille", "rast", "luana", "flütsch", "luana", "fluetsch", "amélie", "klopfenstein", "amelie", "klopfenstein", "delia", "durrer", "noémie", "kolly", "noemie", "kolly", "delphine", "darbellay", "anuk", "brändli", "anuk", "braendli")
names <- c(names_f, names_m)
#weitere Stopwörter entfernen
weiterewörter <- c("sagt", "mehr", "dass",
"immer", "november",
"uhr", "luzern",
"usa", "jährige",
"federer", "ferner",
"schweizer", "schweiz",
"junioren", "engelberg",
"swiss", "gibt",
"luzern", "krien", "surse",
"emmenbrück", "willisau",
"österreich", "schweden",
"lüönd", "www", "tel",
"usa", "schon", "ganz", "mehr",
"schwyz", "sotchi",
"gibt","fussball",
"div", "class", "span", "href",
"ebikon", "ääätsch", "äär",
"aaretal", "aaa", "dzf",
"akw", "gesichtserkennung",
"blocher", "aabach", "aaa",
"aarefeldplatz", "aargauerstalden",
"kernen","nie","adelboden", "viel",
"schiffrin", "hirscher", "nie",
"simon", "sei", "kitzbühel", "wengen",
"lauberhorn", "beim", "weiss",
"zeit", "einfach", "miller", "maier",
"michael", "miller", "bild",
"wurde", "seit", "bern", "mai",
"ersten", "woche", "einfach",
"sursee", "schneider", "janka",
"jahr", "st-gallen", "gallen",
"vonn", "hotel", "jahren", "roger",
"zwei", "geht", "wäre",
"china", "prozent", "srf",
"krienz", "männer", "frauen",
"kriens", "krienseregg", "sui",
"ita", "italien", "aut", "nor", "fra",
"frankreich", "cologna", "giro",
"gemeinde", "toggenburg", "jahr", "jahren",
"jährlich", "oktober", "vonn", "lake", "louis",
"bern", "februar", "berner", "februar",
"dezember", "sonntag", "samstag", "srf",
"maze", "maria", "riesch", "zürich",
"lindsey", "grindelwald", "gallen",
"st-gallen", "pirmin", "girardelli",
"wäre", "viel", "geht", "sagt", "sagte",
"vielleicht", "yule", "kristofferson",
"henrik", "sagten", "gestern", "hätte",
"hoffmann", "ambrosi", "herman", "walchof",
"lässt", "jedoch", "dürfte", "brunner",
"sepp", "lässt", "marcel", "lehmann",
"janka", "carlo", "svindal", "hergiswil",
"emmenbrück", "shiffrin", "mikaela", "michelle",
"ammann", "deutschland", "wattwil",
"thun", "wicki", "stucki", "joel",
"schwinger", "keystone", "basel",
"reuter", "ag", "österreicher",
"streif", "gröden", "aksel", "raich",
"badia", "alta", "mayer", "bromio",
"pinturault", "beaver", "jansrud",
"creek", "norweger", "walchhofer",
"kristoffersen", "emmentaler",
"kilde", "bündner", "pd",
"chuenisbärgli", "bm", "kjetil",
"bode", "miller", "kriechmayr",
"walliser", "reichelt",
"wil", "emmenbrücke",
"neuenburger", "perren",
"nidwaldner", "kantonspolizei",
"kvitfjell", "bormio",
"chamonix", "auto", "franz",
"vierter", "fünfter", "lund",
"lauberhornrennen", "lauberhornabfahrt",
"alexis", "gardena","lauberhorn-abfahrt",
"ligety", "gmbh",
"hundschopf", "kapo", "eberharter",
"w", "matthias", "st.gallen", "dz",
"hintermann", "dominik", "kröll", "aleksander",
"ivica", "hannes", "neurenther", "büchel",
"tessinerin", "schweizerinnen", "schweizerin",
"schwyzerin", "amerikanerin", "goggia", "vlhova",
"österreichererin", "brignone", "may", "fenninger",
"cortina", "rebensburg", "st", "sofia", "italienerin")
# ski_words <- c("schnee", "rennen", "sport", "sportler", "kader", "ski",
# "saison", "team", "league", "weltcup", "kombination", "olympia",
# "lauf", "abfahrt", "olympischen", "spiel", "super")
additional_stopwords <- c(names, weiterewörter)
#additional_stopwords_ski <- c(names, weiterewörter, ski_words)
default_stopwords <- quanteda::stopwords("de")
all_stopwords <- c(default_stopwords, additional_stopwords)
#all_stopwords_ski <- c(default_stopwords, additional_stopwords_ski)
#corpus
ski1_corpus_stm <- textProcessor(ski1$text, metadata = as.data.frame(ski1),
lowercase = T, removestopwords = T, removenumbers = T,
removepunctuation = T, stem = T, language = 'de', verbose = T,
onlycharacter = T, striphtml = T,
customstopwords = all_stopwords)
#pseudo-unique corpus
# ski1_corpus_stm_unique <- textProcessor(ski1_unique$text, metadata = as.data.frame(ski1),
# lowercase = T, removestopwords = T, removenumbers = T,
# removepunctuation = T, stem = T, language = 'de', verbose = T,
# onlycharacter = T, striphtml = T,
# customstopwords = all_stopwords)
#corpus ohne ski wörter
# ski1_corpus_stm_ski <- textProcessor(ski1$text, metadata = as.data.frame(ski1),
# lowercase = T, removestopwords = T, removenumbers = T,
# removepunctuation = T, stem = T, language = 'de', verbose = T,
# onlycharacter = T, striphtml = T,
# customstopwords = all_stopwords_ski)
ski1_corpus_stm <- prepDocuments(documents = ski1_corpus_stm$documents,
vocab = ski1_corpus_stm$vocab,
meta = ski1_corpus_stm$meta,
verbose = T)
# ski1_corpus_stm_unique <- prepDocuments(documents = ski1_corpus_stm_unique$documents,
# vocab = ski1_corpus_stm_unique$vocab,
# meta = ski1_corpus_stm_unique$meta,
# verbose = T)
#ohne ski wörter
# ski1_corpus_stm_ski <- prepDocuments(documents = ski1_corpus_stm_ski$documents,
# vocab = ski1_corpus_stm_ski$vocab,
# meta = ski1_corpus_stm_ski$meta,
# verbose = T)
#speichern
saveRDS(ski1_corpus_stm, "ski1_corpus_stm.rds")
# saveRDS(ski1_corpus_stm_ski, "ski1_corpus_stm_ski.rds")
#einlesen
ski1_corpus_stm <- readRDS("ski1_corpus_stm.rds")
#plot Removed
plotRemoved(documents = ski1_corpus_stm$document, lower.thresh = seq(1, 200, by = 20))
#berechnen
ski1_corpus_stm_search <- searchK(ski1_corpus_stm$documents,ski1_corpus_stm$vocab,
K = seq(5, 55, by = 10), max.em.its = 100)
##Für 55 Topics: Model terminated before convergence reached
#speichern
# saveRDS(ski1_corpus_stm_search, "ski1_corpus_stm_search.rds")
#einlesen
#ski1_corpus_stm_search <- readRDS("ski1_corpus_stm_search.rds")
#plotten
plot(ski1_corpus_stm_search)
\(\bullet\) Held-out likelihood: Je
höher desto besser. Am höchsten bei 55 aber ab 25 sieht ok aus.
\(\bullet\) Residuals: Je tiefer desto
besser. Am tiefsten bei 55.
\(\bullet\) Semantic Coherence: Je
höher desto besser. 15, 25
\(\bullet\) Lower Bound: Je höher desto
besser: Am besten 55.
plot(unlist(ski1_corpus_stm_search$results$K), unlist(ski1_corpus_stm_search$results$exclus),
xlab = "K", ylab = "Exclusivity", type = "b")
\(\bullet\) Je höher desto besser weil trennschärfer aber semantisch nicht unbedingt am besten: 55. Aber ab 15 sieht schon gut aus.
\(\bullet\) mit K = 15 und K = 35 berechnet (mehr braucht wirklich zu lange)
#berechnen
ski1_stm_select <- selectModel(ski1_corpus_stm$documents,
ski1_corpus_stm$vocab,
K = 15, runs = 20)
#speichern
# saveRDS(ski1_stm_select, "ski1_stm_select.rds")
#einlesen
ski1_stm_select <- readRDS("ski1_stm_select.rds")
#plot
plotModels(ski1_stm_select,
legend.position = "bottomleft")
selected4 <- ski1_stm_select$runout[[4]]
#alternative berechnen
# ski1_stm_select35 <- selectModel(ski1_corpus_stm$documents,
# ski1_corpus_stm$vocab,
# K = 35, runs = 20)
#speichern
#saveRDS(ski1_stm_select35, "ski1_stm_select35.rds")
#einlesen
ski1_stm_select35 <- readRDS("ski1_stm_select35.rds")
#plot
plotModels(ski1_stm_select35,
legend.position = "bottomleft")
selected1 <- ski1_stm_select$runout[[1]] #hier 1 besser was nun?
#ski1_stm_select35 <- selectModel(ski1_corpus_stm$documents,
#ski1_corpus_stm$vocab,
#K = 55, runs = 20)
#geschlecht as factor
ski1_corpus_stm$meta$Geschlecht <- as.factor(ski1_corpus_stm$meta$Geschlecht)
#15 Topics
ski1_stm_15_covar <- stm(ski1_corpus_stm$documents,
ski1_corpus_stm$vocab,
K = 15,
data=ski1_corpus_stm$meta,
prevalence =~ Geschlecht)
ski1_stm_15_covar
#speichern
saveRDS(ski1_stm_15_covar, "ski1_stm_15_covar.rds")
#einlesen
ski1_stm_15_covar <- readRDS("ski1_stm_15_covar.rds")
#plot
plot.STM(ski1_stm_15_covar)
ski1_stm_35_covar <- stm(ski1_corpus_stm$documents,
ski1_corpus_stm$vocab,
K = 35,
data=ski1_corpus_stm$meta,
prevalence =~ Geschlecht)
ski1_stm_35_covar
labelTopics(ski1_stm_15_covar, topics = c(1:5), n = 5)
labelTopics(ski1_stm_35_covar, topics = c(1:35), n = 7)
15 Topics:
Topic 1 Top Words: Highest Prob: jahr, schweizer, feder, fussbal, sport FREX: div, class, span, href, kariem Lift: abagnal, abbekämen, abbruchreif, abendanzug, abendrob Score: feder, div, class, award, kambundji Topic 2 Top Words: Highest Prob: luzern, usa, ferner, krien, rund FREX: surse, emmenbrück, willisau, ebikon, hochdorf Lift: schenkon, ääätsch, äär, aaretal, ababel Score: emmenbrück, ebikon, ruswil, krien, luzern Topic 3 Top Words: Highest Prob: cuch, didier, abfahrt, schweizer, super FREX: miller, walchhof, paland, daron, järbyn Lift: abfahrtsberg, abfahrtsfünft, abfahrtsplätzen, abfahrtsszen, abfahrtszweiten Score: cuch, défago, miller, raich, didier Topic 4 Top Words: Highest Prob: jahr, schweiz, gibt, seit, beim FREX: dzf, blocher, akw, gesichtserkennung, playlist Lift: aaa, aabach, aarefeldplatz, aargauerstalden, aargauisch Score: dzf, kernen, tel, svp, skilift Topic 5 Top Words: Highest Prob: schweizer, olympia, olympischen, spiel, cologna FREX: nuit, jugendspiel, gekürten, beeli, francon Lift: aargauerbenaglio, aarwangenanschieb, abendspielen, abenteuerreis, abfahrtriesenslalomsup Score: cologna, ammann, snowboard, turin, sotschi
findThoughts(ski1_stm_35_covar,
texts = ski1_stm_35_covar$meta$text, topics = 26, n = 1)
plot.STM(ski1_stm_35_covar)
plot.STM(ski1_stm_35_covar, n = 7, topics = c(1,18,26), type = "labels")
#wordcloud
topic26_words <- labelTopics(ski1_stm_35_covar, topics = 26)
topic26_words_freq <- table(unlist(topic26_words))
wordcloud(names(topic26_words_freq), freq = topic26_words_freq, random.order = FALSE)
#to lower
ski1$text <- tolower(ski1$text)
#corpus erstellen
corpus_keyness <- corpus(ski1)
vornamen <- c("didier", "urs", "daniel", "konrad", "bruno", "tobias", "marc", "stéphane", "stephane", "dimitri", "silvan", "patrick", "justin", "ami", "markus", "christian", "mauro", "reto", "beat", "vitus", "thomas", "luca", "sandro", "bernhard", "fernando", "loïc", "loic", "gino", "marco", "ramon", "gilles", "cédric", "cedric", "josua", "marco", "noel", "reto", "ralph", "lars", "denis", "lenz","tamara","sonja", "marlies", "nadia", "tanja", "fabienne", "erika", "fränzi", "fraenzi", "franziska", "christine", "lilian", "eliane", "miriam", "marianne", "tina", "christina","aita", "andrea", "sylviane", "lara", "pascale", "aline", "martina", "rabea", "jessica", "nadja", "marina", "jasmine", "esther", "margaux", "wendy", "corinne", "dominique", "sabine", "jasmina", "denise", "rahel", "mélanie", "melanie", "priska", "nathalie", "vivianne", "vivianne", "haerri", "camille", "rast", "luana", "flütsch", "luana", "fluetsch", "amélie", "klopfenstein", "amelie", "klopfenstein", "delia", "durrer", "noémie", "noemie", "delphine", "anuk")
toks_keyness <- tokens(corpus_keyness,
lowercase = TRUE,
remove_symbols = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
remove_punct = TRUE,
remove_numbers = TRUE,
verbose = TRUE)
#stopwörter
toks_keyness <- tokens_remove(toks_keyness, stopwords('de'), padding = TRUE)
toks_keyness <- tokens_remove(toks_keyness, pattern = vornamen, padding = TRUE)
#stemming
#toks_keyness <- tokens_wordstem(toks_keyness, language = "de")
#name männer
skifahrer <- c("cuche", "imboden", "défago", "albrecht", "hari", "kernen", "grünenfelder", "gini", "siebenthal", "zurbriggen", "küng", "murisier", "oreiller", "vogel", "spescha", "caviezel", "schmidiger", "feuz", "lüönd", "tumler", "aerni", "viletta", "kryenbühl", "niederberger", "schmed", "meillard", "caviezel", "odermatt", "zenhäusern", "simonet", "roulin", "noger", "mettler", "reymond", "grünigen", "mächler", "weber", "rösti", "corthay", "hächler")
#name frauen ohne gut
skifahrerinnen <- c("müller", "nef", "öster", "styger", "pieren", "suter", "dicht", "aufdenblatten", "kummer", "volken", "gmür", "abderhalden", "weirather", "camastral", "dettling", "gut-behrami", "behrami", "bonjour", "schild", "grand", "pünchera", "jnglin-kamer", "kamer", "nigg", "flury", "good", "givel", "holdener", "gisin", "suter", "feierabend", "danioth", "kopp", "meillard", "nufer", "gröbli", "härri", "rast", "flütsch", "klopfenstein", "durrer", "kolly", "darbellay", "brändli")
#männer
kwic_results_m <- kwic(toks_keyness, pattern = skifahrer, window = 15)
kwic_results_m$text <- paste(kwic_results_m$pre, kwic_results_m$post)
kwic_results_m$sex <- "Mann"
kwic_results_m.analysis <- kwic_results_m
#frauen
kwic_results_f <- kwic(toks_keyness, pattern = skifahrerinnen, window = 15)
kwic_results_f$text <- paste(kwic_results_f$pre, kwic_results_f$post)
kwic_results_f$sex <- "Frau"
kwic_results_f.analysis <- kwic_results_f
#datensatz beide
kwic_results_all <- rbind(kwic_results_m, kwic_results_f)
#daten speichern
#write.csv(kwic_results_all,"./Inputs/kwic_results_all.csv", row.names = FALSE)
#write.csv(kwic_results_f,"./Inputs/kwic_results_f.csv", row.names = FALSE)
#write.csv(kwic_results_m,"./Inputs/kwic_results_m.csv", row.names = FALSE)
#männer
corpus_kwic_results_m <- corpus(kwic_results_m)
german_stopwords <- stopwords("german")
keyness_tokens_m <- tokens(corpus_kwic_results_m, remove_punct = TRUE, remove_separators = T,
remove_numbers = T, remove_symbols = T)
keyness_tokens_m <- tokens_remove(keyness_tokens_m, pattern = stopwords("german"), padding = TRUE)
dfm_kw_m <- dfm(keyness_tokens_m)
tstat_freq <- textstat_frequency(dfm_kw_m, n = 100)
tstat_freq
#frauen
corpus_kwic_results_f <- corpus(kwic_results_f)
keyness_tokens_f <- tokens(corpus_kwic_results_f, remove_punct = TRUE, remove_separators = T,
remove_numbers = T, remove_symbols = T)
keyness_tokens_f <- tokens_remove(keyness_tokens_f, pattern = stopwords("german"), padding = TRUE)
dfm_kw_f <- dfm(keyness_tokens_f)
tstat_freq_f <- textstat_frequency(dfm_kw_f, n = 100)
tstat_freq_f
#corpus
ski1_corpus <- corpus(ski1)
#tokens
ski1_tokens <- tokens(ski1_corpus,
remove_punct = T,
remove_numbers = T,
remove_symbols = T)
ski1_tokens <- tokens_remove(ski1_tokens, additional_stopwords)
ski1_tokens <- tokens_wordstem(ski1_tokens, language = "de")
#dfm
ski1_dfm <- dfm(ski1_tokens)
ski1_dfm <- dfm_remove(ski1_dfm, german_stopwords)
#keyness
keyness_results_m <- dfm_group(ski1_dfm, Geschlecht) %>% textstat_keyness("Männer")
keyness_results_m
keyness_results_f <- dfm_group(ski1_dfm, Geschlecht) %>% textstat_keyness("Frauen")
keyness_results_f
#plot
textplot_keyness(keyness_results_m, n = 20, show_reference = FALSE, color = "lightgreen")
textplot_keyness(keyness_results_f, n = 20, show_reference = FALSE, color = "lightgreen")
#neu dd
kwic_results_all <- as.data.frame(kwic_results_all)
kwic_results_keyness <- kwic_results_all[c("sex", "text")]
#corpus
ski1_corpus <- corpus(kwic_results_keyness)
#tokens
ski1_tokens <- tokens(ski1_corpus,
remove_punct = T,
remove_numbers = T,
remove_symbols = T)
ski1_tokens <- tokens_remove(ski1_tokens, additional_stopwords)
#ski1_tokens <- tokens_wordstem(ski1_tokens, language = "de")
#dfm
ski1_dfm <- dfm(ski1_tokens)
ski1_dfm <- dfm_remove(ski1_dfm, german_stopwords)
#keyness
keyness_results_m <- dfm_group(ski1_dfm, sex) %>% textstat_keyness("Mann")
keyness_results_m
keyness_results_f <- dfm_group(ski1_dfm, sex) %>% textstat_keyness("Frau")
keyness_results_f
#plot
textplot_keyness(keyness_results_m, n = 20, show_reference = FALSE, color = "lightgreen")
textplot_keyness(keyness_results_f, n = 20, show_reference = FALSE, color = "lightgreen")
Resultate: immer noch keine brauchbare Resultate