quantitative-methods-hs22/report.rmd

316 lines
7.4 KiB
Plaintext
Raw Normal View History

2022-11-18 09:14:50 +01:00
---
title: "Quantitative Methods HS22"
author: "Marc Gauch"
date: "`r Sys.Date()`"
output:
html_document:
toc: true
toc_depth: 2
toc_float: true
number_sections: true
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Preparation
```{r, message=FALSE}
if (!require(tidyverse)){
install.packages("tidyverse")
library(tidyverse)
}
```
# Load Data
## Load from CSV
``` {r loadData}
litdata <- read_csv("DataLit_R.csv", show_col_types = FALSE)
litdata <- as_tibble(litdata)
```
## First inspection of data {.tabset}
### Summary
``` {r}
summary(litdata)
```
### Glimpse
``` {r}
glimpse(litdata)
```
### Print
``` {r}
print(litdata)
```
### Head
``` {r}
head(litdata)
```
# Data cleaning
## Converting Strings to numbers and *Keine Antwort* zu *NaN*
``` {r}
litdata <- litdata %>%
mutate_all(~ replace(., . == "Stimme voll zu5", 5)) %>%
mutate_all(~ replace(., . == "Stimme überhaupt nicht zu1", 1)) %>%
mutate_all(~ replace(., . == "Keine Antwort-", NaN))
```
## Make it numeric
The following code will **NOT** be run. The Idea is to show a way to automatically edit all columns. It works but some columns are NOT numeric.
```{r, eval=FALSE}
# All colnames that exist
litdataColnames <- colnames(litdata)
# the ones we don't want to change
litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
# the colnames that should be changed
litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
print(litdataColsToMakeNumeric)
litdataColsToMakeNumeric <- c("R1")
for (col in litdataColsToMakeNumeric) {
litdata[[col]] <- as.numeric(litdata[[col]])
}
```
First we rename all the columns
```{r}
litdata <- litdata %>% rename(
"A1" = "W001",
"A2" = "W002",
"A3" = "W003",
"A4" = "W004",
"A5" = "W005",
"A6" = "W006",
"A7" = "W007",
"A8" = "W008",
"A9" = "W009",
"B1" = "K001",
"B2" = "K002",
"B3" = "K003",
"B4" = "K004",
"B5" = "K005",
"B6" = "K006",
"B7" = "K007",
"B8" = "K008",
"B9" = "K009",
"C1_1" = "TK001_01",
"C1_2" = "TK001_02",
"C1_3" = "TK001_03",
"C1_4" = "TK001_04",
"C2_1" = "TK002_01",
"C2_2" = "TK002_02",
"C2_3" = "TK002_03",
"C2_4" = "TK002_04",
"C3_1" = "TK003_01",
"C3_2" = "TK003_02",
"C3_3" = "TK003_03",
"C3_4" = "TK003_04",
"C4_1" = "TK004_01",
"C4_2" = "TK004_02",
"C4_3" = "TK004_03",
"C4_4" = "TK004_04",
"C5_1" = "TK005_01",
"C5_2" = "TK005_02",
"C5_3" = "TK005_03",
"C5_4" = "TK005_04",
"C6_1" = "TK006_01",
"C6_2" = "TK006_02",
"C6_3" = "TK006_03",
"C6_4" = "TK006_04",
"D1_1" = "H001_001",
"D1_2" = "H001_002",
"D1_3" = "H001_003",
"D1_4" = "H001_004",
"D1_5" = "H001_005",
"D1_6" = "H001_006",
"D1_7" = "H001_007",
"D2" = "H002",
"D3" = "H003",
"D4" = "H004",
"D4_comment" = "H004_other",
"D5" = "H005",
"D5_comment" = "H005_other",
"D6" = "H006",
"D7" = "H007",
"D8" = "H008",
"E1" = "R1"
)
```
Then we change the datatype and fix the values
```{r}
litdata$A1 <- as.numeric(litdata$A1)
litdata$A2 <- as.numeric(litdata$A2)
litdata$A3 <- as.numeric(litdata$A3)
litdata$A4 <- as.numeric(litdata$A4)
litdata$A5 <- as.numeric(litdata$A5)
litdata$A6 <- as.numeric(litdata$A6)
litdata$A7 <- as.numeric(litdata$A7)
litdata$A8 <- as.numeric(litdata$A8)
litdata$A9 <- as.numeric(litdata$A9)
litdata$B1 <- as.numeric(litdata$B1)
litdata$B2 <- as.numeric(litdata$B2)
litdata$B3 <- as.numeric(litdata$B3)
litdata$B4 <- as.numeric(litdata$B4)
litdata$B5 <- as.numeric(litdata$B5)
litdata$B6 <- as.numeric(litdata$B6)
litdata$B7 <- as.numeric(litdata$B7)
litdata$B8 <- as.numeric(litdata$B8)
litdata$B9 <- as.numeric(litdata$B9)
litdata$C1_1 <- as.numeric(litdata$C1_1)
litdata$C1_2 <- as.numeric(litdata$C1_2)
litdata$C1_3 <- as.numeric(litdata$C1_3)
litdata$C1_4 <- as.numeric(litdata$C1_4)
litdata$C2_1 <- as.numeric(litdata$C2_1)
litdata$C2_2 <- as.numeric(litdata$C2_2)
litdata$C2_3 <- as.numeric(litdata$C2_3)
litdata$C2_4 <- as.numeric(litdata$C2_4)
litdata$C3_1 <- as.numeric(litdata$C3_1)
litdata$C3_2 <- as.numeric(litdata$C3_2)
litdata$C3_3 <- as.numeric(litdata$C3_3)
litdata$C3_4 <- as.numeric(litdata$C3_4)
litdata$C4_1 <- as.numeric(litdata$C4_1)
litdata$C4_2 <- as.numeric(litdata$C4_2)
litdata$C4_3 <- as.numeric(litdata$C4_3)
litdata$C4_4 <- as.numeric(litdata$C4_4)
litdata$C5_1 <- as.numeric(litdata$C5_1)
litdata$C5_2 <- as.numeric(litdata$C5_2)
litdata$C5_3 <- as.numeric(litdata$C5_3)
litdata$C5_4 <- as.numeric(litdata$C5_4)
litdata$C6_1 <- as.numeric(litdata$C6_1)
litdata$C6_2 <- as.numeric(litdata$C6_2)
litdata$C6_3 <- as.numeric(litdata$C6_3)
litdata$C6_4 <- as.numeric(litdata$C6_4)
litdata <- litdata %>% mutate(D1_1 = ifelse(D1_1 == "Ja", TRUE, ifelse(D1_1 == "Nicht Gewählt", FALSE, D1_1)))
litdata$D1_1 <- as.logical(litdata$D1_1)
litdata <- litdata %>% mutate(D1_2 = ifelse(D1_2 == "Ja", TRUE, ifelse(D1_2 == "Nicht Gewählt", FALSE, D1_2)))
litdata$D1_2 <- as.logical(litdata$D1_2)
litdata <- litdata %>% mutate(D1_3 = ifelse(D1_3 == "Ja", TRUE, ifelse(D1_3 == "Nicht Gewählt", FALSE, D1_3)))
litdata$D1_3 <- as.logical(litdata$D1_3)
litdata <- litdata %>% mutate(D1_4 = ifelse(D1_4 == "Ja", TRUE, ifelse(D1_4 == "Nicht Gewählt", FALSE, D1_4)))
litdata$D1_4 <- as.logical(litdata$D1_4)
litdata <- litdata %>% mutate(D1_5 = ifelse(D1_5 == "Ja", TRUE, ifelse(D1_5 == "Nicht Gewählt", FALSE, D1_5)))
litdata$D1_5 <- as.logical(litdata$D1_5)
litdata <- litdata %>% mutate(D1_6 = ifelse(D1_6 == "Ja", TRUE, ifelse(D1_6 == "Nicht Gewählt", FALSE, D1_6)))
litdata$D1_6 <- as.logical(litdata$D1_6)
litdata <- litdata %>% mutate(D1_7 = ifelse(D1_7 == "Ja", TRUE, ifelse(D1_7 == "Nicht Gewählt", FALSE, D1_7)))
litdata$D1_7 <- as.logical(litdata$D1_7)
litdata <- litdata %>% mutate(D2 = ifelse(D2 == "Ja", TRUE, ifelse(D2 == "Nein", FALSE, D2)))
litdata$D2 <- as.logical(litdata$D2)
# skipping D3 because it's just a free text
litdata$D4 <- as.factor(litdata$D4)
# skipping D4_comment because it's a free text
litdata$D5 <- as.factor(litdata$D5)
# skipping D5_comment because it's a free text
# can't be a number as there is a 2010 or earlier option.
litdata$D6 <- as.factor(litdata$D6)
litdata$D7 <- as.numeric(litdata$D7)
litdata$D8 <- as.factor(litdata$D8)
# skipping E1 because it's a free text
```
## Second inspection of data {.tabset}
### Summary
``` {r}
summary(litdata)
```
### Glimpse
``` {r}
glimpse(litdata)
```
### Print
``` {r}
print(litdata)
```
### Head
``` {r}
head(litdata)
```
# Selbststudium 1
*Berechnen Sie die Häufigkeiten für die Variablen W003, K003, H001_001, H005, H007 und H008.*
## Data {.tabset}
```{r}
displayFunction1 <- function(table, column) {
tmp <- table[column]
tmp <- rename(tmp, value = all_of(column))
tmp <- tmp %>%
count(value) %>%
mutate(percentage = prop.table(n)*100)
print(tmp, n = 100)
ggplot(tmp,
aes(x = value, y=n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
```
### A3 (W003) {-}
```{r}
displayFunction1(litdata, "A3")
```
### B3 (K003) {-}
```{r}
displayFunction1(litdata, "B3")
```
### D1_1 (H001_001) {-}
```{r}
displayFunction1(litdata, "D1_1")
```
### D5 (H005) {-}
```{r}
displayFunction1(litdata, "D5")
```
### D7 (H007) {-}
```{r}
displayFunction1(litdata, "D7")
```
Die Warnung ist resultiert daraus, dass es sehr viele *NA* gibt.
### D8 (H008) {-}
```{r}
displayFunction1(litdata, "D8")
```