diff --git a/report.html b/report.html index 9b65058..a93971f 100644 --- a/report.html +++ b/report.html @@ -1494,11 +1494,11 @@ border-radius: 0px;
if (!require(tidyverse)){
+if (!require(tidyverse)) {
install.packages("tidyverse")
library(tidyverse)
}
-if (!require(moments)){
+if (!require(moments)) {
install.packages("moments")
library(moments)
}
@@ -1508,28 +1508,26 @@ if (!require(moments)){
2.1 Frequency Table
ordered from wish.com
-freq <- function(data){
- na_count = length(data[is.na(data)])
- valid_count = length(data)-na_count
+freq <- function(data) {
+ na_count <- length(data[is.na(data)])
+ valid_count <- length(data) - na_count
frequency <- table(data)
p <- prop.table(frequency)
- percent <- round(p*100, digits = 2)
+ percent <- round(p * 100, digits = 2)
frequency_sum <- cumsum(frequency)
hkum <- cumsum(p)
- percent_sum <- round(hkum*100, digits = 2)
+ percent_sum <- round(hkum * 100, digits = 2)
freq_table <- cbind(frequency, percent, frequency_sum, percent_sum)
- valid_percent <- round(valid_count / length(data)*100, digits = 2)
- na_percent <- round(na_count / length(data)*100, digits = 2)
-
-
+ valid_percent <- round(valid_count / length(data) * 100, digits = 2)
+ na_percent <- round(na_count / length(data) * 100, digits = 2)
+
+
print(freq_table)
- count <- c(valid_count, na_count, valid_count+na_count)
- percent <- c(valid_percent, na_percent, valid_percent+na_percent)
- totall <- c(valid_count+na_count, valid_percent+na_percent)
+ count <- c(valid_count, na_count, valid_count + na_count)
+ percent <- c(valid_percent, na_percent, valid_percent + na_percent)
df <- data.frame(count, percent, row.names = c("valid", "NA", "Total"))
print(df)
-
}
Source: https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret
and adapted
@@ -1537,9 +1535,9 @@ and adapted
2.2 Modus
getmode <- function(v) {
- uniqv <- unique(v)
- x <- tabulate(match(v, uniqv))
- uniqv[which(x==max(x))]
+ uniqv <- unique(v)
+ x <- tabulate(match(v, uniqv))
+ uniqv[which(x == max(x))]
}
@@ -1816,7 +1814,7 @@ inspection of data
4.1 Converting Strings to
numbers and Keine Antwort zu NaN
-litdata <- litdata %>%
+litdata <- litdata %>%
mutate_all(~ replace(., . == "Stimme voll zu5", 5)) %>%
mutate_all(~ replace(., . == "Stimme überhaupt nicht zu1", 1)) %>%
mutate_all(~ replace(., . == "Keine Antwort-", NaN))
@@ -1826,17 +1824,17 @@ numbers and Keine Antwort zu NaN
The following code will NOT be run. The Idea is to
show a way to automatically edit all columns. It works but some columns
are NOT numeric.
- # All colnames that exist
- litdataColnames <- colnames(litdata)
- # the ones we don't want to change
- litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
- # the colnames that should be changed
- litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
- print(litdataColsToMakeNumeric)
- litdataColsToMakeNumeric <- c("R1")
- for (col in litdataColsToMakeNumeric) {
- litdata[[col]] <- as.numeric(litdata[[col]])
- }
+# All colnames that exist
+litdataColnames <- colnames(litdata)
+# the ones we don't want to change
+litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
+# the colnames that should be changed
+litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
+print(litdataColsToMakeNumeric)
+litdataColsToMakeNumeric <- c("R1")
+for (col in litdataColsToMakeNumeric) {
+ litdata[[col]] <- as.numeric(litdata[[col]])
+}
First we rename all the columns
litdata <- litdata %>% rename(
"A1" = "W001",
@@ -1848,7 +1846,6 @@ are NOT numeric.
"A7" = "W007",
"A8" = "W008",
"A9" = "W009",
-
"B1" = "K001",
"B2" = "K002",
"B3" = "K003",
@@ -1858,37 +1855,30 @@ are NOT numeric.
"B7" = "K007",
"B8" = "K008",
"B9" = "K009",
-
"C1_1" = "TK001_01",
"C1_2" = "TK001_02",
"C1_3" = "TK001_03",
"C1_4" = "TK001_04",
-
"C2_1" = "TK002_01",
"C2_2" = "TK002_02",
"C2_3" = "TK002_03",
"C2_4" = "TK002_04",
-
"C3_1" = "TK003_01",
"C3_2" = "TK003_02",
"C3_3" = "TK003_03",
"C3_4" = "TK003_04",
-
"C4_1" = "TK004_01",
"C4_2" = "TK004_02",
"C4_3" = "TK004_03",
"C4_4" = "TK004_04",
-
"C5_1" = "TK005_01",
"C5_2" = "TK005_02",
"C5_3" = "TK005_03",
"C5_4" = "TK005_04",
-
"C6_1" = "TK006_01",
"C6_2" = "TK006_02",
"C6_3" = "TK006_03",
"C6_4" = "TK006_04",
-
"D1_1" = "H001_001",
"D1_2" = "H001_002",
"D1_3" = "H001_003",
@@ -1896,23 +1886,15 @@ are NOT numeric.
"D1_5" = "H001_005",
"D1_6" = "H001_006",
"D1_7" = "H001_007",
-
"D2" = "H002",
-
"D3" = "H003",
-
"D4" = "H004",
"D4_comment" = "H004_other",
-
"D5" = "H005",
"D5_comment" = "H005_other",
-
"D6" = "H006",
-
"D7" = "H007",
-
"D8" = "H008",
-
"E1" = "R1"
)
Then we change the datatype and fix the values
@@ -2278,16 +2260,18 @@ Data
tmp <- rename(tmp, value = all_of(column))
tmp <- tmp %>%
count(value) %>%
- mutate(percentage = prop.table(n)*100)
+ mutate(percentage = prop.table(n) * 100)
print(tmp, n = 100)
- ggplot(tmp,
- aes(x = value, y=n)) +
- geom_bar(stat = "identity") +
- theme(axis.text.x = element_text(angle = 45, hjust = 1))
+ ggplot(
+ tmp,
+ aes(x = value, y = n)
+ ) +
+ geom_bar(stat = "identity") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
A3 (W003)
- displayFunction1(litdata, "A3")
+displayFunction1(litdata, "A3")
## # A tibble: 6 × 3
## value n percentage
## <dbl> <int> <dbl>
@@ -2302,7 +2286,7 @@ Data
B3 (K003)
- displayFunction1(litdata, "B3")
+displayFunction1(litdata, "B3")
## # A tibble: 7 × 3
## value n percentage
## <dbl> <int> <dbl>
@@ -2318,7 +2302,7 @@ Data
D1_1 (H001_001)
- displayFunction1(litdata, "D1_1")
+displayFunction1(litdata, "D1_1")
## # A tibble: 3 × 3
## value n percentage
## <lgl> <int> <dbl>
@@ -2329,7 +2313,7 @@ Data
D5 (H005)
- displayFunction1(litdata, "D5")
+displayFunction1(litdata, "D5")
## # A tibble: 18 × 3
## value n percentage
## <fct> <int> <dbl>
@@ -2355,7 +2339,7 @@ Data
D7 (H007)
- displayFunction1(litdata, "D7")
+displayFunction1(litdata, "D7")
## # A tibble: 24 × 3
## value n percentage
## <dbl> <int> <dbl>
@@ -2390,7 +2374,7 @@ gibt.
D8 (H008)
- displayFunction1(litdata, "D8")
+displayFunction1(litdata, "D8")
## # A tibble: 3 × 3
## value n percentage
## <fct> <int> <dbl>
@@ -2407,7 +2391,7 @@ gibt.
birthyears <- litdata$D7
# remove NAs
birthyears <- birthyears[!is.na(birthyears)]
-age <- 2021-birthyears
+age <- 2021 - birthyears
6.1 Frequency
freq(age)
@@ -2440,7 +2424,7 @@ age <- 2021-birthyears
## NA 0 0
## Total 106 100
# with NA
-freq(2021-litdata$D7)
+freq(2021 - litdata$D7)
## frequency percent frequency_sum percent_sum
## 19 1 0.94 1 0.94
## 20 2 1.89 3 2.83
@@ -2492,7 +2476,7 @@ Mittelwert
6.2.4 Spannweite
-max(age)-min(age)
+max(age) - min(age)
## [1] 31
@@ -2643,7 +2627,7 @@ sozialen Herkunft 5.6, er ist also genauso hoch wie in der
Gesamtschweiz. Was ist zur Lage dieses Wertes bezogen auf die Verteilung
in Graubünden zu sagen? Also: Wie viele Schüler in GR liegen mit ihrem
Wert darunter?
-pnorm(5.6, mean = 5.1, sd=2.0)
+pnorm(5.6, mean = 5.1, sd = 2.0)
## [1] 0.5987063
diff --git a/report.rmd b/report.rmd
index ab21172..2d04806 100644
--- a/report.rmd
+++ b/report.rmd
@@ -20,11 +20,11 @@ knitr::opts_chunk$set(echo = TRUE)
# Preparation
```{r, message=FALSE}
-if (!require(tidyverse)){
+if (!require(tidyverse)) {
install.packages("tidyverse")
library(tidyverse)
}
-if (!require(moments)){
+if (!require(moments)) {
install.packages("moments")
library(moments)
}
@@ -34,28 +34,26 @@ if (!require(moments)){
## Frequency Table ordered from wish.com
```{r}
-freq <- function(data){
- na_count = length(data[is.na(data)])
- valid_count = length(data)-na_count
+freq <- function(data) {
+ na_count <- length(data[is.na(data)])
+ valid_count <- length(data) - na_count
frequency <- table(data)
p <- prop.table(frequency)
- percent <- round(p*100, digits = 2)
+ percent <- round(p * 100, digits = 2)
frequency_sum <- cumsum(frequency)
hkum <- cumsum(p)
- percent_sum <- round(hkum*100, digits = 2)
+ percent_sum <- round(hkum * 100, digits = 2)
freq_table <- cbind(frequency, percent, frequency_sum, percent_sum)
- valid_percent <- round(valid_count / length(data)*100, digits = 2)
- na_percent <- round(na_count / length(data)*100, digits = 2)
-
-
+ valid_percent <- round(valid_count / length(data) * 100, digits = 2)
+ na_percent <- round(na_count / length(data) * 100, digits = 2)
+
+
print(freq_table)
- count <- c(valid_count, na_count, valid_count+na_count)
- percent <- c(valid_percent, na_percent, valid_percent+na_percent)
- totall <- c(valid_count+na_count, valid_percent+na_percent)
+ count <- c(valid_count, na_count, valid_count + na_count)
+ percent <- c(valid_percent, na_percent, valid_percent + na_percent)
df <- data.frame(count, percent, row.names = c("valid", "NA", "Total"))
print(df)
-
}
```
*Source: https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret and adapted*
@@ -63,9 +61,9 @@ freq <- function(data){
## Modus
```{r}
getmode <- function(v) {
- uniqv <- unique(v)
- x <- tabulate(match(v, uniqv))
- uniqv[which(x==max(x))]
+ uniqv <- unique(v)
+ x <- tabulate(match(v, uniqv))
+ uniqv[which(x == max(x))]
}
```
@@ -97,7 +95,7 @@ head(litdata)
# Data cleaning
## Converting Strings to numbers and *Keine Antwort* zu *NaN*
``` {r}
-litdata <- litdata %>%
+litdata <- litdata %>%
mutate_all(~ replace(., . == "Stimme voll zu5", 5)) %>%
mutate_all(~ replace(., . == "Stimme überhaupt nicht zu1", 1)) %>%
mutate_all(~ replace(., . == "Keine Antwort-", NaN))
@@ -106,17 +104,17 @@ litdata <- litdata %>%
## Make it numeric
The following code will **NOT** be run. The Idea is to show a way to automatically edit all columns. It works but some columns are NOT numeric.
```{r, eval=FALSE}
- # All colnames that exist
- litdataColnames <- colnames(litdata)
- # the ones we don't want to change
- litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
- # the colnames that should be changed
- litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
- print(litdataColsToMakeNumeric)
- litdataColsToMakeNumeric <- c("R1")
- for (col in litdataColsToMakeNumeric) {
- litdata[[col]] <- as.numeric(litdata[[col]])
- }
+# All colnames that exist
+litdataColnames <- colnames(litdata)
+# the ones we don't want to change
+litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
+# the colnames that should be changed
+litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
+print(litdataColsToMakeNumeric)
+litdataColsToMakeNumeric <- c("R1")
+for (col in litdataColsToMakeNumeric) {
+ litdata[[col]] <- as.numeric(litdata[[col]])
+}
```
First we rename all the columns
@@ -131,7 +129,6 @@ litdata <- litdata %>% rename(
"A7" = "W007",
"A8" = "W008",
"A9" = "W009",
-
"B1" = "K001",
"B2" = "K002",
"B3" = "K003",
@@ -141,37 +138,30 @@ litdata <- litdata %>% rename(
"B7" = "K007",
"B8" = "K008",
"B9" = "K009",
-
"C1_1" = "TK001_01",
"C1_2" = "TK001_02",
"C1_3" = "TK001_03",
"C1_4" = "TK001_04",
-
"C2_1" = "TK002_01",
"C2_2" = "TK002_02",
"C2_3" = "TK002_03",
"C2_4" = "TK002_04",
-
"C3_1" = "TK003_01",
"C3_2" = "TK003_02",
"C3_3" = "TK003_03",
"C3_4" = "TK003_04",
-
"C4_1" = "TK004_01",
"C4_2" = "TK004_02",
"C4_3" = "TK004_03",
"C4_4" = "TK004_04",
-
"C5_1" = "TK005_01",
"C5_2" = "TK005_02",
"C5_3" = "TK005_03",
"C5_4" = "TK005_04",
-
"C6_1" = "TK006_01",
"C6_2" = "TK006_02",
"C6_3" = "TK006_03",
"C6_4" = "TK006_04",
-
"D1_1" = "H001_001",
"D1_2" = "H001_002",
"D1_3" = "H001_003",
@@ -179,23 +169,15 @@ litdata <- litdata %>% rename(
"D1_5" = "H001_005",
"D1_6" = "H001_006",
"D1_7" = "H001_007",
-
"D2" = "H002",
-
"D3" = "H003",
-
"D4" = "H004",
"D4_comment" = "H004_other",
-
"D5" = "H005",
"D5_comment" = "H005_other",
-
"D6" = "H006",
-
"D7" = "H007",
-
"D8" = "H008",
-
"E1" = "R1"
)
```
@@ -289,7 +271,6 @@ litdata$D7 <- as.numeric(litdata$D7)
litdata$D8 <- as.factor(litdata$D8)
# skipping E1 because it's a free text
-
```
@@ -321,44 +302,46 @@ displayFunction1 <- function(table, column) {
tmp <- rename(tmp, value = all_of(column))
tmp <- tmp %>%
count(value) %>%
- mutate(percentage = prop.table(n)*100)
+ mutate(percentage = prop.table(n) * 100)
print(tmp, n = 100)
- ggplot(tmp,
- aes(x = value, y=n)) +
- geom_bar(stat = "identity") +
- theme(axis.text.x = element_text(angle = 45, hjust = 1))
+ ggplot(
+ tmp,
+ aes(x = value, y = n)
+ ) +
+ geom_bar(stat = "identity") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
```
### A3 (W003) {-}
```{r}
- displayFunction1(litdata, "A3")
+displayFunction1(litdata, "A3")
```
### B3 (K003) {-}
```{r}
- displayFunction1(litdata, "B3")
+displayFunction1(litdata, "B3")
```
### D1_1 (H001_001) {-}
```{r}
- displayFunction1(litdata, "D1_1")
+displayFunction1(litdata, "D1_1")
```
### D5 (H005) {-}
```{r}
- displayFunction1(litdata, "D5")
+displayFunction1(litdata, "D5")
```
### D7 (H007) {-}
```{r}
- displayFunction1(litdata, "D7")
+displayFunction1(litdata, "D7")
```
Die Warnung resultiert daraus, dass es sehr viele *NA* gibt.
### D8 (H008) {-}
```{r}
- displayFunction1(litdata, "D8")
+displayFunction1(litdata, "D8")
```
# Selbststudium 2.1
@@ -368,14 +351,14 @@ We have the year 2021
birthyears <- litdata$D7
# remove NAs
birthyears <- birthyears[!is.na(birthyears)]
-age <- 2021-birthyears
+age <- 2021 - birthyears
```
## Frequency
```{r}
freq(age)
# with NA
-freq(2021-litdata$D7)
+freq(2021 - litdata$D7)
```
## Selbststudium 2
@@ -399,7 +382,7 @@ mean(age)
### Spannweite
```{r}
-max(age)-min(age)
+max(age) - min(age)
```
### Quartilsabstand
@@ -511,6 +494,6 @@ pnorm(5.1, mean = 5.6, sd = 1.8)
In einem Bündner Ort beträgt der Mittelwert auf der Skala zur sozialen Herkunft 5.6, er ist also genauso hoch wie in der Gesamtschweiz. Was ist zur Lage dieses Wertes bezogen auf die Verteilung in Graubünden zu sagen? Also: Wie viele Schüler in GR liegen mit ihrem Wert darunter?
```{r}
-pnorm(5.6, mean = 5.1, sd=2.0)
+pnorm(5.6, mean = 5.1, sd = 2.0)
```