From 51cd711da845a3ad64142fda21e5183eb46195de Mon Sep 17 00:00:00 2001 From: marcgauch <34353267+marcgauch@users.noreply.github.com> Date: Fri, 18 Nov 2022 21:10:07 +0100 Subject: [PATCH] Linted --- report.html | 106 ++++++++++++++++++++++----------------------------- report.rmd | 107 ++++++++++++++++++++++------------------------------ 2 files changed, 90 insertions(+), 123 deletions(-) diff --git a/report.html b/report.html index 9b65058..a93971f 100644 --- a/report.html +++ b/report.html @@ -1494,11 +1494,11 @@ border-radius: 0px;

1 Preparation

-
if (!require(tidyverse)){
+
if (!require(tidyverse)) {
   install.packages("tidyverse")
   library(tidyverse)
 }
-if (!require(moments)){
+if (!require(moments)) {
   install.packages("moments")
   library(moments)
 }
@@ -1508,28 +1508,26 @@ if (!require(moments)){

2.1 Frequency Table ordered from wish.com

-
freq <- function(data){
-  na_count = length(data[is.na(data)])
-  valid_count = length(data)-na_count
+
freq <- function(data) {
+  na_count <- length(data[is.na(data)])
+  valid_count <- length(data) - na_count
   frequency <- table(data)
   p <- prop.table(frequency)
-  percent <- round(p*100, digits = 2)
+  percent <- round(p * 100, digits = 2)
   frequency_sum <- cumsum(frequency)
   hkum <- cumsum(p)
-  percent_sum <- round(hkum*100, digits = 2)
+  percent_sum <- round(hkum * 100, digits = 2)
   freq_table <- cbind(frequency, percent, frequency_sum, percent_sum)
-  valid_percent <- round(valid_count / length(data)*100, digits = 2)
-  na_percent <- round(na_count / length(data)*100, digits = 2)
-  
-  
+  valid_percent <- round(valid_count / length(data) * 100, digits = 2)
+  na_percent <- round(na_count / length(data) * 100, digits = 2)
+
+
   print(freq_table)
 
-  count <- c(valid_count, na_count, valid_count+na_count)
-  percent <- c(valid_percent, na_percent, valid_percent+na_percent)
-  totall <- c(valid_count+na_count, valid_percent+na_percent)
+  count <- c(valid_count, na_count, valid_count + na_count)
+  percent <- c(valid_percent, na_percent, valid_percent + na_percent)
   df <- data.frame(count, percent, row.names = c("valid", "NA", "Total"))
   print(df)
-
 }

Source: https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret and adapted

@@ -1537,9 +1535,9 @@ and adapted

2.2 Modus

getmode <- function(v) {
-   uniqv <- unique(v)
-   x <- tabulate(match(v, uniqv))
-   uniqv[which(x==max(x))]
+  uniqv <- unique(v)
+  x <- tabulate(match(v, uniqv))
+  uniqv[which(x == max(x))]
 }
@@ -1816,7 +1814,7 @@ inspection of data

4.1 Converting Strings to numbers and Keine Antwort zu NaN

-
litdata <- litdata %>% 
+
litdata <- litdata %>%
   mutate_all(~ replace(., . == "Stimme voll zu5", 5)) %>%
   mutate_all(~ replace(., . == "Stimme überhaupt nicht zu1", 1)) %>%
   mutate_all(~ replace(., . == "Keine Antwort-", NaN))
@@ -1826,17 +1824,17 @@ numbers and Keine Antwort zu NaN

The following code will NOT be run. The Idea is to show a way to automatically edit all columns. It works but some columns are NOT numeric.

-
  # All colnames that exist
-  litdataColnames <- colnames(litdata)
-  # the ones we don't want to change
-  litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
-  # the colnames that should be changed
-  litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
-  print(litdataColsToMakeNumeric)
-  litdataColsToMakeNumeric <- c("R1")
-  for (col in litdataColsToMakeNumeric) {
-    litdata[[col]] <- as.numeric(litdata[[col]])
-  }
+
# All colnames that exist
+litdataColnames <- colnames(litdata)
+# the ones we don't want to change
+litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
+# the colnames that should be changed
+litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
+print(litdataColsToMakeNumeric)
+litdataColsToMakeNumeric <- c("R1")
+for (col in litdataColsToMakeNumeric) {
+  litdata[[col]] <- as.numeric(litdata[[col]])
+}

First we rename all the columns

litdata <- litdata %>% rename(
   "A1" = "W001",
@@ -1848,7 +1846,6 @@ are NOT numeric.

"A7" = "W007", "A8" = "W008", "A9" = "W009", - "B1" = "K001", "B2" = "K002", "B3" = "K003", @@ -1858,37 +1855,30 @@ are NOT numeric.

"B7" = "K007", "B8" = "K008", "B9" = "K009", - "C1_1" = "TK001_01", "C1_2" = "TK001_02", "C1_3" = "TK001_03", "C1_4" = "TK001_04", - "C2_1" = "TK002_01", "C2_2" = "TK002_02", "C2_3" = "TK002_03", "C2_4" = "TK002_04", - "C3_1" = "TK003_01", "C3_2" = "TK003_02", "C3_3" = "TK003_03", "C3_4" = "TK003_04", - "C4_1" = "TK004_01", "C4_2" = "TK004_02", "C4_3" = "TK004_03", "C4_4" = "TK004_04", - "C5_1" = "TK005_01", "C5_2" = "TK005_02", "C5_3" = "TK005_03", "C5_4" = "TK005_04", - "C6_1" = "TK006_01", "C6_2" = "TK006_02", "C6_3" = "TK006_03", "C6_4" = "TK006_04", - "D1_1" = "H001_001", "D1_2" = "H001_002", "D1_3" = "H001_003", @@ -1896,23 +1886,15 @@ are NOT numeric.

"D1_5" = "H001_005", "D1_6" = "H001_006", "D1_7" = "H001_007", - "D2" = "H002", - "D3" = "H003", - "D4" = "H004", "D4_comment" = "H004_other", - "D5" = "H005", "D5_comment" = "H005_other", - "D6" = "H006", - "D7" = "H007", - "D8" = "H008", - "E1" = "R1" )

Then we change the datatype and fix the values

@@ -2278,16 +2260,18 @@ Data tmp <- rename(tmp, value = all_of(column)) tmp <- tmp %>% count(value) %>% - mutate(percentage = prop.table(n)*100) + mutate(percentage = prop.table(n) * 100) print(tmp, n = 100) - ggplot(tmp, - aes(x = value, y=n)) + - geom_bar(stat = "identity") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggplot( + tmp, + aes(x = value, y = n) + ) + + geom_bar(stat = "identity") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) }

A3 (W003)

-
  displayFunction1(litdata, "A3")
+
displayFunction1(litdata, "A3")
## # A tibble: 6 × 3
 ##   value     n percentage
 ##   <dbl> <int>      <dbl>
@@ -2302,7 +2286,7 @@ Data
 

B3 (K003)

-
  displayFunction1(litdata, "B3")
+
displayFunction1(litdata, "B3")
## # A tibble: 7 × 3
 ##   value     n percentage
 ##   <dbl> <int>      <dbl>
@@ -2318,7 +2302,7 @@ Data
 

D1_1 (H001_001)

-
  displayFunction1(litdata, "D1_1")
+
displayFunction1(litdata, "D1_1")
## # A tibble: 3 × 3
 ##   value     n percentage
 ##   <lgl> <int>      <dbl>
@@ -2329,7 +2313,7 @@ Data
 

D5 (H005)

-
  displayFunction1(litdata, "D5")
+
displayFunction1(litdata, "D5")
## # A tibble: 18 × 3
 ##    value                                       n percentage
 ##    <fct>                                   <int>      <dbl>
@@ -2355,7 +2339,7 @@ Data
 

D7 (H007)

-
  displayFunction1(litdata, "D7")
+
displayFunction1(litdata, "D7")
## # A tibble: 24 × 3
 ##    value     n percentage
 ##    <dbl> <int>      <dbl>
@@ -2390,7 +2374,7 @@ gibt.

D8 (H008)

-
  displayFunction1(litdata, "D8")
+
displayFunction1(litdata, "D8")
## # A tibble: 3 × 3
 ##   value        n percentage
 ##   <fct>    <int>      <dbl>
@@ -2407,7 +2391,7 @@ gibt.

birthyears <- litdata$D7
 # remove NAs
 birthyears <- birthyears[!is.na(birthyears)]
-age <- 2021-birthyears
+age <- 2021 - birthyears

6.1 Frequency

freq(age)
@@ -2440,7 +2424,7 @@ age <- 2021-birthyears
## NA 0 0 ## Total 106 100
# with NA
-freq(2021-litdata$D7)
+freq(2021 - litdata$D7)
##    frequency percent frequency_sum percent_sum
 ## 19         1    0.94             1        0.94
 ## 20         2    1.89             3        2.83
@@ -2492,7 +2476,7 @@ Mittelwert
 

6.2.4 Spannweite

-
max(age)-min(age)
+
max(age) - min(age)
## [1] 31
@@ -2643,7 +2627,7 @@ sozialen Herkunft 5.6, er ist also genauso hoch wie in der Gesamtschweiz. Was ist zur Lage dieses Wertes bezogen auf die Verteilung in Graubünden zu sagen? Also: Wie viele Schüler in GR liegen mit ihrem Wert darunter?

-
pnorm(5.6, mean = 5.1, sd=2.0)
+
pnorm(5.6, mean = 5.1, sd = 2.0)
## [1] 0.5987063
diff --git a/report.rmd b/report.rmd index ab21172..2d04806 100644 --- a/report.rmd +++ b/report.rmd @@ -20,11 +20,11 @@ knitr::opts_chunk$set(echo = TRUE) # Preparation ```{r, message=FALSE} -if (!require(tidyverse)){ +if (!require(tidyverse)) { install.packages("tidyverse") library(tidyverse) } -if (!require(moments)){ +if (!require(moments)) { install.packages("moments") library(moments) } @@ -34,28 +34,26 @@ if (!require(moments)){ ## Frequency Table ordered from wish.com ```{r} -freq <- function(data){ - na_count = length(data[is.na(data)]) - valid_count = length(data)-na_count +freq <- function(data) { + na_count <- length(data[is.na(data)]) + valid_count <- length(data) - na_count frequency <- table(data) p <- prop.table(frequency) - percent <- round(p*100, digits = 2) + percent <- round(p * 100, digits = 2) frequency_sum <- cumsum(frequency) hkum <- cumsum(p) - percent_sum <- round(hkum*100, digits = 2) + percent_sum <- round(hkum * 100, digits = 2) freq_table <- cbind(frequency, percent, frequency_sum, percent_sum) - valid_percent <- round(valid_count / length(data)*100, digits = 2) - na_percent <- round(na_count / length(data)*100, digits = 2) - - + valid_percent <- round(valid_count / length(data) * 100, digits = 2) + na_percent <- round(na_count / length(data) * 100, digits = 2) + + print(freq_table) - count <- c(valid_count, na_count, valid_count+na_count) - percent <- c(valid_percent, na_percent, valid_percent+na_percent) - totall <- c(valid_count+na_count, valid_percent+na_percent) + count <- c(valid_count, na_count, valid_count + na_count) + percent <- c(valid_percent, na_percent, valid_percent + na_percent) df <- data.frame(count, percent, row.names = c("valid", "NA", "Total")) print(df) - } ``` *Source: https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret and adapted* @@ -63,9 +61,9 @@ freq <- function(data){ ## Modus ```{r} getmode <- function(v) { - uniqv <- unique(v) - x <- tabulate(match(v, uniqv)) - uniqv[which(x==max(x))] + uniqv <- unique(v) + x <- tabulate(match(v, uniqv)) + uniqv[which(x == max(x))] } ``` @@ -97,7 +95,7 @@ head(litdata) # Data cleaning ## Converting Strings to numbers and *Keine Antwort* zu *NaN* ``` {r} -litdata <- litdata %>% +litdata <- litdata %>% mutate_all(~ replace(., . == "Stimme voll zu5", 5)) %>% mutate_all(~ replace(., . == "Stimme überhaupt nicht zu1", 1)) %>% mutate_all(~ replace(., . == "Keine Antwort-", NaN)) @@ -106,17 +104,17 @@ litdata <- litdata %>% ## Make it numeric The following code will **NOT** be run. The Idea is to show a way to automatically edit all columns. It works but some columns are NOT numeric. ```{r, eval=FALSE} - # All colnames that exist - litdataColnames <- colnames(litdata) - # the ones we don't want to change - litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed") - # the colnames that should be changed - litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)] - print(litdataColsToMakeNumeric) - litdataColsToMakeNumeric <- c("R1") - for (col in litdataColsToMakeNumeric) { - litdata[[col]] <- as.numeric(litdata[[col]]) - } +# All colnames that exist +litdataColnames <- colnames(litdata) +# the ones we don't want to change +litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed") +# the colnames that should be changed +litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)] +print(litdataColsToMakeNumeric) +litdataColsToMakeNumeric <- c("R1") +for (col in litdataColsToMakeNumeric) { + litdata[[col]] <- as.numeric(litdata[[col]]) +} ``` First we rename all the columns @@ -131,7 +129,6 @@ litdata <- litdata %>% rename( "A7" = "W007", "A8" = "W008", "A9" = "W009", - "B1" = "K001", "B2" = "K002", "B3" = "K003", @@ -141,37 +138,30 @@ litdata <- litdata %>% rename( "B7" = "K007", "B8" = "K008", "B9" = "K009", - "C1_1" = "TK001_01", "C1_2" = "TK001_02", "C1_3" = "TK001_03", "C1_4" = "TK001_04", - "C2_1" = "TK002_01", "C2_2" = "TK002_02", "C2_3" = "TK002_03", "C2_4" = "TK002_04", - "C3_1" = "TK003_01", "C3_2" = "TK003_02", "C3_3" = "TK003_03", "C3_4" = "TK003_04", - "C4_1" = "TK004_01", "C4_2" = "TK004_02", "C4_3" = "TK004_03", "C4_4" = "TK004_04", - "C5_1" = "TK005_01", "C5_2" = "TK005_02", "C5_3" = "TK005_03", "C5_4" = "TK005_04", - "C6_1" = "TK006_01", "C6_2" = "TK006_02", "C6_3" = "TK006_03", "C6_4" = "TK006_04", - "D1_1" = "H001_001", "D1_2" = "H001_002", "D1_3" = "H001_003", @@ -179,23 +169,15 @@ litdata <- litdata %>% rename( "D1_5" = "H001_005", "D1_6" = "H001_006", "D1_7" = "H001_007", - "D2" = "H002", - "D3" = "H003", - "D4" = "H004", "D4_comment" = "H004_other", - "D5" = "H005", "D5_comment" = "H005_other", - "D6" = "H006", - "D7" = "H007", - "D8" = "H008", - "E1" = "R1" ) ``` @@ -289,7 +271,6 @@ litdata$D7 <- as.numeric(litdata$D7) litdata$D8 <- as.factor(litdata$D8) # skipping E1 because it's a free text - ``` @@ -321,44 +302,46 @@ displayFunction1 <- function(table, column) { tmp <- rename(tmp, value = all_of(column)) tmp <- tmp %>% count(value) %>% - mutate(percentage = prop.table(n)*100) + mutate(percentage = prop.table(n) * 100) print(tmp, n = 100) - ggplot(tmp, - aes(x = value, y=n)) + - geom_bar(stat = "identity") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggplot( + tmp, + aes(x = value, y = n) + ) + + geom_bar(stat = "identity") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) } ``` ### A3 (W003) {-} ```{r} - displayFunction1(litdata, "A3") +displayFunction1(litdata, "A3") ``` ### B3 (K003) {-} ```{r} - displayFunction1(litdata, "B3") +displayFunction1(litdata, "B3") ``` ### D1_1 (H001_001) {-} ```{r} - displayFunction1(litdata, "D1_1") +displayFunction1(litdata, "D1_1") ``` ### D5 (H005) {-} ```{r} - displayFunction1(litdata, "D5") +displayFunction1(litdata, "D5") ``` ### D7 (H007) {-} ```{r} - displayFunction1(litdata, "D7") +displayFunction1(litdata, "D7") ``` Die Warnung resultiert daraus, dass es sehr viele *NA* gibt. ### D8 (H008) {-} ```{r} - displayFunction1(litdata, "D8") +displayFunction1(litdata, "D8") ``` # Selbststudium 2.1 @@ -368,14 +351,14 @@ We have the year 2021 birthyears <- litdata$D7 # remove NAs birthyears <- birthyears[!is.na(birthyears)] -age <- 2021-birthyears +age <- 2021 - birthyears ``` ## Frequency ```{r} freq(age) # with NA -freq(2021-litdata$D7) +freq(2021 - litdata$D7) ``` ## Selbststudium 2 @@ -399,7 +382,7 @@ mean(age) ### Spannweite ```{r} -max(age)-min(age) +max(age) - min(age) ``` ### Quartilsabstand @@ -511,6 +494,6 @@ pnorm(5.1, mean = 5.6, sd = 1.8) In einem Bündner Ort beträgt der Mittelwert auf der Skala zur sozialen Herkunft 5.6, er ist also genauso hoch wie in der Gesamtschweiz. Was ist zur Lage dieses Wertes bezogen auf die Verteilung in Graubünden zu sagen? Also: Wie viele Schüler in GR liegen mit ihrem Wert darunter? ```{r} -pnorm(5.6, mean = 5.1, sd=2.0) +pnorm(5.6, mean = 5.1, sd = 2.0) ```