master
marcgauch 2022-11-18 21:10:07 +01:00
parent afd7d1babf
commit 51cd711da8
2 changed files with 90 additions and 123 deletions

View File

@ -1494,11 +1494,11 @@ border-radius: 0px;
<div id="preparation" class="section level1" number="1">
<h1><span class="header-section-number">1</span> Preparation</h1>
<pre class="r"><code>if (!require(tidyverse)){
<pre class="r"><code>if (!require(tidyverse)) {
install.packages(&quot;tidyverse&quot;)
library(tidyverse)
}
if (!require(moments)){
if (!require(moments)) {
install.packages(&quot;moments&quot;)
library(moments)
}</code></pre>
@ -1508,28 +1508,26 @@ if (!require(moments)){
<div id="frequency-table-ordered-from-wish.com" class="section level2" number="2.1">
<h2><span class="header-section-number">2.1</span> Frequency Table
ordered from wish.com</h2>
<pre class="r"><code>freq &lt;- function(data){
na_count = length(data[is.na(data)])
valid_count = length(data)-na_count
<pre class="r"><code>freq &lt;- function(data) {
na_count &lt;- length(data[is.na(data)])
valid_count &lt;- length(data) - na_count
frequency &lt;- table(data)
p &lt;- prop.table(frequency)
percent &lt;- round(p*100, digits = 2)
percent &lt;- round(p * 100, digits = 2)
frequency_sum &lt;- cumsum(frequency)
hkum &lt;- cumsum(p)
percent_sum &lt;- round(hkum*100, digits = 2)
percent_sum &lt;- round(hkum * 100, digits = 2)
freq_table &lt;- cbind(frequency, percent, frequency_sum, percent_sum)
valid_percent &lt;- round(valid_count / length(data)*100, digits = 2)
na_percent &lt;- round(na_count / length(data)*100, digits = 2)
valid_percent &lt;- round(valid_count / length(data) * 100, digits = 2)
na_percent &lt;- round(na_count / length(data) * 100, digits = 2)
print(freq_table)
count &lt;- c(valid_count, na_count, valid_count+na_count)
percent &lt;- c(valid_percent, na_percent, valid_percent+na_percent)
totall &lt;- c(valid_count+na_count, valid_percent+na_percent)
count &lt;- c(valid_count, na_count, valid_count + na_count)
percent &lt;- c(valid_percent, na_percent, valid_percent + na_percent)
df &lt;- data.frame(count, percent, row.names = c(&quot;valid&quot;, &quot;NA&quot;, &quot;Total&quot;))
print(df)
}</code></pre>
<p><em>Source: <a href="https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret" class="uri">https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret</a>
and adapted</em></p>
@ -1537,9 +1535,9 @@ and adapted</em></p>
<div id="modus" class="section level2" number="2.2">
<h2><span class="header-section-number">2.2</span> Modus</h2>
<pre class="r"><code>getmode &lt;- function(v) {
uniqv &lt;- unique(v)
x &lt;- tabulate(match(v, uniqv))
uniqv[which(x==max(x))]
uniqv &lt;- unique(v)
x &lt;- tabulate(match(v, uniqv))
uniqv[which(x == max(x))]
}</code></pre>
</div>
</div>
@ -1816,7 +1814,7 @@ inspection of data</h2>
<div id="converting-strings-to-numbers-and-keine-antwort-zu-nan" class="section level2" number="4.1">
<h2><span class="header-section-number">4.1</span> Converting Strings to
numbers and <em>Keine Antwort</em> zu <em>NaN</em></h2>
<pre class="r"><code>litdata &lt;- litdata %&gt;%
<pre class="r"><code>litdata &lt;- litdata %&gt;%
mutate_all(~ replace(., . == &quot;Stimme voll zu5&quot;, 5)) %&gt;%
mutate_all(~ replace(., . == &quot;Stimme überhaupt nicht zu1&quot;, 1)) %&gt;%
mutate_all(~ replace(., . == &quot;Keine Antwort-&quot;, NaN))</code></pre>
@ -1826,17 +1824,17 @@ numbers and <em>Keine Antwort</em> zu <em>NaN</em></h2>
<p>The following code will <strong>NOT</strong> be run. The Idea is to
show a way to automatically edit all columns. It works but some columns
are NOT numeric.</p>
<pre class="r"><code> # All colnames that exist
litdataColnames &lt;- colnames(litdata)
# the ones we don&#39;t want to change
litdataNonNumericCols &lt;- c(&quot;submitdate&quot;, &quot;startlanguage&quot;, &quot;startdate&quot;, &quot;datestamp&quot;, &quot;lastpage&quot;, &quot;seed&quot;)
# the colnames that should be changed
litdataColsToMakeNumeric &lt;- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
print(litdataColsToMakeNumeric)
litdataColsToMakeNumeric &lt;- c(&quot;R1&quot;)
for (col in litdataColsToMakeNumeric) {
litdata[[col]] &lt;- as.numeric(litdata[[col]])
}</code></pre>
<pre class="r"><code># All colnames that exist
litdataColnames &lt;- colnames(litdata)
# the ones we don&#39;t want to change
litdataNonNumericCols &lt;- c(&quot;submitdate&quot;, &quot;startlanguage&quot;, &quot;startdate&quot;, &quot;datestamp&quot;, &quot;lastpage&quot;, &quot;seed&quot;)
# the colnames that should be changed
litdataColsToMakeNumeric &lt;- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
print(litdataColsToMakeNumeric)
litdataColsToMakeNumeric &lt;- c(&quot;R1&quot;)
for (col in litdataColsToMakeNumeric) {
litdata[[col]] &lt;- as.numeric(litdata[[col]])
}</code></pre>
<p>First we rename all the columns</p>
<pre class="r"><code>litdata &lt;- litdata %&gt;% rename(
&quot;A1&quot; = &quot;W001&quot;,
@ -1848,7 +1846,6 @@ are NOT numeric.</p>
&quot;A7&quot; = &quot;W007&quot;,
&quot;A8&quot; = &quot;W008&quot;,
&quot;A9&quot; = &quot;W009&quot;,
&quot;B1&quot; = &quot;K001&quot;,
&quot;B2&quot; = &quot;K002&quot;,
&quot;B3&quot; = &quot;K003&quot;,
@ -1858,37 +1855,30 @@ are NOT numeric.</p>
&quot;B7&quot; = &quot;K007&quot;,
&quot;B8&quot; = &quot;K008&quot;,
&quot;B9&quot; = &quot;K009&quot;,
&quot;C1_1&quot; = &quot;TK001_01&quot;,
&quot;C1_2&quot; = &quot;TK001_02&quot;,
&quot;C1_3&quot; = &quot;TK001_03&quot;,
&quot;C1_4&quot; = &quot;TK001_04&quot;,
&quot;C2_1&quot; = &quot;TK002_01&quot;,
&quot;C2_2&quot; = &quot;TK002_02&quot;,
&quot;C2_3&quot; = &quot;TK002_03&quot;,
&quot;C2_4&quot; = &quot;TK002_04&quot;,
&quot;C3_1&quot; = &quot;TK003_01&quot;,
&quot;C3_2&quot; = &quot;TK003_02&quot;,
&quot;C3_3&quot; = &quot;TK003_03&quot;,
&quot;C3_4&quot; = &quot;TK003_04&quot;,
&quot;C4_1&quot; = &quot;TK004_01&quot;,
&quot;C4_2&quot; = &quot;TK004_02&quot;,
&quot;C4_3&quot; = &quot;TK004_03&quot;,
&quot;C4_4&quot; = &quot;TK004_04&quot;,
&quot;C5_1&quot; = &quot;TK005_01&quot;,
&quot;C5_2&quot; = &quot;TK005_02&quot;,
&quot;C5_3&quot; = &quot;TK005_03&quot;,
&quot;C5_4&quot; = &quot;TK005_04&quot;,
&quot;C6_1&quot; = &quot;TK006_01&quot;,
&quot;C6_2&quot; = &quot;TK006_02&quot;,
&quot;C6_3&quot; = &quot;TK006_03&quot;,
&quot;C6_4&quot; = &quot;TK006_04&quot;,
&quot;D1_1&quot; = &quot;H001_001&quot;,
&quot;D1_2&quot; = &quot;H001_002&quot;,
&quot;D1_3&quot; = &quot;H001_003&quot;,
@ -1896,23 +1886,15 @@ are NOT numeric.</p>
&quot;D1_5&quot; = &quot;H001_005&quot;,
&quot;D1_6&quot; = &quot;H001_006&quot;,
&quot;D1_7&quot; = &quot;H001_007&quot;,
&quot;D2&quot; = &quot;H002&quot;,
&quot;D3&quot; = &quot;H003&quot;,
&quot;D4&quot; = &quot;H004&quot;,
&quot;D4_comment&quot; = &quot;H004_other&quot;,
&quot;D5&quot; = &quot;H005&quot;,
&quot;D5_comment&quot; = &quot;H005_other&quot;,
&quot;D6&quot; = &quot;H006&quot;,
&quot;D7&quot; = &quot;H007&quot;,
&quot;D8&quot; = &quot;H008&quot;,
&quot;E1&quot; = &quot;R1&quot;
)</code></pre>
<p>Then we change the datatype and fix the values</p>
@ -2278,16 +2260,18 @@ Data</h2>
tmp &lt;- rename(tmp, value = all_of(column))
tmp &lt;- tmp %&gt;%
count(value) %&gt;%
mutate(percentage = prop.table(n)*100)
mutate(percentage = prop.table(n) * 100)
print(tmp, n = 100)
ggplot(tmp,
aes(x = value, y=n)) +
geom_bar(stat = &quot;identity&quot;) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(
tmp,
aes(x = value, y = n)
) +
geom_bar(stat = &quot;identity&quot;) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}</code></pre>
<div id="a3-w003" class="section level3 unnumbered">
<h3 class="unnumbered">A3 (W003)</h3>
<pre class="r"><code> displayFunction1(litdata, &quot;A3&quot;)</code></pre>
<pre class="r"><code>displayFunction1(litdata, &quot;A3&quot;)</code></pre>
<pre><code>## # A tibble: 6 × 3
## value n percentage
## &lt;dbl&gt; &lt;int&gt; &lt;dbl&gt;
@ -2302,7 +2286,7 @@ Data</h2>
</div>
<div id="b3-k003" class="section level3 unnumbered">
<h3 class="unnumbered">B3 (K003)</h3>
<pre class="r"><code> displayFunction1(litdata, &quot;B3&quot;)</code></pre>
<pre class="r"><code>displayFunction1(litdata, &quot;B3&quot;)</code></pre>
<pre><code>## # A tibble: 7 × 3
## value n percentage
## &lt;dbl&gt; &lt;int&gt; &lt;dbl&gt;
@ -2318,7 +2302,7 @@ Data</h2>
</div>
<div id="d1_1-h001_001" class="section level3 unnumbered">
<h3 class="unnumbered">D1_1 (H001_001)</h3>
<pre class="r"><code> displayFunction1(litdata, &quot;D1_1&quot;)</code></pre>
<pre class="r"><code>displayFunction1(litdata, &quot;D1_1&quot;)</code></pre>
<pre><code>## # A tibble: 3 × 3
## value n percentage
## &lt;lgl&gt; &lt;int&gt; &lt;dbl&gt;
@ -2329,7 +2313,7 @@ Data</h2>
</div>
<div id="d5-h005" class="section level3 unnumbered">
<h3 class="unnumbered">D5 (H005)</h3>
<pre class="r"><code> displayFunction1(litdata, &quot;D5&quot;)</code></pre>
<pre class="r"><code>displayFunction1(litdata, &quot;D5&quot;)</code></pre>
<pre><code>## # A tibble: 18 × 3
## value n percentage
## &lt;fct&gt; &lt;int&gt; &lt;dbl&gt;
@ -2355,7 +2339,7 @@ Data</h2>
</div>
<div id="d7-h007" class="section level3 unnumbered">
<h3 class="unnumbered">D7 (H007)</h3>
<pre class="r"><code> displayFunction1(litdata, &quot;D7&quot;)</code></pre>
<pre class="r"><code>displayFunction1(litdata, &quot;D7&quot;)</code></pre>
<pre><code>## # A tibble: 24 × 3
## value n percentage
## &lt;dbl&gt; &lt;int&gt; &lt;dbl&gt;
@ -2390,7 +2374,7 @@ gibt.</p>
</div>
<div id="d8-h008" class="section level3 unnumbered">
<h3 class="unnumbered">D8 (H008)</h3>
<pre class="r"><code> displayFunction1(litdata, &quot;D8&quot;)</code></pre>
<pre class="r"><code>displayFunction1(litdata, &quot;D8&quot;)</code></pre>
<pre><code>## # A tibble: 3 × 3
## value n percentage
## &lt;fct&gt; &lt;int&gt; &lt;dbl&gt;
@ -2407,7 +2391,7 @@ gibt.</p>
<pre class="r"><code>birthyears &lt;- litdata$D7
# remove NAs
birthyears &lt;- birthyears[!is.na(birthyears)]
age &lt;- 2021-birthyears</code></pre>
age &lt;- 2021 - birthyears</code></pre>
<div id="frequency" class="section level2" number="6.1">
<h2><span class="header-section-number">6.1</span> Frequency</h2>
<pre class="r"><code>freq(age)</code></pre>
@ -2440,7 +2424,7 @@ age &lt;- 2021-birthyears</code></pre>
## NA 0 0
## Total 106 100</code></pre>
<pre class="r"><code># with NA
freq(2021-litdata$D7)</code></pre>
freq(2021 - litdata$D7)</code></pre>
<pre><code>## frequency percent frequency_sum percent_sum
## 19 1 0.94 1 0.94
## 20 2 1.89 3 2.83
@ -2492,7 +2476,7 @@ Mittelwert</h3>
</div>
<div id="spannweite" class="section level3" number="6.2.4">
<h3><span class="header-section-number">6.2.4</span> Spannweite</h3>
<pre class="r"><code>max(age)-min(age)</code></pre>
<pre class="r"><code>max(age) - min(age)</code></pre>
<pre><code>## [1] 31</code></pre>
</div>
<div id="quartilsabstand" class="section level3" number="6.2.5">
@ -2643,7 +2627,7 @@ sozialen Herkunft 5.6, er ist also genauso hoch wie in der
Gesamtschweiz. Was ist zur Lage dieses Wertes bezogen auf die Verteilung
in Graubünden zu sagen? Also: Wie viele Schüler in GR liegen mit ihrem
Wert darunter?</p>
<pre class="r"><code>pnorm(5.6, mean = 5.1, sd=2.0)</code></pre>
<pre class="r"><code>pnorm(5.6, mean = 5.1, sd = 2.0)</code></pre>
<pre><code>## [1] 0.5987063</code></pre>
</div>
</div>

View File

@ -20,11 +20,11 @@ knitr::opts_chunk$set(echo = TRUE)
# Preparation
```{r, message=FALSE}
if (!require(tidyverse)){
if (!require(tidyverse)) {
install.packages("tidyverse")
library(tidyverse)
}
if (!require(moments)){
if (!require(moments)) {
install.packages("moments")
library(moments)
}
@ -34,28 +34,26 @@ if (!require(moments)){
## Frequency Table ordered from wish.com
```{r}
freq <- function(data){
na_count = length(data[is.na(data)])
valid_count = length(data)-na_count
freq <- function(data) {
na_count <- length(data[is.na(data)])
valid_count <- length(data) - na_count
frequency <- table(data)
p <- prop.table(frequency)
percent <- round(p*100, digits = 2)
percent <- round(p * 100, digits = 2)
frequency_sum <- cumsum(frequency)
hkum <- cumsum(p)
percent_sum <- round(hkum*100, digits = 2)
percent_sum <- round(hkum * 100, digits = 2)
freq_table <- cbind(frequency, percent, frequency_sum, percent_sum)
valid_percent <- round(valid_count / length(data)*100, digits = 2)
na_percent <- round(na_count / length(data)*100, digits = 2)
valid_percent <- round(valid_count / length(data) * 100, digits = 2)
na_percent <- round(na_count / length(data) * 100, digits = 2)
print(freq_table)
count <- c(valid_count, na_count, valid_count+na_count)
percent <- c(valid_percent, na_percent, valid_percent+na_percent)
totall <- c(valid_count+na_count, valid_percent+na_percent)
count <- c(valid_count, na_count, valid_count + na_count)
percent <- c(valid_percent, na_percent, valid_percent + na_percent)
df <- data.frame(count, percent, row.names = c("valid", "NA", "Total"))
print(df)
}
```
*Source: https://tellmi.psy.lmu.de/tutorials/deskriptive-statistiken-und-grafiken.html#haeufigkeiten-diskret and adapted*
@ -63,9 +61,9 @@ freq <- function(data){
## Modus
```{r}
getmode <- function(v) {
uniqv <- unique(v)
x <- tabulate(match(v, uniqv))
uniqv[which(x==max(x))]
uniqv <- unique(v)
x <- tabulate(match(v, uniqv))
uniqv[which(x == max(x))]
}
```
@ -97,7 +95,7 @@ head(litdata)
# Data cleaning
## Converting Strings to numbers and *Keine Antwort* zu *NaN*
``` {r}
litdata <- litdata %>%
litdata <- litdata %>%
mutate_all(~ replace(., . == "Stimme voll zu5", 5)) %>%
mutate_all(~ replace(., . == "Stimme überhaupt nicht zu1", 1)) %>%
mutate_all(~ replace(., . == "Keine Antwort-", NaN))
@ -106,17 +104,17 @@ litdata <- litdata %>%
## Make it numeric
The following code will **NOT** be run. The Idea is to show a way to automatically edit all columns. It works but some columns are NOT numeric.
```{r, eval=FALSE}
# All colnames that exist
litdataColnames <- colnames(litdata)
# the ones we don't want to change
litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
# the colnames that should be changed
litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
print(litdataColsToMakeNumeric)
litdataColsToMakeNumeric <- c("R1")
for (col in litdataColsToMakeNumeric) {
litdata[[col]] <- as.numeric(litdata[[col]])
}
# All colnames that exist
litdataColnames <- colnames(litdata)
# the ones we don't want to change
litdataNonNumericCols <- c("submitdate", "startlanguage", "startdate", "datestamp", "lastpage", "seed")
# the colnames that should be changed
litdataColsToMakeNumeric <- litdataColnames[!(litdataColnames %in% litdataNonNumericCols)]
print(litdataColsToMakeNumeric)
litdataColsToMakeNumeric <- c("R1")
for (col in litdataColsToMakeNumeric) {
litdata[[col]] <- as.numeric(litdata[[col]])
}
```
First we rename all the columns
@ -131,7 +129,6 @@ litdata <- litdata %>% rename(
"A7" = "W007",
"A8" = "W008",
"A9" = "W009",
"B1" = "K001",
"B2" = "K002",
"B3" = "K003",
@ -141,37 +138,30 @@ litdata <- litdata %>% rename(
"B7" = "K007",
"B8" = "K008",
"B9" = "K009",
"C1_1" = "TK001_01",
"C1_2" = "TK001_02",
"C1_3" = "TK001_03",
"C1_4" = "TK001_04",
"C2_1" = "TK002_01",
"C2_2" = "TK002_02",
"C2_3" = "TK002_03",
"C2_4" = "TK002_04",
"C3_1" = "TK003_01",
"C3_2" = "TK003_02",
"C3_3" = "TK003_03",
"C3_4" = "TK003_04",
"C4_1" = "TK004_01",
"C4_2" = "TK004_02",
"C4_3" = "TK004_03",
"C4_4" = "TK004_04",
"C5_1" = "TK005_01",
"C5_2" = "TK005_02",
"C5_3" = "TK005_03",
"C5_4" = "TK005_04",
"C6_1" = "TK006_01",
"C6_2" = "TK006_02",
"C6_3" = "TK006_03",
"C6_4" = "TK006_04",
"D1_1" = "H001_001",
"D1_2" = "H001_002",
"D1_3" = "H001_003",
@ -179,23 +169,15 @@ litdata <- litdata %>% rename(
"D1_5" = "H001_005",
"D1_6" = "H001_006",
"D1_7" = "H001_007",
"D2" = "H002",
"D3" = "H003",
"D4" = "H004",
"D4_comment" = "H004_other",
"D5" = "H005",
"D5_comment" = "H005_other",
"D6" = "H006",
"D7" = "H007",
"D8" = "H008",
"E1" = "R1"
)
```
@ -289,7 +271,6 @@ litdata$D7 <- as.numeric(litdata$D7)
litdata$D8 <- as.factor(litdata$D8)
# skipping E1 because it's a free text
```
@ -321,44 +302,46 @@ displayFunction1 <- function(table, column) {
tmp <- rename(tmp, value = all_of(column))
tmp <- tmp %>%
count(value) %>%
mutate(percentage = prop.table(n)*100)
mutate(percentage = prop.table(n) * 100)
print(tmp, n = 100)
ggplot(tmp,
aes(x = value, y=n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(
tmp,
aes(x = value, y = n)
) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
```
### A3 (W003) {-}
```{r}
displayFunction1(litdata, "A3")
displayFunction1(litdata, "A3")
```
### B3 (K003) {-}
```{r}
displayFunction1(litdata, "B3")
displayFunction1(litdata, "B3")
```
### D1_1 (H001_001) {-}
```{r}
displayFunction1(litdata, "D1_1")
displayFunction1(litdata, "D1_1")
```
### D5 (H005) {-}
```{r}
displayFunction1(litdata, "D5")
displayFunction1(litdata, "D5")
```
### D7 (H007) {-}
```{r}
displayFunction1(litdata, "D7")
displayFunction1(litdata, "D7")
```
Die Warnung resultiert daraus, dass es sehr viele *NA* gibt.
### D8 (H008) {-}
```{r}
displayFunction1(litdata, "D8")
displayFunction1(litdata, "D8")
```
# Selbststudium 2.1
@ -368,14 +351,14 @@ We have the year 2021
birthyears <- litdata$D7
# remove NAs
birthyears <- birthyears[!is.na(birthyears)]
age <- 2021-birthyears
age <- 2021 - birthyears
```
## Frequency
```{r}
freq(age)
# with NA
freq(2021-litdata$D7)
freq(2021 - litdata$D7)
```
## Selbststudium 2
@ -399,7 +382,7 @@ mean(age)
### Spannweite
```{r}
max(age)-min(age)
max(age) - min(age)
```
### Quartilsabstand
@ -511,6 +494,6 @@ pnorm(5.1, mean = 5.6, sd = 1.8)
In einem Bündner Ort beträgt der Mittelwert auf der Skala zur sozialen Herkunft 5.6, er ist also genauso hoch wie in der Gesamtschweiz. Was ist zur Lage dieses Wertes bezogen auf die Verteilung in Graubünden zu sagen? Also: Wie viele Schüler in GR liegen mit ihrem Wert darunter?
```{r}
pnorm(5.6, mean = 5.1, sd=2.0)
pnorm(5.6, mean = 5.1, sd = 2.0)
```