-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path08-Anova.html
881 lines (835 loc) · 82.8 KB
/
08-Anova.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>0.1 Comparing several means | 08-Anova.knit</title>
<meta name="description" content="" />
<meta name="generator" content="bookdown 0.24 and GitBook 2.6.7" />
<meta property="og:title" content="0.1 Comparing several means | 08-Anova.knit" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="0.1 Comparing several means | 08-Anova.knit" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<script src="libs/header-attrs-2.10/header-attrs.js"></script>
<script src="libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/fuse.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />
<link href="libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet" />
<script src="libs/pagedtable-1.1/js/pagedtable.js"></script>
<link href="libs/anchor-sections-1.0.1/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0.1/anchor-sections.js"></script>
<style type="text/css">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><strong><a href="./">MRDA 2020</a></strong></li>
<li class="divider"></li>
<li class="chapter" data-level="0.1" data-path=""><a href="#comparing-several-means"><i class="fa fa-check"></i><b>0.1</b> Comparing several means</a>
<ul>
<li class="chapter" data-level="0.1.1" data-path=""><a href="#introduction"><i class="fa fa-check"></i><b>0.1.1</b> Introduction</a></li>
<li class="chapter" data-level="0.1.2" data-path=""><a href="#decomposing-variance"><i class="fa fa-check"></i><b>0.1.2</b> Decomposing variance</a></li>
<li class="chapter" data-level="0.1.3" data-path=""><a href="#one-way-anova"><i class="fa fa-check"></i><b>0.1.3</b> One-way ANOVA</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./"></a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<!--bookdown:title:end-->
<!--bookdown:title:start-->
<div id="comparing-several-means" class="section level2" number="0.1">
<h2><span class="header-section-number">0.1</span> Comparing several means</h2>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/9cGZcALfU5k" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<div class="infobox download">
<p><a href="./Code/07-anova.R">You can download the corresponding R-Code here</a></p>
</div>
<div id="introduction" class="section level3" number="0.1.1">
<h3><span class="header-section-number">0.1.1</span> Introduction</h3>
<p>In the previous section we learned how to compare two means using a t-test. The t-test has some limitations since it only lets you compare two means and you can only use it with one independent variable. However, often we would like to compare means from 3 or more groups. In addition, there may be instances in which you manipulate more than one independent variable. For these applications, ANOVA (<u>AN</u>alysis <u>O</u>f <u>VA</u>riance) can be used. Hence, to conduct ANOVA you need:</p>
<ul>
<li>A metric dependent variable (i.e., measured using an interval or ratio scale)</li>
<li>One or more non-metric (categorical) independent variables (also called factors)</li>
</ul>
<p>A <strong>treatment</strong> is a particular combination of factor levels, or categories. So-called <strong>one-way ANOVA</strong> is used when there is only one categorical variable (factor). In this case, a treatment is the same as a factor level. <strong>N-way ANOVA</strong> is used with two or more factors. Note that we are only going to talk about a single independent variable in the context of ANOVA on this website. If you have multiple independent variables please refer to the chapter on <strong>Regression</strong>.</p>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/cG0HAWqObJs" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<p>Let’s use an example to see how ANOVA works. Similar to the previous example, imagine that the music streaming service experiments with a recommender system and manipulates the intensity of personalized recommendations using three levels: ‘low’, ‘medium’, and ‘high’. The service randomly assigns 100 users to each condition and records the listening times in hours in the following week. As always, we load and inspect the data first:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>hours_abc <span class="ot"><-</span> <span class="fu">read.table</span>(<span class="st">"https://raw.githubusercontent.com/IMSMWU/MRDA2018/master/data/hours_abc.dat"</span>,</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a> <span class="at">sep =</span> <span class="st">"</span><span class="sc">\t</span><span class="st">"</span>, <span class="at">header =</span> <span class="cn">TRUE</span>) <span class="co">#read in data</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>hours_abc<span class="sc">$</span>group <span class="ot"><-</span> <span class="fu">factor</span>(hours_abc<span class="sc">$</span>group, <span class="at">levels =</span> <span class="fu">c</span>(<span class="st">"A"</span>,</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a> <span class="st">"B"</span>, <span class="st">"C"</span>), <span class="at">labels =</span> <span class="fu">c</span>(<span class="st">"low"</span>, <span class="st">"medium"</span>, <span class="st">"high"</span>)) <span class="co">#convert grouping variable to factor</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(hours_abc) <span class="co">#inspect data</span></span></code></pre></div>
<pre><code>## 'data.frame': 300 obs. of 3 variables:
## $ hours: int 18 13 3 13 20 18 18 10 11 20 ...
## $ group: Factor w/ 3 levels "low","medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ index: int 1 2 3 4 5 6 7 8 9 10 ...</code></pre>
<p>The null hypothesis, typically, is that all means are equal (non-directional hypothesis). Hence, in our case:</p>
<p><span class="math display">\[H_0: \mu_1 = \mu_2 = \mu_3\]</span></p>
<p>The alternative hypothesis is simply that the means are not all equal, i.e.,</p>
<p><span class="math display">\[H_1: \textrm{Means are not all equal}\]</span></p>
<p>If you wanted to put this in mathematical notation, you could also write:</p>
<p><span class="math display">\[H_1: \exists {i,j}: {\mu_i \ne \mu_j} \]</span></p>
<p>To get a first impression if there are any differences in listening times across the experimental groups, we use the <code>describeBy(...)</code> function from the <code>psych</code> package:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(psych)</span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">describeBy</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group) <span class="co">#inspect data</span></span></code></pre></div>
<pre><code>##
## Descriptive statistics by group
## group: low
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 14.34 4.62 14 14.36 4.45 3 25 22 -0.03 -0.32 0.46
## ------------------------------------------------------------
## group: medium
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 24.7 5.81 25 24.79 5.93 12 42 30 0.05 0.2 0.58
## ------------------------------------------------------------
## group: high
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 34.99 6.42 34.5 35.02 6.67 17 50 33 -0.05 -0.23 0.64</code></pre>
<p>In addition, you should visualize the data using appropriate plots. Appropriate plots in this case would be a plot of means, including the 95% confidence interval around the mean, or a boxplot.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co">#Plot of mean</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(Rmisc)</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggplot2)</span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>mean_data <span class="ot"><-</span> <span class="fu">summarySE</span>(hours_abc, <span class="at">measurevar=</span><span class="st">"hours"</span>, <span class="at">groupvars=</span><span class="fu">c</span>(<span class="st">"group"</span>))</span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(mean_data,<span class="fu">aes</span>(<span class="at">x =</span> group, <span class="at">y =</span> hours)) <span class="sc">+</span> </span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">position=</span><span class="fu">position_dodge</span>(<span class="dv">1</span>), <span class="at">colour=</span><span class="st">"black"</span>, <span class="at">fill =</span> <span class="st">"#CCCCCC"</span>, <span class="at">stat=</span><span class="st">"identity"</span>, <span class="at">width =</span> <span class="fl">0.65</span>) <span class="sc">+</span></span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_errorbar</span>(<span class="at">position=</span><span class="fu">position_dodge</span>(.<span class="dv">9</span>), <span class="at">width=</span>.<span class="dv">15</span>, <span class="fu">aes</span>(<span class="at">ymin=</span>hours<span class="sc">-</span>ci, <span class="at">ymax=</span>hours<span class="sc">+</span>ci)) <span class="sc">+</span></span>
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Group"</span>, <span class="at">y =</span> <span class="st">"Average number of hours"</span>, <span class="at">title =</span> <span class="st">"Average number of hours by group"</span>)<span class="sc">+</span></span>
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">hjust =</span> <span class="fl">0.5</span>,<span class="at">color =</span> <span class="st">"#666666"</span>)) </span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-6"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-6-1.png" alt="Plot of means" width="672" />
<p class="caption">
Figure 1: Plot of means
</p>
</div>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(hours_abc,<span class="fu">aes</span>(<span class="at">x =</span> group, <span class="at">y =</span> hours)) <span class="sc">+</span> </span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_boxplot</span>() <span class="sc">+</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_jitter</span>(<span class="at">colour=</span><span class="st">"red"</span>, <span class="at">alpha =</span> <span class="fl">0.1</span>) <span class="sc">+</span></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Group"</span>, <span class="at">y =</span> <span class="st">"Average number of hours"</span>, <span class="at">title =</span> <span class="st">"Average number of hours by group"</span>)<span class="sc">+</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">hjust =</span> <span class="fl">0.5</span>,<span class="at">color =</span> <span class="st">"#666666"</span>)) </span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-7"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-7-1.png" alt="Boxplot" width="672" />
<p class="caption">
Figure 2: Boxplot
</p>
</div>
<p>Note that ANOVA is an omnibus test, which means that we test for an overall difference between groups. Hence, the test will only tell you if the group means are different, but it won’t tell you exactly which groups are different from another.</p>
<p>So why don’t we then just conduct a series of t-tests for all combinations of groups (i.e., “low” vs. “medium”, “low” vs. “high”, “medium” vs. “high”)? The reason is that if we assume each test to be independent, then there is a 5% probability of falsely rejecting the null hypothesis (Type I error) for each test. In our case:</p>
<ul>
<li>“low” vs. “medium” (α = 0.05)</li>
<li>“low” vs. “high” (α = 0.05)</li>
<li>“medium” vs. “high” (α = 0.05)</li>
</ul>
<p>This means that the overall probability of making a Type I error is 1-(0.95<sup>3</sup>) = 0.143, since the probability of no Type I error is 0.95 for each of the three tests. Consequently, the Type I error probability would be 14.3%, which is above the conventional standard of 5%. This is also known as the family-wise or experiment-wise error.</p>
</div>
<div id="decomposing-variance" class="section level3" number="0.1.2">
<h3><span class="header-section-number">0.1.2</span> Decomposing variance</h3>
<p>The basic concept underlying ANOVA is the decomposition of the variance in the data. There are three variance components which we need to consider:</p>
<ul>
<li>We calculate how much variability there is overall between scores: <b>Total sum of squares (SS<sub>T</sub>)</b></li>
<li>We then calculate how much of this variability can be explained by the model we fit to the data (i.e., how much variability is due to the experimental manipulation): <b>Model sum of squares (SS<sub>M</sub>)</b></li>
<li>… and how much cannot be explained (i.e., how much variability is due to individual differences in performance): <b>Residual sum of squares (SS<sub>R</sub>)</b></li>
</ul>
<p>The following figure shows the different variance components using a generalized data matrix:</p>
<p style="text-align:center;">
<img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/sum_of_squares.JPG" alt="decomposing_variance"/>
</p>
<p>The total variation is determined by the variation between the categories (due to our experimental manipulation) and the within-category variation that is due to extraneous factors (e.g., unobserved factors such as the promotion of artists on a social network):</p>
<p><span class="math display">\[SS_T= SS_M+SS_R\]</span></p>
<p>To get a better feeling how this relates to our data set, we can look at the data in a slightly different way. Specifically, we can use the <code>dcast(...)</code> function from the <code>reshape2</code> package to convert the data to wide format:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(reshape2)</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="fu">dcast</span>(hours_abc, index <span class="sc">~</span> group, <span class="at">value.var =</span> <span class="st">"hours"</span>)</span></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["index"],"name":[1],"type":["int"],"align":["right"]},{"label":["low"],"name":[2],"type":["int"],"align":["right"]},{"label":["medium"],"name":[3],"type":["int"],"align":["right"]},{"label":["high"],"name":[4],"type":["int"],"align":["right"]}],"data":[{"1":"1","2":"18","3":"26","4":"34"},{"1":"2","2":"13","3":"32","4":"34"},{"1":"3","2":"3","3":"29","4":"32"},{"1":"4","2":"13","3":"31","4":"25"},{"1":"5","2":"20","3":"13","4":"35"},{"1":"6","2":"18","3":"31","4":"20"},{"1":"7","2":"18","3":"26","4":"30"},{"1":"8","2":"10","3":"21","4":"33"},{"1":"9","2":"11","3":"31","4":"31"},{"1":"10","2":"20","3":"24","4":"42"},{"1":"11","2":"23","3":"13","4":"40"},{"1":"12","2":"14","3":"21","4":"44"},{"1":"13","2":"25","3":"31","4":"39"},{"1":"14","2":"16","3":"21","4":"33"},{"1":"15","2":"14","3":"28","4":"27"},{"1":"16","2":"14","3":"39","4":"48"},{"1":"17","2":"22","3":"22","4":"32"},{"1":"18","2":"23","3":"27","4":"35"},{"1":"19","2":"20","3":"28","4":"32"},{"1":"20","2":"15","3":"19","4":"35"},{"1":"21","2":"19","3":"20","4":"44"},{"1":"22","2":"16","3":"20","4":"32"},{"1":"23","2":"15","3":"26","4":"39"},{"1":"24","2":"15","3":"26","4":"35"},{"1":"25","2":"6","3":"12","4":"42"},{"1":"26","2":"12","3":"25","4":"29"},{"1":"27","2":"12","3":"23","4":"45"},{"1":"28","2":"15","3":"24","4":"27"},{"1":"29","2":"11","3":"24","4":"35"},{"1":"30","2":"10","3":"22","4":"38"},{"1":"31","2":"14","3":"16","4":"44"},{"1":"32","2":"5","3":"14","4":"39"},{"1":"33","2":"13","3":"29","4":"38"},{"1":"34","2":"12","3":"32","4":"44"},{"1":"35","2":"22","3":"29","4":"17"},{"1":"36","2":"16","3":"20","4":"38"},{"1":"37","2":"7","3":"32","4":"34"},{"1":"38","2":"20","3":"34","4":"31"},{"1":"39","2":"13","3":"27","4":"29"},{"1":"40","2":"21","3":"26","4":"46"},{"1":"41","2":"13","3":"20","4":"36"},{"1":"42","2":"13","3":"23","4":"32"},{"1":"43","2":"12","3":"42","4":"35"},{"1":"44","2":"5","3":"19","4":"30"},{"1":"45","2":"17","3":"26","4":"27"},{"1":"46","2":"18","3":"32","4":"44"},{"1":"47","2":"12","3":"29","4":"40"},{"1":"48","2":"8","3":"22","4":"31"},{"1":"49","2":"5","3":"25","4":"21"},{"1":"50","2":"18","3":"24","4":"29"},{"1":"51","2":"18","3":"28","4":"29"},{"1":"52","2":"19","3":"38","4":"32"},{"1":"53","2":"12","3":"26","4":"33"},{"1":"54","2":"11","3":"16","4":"28"},{"1":"55","2":"13","3":"21","4":"44"},{"1":"56","2":"16","3":"27","4":"37"},{"1":"57","2":"5","3":"24","4":"33"},{"1":"58","2":"17","3":"28","4":"45"},{"1":"59","2":"9","3":"28","4":"41"},{"1":"60","2":"8","3":"24","4":"33"},{"1":"61","2":"11","3":"32","4":"31"},{"1":"62","2":"12","3":"25","4":"42"},{"1":"63","2":"13","3":"18","4":"31"},{"1":"64","2":"10","3":"29","4":"25"},{"1":"65","2":"15","3":"25","4":"43"},{"1":"66","2":"16","3":"20","4":"38"},{"1":"67","2":"22","3":"18","4":"40"},{"1":"68","2":"10","3":"28","4":"36"},{"1":"69","2":"16","3":"27","4":"25"},{"1":"70","2":"18","3":"32","4":"30"},{"1":"71","2":"12","3":"22","4":"36"},{"1":"72","2":"22","3":"21","4":"46"},{"1":"73","2":"13","3":"28","4":"33"},{"1":"74","2":"14","3":"30","4":"39"},{"1":"75","2":"10","3":"30","4":"40"},{"1":"76","2":"9","3":"12","4":"31"},{"1":"77","2":"9","3":"21","4":"39"},{"1":"78","2":"11","3":"23","4":"43"},{"1":"79","2":"17","3":"22","4":"36"},{"1":"80","2":"17","3":"17","4":"37"},{"1":"81","2":"16","3":"29","4":"46"},{"1":"82","2":"14","3":"29","4":"28"},{"1":"83","2":"16","3":"22","4":"30"},{"1":"84","2":"14","3":"12","4":"31"},{"1":"85","2":"19","3":"23","4":"32"},{"1":"86","2":"24","3":"32","4":"31"},{"1":"87","2":"17","3":"21","4":"29"},{"1":"88","2":"12","3":"18","4":"50"},{"1":"89","2":"12","3":"27","4":"37"},{"1":"90","2":"17","3":"31","4":"32"},{"1":"91","2":"14","3":"23","4":"41"},{"1":"92","2":"20","3":"21","4":"36"},{"1":"93","2":"10","3":"27","4":"33"},{"1":"94","2":"13","3":"17","4":"40"},{"1":"95","2":"12","3":"21","4":"25"},{"1":"96","2":"20","3":"22","4":"38"},{"1":"97","2":"9","3":"25","4":"31"},{"1":"98","2":"16","3":"28","4":"28"},{"1":"99","2":"12","3":"27","4":"39"},{"1":"100","2":"17","3":"19","4":"34"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>In this example, X<sub>1</sub> from the generalized data matrix above would refer to the factor level “low”, X<sub>2</sub> to the level “medium”, and X<sub>3</sub> to the level “high”. Y<sub>11</sub> refers to the first data point in the first row (i.e., “13”), Y<sub>12</sub> to the second data point in the first row (i.e., “21”), etc.. The grand mean (<span class="math inline">\(\overline{Y}\)</span>) and the category means (<span class="math inline">\(\overline{Y}_c\)</span>) can be easily computed:</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">mean</span>(hours_abc<span class="sc">$</span>hours) <span class="co">#grand mean</span></span></code></pre></div>
<pre><code>## [1] 24.67667</code></pre>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">by</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group, mean) <span class="co">#category mean</span></span></code></pre></div>
<pre><code>## hours_abc$group: low
## [1] 14.34
## ------------------------------------------------------------
## hours_abc$group: medium
## [1] 24.7
## ------------------------------------------------------------
## hours_abc$group: high
## [1] 34.99</code></pre>
<p>To see how each variance component can be derived, let’s look at the data again. The following graph shows the individual observations by experimental group:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-10"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-10-1.png" alt="Sum of Squares" width="672" />
<p class="caption">
Figure 3: Sum of Squares
</p>
</div>
<div id="total-sum-of-squares" class="section level4" number="0.1.2.1">
<h4><span class="header-section-number">0.1.2.1</span> Total sum of squares</h4>
<p>To compute the total variation in the data, we consider the difference between each observation and the grand mean. The grand mean is the mean over all observations in the data set. The vertical lines in the following plot measure how far each observation is away from the grand mean:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-11"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-11-1.png" alt="Total Sum of Squares" width="672" />
<p class="caption">
Figure 4: Total Sum of Squares
</p>
</div>
<p>The formal representation of the total sum of squares (SS<sub>T</sub>) is:</p>
<p><span class="math display">\[
SS_T= \sum_{i=1}^{N} (Y_i-\bar{Y})^2
\]</span></p>
<p>This means that we need to subtract the grand mean from each individual data point, square the difference, and sum up over all the squared differences. Thus, in our example, the total sum of squares can be calculated as:</p>
<p><span class="math display">\[
\begin{align}
SS_T =&(13−24.67)^2 + (14−24.67)^2 + … + (2−24.67)^2\\
&+(21−24.67)^2 + (18-24.67)^2 + … + (17−24.67)^2\\
&+(30−24.67)^2 + (37−24.67)^2 + … + (28−24.67)^2\\
&=30855.64
\end{align}
\]</span></p>
<p>You could also compute this in R using:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>SST <span class="ot"><-</span> <span class="fu">sum</span>((hours_abc<span class="sc">$</span>hours <span class="sc">-</span> <span class="fu">mean</span>(hours_abc<span class="sc">$</span>hours))<span class="sc">^</span><span class="dv">2</span>)</span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>SST</span></code></pre></div>
<pre><code>## [1] 30855.64</code></pre>
<p>For the subsequent analyses, it is important to understand the concept behind the <b>degrees of freedom</b> (df). Remember that in order to estimate a population value from a sample, we need to hold something in the population constant. In ANOVA, the df are generally one less than the number of values used to calculate the SS. For example, when we estimate the population mean from a sample, we assume that the sample mean is equal to the population mean. Then, in order to estimate the population mean from the sample, all but one scores are free to vary and the remaining score needs to be the value that keeps the population mean constant. Thus, the degrees of freedom of an estimate can also be thought of as the number of independent pieces of information that went into calculating the estimate. In our example, we used all 300 observations to calculate the sum of square, so the total degrees of freedom (df<sub>T</sub>) are:</p>
<p><span class="math display" id="eq:dfT">\[\begin{equation}
\begin{split}
df_T = N-1=300-1=299
\end{split}
\tag{1}
\end{equation}\]</span></p>
<div class="infobox_orange hint">
<p>Why do we subtract 1 from the number of items when computing the <strong>degrees of freedom</strong>? As mentioned above, the degrees of freedom refer to the number of values that are free to vary in a data set. To understand what this means, imagine that we try to estimate the mean hours of music listening in a population and that mean is 20 hours. We could take different samples from the population and we assume that the sample mean is equal to the population mean. Imagine, we only take three small samples of 3 students each: i) 19, 20, 21, ii) 18, 20, 22, iii) 15, 20, 25. Once you have chosen the first two values in each set, the third item cannot be chosen freely (i.e., it is fixed) because it needs to be the value that gets you to the population mean. Hence, only the first two values are ‘free to vary’. You can select 19 + 20 or 15 + 25, but once you have chosen the first two values, you must choose a particular value that will give you the population mean you are looking for (i.e., 20 hours). In this case, the degrees of freedom for each set of three numbers is two.</p>
</div>
</div>
<div id="model-sum-of-squares" class="section level4" number="0.1.2.2">
<h4><span class="header-section-number">0.1.2.2</span> Model sum of squares</h4>
<p>Now we know that there are 30855.64 units of total variation in our data. Next, we compute how much of the total variation can be explained by the differences between groups (i.e., our experimental manipulation). To compute the explained variation in the data, we consider the difference between the values predicted by our model for each observation (i.e., the group mean) and the grand mean. The group mean refers to the mean value within the experimental group. The vertical lines in the following plot measure how far the predicted value for each observation (i.e., the group mean) is away from the grand mean:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-13"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-13-1.png" alt="Model Sum of Squares" width="672" />
<p class="caption">
Figure 5: Model Sum of Squares
</p>
</div>
<p>The formal representation of the model sum of squares (SS<sub>M</sub>) is:</p>
<p><span class="math display">\[
SS_M= \sum_{j=1}^{c} n_j(\bar{Y}_j-\bar{Y})^2
\]</span></p>
<p>where c denotes the number of categories (experimental groups). This means that we need to subtract the grand mean from each group mean, square the difference, and sum up over all the squared differences. Thus, in our example, the model sum of squares can be calculated as:</p>
<p><span class="math display">\[
\begin{align}
SS_M &= 100*(15.47−24.67)^2 + 100*(24.88−24.67)^2 + 100*(33.66−24.67)^2 \\
&= 21321.21
\end{align}
\]</span></p>
<p>You could also compute this manually in R using:</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>SSM <span class="ot"><-</span> <span class="fu">sum</span>(<span class="dv">100</span> <span class="sc">*</span> (<span class="fu">by</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group,</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a> mean) <span class="sc">-</span> <span class="fu">mean</span>(hours_abc<span class="sc">$</span>hours))<span class="sc">^</span><span class="dv">2</span>)</span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>SSM</span></code></pre></div>
<pre><code>## [1] 21321.21</code></pre>
<p>In this case, we used the three group means to calculate the sum of squares, so the model degrees of freedom (df<sub>M</sub>) are:</p>
<p><span class="math display">\[
df_M= c-1=3-1=2
\]</span></p>
</div>
<div id="residual-sum-of-squares" class="section level4" number="0.1.2.3">
<h4><span class="header-section-number">0.1.2.3</span> Residual sum of squares</h4>
<p>Lastly, we calculate the amount of variation that cannot be explained by our model. In ANOVA, this is the sum of squared distances between what the model predicts for each data point (i.e., the group means) and the observed values. In other words, this refers to the amount of variation that is caused by extraneous factors, such as differences between product characteristics of the products in the different experimental groups. The vertical lines in the following plot measure how far each observation is away from the group mean:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-15"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-15-1.png" alt="Residual Sum of Squares" width="672" />
<p class="caption">
Figure 6: Residual Sum of Squares
</p>
</div>
<p>The formal representation of the residual sum of squares (SS<sub>R</sub>) is:</p>
<p><span class="math display">\[
SS_R= \sum_{j=1}^{c} \sum_{i=1}^{n} ({Y}_{ij}-\bar{Y}_{j})^2
\]</span></p>
<p>This means that we need to subtract the group mean from each individual observation, square the difference, and sum up over all the squared differences. Thus, in our example, the model sum of squares can be calculated as:</p>
<p><span class="math display">\[
\begin{align}
SS_R =& (13−14.34)^2 + (14−14.34)^2 + … + (2−14.34)^2 \\
+&(21−24.7)^2 + (18−24.7)^2 + … + (17−24.7)^2 \\
+& (30−34.99)^2 + (37−34.99)^2 + … + (28−34.99)^2 \\
=& 9534.43
\end{align}
\]</span></p>
<p>You could also compute this in R using:</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>SSR <span class="ot"><-</span> <span class="fu">sum</span>((hours_abc<span class="sc">$</span>hours <span class="sc">-</span> <span class="fu">rep</span>(<span class="fu">by</span>(hours_abc<span class="sc">$</span>hours,</span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a> hours_abc<span class="sc">$</span>group, mean), <span class="at">each =</span> <span class="dv">100</span>))<span class="sc">^</span><span class="dv">2</span>)</span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>SSR</span></code></pre></div>
<pre><code>## [1] 9534.43</code></pre>
<p>In this case, we used the 10 values for each of the SS for each group, so the residual degrees of freedom (df<sub>R</sub>) are:</p>
<p><span class="math display">\[
\begin{align}
df_R=& (n_1-1)+(n_2-1)+(n_3-1) \\
=&(100-1)+(100-1)+(100-1)=297
\end{align}
\]</span></p>
</div>
<div id="effect-strength" class="section level4" number="0.1.2.4">
<h4><span class="header-section-number">0.1.2.4</span> Effect strength</h4>
<p>Once you have computed the different sum of squares, you can investigate the effect strength. <span class="math inline">\(\eta^2\)</span> is a measure of the variation in Y that is explained by X:</p>
<p><span class="math display">\[
\eta^2= \frac{SS_M}{SS_T}=\frac{21321.21}{30855.64}=0.69
\]</span></p>
<p>To compute this in R:</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>eta <span class="ot"><-</span> SSM<span class="sc">/</span>SST</span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>eta</span></code></pre></div>
<pre><code>## [1] 0.6909988</code></pre>
<p>The statistic can only take values between 0 and 1. It is equal to 0 when all the category means are equal, indicating that X has no effect on Y. In contrast, it has a value of 1 when there is no variability within each category of X but there is some variability between categories. You can think of it as the equivalent to the R-squared statistic in regression model since it also represents a measure of the share of explained variance.</p>
</div>
<div id="test-of-significance" class="section level4" number="0.1.2.5">
<h4><span class="header-section-number">0.1.2.5</span> Test of significance</h4>
<p>How can we determine whether the effect of X on Y is significant?</p>
<ul>
<li>First, we calculate the fit of the most basic model (i.e., the grand mean)</li>
<li>Then, we calculate the fit of the “best” model (i.e., the group means)</li>
<li>A good model should fit the data significantly better than the basic model</li>
<li>The F-statistic, or F-ratio, compares the amount of systematic variance in the data to the amount of unsystematic variance</li>
</ul>
<p>The F-statistic uses the ratio of mean square related to X (explained variation) and the mean square related to the error (unexplained variation):</p>
<p><span class="math display">\[\frac{SS_M}{SS_R}\]</span></p>
<p>However, since these are summed values, their magnitude is influenced by the number of scores that were summed. For example, to calculate SS<sub>M</sub> we only used the sum of 3 values (the group means), while we used 300 values to calculate SS<sub>T</sub> and SS<sub>R</sub>, respectively. Thus, we calculate the average sum of squares (“mean square”) to compare the average amount of systematic vs. unsystematic variation by dividing the SS values by the degrees of freedom associated with the respective statistic.</p>
<p>Mean square due to X:</p>
<p><span class="math display">\[
MS_M= \frac{SS_M}{df_M}=\frac{SS_M}{c-1}=\frac{21321.21}{(3-1)}
\]</span></p>
<p>Mean square due to error:</p>
<p><span class="math display">\[
MS_R= \frac{SS_R}{df_R}=\frac{SS_R}{N-c}=\frac{9534.43}{(300-3)}
\]</span></p>
<p>Now, we compare the amount of variability explained by the model (experiment), to the error in the model (variation due to extraneous variables). If the model explains more variability than it can’t explain, then the experimental manipulation has had a significant effect on the outcome (DV). The F-ratio can be derived as follows:</p>
<p><span class="math display">\[
F= \frac{MS_M}{MS_R}=\frac{\frac{SS_M}{c-1}}{\frac{SS_R}{N-c}}=\frac{\frac{21321.21}{(3-1)}}{\frac{9534.43}{(300-3)}}=332.08
\]</span></p>
<p>You can easily compute this in R:</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>f_ratio <span class="ot"><-</span> (SSM<span class="sc">/</span><span class="dv">2</span>)<span class="sc">/</span>(SSR<span class="sc">/</span><span class="dv">297</span>)</span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>f_ratio</span></code></pre></div>
<pre><code>## [1] 332.0806</code></pre>
<p>Similar to the t-test, the outcome of the significance test will be one of the following:</p>
<ul>
<li>If the null hypothesis of equal category means is not rejected, then the independent variable does not have a significant effect on the dependent variable</li>
<li>If the null hypothesis is rejected, then the effect of the independent variable is significant</li>
</ul>
<p>To decide which one it is, we proceed as with the t-test. That is, we calculate the test statistic and compare it to the critical value for a given level of confidence. If the calculated test statistic is larger than the critical value, we can reject the null hypothesis of equal group means and conclude that the independent variable has a significant effect on our outcome. In this case, however, the test statistic follows a F distribution (instead of the t-distribution) with (m = c – 1) and (n = N – c) degrees of freedom. This means that the shape of the F-distribution depends on the degrees of freedom. In this case, the shape depends on the degrees of freedom associated with the numerator and denominator used to compute the F-ratio. The following figure shows the shape of the F-distribution for different degrees of freedom:</p>
<div class="figure">
<img src="fdistributions.png" alt="" />
<p class="caption">The F distribution</p>
</div>
<p>For 2 and 297 degrees of freedom, the critical value of F is 3.026 for α=0.05. As usual, you can either look up these values in a table or use the appropriate function in R:</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>f_crit <span class="ot"><-</span> <span class="fu">qf</span>(<span class="fl">0.95</span>, <span class="at">df1 =</span> <span class="dv">2</span>, <span class="at">df2 =</span> <span class="dv">297</span>) <span class="co">#critical value</span></span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>f_crit</span></code></pre></div>
<pre><code>## [1] 3.026153</code></pre>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>f_ratio <span class="sc">></span> f_crit <span class="co">#test if calculated test statistic is larger than critical value</span></span></code></pre></div>
<pre><code>## [1] TRUE</code></pre>
<p>The output tells us that the calculated test statistic exceeds the critical value. We can also show the test result visually:</p>
<div class="figure">
<img src="ftest.png" alt="" />
<p class="caption">Visual depiction of the test result</p>
</div>
<p>Thus, we conclude that because F<sub>CAL</sub> = 332.08 > F<sub>CR</sub> = 3.03, H<sub>0</sub> is rejected!</p>
<p>Now we can interpret our findings as follows: one or more of the differences between means are statistically significant.</p>
<div class="infobox_red caution">
<p>Remember: The ANOVA tests for an overall difference in means between the groups. It doesn’t tell us where the differences between groups lie, e.g., whether group “low” is different from “medium” or “high” is different from “medium” or “high” is different from “low”. To find out which group means exactly differ, we need to use post-hoc procedures, which are described below. However, when the ANOVA tells you that the there is no differences between the means, then you also shouldn’t proceed to conduct post-hoc tests. In other words, you should only proceed to conduct post-hoc tests when you found a significant overall effect in your ANOVA.</p>
</div>
<p>Finally, you should report your findings in an appropriate way. You could do this by saying: There was a significant effect of playlists and personalized recommendations on listening times, F(2,297) = 332.08, p < 0.05, <span class="math inline">\(\eta^2\)</span> = 0.69.</p>
<p>As usual, you don’t have to compute these statistics manually! Luckily, there is a function for ANOVA in R, which does the above calculations for you as we will see in the next section.</p>
</div>
</div>
<div id="one-way-anova" class="section level3" number="0.1.3">
<h3><span class="header-section-number">0.1.3</span> One-way ANOVA</h3>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/ElghkF6rwBQ" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<div id="basic-anova" class="section level4" number="0.1.3.1">
<h4><span class="header-section-number">0.1.3.1</span> Basic ANOVA</h4>
<p>As already indicated, one-way ANOVA is used when there is only one categorical variable (factor). Before conducting ANOVA, you need to check if the assumptions of the test are fulfilled. The assumptions of ANOVA are discussed in the following sections.</p>
<div id="independence-of-observations" class="section level5 unnumbered">
<h5>Independence of observations</h5>
<p>The observations in the groups should be independent. Because we randomly assigned the listeners to the experimental conditions, this assumption can be assumed to be met.</p>
</div>
<div id="distributional-assumptions" class="section level5 unnumbered">
<h5>Distributional assumptions</h5>
<p>ANOVA is relatively immune to violations to the normality assumption when sample sizes are large due to the Central Limit Theorem. However, if your sample is small (i.e., n < 30 per group) you may nevertheless want to check the normality of your data, e.g., by using the Shapiro-Wilk test or QQ-Plot. In our example, we have 100 observations in each group which is plenty but let’s create another example with only 10 observations in each group. In the latter case we cannot rely on the Central Limit Theorem and we should test the normality of our data. This can be done using the Shapiro-Wilk Test, which has the Null Hypothesis that the data is normally distributed. Hence, an insignificant test results means that the data can be assumed to be approximately normally distributed:</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">321</span>)</span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>hours_fewobs <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">hours =</span> <span class="fu">c</span>(<span class="fu">rnorm</span>(<span class="dv">10</span>, <span class="dv">20</span>,</span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a> <span class="dv">5</span>), <span class="fu">rnorm</span>(<span class="dv">10</span>, <span class="dv">40</span>, <span class="dv">5</span>), <span class="fu">rnorm</span>(<span class="dv">10</span>, <span class="dv">60</span>, <span class="dv">5</span>)), <span class="at">group =</span> <span class="fu">c</span>(<span class="fu">rep</span>(<span class="st">"low"</span>,</span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a> <span class="dv">10</span>), <span class="fu">rep</span>(<span class="st">"medium"</span>, <span class="dv">10</span>), <span class="fu">rep</span>(<span class="st">"high"</span>, <span class="dv">10</span>)))</span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="fu">by</span>(hours_fewobs<span class="sc">$</span>hours, hours_fewobs<span class="sc">$</span>group, shapiro.test)</span></code></pre></div>
<pre><code>## hours_fewobs$group: high
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.9595, p-value = 0.7801
##
## ------------------------------------------------------------
## hours_fewobs$group: low
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.91625, p-value = 0.3267
##
## ------------------------------------------------------------
## hours_fewobs$group: medium
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.91486, p-value = 0.3161</code></pre>
<p>Since the test result is insignificant for all groups, we can conclude that the data approximately follow a normal distribution.</p>
<p>We could also test the distributional assumptions visually using a Q-Q plot (i.e., quantile-quantile plot). This plot can be used to assess if a set of data plausibly came from some theoretical distribution such as the Normal distribution. Since this is just a visual check, it is somewhat subjective. But it may help us to judge if our assumption is plausible, and if not, which data points contribute to the violation. A Q-Q plot is a scatterplot created by plotting two sets of quantiles against one another. If both sets of quantiles came from the same distribution, we should see the points forming a line that’s roughly straight. In other words, Q-Q plots take your sample data, sort it in ascending order, and then plot them versus quantiles calculated from a theoretical distribution. Quantiles are often referred to as “percentiles” and refer to the points in your data below which a certain proportion of your data fall. Recall, for example, the standard Normal distribution with a mean of 0 and a standard deviation of 1. Since the 50th percentile (or 0.5 quantile) is 0, half the data lie below 0. The 95th percentile (or 0.95 quantile), is about 1.64, which means that 95 percent of the data lie below 1.64. The 97.5th quantile is about 1.96, which means that 97.5% of the data lie below 1.96. In the Q-Q plot, the number of quantiles is selected to match the size of your sample data.</p>
<p>To create the Q-Q plot for the normal distribution, you may use the <code>qqnorm()</code> function, which takes the data to be tested as an argument. Using the <code>qqline()</code> function subsequently on the data creates the line on which the data points should fall based on the theoretical quantiles. If the individual data points deviate a lot from this line, it means that the data is not likely to follow a normal distribution.</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">qqnorm</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"low"</span>, ]<span class="sc">$</span>hours)</span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a><span class="fu">qqline</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"low"</span>, ]<span class="sc">$</span>hours)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-23-1"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-23-1.png" alt="Q-Q plot 1" width="672" />
<p class="caption">
Figure 7: Q-Q plot 1
</p>
</div>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="fu">qqnorm</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"medium"</span>,</span>
<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a> ]<span class="sc">$</span>hours)</span>
<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a><span class="fu">qqline</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"medium"</span>,</span>
<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a> ]<span class="sc">$</span>hours)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-23-2"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-23-2.png" alt="Q-Q plot 2" width="672" />
<p class="caption">
Figure 8: Q-Q plot 2
</p>
</div>
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="fu">qqnorm</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"high"</span>, ]<span class="sc">$</span>hours)</span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a><span class="fu">qqline</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"high"</span>, ]<span class="sc">$</span>hours)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-23-3"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-23-3.png" alt="Q-Q plot 3" width="672" />
<p class="caption">
Figure 9: Q-Q plot 3
</p>
</div>
<p>The Q-Q plots suggest an approximately Normal distribution. If the assumption had been violated, you might consider transforming your data or resort to a non-parametric test.</p>
</div>
<div id="homogeneity-of-variance" class="section level5 unnumbered">
<h5>Homogeneity of variance</h5>
<p>Let’s return to our original data set with 100 observations in each group for the rest of the analysis.</p>
<p>You can test the homogeneity of variances in R using Levene’s test:</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(car)</span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="fu">leveneTest</span>(hours <span class="sc">~</span> group, <span class="at">data =</span> hours_abc, <span class="at">center =</span> mean)</span></code></pre></div>
<pre><code>## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 2 4.9678 0.007548 **
## 297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1</code></pre>
<p>The null hypothesis of the test is that the group variances are equal. Thus, if the test result is significant it means that the variances are not equal. If we cannot reject the null hypothesis (i.e., the group variances are not significantly different), we can proceed with the ANOVA as follows:</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>aov <span class="ot"><-</span> <span class="fu">aov</span>(hours <span class="sc">~</span> group, <span class="at">data =</span> hours_abc)</span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(aov)</span></code></pre></div>
<pre><code>## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 21321 10661 332.1 <2e-16 ***
## Residuals 297 9534 32
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1</code></pre>
<p>You can see that the p-value is smaller than 0.05. This means that, if there really was no difference between the population means (i.e., the Null hypothesis was true), the probability of the observed differences (or larger differences) is less than 5%.</p>
<p>To compute η<sup>2</sup> from the output, we can extract the relevant sum of squares as follows</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">"Sum Sq"</span>[<span class="dv">1</span>]<span class="sc">/</span>(<span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">"Sum Sq"</span>[<span class="dv">1</span>] <span class="sc">+</span></span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">"Sum Sq"</span>[<span class="dv">2</span>])</span></code></pre></div>
<pre><code>## [1] 0.6909988</code></pre>
<p>You can see that the results match the results from our manual computation above (<span class="math inline">\(\eta^2 =\)</span> 0.69).</p>
<p>The <code>aov()</code> function also automatically generates some plots that you can use to judge if the model assumptions are met. We will inspect two of the plots here.</p>
<p>We will use the first plot to inspect if the residual variances are equal across the experimental groups:</p>
<div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(aov, <span class="dv">1</span>)</span></code></pre></div>
<p><img src="08-Anova_files/figure-html/unnamed-chunk-27-1.png" width="672" /></p>
<p>Generally, the residual variance (i.e., the range of values on the y-axis) should be the same for different levels of our independent variable. The plot shows, that there are some slight differences. Notably, the range of residuals is higher in group “medium” than in group “high”. However, the differences are not that large and since the Levene’s test could not reject the Null of equal variances, we conclude that the variances are similar enough in this case.</p>
<p>The second plot can be used to test the assumption that the residuals are approximately normally distributed. We use a Q-Q plot to test this assumption:</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(aov, <span class="dv">2</span>)</span></code></pre></div>
<p><img src="08-Anova_files/figure-html/unnamed-chunk-28-1.png" width="672" /></p>
<p>The plot suggests that, the residuals are approximately normally distributed. We could also test this by extracting the residuals from the anova output using the <code>resid()</code> function and using the Shapiro-Wilk test:</p>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="fu">shapiro.test</span>(<span class="fu">resid</span>(aov))</span></code></pre></div>
<pre><code>##
## Shapiro-Wilk normality test
##
## data: resid(aov)
## W = 0.99723, p-value = 0.8925</code></pre>
<p>Confirming the impression from the Q-Q plot, we cannot reject the Null that the residuals are approximately normally distributed.</p>
<p>Note that if Levene’s test would have been significant (i.e., variances are not equal), we would have needed to either resort to non-parametric tests (see below), or compute the Welch’s F-ratio instead, which is correcting for unequal variances between the groups:</p>
<div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="co"># oneway.test(hours ~ group, hours_abc)</span></span></code></pre></div>
<p>You can see that the results are fairly similar, since the variances turned out to be fairly equal across groups.</p>
</div>
</div>
<div id="post-hoc-tests" class="section level4" number="0.1.3.2">
<h4><span class="header-section-number">0.1.3.2</span> Post-hoc tests</h4>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/wNwKx0TZ7fQ" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<p>Provided that significant differences were detected by the overall ANOVA you can find out which group means are different using post-hoc procedures. Post-hoc procedures are designed to conduct pairwise comparisons of all different combinations of the treatment groups by correcting the level of significance for each test such that the overall Type I error rate (α) across all comparisons remains at 0.05.</p>
<p>In other words, we rejected H<sub>0</sub>: μ<sub>low</sub>= μ<sub>medium</sub>= μ<sub>high</sub>, and now we would like to test:</p>
<p>Test1:</p>
<p><span class="math display">\[H_0: \mu_{low} = \mu_{medium}\]</span></p>
<p>Test2:</p>
<p><span class="math display">\[H_0: \mu_{low} = \mu_{high}\]</span></p>
<p>Test3:</p>
<p><span class="math display">\[H_0: \mu_{medium} = \mu_{high}\]</span></p>
<p>There are several post-hoc procedures available to choose from. In this tutorial, we will cover Bonferroni and Tukey’s HSD (“honest significant differences”). Both tests control for family-wise error. Bonferroni tends to have more power when the number of comparisons is small, whereas Tukey’s HSDs is better when testing large numbers of means.</p>
<div id="bonferroni" class="section level5" number="0.1.3.2.1">
<h5><span class="header-section-number">0.1.3.2.1</span> Bonferroni</h5>
<p>One of the most popular (and easiest) methods to correct for the family-wise error rate is to conduct the individual t-tests and divide α by the number of comparisons („k“):</p>
<p><span class="math display">\[
p_{CR}= \frac{\alpha}{k}
\]</span></p>
<p>In our example with three groups:</p>
<p><span class="math display">\[p_{CR}= \frac{0.05}{3}=0.017\]</span></p>
<p>Thus, the “corrected” critical p-value is now 0.017 instead of 0.05 (i.e., the critical t value is higher). This means that the test is more conservative to account for the family-wise error. Remember that, to reject the null hypothesis at a 5% significance level, we usually check if the p-value in our analysis is smaller than 0.05. The corrected p-value above requires us to obtain a p-value smaller than 0.017 in order to reject the null hypothesis at the 5% significance level, which means that the critical value of the test statistic is higher. You can implement the Bonferroni procedure in R using:</p>
<div class="sourceCode" id="cb42"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>bonferroni <span class="ot"><-</span> <span class="fu">pairwise.t.test</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group,</span>
<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a> <span class="at">data =</span> hours_abc, <span class="at">p.adjust.method =</span> <span class="st">"bonferroni"</span>)</span>
<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a>bonferroni</span></code></pre></div>
<pre><code>##
## Pairwise comparisons using t tests with pooled SD
##
## data: hours_abc$hours and hours_abc$group
##
## low medium
## medium <2e-16 -
## high <2e-16 <2e-16
##
## P value adjustment method: bonferroni</code></pre>
<p>In the output, you will get the corrected p-values for the individual tests. This mean, to reject the null hypothesis, we require the p-value to be smaller than 0.05 again, since the reported p-values are already corrected for the family-wise error. In our example, we can reject H<sub>0</sub> of equal means for all three tests, since p < 0.05 for all combinations of groups.</p>
<p>Note the difference between the results from the post-hoc test compared to individual t-tests. For example, when we test the “medium” vs. “high” groups, the result from a t-test would be:</p>
<div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a>data_subset <span class="ot"><-</span> <span class="fu">subset</span>(hours_abc, group <span class="sc">!=</span> <span class="st">"low"</span>)</span>
<span id="cb44-2"><a href="#cb44-2" aria-hidden="true" tabindex="-1"></a>ttest <span class="ot"><-</span> <span class="fu">t.test</span>(hours <span class="sc">~</span> group, <span class="at">data =</span> data_subset,</span>
<span id="cb44-3"><a href="#cb44-3" aria-hidden="true" tabindex="-1"></a> <span class="at">var.equal =</span> <span class="cn">TRUE</span>)</span>
<span id="cb44-4"><a href="#cb44-4" aria-hidden="true" tabindex="-1"></a>ttest</span></code></pre></div>
<pre><code>##
## Two Sample t-test
##
## data: hours by group
## t = -11.884, df = 198, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group medium and group high is not equal to 0
## 95 percent confidence interval:
## -11.997471 -8.582529
## sample estimates:
## mean in group medium mean in group high
## 24.70 34.99</code></pre>
<p>Usually the p-value is lower in the t-test, reflecting the fact that the family-wise error is not corrected (i.e., the test is less conservative). In this case the p-value is extremely small in both cases and thus indistinguishable.</p>
</div>
<div id="tukeys-hsd" class="section level5" number="0.1.3.2.2">
<h5><span class="header-section-number">0.1.3.2.2</span> Tukey’s HSD</h5>
<p>Tukey’s HSD also compares all possible pairs of means (two-by-two combinations; i.e., like a t-test, except that it corrects for family-wise error rate).</p>
<p>Test statistic:</p>
<p><span class="math display" id="eq:tukey">\[\begin{equation}
\begin{split}
HSD= q\sqrt{\frac{MS_R}{n_c}}
\end{split}
\tag{2}
\end{equation}\]</span></p>
<p>where:</p>
<ul>
<li>q = value from studentized range table (see e.g., <a href="http://www.real-statistics.com/statistics-tables/studentized-range-q-table/" target="_blank">here</a>)</li>
<li>MS<sub>R</sub> = Mean Square Error from ANOVA</li>
<li>n<sub>c</sub> = number of observations per group</li>
<li>Decision: Reject H<sub>0</sub> if</li>
</ul>
<p><span class="math display">\[|\bar{Y}_i-\bar{Y}_j | > HSD\]</span></p>
<p>The value from the studentized range table can be obtained using the <code>qtukey()</code> function.</p>
<div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a>q <span class="ot"><-</span> <span class="fu">qtukey</span>(<span class="fl">0.95</span>, <span class="at">nm =</span> <span class="dv">3</span>, <span class="at">df =</span> <span class="dv">297</span>)</span>
<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a>q</span></code></pre></div>
<pre><code>## [1] 3.331215</code></pre>
<p>Hence:</p>
<p><span class="math display">\[HSD= 3.33\sqrt{\frac{33.99}{100}}=1.94\]</span></p>
<p>Or, in R:</p>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a>hsd <span class="ot"><-</span> q <span class="sc">*</span> <span class="fu">sqrt</span>(<span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">"Mean Sq"</span>[<span class="dv">2</span>]<span class="sc">/</span><span class="dv">100</span>)</span>
<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a>hsd</span></code></pre></div>
<pre><code>## [1] 1.887434</code></pre>
<p>Since all mean differences between groups are larger than 1.906, we can reject the null hypothesis for all individual tests, confirming the results from the Bonferroni test. To compute Tukey’s HSD, we can use the appropriate function from the <code>multcomp</code> package.</p>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(multcomp)</span>
<span id="cb50-2"><a href="#cb50-2" aria-hidden="true" tabindex="-1"></a>aov<span class="sc">$</span>model<span class="sc">$</span>group <span class="ot"><-</span> <span class="fu">as.factor</span>(aov<span class="sc">$</span>model<span class="sc">$</span>group)</span>
<span id="cb50-3"><a href="#cb50-3" aria-hidden="true" tabindex="-1"></a>tukeys <span class="ot"><-</span> <span class="fu">glht</span>(aov, <span class="at">linfct =</span> <span class="fu">mcp</span>(<span class="at">group =</span> <span class="st">"Tukey"</span>))</span>
<span id="cb50-4"><a href="#cb50-4" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(tukeys)</span></code></pre></div>
<pre><code>##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: aov(formula = hours ~ group, data = hours_abc)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## medium - low == 0 10.3600 0.8013 12.93 <2e-16 ***
## high - low == 0 20.6500 0.8013 25.77 <2e-16 ***
## high - medium == 0 10.2900 0.8013 12.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)</code></pre>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="fu">confint</span>(tukeys)</span></code></pre></div>
<pre><code>##
## Simultaneous Confidence Intervals
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: aov(formula = hours ~ group, data = hours_abc)
##
## Quantile = 2.3558
## 95% family-wise confidence level
##
##
## Linear Hypotheses:
## Estimate lwr upr
## medium - low == 0 10.3600 8.4723 12.2477
## high - low == 0 20.6500 18.7623 22.5377
## high - medium == 0 10.2900 8.4023 12.1777</code></pre>
<p>We may also plot the result for the mean differences incl. their confidence intervals:</p>
<div class="sourceCode" id="cb54"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(tukeys)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-36"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-36-1.png" alt="Tukey's HSD" width="672" />
<p class="caption">
Figure 10: Tukey’s HSD
</p>
</div>
<p>You can see that the CIs do not cross zero, which means that the true difference between group means is unlikely zero. It is sufficient to report the results in the way described above. However, you could also manually compute the differences between the groups and their confidence interval as follows:</p>
<div class="sourceCode" id="cb55"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a>mean1 <span class="ot"><-</span> <span class="fu">mean</span>(hours_abc[hours_abc<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"low"</span>, <span class="st">"hours"</span>]) <span class="co">#mean group 'low'</span></span>
<span id="cb55-2"><a href="#cb55-2" aria-hidden="true" tabindex="-1"></a>mean1</span></code></pre></div>
<pre><code>## [1] 14.34</code></pre>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a>mean2 <span class="ot"><-</span> <span class="fu">mean</span>(hours_abc[hours_abc<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"medium"</span>,</span>
<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a> <span class="st">"hours"</span>]) <span class="co">#mean group 'medium'</span></span>
<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a>mean2</span></code></pre></div>
<pre><code>## [1] 24.7</code></pre>
<div class="sourceCode" id="cb59"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>mean3 <span class="ot"><-</span> <span class="fu">mean</span>(hours_abc[hours_abc<span class="sc">$</span>group <span class="sc">==</span> <span class="st">"high"</span>,</span>
<span id="cb59-2"><a href="#cb59-2" aria-hidden="true" tabindex="-1"></a> <span class="st">"hours"</span>]) <span class="co">#mean group 'high'</span></span>
<span id="cb59-3"><a href="#cb59-3" aria-hidden="true" tabindex="-1"></a>mean3</span></code></pre></div>
<pre><code>## [1] 34.99</code></pre>
<div class="sourceCode" id="cb61"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a><span class="co"># CI high vs. medium</span></span>
<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a>mean_diff_high_med <span class="ot"><-</span> mean2 <span class="sc">-</span> mean1</span>
<span id="cb61-3"><a href="#cb61-3" aria-hidden="true" tabindex="-1"></a>mean_diff_high_med</span></code></pre></div>
<pre><code>## [1] 10.36</code></pre>
<div class="sourceCode" id="cb63"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a>ci_med_high_lower <span class="ot"><-</span> mean_diff_high_med <span class="sc">-</span> hsd</span>
<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a>ci_med_high_upper <span class="ot"><-</span> mean_diff_high_med <span class="sc">+</span> hsd</span>
<span id="cb63-3"><a href="#cb63-3" aria-hidden="true" tabindex="-1"></a>ci_med_high_lower</span></code></pre></div>
<pre><code>## [1] 8.472566</code></pre>
<div class="sourceCode" id="cb65"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a>ci_med_high_upper</span></code></pre></div>
<pre><code>## [1] 12.24743</code></pre>
<div class="sourceCode" id="cb67"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a><span class="co"># CI high vs.low</span></span>
<span id="cb67-2"><a href="#cb67-2" aria-hidden="true" tabindex="-1"></a>mean_diff_high_low <span class="ot"><-</span> mean3 <span class="sc">-</span> mean1</span>
<span id="cb67-3"><a href="#cb67-3" aria-hidden="true" tabindex="-1"></a>mean_diff_high_low</span></code></pre></div>
<pre><code>## [1] 20.65</code></pre>
<div class="sourceCode" id="cb69"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a>ci_low_high_lower <span class="ot"><-</span> mean_diff_high_low <span class="sc">-</span> hsd</span>
<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a>ci_low_high_upper <span class="ot"><-</span> mean_diff_high_low <span class="sc">+</span> hsd</span>
<span id="cb69-3"><a href="#cb69-3" aria-hidden="true" tabindex="-1"></a>ci_low_high_lower</span></code></pre></div>
<pre><code>## [1] 18.76257</code></pre>
<div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a>ci_low_high_upper</span></code></pre></div>
<pre><code>## [1] 22.53743</code></pre>
<div class="sourceCode" id="cb73"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a><span class="co"># CI medium vs.low</span></span>
<span id="cb73-2"><a href="#cb73-2" aria-hidden="true" tabindex="-1"></a>mean_diff_med_low <span class="ot"><-</span> mean3 <span class="sc">-</span> mean2</span>
<span id="cb73-3"><a href="#cb73-3" aria-hidden="true" tabindex="-1"></a>mean_diff_med_low</span></code></pre></div>
<pre><code>## [1] 10.29</code></pre>
<div class="sourceCode" id="cb75"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb75-1"><a href="#cb75-1" aria-hidden="true" tabindex="-1"></a>ci_low_med_lower <span class="ot"><-</span> mean_diff_med_low <span class="sc">-</span> hsd</span>
<span id="cb75-2"><a href="#cb75-2" aria-hidden="true" tabindex="-1"></a>ci_low_med_upper <span class="ot"><-</span> mean_diff_med_low <span class="sc">+</span> hsd</span>
<span id="cb75-3"><a href="#cb75-3" aria-hidden="true" tabindex="-1"></a>ci_low_med_lower</span></code></pre></div>
<pre><code>## [1] 8.402566</code></pre>
<div class="sourceCode" id="cb77"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb77-1"><a href="#cb77-1" aria-hidden="true" tabindex="-1"></a>ci_low_med_upper</span></code></pre></div>
<pre><code>## [1] 12.17743</code></pre>
<p>The results of a post-hoc test can be reported as follows:</p>
<p>The post-hoc tests based on Bonferroni and Tukey’s HSD revealed that users listened to music significantly more when the intensity of personalized recommendations was increased. This is true for “low” vs. “medium” intensity, as well as for “low” vs. “high” and “medium” vs. “high” intensity.</p>
<p>As with the t-test, you could also use the functions contained in the <code>ggstatsplot</code> package to combine a visual depiction of the data with the results of statistical tests. In the case of an ANOVA, the output would also include the pairwise comparisons.</p>
<div class="sourceCode" id="cb79"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb79-1"><a href="#cb79-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggstatsplot)</span>
<span id="cb79-2"><a href="#cb79-2" aria-hidden="true" tabindex="-1"></a><span class="fu">ggbetweenstats</span>(</span>
<span id="cb79-3"><a href="#cb79-3" aria-hidden="true" tabindex="-1"></a> <span class="at">data =</span> hours_abc,</span>
<span id="cb79-4"><a href="#cb79-4" aria-hidden="true" tabindex="-1"></a> <span class="at">x =</span> group,</span>
<span id="cb79-5"><a href="#cb79-5" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> hours,</span>
<span id="cb79-6"><a href="#cb79-6" aria-hidden="true" tabindex="-1"></a> <span class="at">plot.type =</span> <span class="st">"box"</span>,</span>
<span id="cb79-7"><a href="#cb79-7" aria-hidden="true" tabindex="-1"></a> <span class="at">pairwise.comparisons =</span> <span class="cn">TRUE</span>,</span>
<span id="cb79-8"><a href="#cb79-8" aria-hidden="true" tabindex="-1"></a> <span class="at">pairwise.annotation =</span> <span class="st">"p.value"</span>,</span>
<span id="cb79-9"><a href="#cb79-9" aria-hidden="true" tabindex="-1"></a> <span class="at">p.adjust.method =</span> <span class="st">"bonferroni"</span>,</span>
<span id="cb79-10"><a href="#cb79-10" aria-hidden="true" tabindex="-1"></a> <span class="at">effsize.type =</span> <span class="st">"partial_eta"</span>,</span>
<span id="cb79-11"><a href="#cb79-11" aria-hidden="true" tabindex="-1"></a> <span class="at">var.equal =</span> <span class="cn">FALSE</span>,</span>
<span id="cb79-12"><a href="#cb79-12" aria-hidden="true" tabindex="-1"></a> <span class="at">mean.plotting =</span> <span class="cn">TRUE</span>, <span class="co"># whether mean for each group is to be displayed</span></span>
<span id="cb79-13"><a href="#cb79-13" aria-hidden="true" tabindex="-1"></a> <span class="at">mean.ci =</span> <span class="cn">TRUE</span>, <span class="co"># whether to display confidence interval for means</span></span>
<span id="cb79-14"><a href="#cb79-14" aria-hidden="true" tabindex="-1"></a> <span class="at">mean.label.size =</span> <span class="fl">2.5</span>, <span class="co"># size of the label for mean</span></span>
<span id="cb79-15"><a href="#cb79-15" aria-hidden="true" tabindex="-1"></a> <span class="at">type =</span> <span class="st">"parametric"</span>, <span class="co"># which type of test is to be run</span></span>
<span id="cb79-16"><a href="#cb79-16" aria-hidden="true" tabindex="-1"></a> <span class="at">k =</span> <span class="dv">3</span>, <span class="co"># number of decimal places for statistical results</span></span>
<span id="cb79-17"><a href="#cb79-17" aria-hidden="true" tabindex="-1"></a> <span class="at">outlier.label.color =</span> <span class="st">"darkgreen"</span>, <span class="co"># changing the color for the text label</span></span>
<span id="cb79-18"><a href="#cb79-18" aria-hidden="true" tabindex="-1"></a> <span class="at">title =</span> <span class="st">"Comparison of listening times between groups"</span>,</span>
<span id="cb79-19"><a href="#cb79-19" aria-hidden="true" tabindex="-1"></a> <span class="at">xlab =</span> <span class="st">"Experimental group"</span>, <span class="co"># label for the x-axis variable</span></span>
<span id="cb79-20"><a href="#cb79-20" aria-hidden="true" tabindex="-1"></a> <span class="at">ylab =</span> <span class="st">"Listening time"</span>, <span class="co"># label for the y-axis variable</span></span>
<span id="cb79-21"><a href="#cb79-21" aria-hidden="true" tabindex="-1"></a> <span class="at">messages =</span> <span class="cn">FALSE</span>,</span>
<span id="cb79-22"><a href="#cb79-22" aria-hidden="true" tabindex="-1"></a> <span class="at">bf.message =</span> <span class="cn">FALSE</span></span>
<span id="cb79-23"><a href="#cb79-23" aria-hidden="true" tabindex="-1"></a>)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-38"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-38-1.png" alt="ANOVA using ggstatsplot" width="672" />
<p class="caption">
Figure 11: ANOVA using ggstatsplot
</p>
</div>
</div>
</div>
</div>
</div>
</section>
</div>
</div>
</div>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"whatsapp": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": null,
"search": false,
"toc": {
"collapse": "section"
}
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "true";
if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:")
if (/^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>