08-Anova.html

<!DOCTYPE html>
<html lang="" xml:lang="">
<head>

  <meta charset="utf-8" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <title>0.1 Comparing several means | 08-Anova.knit</title>
  <meta name="description" content="" />
  <meta name="generator" content="bookdown 0.24 and GitBook 2.6.7" />

  <meta property="og:title" content="0.1 Comparing several means | 08-Anova.knit" />
  <meta property="og:type" content="book" />
  
  
  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="0.1 Comparing several means | 08-Anova.knit" />
  
  
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <meta name="apple-mobile-web-app-capable" content="yes" />
  <meta name="apple-mobile-web-app-status-bar-style" content="black" />
  
  
<script src="libs/header-attrs-2.10/header-attrs.js"></script>
<script src="libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/fuse.js@6.4.6/dist/fuse.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />


<link href="libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet" />
<script src="libs/pagedtable-1.1/js/pagedtable.js"></script>
<link href="libs/anchor-sections-1.0.1/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0.1/anchor-sections.js"></script>


<style type="text/css">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>


</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li><strong><a href="./">MRDA 2020</a></strong></li>

<li class="divider"></li>
<li class="chapter" data-level="0.1" data-path=""><a href="#comparing-several-means"><i class="fa fa-check"></i><b>0.1</b> Comparing several means</a>
<ul>
<li class="chapter" data-level="0.1.1" data-path=""><a href="#introduction"><i class="fa fa-check"></i><b>0.1.1</b> Introduction</a></li>
<li class="chapter" data-level="0.1.2" data-path=""><a href="#decomposing-variance"><i class="fa fa-check"></i><b>0.1.2</b> Decomposing variance</a></li>
<li class="chapter" data-level="0.1.3" data-path=""><a href="#one-way-anova"><i class="fa fa-check"></i><b>0.1.3</b> One-way ANOVA</a></li>
</ul></li>
</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./"></a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<!--bookdown:title:end-->
<!--bookdown:title:start-->
<div id="comparing-several-means" class="section level2" number="0.1">
<h2><span class="header-section-number">0.1</span> Comparing several means</h2>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/9cGZcALfU5k" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<div class="infobox download">
<p><a href="./Code/07-anova.R">You can download the corresponding R-Code here</a></p>
</div>
<div id="introduction" class="section level3" number="0.1.1">
<h3><span class="header-section-number">0.1.1</span> Introduction</h3>
<p>In the previous section we learned how to compare two means using a t-test. The t-test has some limitations since it only lets you compare two means and you can only use it with one independent variable. However, often we would like to compare means from 3 or more groups. In addition, there may be instances in which you manipulate more than one independent variable. For these applications, ANOVA (<u>AN</u>alysis <u>O</u>f <u>VA</u>riance) can be used. Hence, to conduct ANOVA you need:</p>
<ul>
<li>A metric dependent variable (i.e., measured using an interval or ratio scale)</li>
<li>One or more non-metric (categorical) independent variables (also called factors)</li>
</ul>
<p>A <strong>treatment</strong> is a particular combination of factor levels, or categories. So-called <strong>one-way ANOVA</strong> is used when there is only one categorical variable (factor). In this case, a treatment is the same as a factor level. <strong>N-way ANOVA</strong> is used with two or more factors. Note that we are only going to talk about a single independent variable in the context of ANOVA on this website. If you have multiple independent variables please refer to the chapter on <strong>Regression</strong>.</p>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/cG0HAWqObJs" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<p>Let’s use an example to see how ANOVA works. Similar to the previous example, imagine that the music streaming service experiments with a recommender system and manipulates the intensity of personalized recommendations using three levels: ‘low’, ‘medium’, and ‘high’. The service randomly assigns 100 users to each condition and records the listening times in hours in the following week. As always, we load and inspect the data first:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>hours_abc <span class="ot">&lt;-</span> <span class="fu">read.table</span>(<span class="st">&quot;https://raw.githubusercontent.com/IMSMWU/MRDA2018/master/data/hours_abc.dat&quot;</span>,</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>    <span class="at">sep =</span> <span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>, <span class="at">header =</span> <span class="cn">TRUE</span>)  <span class="co">#read in data</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>hours_abc<span class="sc">$</span>group <span class="ot">&lt;-</span> <span class="fu">factor</span>(hours_abc<span class="sc">$</span>group, <span class="at">levels =</span> <span class="fu">c</span>(<span class="st">&quot;A&quot;</span>,</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>    <span class="st">&quot;B&quot;</span>, <span class="st">&quot;C&quot;</span>), <span class="at">labels =</span> <span class="fu">c</span>(<span class="st">&quot;low&quot;</span>, <span class="st">&quot;medium&quot;</span>, <span class="st">&quot;high&quot;</span>))  <span class="co">#convert grouping variable to factor</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(hours_abc)  <span class="co">#inspect data</span></span></code></pre></div>
<pre><code>## &#39;data.frame&#39;:    300 obs. of  3 variables:
##  $ hours: int  18 13 3 13 20 18 18 10 11 20 ...
##  $ group: Factor w/ 3 levels &quot;low&quot;,&quot;medium&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ index: int  1 2 3 4 5 6 7 8 9 10 ...</code></pre>
<p>The null hypothesis, typically, is that all means are equal (non-directional hypothesis). Hence, in our case:</p>
<p><span class="math display">\[H_0: \mu_1 = \mu_2 = \mu_3\]</span></p>
<p>The alternative hypothesis is simply that the means are not all equal, i.e.,</p>
<p><span class="math display">\[H_1: \textrm{Means are not all equal}\]</span></p>
<p>If you wanted to put this in mathematical notation, you could also write:</p>
<p><span class="math display">\[H_1: \exists {i,j}: {\mu_i \ne \mu_j} \]</span></p>
<p>To get a first impression if there are any differences in listening times across the experimental groups, we use the <code>describeBy(...)</code> function from the <code>psych</code> package:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(psych)</span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">describeBy</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group)  <span class="co">#inspect data</span></span></code></pre></div>
<pre><code>## 
##  Descriptive statistics by group 
## group: low
##    vars   n  mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 100 14.34 4.62     14   14.36 4.45   3  25    22 -0.03    -0.32 0.46
## ------------------------------------------------------------ 
## group: medium
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 100 24.7 5.81     25   24.79 5.93  12  42    30 0.05      0.2 0.58
## ------------------------------------------------------------ 
## group: high
##    vars   n  mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 100 34.99 6.42   34.5   35.02 6.67  17  50    33 -0.05    -0.23 0.64</code></pre>
<p>In addition, you should visualize the data using appropriate plots. Appropriate plots in this case would be a plot of means, including the 95% confidence interval around the mean, or a boxplot.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co">#Plot of mean</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(Rmisc)</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggplot2)</span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>mean_data <span class="ot">&lt;-</span> <span class="fu">summarySE</span>(hours_abc, <span class="at">measurevar=</span><span class="st">&quot;hours&quot;</span>, <span class="at">groupvars=</span><span class="fu">c</span>(<span class="st">&quot;group&quot;</span>))</span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(mean_data,<span class="fu">aes</span>(<span class="at">x =</span> group, <span class="at">y =</span> hours)) <span class="sc">+</span> </span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_bar</span>(<span class="at">position=</span><span class="fu">position_dodge</span>(<span class="dv">1</span>), <span class="at">colour=</span><span class="st">&quot;black&quot;</span>, <span class="at">fill =</span> <span class="st">&quot;#CCCCCC&quot;</span>, <span class="at">stat=</span><span class="st">&quot;identity&quot;</span>, <span class="at">width =</span> <span class="fl">0.65</span>) <span class="sc">+</span></span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_errorbar</span>(<span class="at">position=</span><span class="fu">position_dodge</span>(.<span class="dv">9</span>), <span class="at">width=</span>.<span class="dv">15</span>, <span class="fu">aes</span>(<span class="at">ymin=</span>hours<span class="sc">-</span>ci, <span class="at">ymax=</span>hours<span class="sc">+</span>ci)) <span class="sc">+</span></span>
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">&quot;Group&quot;</span>, <span class="at">y =</span> <span class="st">&quot;Average number of hours&quot;</span>, <span class="at">title =</span> <span class="st">&quot;Average number of hours by group&quot;</span>)<span class="sc">+</span></span>
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">hjust =</span> <span class="fl">0.5</span>,<span class="at">color =</span> <span class="st">&quot;#666666&quot;</span>)) </span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-6"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-6-1.png" alt="Plot of means" width="672" />
<p class="caption">
Figure 1: Plot of means
</p>
</div>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(hours_abc,<span class="fu">aes</span>(<span class="at">x =</span> group, <span class="at">y =</span> hours)) <span class="sc">+</span> </span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_boxplot</span>() <span class="sc">+</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_jitter</span>(<span class="at">colour=</span><span class="st">&quot;red&quot;</span>, <span class="at">alpha =</span> <span class="fl">0.1</span>) <span class="sc">+</span></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">&quot;Group&quot;</span>, <span class="at">y =</span> <span class="st">&quot;Average number of hours&quot;</span>, <span class="at">title =</span> <span class="st">&quot;Average number of hours by group&quot;</span>)<span class="sc">+</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">hjust =</span> <span class="fl">0.5</span>,<span class="at">color =</span> <span class="st">&quot;#666666&quot;</span>)) </span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-7"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-7-1.png" alt="Boxplot" width="672" />
<p class="caption">
Figure 2: Boxplot
</p>
</div>
<p>Note that ANOVA is an omnibus test, which means that we test for an overall difference between groups. Hence, the test will only tell you if the group means are different, but it won’t tell you exactly which groups are different from another.</p>
<p>So why don’t we then just conduct a series of t-tests for all combinations of groups (i.e., “low” vs. “medium”, “low” vs. “high”, “medium” vs. “high”)? The reason is that if we assume each test to be independent, then there is a 5% probability of falsely rejecting the null hypothesis (Type I error) for each test. In our case:</p>
<ul>
<li>“low” vs. “medium” (α = 0.05)</li>
<li>“low” vs. “high” (α = 0.05)</li>
<li>“medium” vs. “high” (α = 0.05)</li>
</ul>
<p>This means that the overall probability of making a Type I error is 1-(0.95<sup>3</sup>) = 0.143, since the probability of no Type I error is 0.95 for each of the three tests. Consequently, the Type I error probability would be 14.3%, which is above the conventional standard of 5%. This is also known as the family-wise or experiment-wise error.</p>
</div>
<div id="decomposing-variance" class="section level3" number="0.1.2">
<h3><span class="header-section-number">0.1.2</span> Decomposing variance</h3>
<p>The basic concept underlying ANOVA is the decomposition of the variance in the data. There are three variance components which we need to consider:</p>
<ul>
<li>We calculate how much variability there is overall between scores: <b>Total sum of squares (SS<sub>T</sub>)</b></li>
<li>We then calculate how much of this variability can be explained by the model we fit to the data (i.e., how much variability is due to the experimental manipulation): <b>Model sum of squares (SS<sub>M</sub>)</b></li>
<li>… and how much cannot be explained (i.e., how much variability is due to individual differences in performance): <b>Residual sum of squares (SS<sub>R</sub>)</b></li>
</ul>
<p>The following figure shows the different variance components using a generalized data matrix:</p>
<p style="text-align:center;">
<img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/sum_of_squares.JPG" alt="decomposing_variance"/>
</p>
<p>The total variation is determined by the variation between the categories (due to our experimental manipulation) and the within-category variation that is due to extraneous factors (e.g., unobserved factors such as the promotion of artists on a social network):</p>
<p><span class="math display">\[SS_T= SS_M+SS_R\]</span></p>
<p>To get a better feeling how this relates to our data set, we can look at the data in a slightly different way. Specifically, we can use the <code>dcast(...)</code> function from the <code>reshape2</code> package to convert the data to wide format:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(reshape2)</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="fu">dcast</span>(hours_abc, index <span class="sc">~</span> group, <span class="at">value.var =</span> <span class="st">&quot;hours&quot;</span>)</span></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["index"],"name":[1],"type":["int"],"align":["right"]},{"label":["low"],"name":[2],"type":["int"],"align":["right"]},{"label":["medium"],"name":[3],"type":["int"],"align":["right"]},{"label":["high"],"name":[4],"type":["int"],"align":["right"]}],"data":[{"1":"1","2":"18","3":"26","4":"34"},{"1":"2","2":"13","3":"32","4":"34"},{"1":"3","2":"3","3":"29","4":"32"},{"1":"4","2":"13","3":"31","4":"25"},{"1":"5","2":"20","3":"13","4":"35"},{"1":"6","2":"18","3":"31","4":"20"},{"1":"7","2":"18","3":"26","4":"30"},{"1":"8","2":"10","3":"21","4":"33"},{"1":"9","2":"11","3":"31","4":"31"},{"1":"10","2":"20","3":"24","4":"42"},{"1":"11","2":"23","3":"13","4":"40"},{"1":"12","2":"14","3":"21","4":"44"},{"1":"13","2":"25","3":"31","4":"39"},{"1":"14","2":"16","3":"21","4":"33"},{"1":"15","2":"14","3":"28","4":"27"},{"1":"16","2":"14","3":"39","4":"48"},{"1":"17","2":"22","3":"22","4":"32"},{"1":"18","2":"23","3":"27","4":"35"},{"1":"19","2":"20","3":"28","4":"32"},{"1":"20","2":"15","3":"19","4":"35"},{"1":"21","2":"19","3":"20","4":"44"},{"1":"22","2":"16","3":"20","4":"32"},{"1":"23","2":"15","3":"26","4":"39"},{"1":"24","2":"15","3":"26","4":"35"},{"1":"25","2":"6","3":"12","4":"42"},{"1":"26","2":"12","3":"25","4":"29"},{"1":"27","2":"12","3":"23","4":"45"},{"1":"28","2":"15","3":"24","4":"27"},{"1":"29","2":"11","3":"24","4":"35"},{"1":"30","2":"10","3":"22","4":"38"},{"1":"31","2":"14","3":"16","4":"44"},{"1":"32","2":"5","3":"14","4":"39"},{"1":"33","2":"13","3":"29","4":"38"},{"1":"34","2":"12","3":"32","4":"44"},{"1":"35","2":"22","3":"29","4":"17"},{"1":"36","2":"16","3":"20","4":"38"},{"1":"37","2":"7","3":"32","4":"34"},{"1":"38","2":"20","3":"34","4":"31"},{"1":"39","2":"13","3":"27","4":"29"},{"1":"40","2":"21","3":"26","4":"46"},{"1":"41","2":"13","3":"20","4":"36"},{"1":"42","2":"13","3":"23","4":"32"},{"1":"43","2":"12","3":"42","4":"35"},{"1":"44","2":"5","3":"19","4":"30"},{"1":"45","2":"17","3":"26","4":"27"},{"1":"46","2":"18","3":"32","4":"44"},{"1":"47","2":"12","3":"29","4":"40"},{"1":"48","2":"8","3":"22","4":"31"},{"1":"49","2":"5","3":"25","4":"21"},{"1":"50","2":"18","3":"24","4":"29"},{"1":"51","2":"18","3":"28","4":"29"},{"1":"52","2":"19","3":"38","4":"32"},{"1":"53","2":"12","3":"26","4":"33"},{"1":"54","2":"11","3":"16","4":"28"},{"1":"55","2":"13","3":"21","4":"44"},{"1":"56","2":"16","3":"27","4":"37"},{"1":"57","2":"5","3":"24","4":"33"},{"1":"58","2":"17","3":"28","4":"45"},{"1":"59","2":"9","3":"28","4":"41"},{"1":"60","2":"8","3":"24","4":"33"},{"1":"61","2":"11","3":"32","4":"31"},{"1":"62","2":"12","3":"25","4":"42"},{"1":"63","2":"13","3":"18","4":"31"},{"1":"64","2":"10","3":"29","4":"25"},{"1":"65","2":"15","3":"25","4":"43"},{"1":"66","2":"16","3":"20","4":"38"},{"1":"67","2":"22","3":"18","4":"40"},{"1":"68","2":"10","3":"28","4":"36"},{"1":"69","2":"16","3":"27","4":"25"},{"1":"70","2":"18","3":"32","4":"30"},{"1":"71","2":"12","3":"22","4":"36"},{"1":"72","2":"22","3":"21","4":"46"},{"1":"73","2":"13","3":"28","4":"33"},{"1":"74","2":"14","3":"30","4":"39"},{"1":"75","2":"10","3":"30","4":"40"},{"1":"76","2":"9","3":"12","4":"31"},{"1":"77","2":"9","3":"21","4":"39"},{"1":"78","2":"11","3":"23","4":"43"},{"1":"79","2":"17","3":"22","4":"36"},{"1":"80","2":"17","3":"17","4":"37"},{"1":"81","2":"16","3":"29","4":"46"},{"1":"82","2":"14","3":"29","4":"28"},{"1":"83","2":"16","3":"22","4":"30"},{"1":"84","2":"14","3":"12","4":"31"},{"1":"85","2":"19","3":"23","4":"32"},{"1":"86","2":"24","3":"32","4":"31"},{"1":"87","2":"17","3":"21","4":"29"},{"1":"88","2":"12","3":"18","4":"50"},{"1":"89","2":"12","3":"27","4":"37"},{"1":"90","2":"17","3":"31","4":"32"},{"1":"91","2":"14","3":"23","4":"41"},{"1":"92","2":"20","3":"21","4":"36"},{"1":"93","2":"10","3":"27","4":"33"},{"1":"94","2":"13","3":"17","4":"40"},{"1":"95","2":"12","3":"21","4":"25"},{"1":"96","2":"20","3":"22","4":"38"},{"1":"97","2":"9","3":"25","4":"31"},{"1":"98","2":"16","3":"28","4":"28"},{"1":"99","2":"12","3":"27","4":"39"},{"1":"100","2":"17","3":"19","4":"34"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
  </script>
</div>
<p>In this example, X<sub>1</sub> from the generalized data matrix above would refer to the factor level “low”, X<sub>2</sub> to the level “medium”, and X<sub>3</sub> to the level “high”. Y<sub>11</sub> refers to the first data point in the first row (i.e., “13”), Y<sub>12</sub> to the second data point in the first row (i.e., “21”), etc.. The grand mean (<span class="math inline">\(\overline{Y}\)</span>) and the category means (<span class="math inline">\(\overline{Y}_c\)</span>) can be easily computed:</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">mean</span>(hours_abc<span class="sc">$</span>hours)  <span class="co">#grand mean</span></span></code></pre></div>
<pre><code>## [1] 24.67667</code></pre>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">by</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group, mean)  <span class="co">#category mean</span></span></code></pre></div>
<pre><code>## hours_abc$group: low
## [1] 14.34
## ------------------------------------------------------------ 
## hours_abc$group: medium
## [1] 24.7
## ------------------------------------------------------------ 
## hours_abc$group: high
## [1] 34.99</code></pre>
<p>To see how each variance component can be derived, let’s look at the data again. The following graph shows the individual observations by experimental group:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-10"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-10-1.png" alt="Sum of Squares" width="672" />
<p class="caption">
Figure 3: Sum of Squares
</p>
</div>
<div id="total-sum-of-squares" class="section level4" number="0.1.2.1">
<h4><span class="header-section-number">0.1.2.1</span> Total sum of squares</h4>
<p>To compute the total variation in the data, we consider the difference between each observation and the grand mean. The grand mean is the mean over all observations in the data set. The vertical lines in the following plot measure how far each observation is away from the grand mean:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-11"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-11-1.png" alt="Total Sum of Squares" width="672" />
<p class="caption">
Figure 4: Total Sum of Squares
</p>
</div>
<p>The formal representation of the total sum of squares (SS<sub>T</sub>) is:</p>
<p><span class="math display">\[
SS_T= \sum_{i=1}^{N} (Y_i-\bar{Y})^2
\]</span></p>
<p>This means that we need to subtract the grand mean from each individual data point, square the difference, and sum up over all the squared differences. Thus, in our example, the total sum of squares can be calculated as:</p>
<p><span class="math display">\[ 
\begin{align}
SS_T =&amp;(13−24.67)^2 + (14−24.67)^2 + … + (2−24.67)^2\\
      &amp;+(21−24.67)^2 + (18-24.67)^2 + … + (17−24.67)^2\\
      &amp;+(30−24.67)^2 + (37−24.67)^2 + … + (28−24.67)^2\\ 
      &amp;=30855.64
\end{align}
\]</span></p>
<p>You could also compute this in R using:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>SST <span class="ot">&lt;-</span> <span class="fu">sum</span>((hours_abc<span class="sc">$</span>hours <span class="sc">-</span> <span class="fu">mean</span>(hours_abc<span class="sc">$</span>hours))<span class="sc">^</span><span class="dv">2</span>)</span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>SST</span></code></pre></div>
<pre><code>## [1] 30855.64</code></pre>
<p>For the subsequent analyses, it is important to understand the concept behind the <b>degrees of freedom</b> (df). Remember that in order to estimate a population value from a sample, we need to hold something in the population constant. In ANOVA, the df are generally one less than the number of values used to calculate the SS. For example, when we estimate the population mean from a sample, we assume that the sample mean is equal to the population mean. Then, in order to estimate the population mean from the sample, all but one scores are free to vary and the remaining score needs to be the value that keeps the population mean constant. Thus, the degrees of freedom of an estimate can also be thought of as the number of independent pieces of information that went into calculating the estimate. In our example, we used all 300 observations to calculate the sum of square, so the total degrees of freedom (df<sub>T</sub>) are:</p>
<p><span class="math display" id="eq:dfT">\[\begin{equation} 
\begin{split}
df_T = N-1=300-1=299
\end{split}
\tag{1}
\end{equation}\]</span></p>
<div class="infobox_orange hint">
<p>Why do we subtract 1 from the number of items when computing the <strong>degrees of freedom</strong>? As mentioned above, the degrees of freedom refer to the number of values that are free to vary in a data set. To understand what this means, imagine that we try to estimate the mean hours of music listening in a population and that mean is 20 hours. We could take different samples from the population and we assume that the sample mean is equal to the population mean. Imagine, we only take three small samples of 3 students each: i) 19, 20, 21, ii) 18, 20, 22, iii) 15, 20, 25. Once you have chosen the first two values in each set, the third item cannot be chosen freely (i.e., it is fixed) because it needs to be the value that gets you to the population mean. Hence, only the first two values are ‘free to vary’. You can select 19 + 20 or 15 + 25, but once you have chosen the first two values, you must choose a particular value that will give you the population mean you are looking for (i.e., 20 hours). In this case, the degrees of freedom for each set of three numbers is two.</p>
</div>
</div>
<div id="model-sum-of-squares" class="section level4" number="0.1.2.2">
<h4><span class="header-section-number">0.1.2.2</span> Model sum of squares</h4>
<p>Now we know that there are 30855.64 units of total variation in our data. Next, we compute how much of the total variation can be explained by the differences between groups (i.e., our experimental manipulation). To compute the explained variation in the data, we consider the difference between the values predicted by our model for each observation (i.e., the group mean) and the grand mean. The group mean refers to the mean value within the experimental group. The vertical lines in the following plot measure how far the predicted value for each observation (i.e., the group mean) is away from the grand mean:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-13"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-13-1.png" alt="Model Sum of Squares" width="672" />
<p class="caption">
Figure 5: Model Sum of Squares
</p>
</div>
<p>The formal representation of the model sum of squares (SS<sub>M</sub>) is:</p>
<p><span class="math display">\[
SS_M= \sum_{j=1}^{c} n_j(\bar{Y}_j-\bar{Y})^2
\]</span></p>
<p>where c denotes the number of categories (experimental groups). This means that we need to subtract the grand mean from each group mean, square the difference, and sum up over all the squared differences. Thus, in our example, the model sum of squares can be calculated as:</p>
<p><span class="math display">\[ 
\begin{align}
SS_M &amp;= 100*(15.47−24.67)^2 + 100*(24.88−24.67)^2 + 100*(33.66−24.67)^2 \\
     &amp;= 21321.21
\end{align}
\]</span></p>
<p>You could also compute this manually in R using:</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>SSM <span class="ot">&lt;-</span> <span class="fu">sum</span>(<span class="dv">100</span> <span class="sc">*</span> (<span class="fu">by</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group,</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>    mean) <span class="sc">-</span> <span class="fu">mean</span>(hours_abc<span class="sc">$</span>hours))<span class="sc">^</span><span class="dv">2</span>)</span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>SSM</span></code></pre></div>
<pre><code>## [1] 21321.21</code></pre>
<p>In this case, we used the three group means to calculate the sum of squares, so the model degrees of freedom (df<sub>M</sub>) are:</p>
<p><span class="math display">\[
df_M= c-1=3-1=2
\]</span></p>
</div>
<div id="residual-sum-of-squares" class="section level4" number="0.1.2.3">
<h4><span class="header-section-number">0.1.2.3</span> Residual sum of squares</h4>
<p>Lastly, we calculate the amount of variation that cannot be explained by our model. In ANOVA, this is the sum of squared distances between what the model predicts for each data point (i.e., the group means) and the observed values. In other words, this refers to the amount of variation that is caused by extraneous factors, such as differences between product characteristics of the products in the different experimental groups. The vertical lines in the following plot measure how far each observation is away from the group mean:</p>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-15"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-15-1.png" alt="Residual Sum of Squares" width="672" />
<p class="caption">
Figure 6: Residual Sum of Squares
</p>
</div>
<p>The formal representation of the residual sum of squares (SS<sub>R</sub>) is:</p>
<p><span class="math display">\[
SS_R= \sum_{j=1}^{c} \sum_{i=1}^{n} ({Y}_{ij}-\bar{Y}_{j})^2
\]</span></p>
<p>This means that we need to subtract the group mean from each individual observation, square the difference, and sum up over all the squared differences. Thus, in our example, the model sum of squares can be calculated as:</p>
<p><span class="math display">\[ 
\begin{align}
SS_R =&amp; (13−14.34)^2 + (14−14.34)^2 + … + (2−14.34)^2 \\
     +&amp;(21−24.7)^2 + (18−24.7)^2 + … + (17−24.7)^2 \\
     +&amp; (30−34.99)^2 + (37−34.99)^2 + … + (28−34.99)^2 \\
     =&amp;  9534.43
\end{align}
\]</span></p>
<p>You could also compute this in R using:</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>SSR <span class="ot">&lt;-</span> <span class="fu">sum</span>((hours_abc<span class="sc">$</span>hours <span class="sc">-</span> <span class="fu">rep</span>(<span class="fu">by</span>(hours_abc<span class="sc">$</span>hours,</span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>    hours_abc<span class="sc">$</span>group, mean), <span class="at">each =</span> <span class="dv">100</span>))<span class="sc">^</span><span class="dv">2</span>)</span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>SSR</span></code></pre></div>
<pre><code>## [1] 9534.43</code></pre>
<p>In this case, we used the 10 values for each of the SS for each group, so the residual degrees of freedom (df<sub>R</sub>) are:</p>
<p><span class="math display">\[
\begin{align}
df_R=&amp; (n_1-1)+(n_2-1)+(n_3-1) \\
    =&amp;(100-1)+(100-1)+(100-1)=297
\end{align}
\]</span></p>
</div>
<div id="effect-strength" class="section level4" number="0.1.2.4">
<h4><span class="header-section-number">0.1.2.4</span> Effect strength</h4>
<p>Once you have computed the different sum of squares, you can investigate the effect strength. <span class="math inline">\(\eta^2\)</span> is a measure of the variation in Y that is explained by X:</p>
<p><span class="math display">\[
\eta^2= \frac{SS_M}{SS_T}=\frac{21321.21}{30855.64}=0.69
\]</span></p>
<p>To compute this in R:</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>eta <span class="ot">&lt;-</span> SSM<span class="sc">/</span>SST</span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>eta</span></code></pre></div>
<pre><code>## [1] 0.6909988</code></pre>
<p>The statistic can only take values between 0 and 1. It is equal to 0 when all the category means are equal, indicating that X has no effect on Y. In contrast, it has a value of 1 when there is no variability within each category of X but there is some variability between categories. You can think of it as the equivalent to the R-squared statistic in regression model since it also represents a measure of the share of explained variance.</p>
</div>
<div id="test-of-significance" class="section level4" number="0.1.2.5">
<h4><span class="header-section-number">0.1.2.5</span> Test of significance</h4>
<p>How can we determine whether the effect of X on Y is significant?</p>
<ul>
<li>First, we calculate the fit of the most basic model (i.e., the grand mean)</li>
<li>Then, we calculate the fit of the “best” model (i.e., the group means)</li>
<li>A good model should fit the data significantly better than the basic model</li>
<li>The F-statistic, or F-ratio, compares the amount of systematic variance in the data to the amount of unsystematic variance</li>
</ul>
<p>The F-statistic uses the ratio of mean square related to X (explained variation) and the mean square related to the error (unexplained variation):</p>
<p><span class="math display">\[\frac{SS_M}{SS_R}\]</span></p>
<p>However, since these are summed values, their magnitude is influenced by the number of scores that were summed. For example, to calculate SS<sub>M</sub> we only used the sum of 3 values (the group means), while we used 300 values to calculate SS<sub>T</sub> and SS<sub>R</sub>, respectively. Thus, we calculate the average sum of squares (“mean square”) to compare the average amount of systematic vs. unsystematic variation by dividing the SS values by the degrees of freedom associated with the respective statistic.</p>
<p>Mean square due to X:</p>
<p><span class="math display">\[
MS_M= \frac{SS_M}{df_M}=\frac{SS_M}{c-1}=\frac{21321.21}{(3-1)}
\]</span></p>
<p>Mean square due to error:</p>
<p><span class="math display">\[
MS_R= \frac{SS_R}{df_R}=\frac{SS_R}{N-c}=\frac{9534.43}{(300-3)}
\]</span></p>
<p>Now, we compare the amount of variability explained by the model (experiment), to the error in the model (variation due to extraneous variables). If the model explains more variability than it can’t explain, then the experimental manipulation has had a significant effect on the outcome (DV). The F-ratio can be derived as follows:</p>
<p><span class="math display">\[
F= \frac{MS_M}{MS_R}=\frac{\frac{SS_M}{c-1}}{\frac{SS_R}{N-c}}=\frac{\frac{21321.21}{(3-1)}}{\frac{9534.43}{(300-3)}}=332.08
\]</span></p>
<p>You can easily compute this in R:</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>f_ratio <span class="ot">&lt;-</span> (SSM<span class="sc">/</span><span class="dv">2</span>)<span class="sc">/</span>(SSR<span class="sc">/</span><span class="dv">297</span>)</span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>f_ratio</span></code></pre></div>
<pre><code>## [1] 332.0806</code></pre>
<p>Similar to the t-test, the outcome of the significance test will be one of the following:</p>
<ul>
<li>If the null hypothesis of equal category means is not rejected, then the independent variable does not have a significant effect on the dependent variable</li>
<li>If the null hypothesis is rejected, then the effect of the independent variable is significant</li>
</ul>
<p>To decide which one it is, we proceed as with the t-test. That is, we calculate the test statistic and compare it to the critical value for a given level of confidence. If the calculated test statistic is larger than the critical value, we can reject the null hypothesis of equal group means and conclude that the independent variable has a significant effect on our outcome. In this case, however, the test statistic follows a F distribution (instead of the t-distribution) with (m = c – 1) and (n = N – c) degrees of freedom. This means that the shape of the F-distribution depends on the degrees of freedom. In this case, the shape depends on the degrees of freedom associated with the numerator and denominator used to compute the F-ratio. The following figure shows the shape of the F-distribution for different degrees of freedom:</p>
<div class="figure">
<img src="fdistributions.png" alt="" />
<p class="caption">The F distribution</p>
</div>
<p>For 2 and 297 degrees of freedom, the critical value of F is 3.026 for α=0.05. As usual, you can either look up these values in a table or use the appropriate function in R:</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>f_crit <span class="ot">&lt;-</span> <span class="fu">qf</span>(<span class="fl">0.95</span>, <span class="at">df1 =</span> <span class="dv">2</span>, <span class="at">df2 =</span> <span class="dv">297</span>)  <span class="co">#critical value</span></span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>f_crit</span></code></pre></div>
<pre><code>## [1] 3.026153</code></pre>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>f_ratio <span class="sc">&gt;</span> f_crit  <span class="co">#test if calculated test statistic is larger than critical value</span></span></code></pre></div>
<pre><code>## [1] TRUE</code></pre>
<p>The output tells us that the calculated test statistic exceeds the critical value. We can also show the test result visually:</p>
<div class="figure">
<img src="ftest.png" alt="" />
<p class="caption">Visual depiction of the test result</p>
</div>
<p>Thus, we conclude that because F<sub>CAL</sub> = 332.08 &gt; F<sub>CR</sub> = 3.03, H<sub>0</sub> is rejected!</p>
<p>Now we can interpret our findings as follows: one or more of the differences between means are statistically significant.</p>
<div class="infobox_red caution">
<p>Remember: The ANOVA tests for an overall difference in means between the groups. It doesn’t tell us where the differences between groups lie, e.g., whether group “low” is different from “medium” or “high” is different from “medium” or “high” is different from “low”. To find out which group means exactly differ, we need to use post-hoc procedures, which are described below. However, when the ANOVA tells you that the there is no differences between the means, then you also shouldn’t proceed to conduct post-hoc tests. In other words, you should only proceed to conduct post-hoc tests when you found a significant overall effect in your ANOVA.</p>
</div>
<p>Finally, you should report your findings in an appropriate way. You could do this by saying: There was a significant effect of playlists and personalized recommendations on listening times, F(2,297) = 332.08, p &lt; 0.05, <span class="math inline">\(\eta^2\)</span> = 0.69.</p>
<p>As usual, you don’t have to compute these statistics manually! Luckily, there is a function for ANOVA in R, which does the above calculations for you as we will see in the next section.</p>
</div>
</div>
<div id="one-way-anova" class="section level3" number="0.1.3">
<h3><span class="header-section-number">0.1.3</span> One-way ANOVA</h3>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/ElghkF6rwBQ" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<div id="basic-anova" class="section level4" number="0.1.3.1">
<h4><span class="header-section-number">0.1.3.1</span> Basic ANOVA</h4>
<p>As already indicated, one-way ANOVA is used when there is only one categorical variable (factor). Before conducting ANOVA, you need to check if the assumptions of the test are fulfilled. The assumptions of ANOVA are discussed in the following sections.</p>
<div id="independence-of-observations" class="section level5 unnumbered">
<h5>Independence of observations</h5>
<p>The observations in the groups should be independent. Because we randomly assigned the listeners to the experimental conditions, this assumption can be assumed to be met.</p>
</div>
<div id="distributional-assumptions" class="section level5 unnumbered">
<h5>Distributional assumptions</h5>
<p>ANOVA is relatively immune to violations to the normality assumption when sample sizes are large due to the Central Limit Theorem. However, if your sample is small (i.e., n &lt; 30 per group) you may nevertheless want to check the normality of your data, e.g., by using the Shapiro-Wilk test or QQ-Plot. In our example, we have 100 observations in each group which is plenty but let’s create another example with only 10 observations in each group. In the latter case we cannot rely on the Central Limit Theorem and we should test the normality of our data. This can be done using the Shapiro-Wilk Test, which has the Null Hypothesis that the data is normally distributed. Hence, an insignificant test results means that the data can be assumed to be approximately normally distributed:</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">321</span>)</span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>hours_fewobs <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">hours =</span> <span class="fu">c</span>(<span class="fu">rnorm</span>(<span class="dv">10</span>, <span class="dv">20</span>,</span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>    <span class="dv">5</span>), <span class="fu">rnorm</span>(<span class="dv">10</span>, <span class="dv">40</span>, <span class="dv">5</span>), <span class="fu">rnorm</span>(<span class="dv">10</span>, <span class="dv">60</span>, <span class="dv">5</span>)), <span class="at">group =</span> <span class="fu">c</span>(<span class="fu">rep</span>(<span class="st">&quot;low&quot;</span>,</span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a>    <span class="dv">10</span>), <span class="fu">rep</span>(<span class="st">&quot;medium&quot;</span>, <span class="dv">10</span>), <span class="fu">rep</span>(<span class="st">&quot;high&quot;</span>, <span class="dv">10</span>)))</span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="fu">by</span>(hours_fewobs<span class="sc">$</span>hours, hours_fewobs<span class="sc">$</span>group, shapiro.test)</span></code></pre></div>
<pre><code>## hours_fewobs$group: high
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.9595, p-value = 0.7801
## 
## ------------------------------------------------------------ 
## hours_fewobs$group: low
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.91625, p-value = 0.3267
## 
## ------------------------------------------------------------ 
## hours_fewobs$group: medium
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.91486, p-value = 0.3161</code></pre>
<p>Since the test result is insignificant for all groups, we can conclude that the data approximately follow a normal distribution.</p>
<p>We could also test the distributional assumptions visually using a Q-Q plot (i.e., quantile-quantile plot). This plot can be used to assess if a set of data plausibly came from some theoretical distribution such as the Normal distribution. Since this is just a visual check, it is somewhat subjective. But it may help us to judge if our assumption is plausible, and if not, which data points contribute to the violation. A Q-Q plot is a scatterplot created by plotting two sets of quantiles against one another. If both sets of quantiles came from the same distribution, we should see the points forming a line that’s roughly straight. In other words, Q-Q plots take your sample data, sort it in ascending order, and then plot them versus quantiles calculated from a theoretical distribution. Quantiles are often referred to as “percentiles” and refer to the points in your data below which a certain proportion of your data fall. Recall, for example, the standard Normal distribution with a mean of 0 and a standard deviation of 1. Since the 50th percentile (or 0.5 quantile) is 0, half the data lie below 0. The 95th percentile (or 0.95 quantile), is about 1.64, which means that 95 percent of the data lie below 1.64. The 97.5th quantile is about 1.96, which means that 97.5% of the data lie below 1.96. In the Q-Q plot, the number of quantiles is selected to match the size of your sample data.</p>
<p>To create the Q-Q plot for the normal distribution, you may use the <code>qqnorm()</code> function, which takes the data to be tested as an argument. Using the <code>qqline()</code> function subsequently on the data creates the line on which the data points should fall based on the theoretical quantiles. If the individual data points deviate a lot from this line, it means that the data is not likely to follow a normal distribution.</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">qqnorm</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;low&quot;</span>, ]<span class="sc">$</span>hours)</span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a><span class="fu">qqline</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;low&quot;</span>, ]<span class="sc">$</span>hours)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-23-1"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-23-1.png" alt="Q-Q plot 1" width="672" />
<p class="caption">
Figure 7: Q-Q plot 1
</p>
</div>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="fu">qqnorm</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;medium&quot;</span>,</span>
<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>    ]<span class="sc">$</span>hours)</span>
<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a><span class="fu">qqline</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;medium&quot;</span>,</span>
<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>    ]<span class="sc">$</span>hours)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-23-2"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-23-2.png" alt="Q-Q plot 2" width="672" />
<p class="caption">
Figure 8: Q-Q plot 2
</p>
</div>
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="fu">qqnorm</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;high&quot;</span>, ]<span class="sc">$</span>hours)</span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a><span class="fu">qqline</span>(hours_fewobs[hours_fewobs<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;high&quot;</span>, ]<span class="sc">$</span>hours)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-23-3"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-23-3.png" alt="Q-Q plot 3" width="672" />
<p class="caption">
Figure 9: Q-Q plot 3
</p>
</div>
<p>The Q-Q plots suggest an approximately Normal distribution. If the assumption had been violated, you might consider transforming your data or resort to a non-parametric test.</p>
</div>
<div id="homogeneity-of-variance" class="section level5 unnumbered">
<h5>Homogeneity of variance</h5>
<p>Let’s return to our original data set with 100 observations in each group for the rest of the analysis.</p>
<p>You can test the homogeneity of variances in R using Levene’s test:</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(car)</span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="fu">leveneTest</span>(hours <span class="sc">~</span> group, <span class="at">data =</span> hours_abc, <span class="at">center =</span> mean)</span></code></pre></div>
<pre><code>## Levene&#39;s Test for Homogeneity of Variance (center = mean)
##        Df F value   Pr(&gt;F)   
## group   2  4.9678 0.007548 **
##       297                    
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1</code></pre>
<p>The null hypothesis of the test is that the group variances are equal. Thus, if the test result is significant it means that the variances are not equal. If we cannot reject the null hypothesis (i.e., the group variances are not significantly different), we can proceed with the ANOVA as follows:</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>aov <span class="ot">&lt;-</span> <span class="fu">aov</span>(hours <span class="sc">~</span> group, <span class="at">data =</span> hours_abc)</span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(aov)</span></code></pre></div>
<pre><code>##              Df Sum Sq Mean Sq F value Pr(&gt;F)    
## group         2  21321   10661   332.1 &lt;2e-16 ***
## Residuals   297   9534      32                   
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1</code></pre>
<p>You can see that the p-value is smaller than 0.05. This means that, if there really was no difference between the population means (i.e., the Null hypothesis was true), the probability of the observed differences (or larger differences) is less than 5%.</p>
<p>To compute η<sup>2</sup> from the output, we can extract the relevant sum of squares as follows</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">&quot;Sum Sq&quot;</span>[<span class="dv">1</span>]<span class="sc">/</span>(<span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">&quot;Sum Sq&quot;</span>[<span class="dv">1</span>] <span class="sc">+</span></span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a>    <span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">&quot;Sum Sq&quot;</span>[<span class="dv">2</span>])</span></code></pre></div>
<pre><code>## [1] 0.6909988</code></pre>
<p>You can see that the results match the results from our manual computation above (<span class="math inline">\(\eta^2 =\)</span> 0.69).</p>
<p>The <code>aov()</code> function also automatically generates some plots that you can use to judge if the model assumptions are met. We will inspect two of the plots here.</p>
<p>We will use the first plot to inspect if the residual variances are equal across the experimental groups:</p>
<div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(aov, <span class="dv">1</span>)</span></code></pre></div>
<p><img src="08-Anova_files/figure-html/unnamed-chunk-27-1.png" width="672" /></p>
<p>Generally, the residual variance (i.e., the range of values on the y-axis) should be the same for different levels of our independent variable. The plot shows, that there are some slight differences. Notably, the range of residuals is higher in group “medium” than in group “high”. However, the differences are not that large and since the Levene’s test could not reject the Null of equal variances, we conclude that the variances are similar enough in this case.</p>
<p>The second plot can be used to test the assumption that the residuals are approximately normally distributed. We use a Q-Q plot to test this assumption:</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(aov, <span class="dv">2</span>)</span></code></pre></div>
<p><img src="08-Anova_files/figure-html/unnamed-chunk-28-1.png" width="672" /></p>
<p>The plot suggests that, the residuals are approximately normally distributed. We could also test this by extracting the residuals from the anova output using the <code>resid()</code> function and using the Shapiro-Wilk test:</p>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="fu">shapiro.test</span>(<span class="fu">resid</span>(aov))</span></code></pre></div>
<pre><code>## 
##  Shapiro-Wilk normality test
## 
## data:  resid(aov)
## W = 0.99723, p-value = 0.8925</code></pre>
<p>Confirming the impression from the Q-Q plot, we cannot reject the Null that the residuals are approximately normally distributed.</p>
<p>Note that if Levene’s test would have been significant (i.e., variances are not equal), we would have needed to either resort to non-parametric tests (see below), or compute the Welch’s F-ratio instead, which is correcting for unequal variances between the groups:</p>
<div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="co"># oneway.test(hours ~ group, hours_abc)</span></span></code></pre></div>
<p>You can see that the results are fairly similar, since the variances turned out to be fairly equal across groups.</p>
</div>
</div>
<div id="post-hoc-tests" class="section level4" number="0.1.3.2">
<h4><span class="header-section-number">0.1.3.2</span> Post-hoc tests</h4>
<br>
<div align="center">
<iframe width="560" height="315" src="https://www.youtube.com/embed/wNwKx0TZ7fQ" frameborder="0" allowfullscreen>
</iframe>
</div>
<p><br></p>
<p>Provided that significant differences were detected by the overall ANOVA you can find out which group means are different using post-hoc procedures. Post-hoc procedures are designed to conduct pairwise comparisons of all different combinations of the treatment groups by correcting the level of significance for each test such that the overall Type I error rate (α) across all comparisons remains at 0.05.</p>
<p>In other words, we rejected H<sub>0</sub>: μ<sub>low</sub>= μ<sub>medium</sub>= μ<sub>high</sub>, and now we would like to test:</p>
<p>Test1:</p>
<p><span class="math display">\[H_0: \mu_{low} = \mu_{medium}\]</span></p>
<p>Test2:</p>
<p><span class="math display">\[H_0: \mu_{low} = \mu_{high}\]</span></p>
<p>Test3:</p>
<p><span class="math display">\[H_0: \mu_{medium} = \mu_{high}\]</span></p>
<p>There are several post-hoc procedures available to choose from. In this tutorial, we will cover Bonferroni and Tukey’s HSD (“honest significant differences”). Both tests control for family-wise error. Bonferroni tends to have more power when the number of comparisons is small, whereas Tukey’s HSDs is better when testing large numbers of means.</p>
<div id="bonferroni" class="section level5" number="0.1.3.2.1">
<h5><span class="header-section-number">0.1.3.2.1</span> Bonferroni</h5>
<p>One of the most popular (and easiest) methods to correct for the family-wise error rate is to conduct the individual t-tests and divide α by the number of comparisons („k“):</p>
<p><span class="math display">\[
p_{CR}= \frac{\alpha}{k}
\]</span></p>
<p>In our example with three groups:</p>
<p><span class="math display">\[p_{CR}= \frac{0.05}{3}=0.017\]</span></p>
<p>Thus, the “corrected” critical p-value is now 0.017 instead of 0.05 (i.e., the critical t value is higher). This means that the test is more conservative to account for the family-wise error. Remember that, to reject the null hypothesis at a 5% significance level, we usually check if the p-value in our analysis is smaller than 0.05. The corrected p-value above requires us to obtain a p-value smaller than 0.017 in order to reject the null hypothesis at the 5% significance level, which means that the critical value of the test statistic is higher. You can implement the Bonferroni procedure in R using:</p>
<div class="sourceCode" id="cb42"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>bonferroni <span class="ot">&lt;-</span> <span class="fu">pairwise.t.test</span>(hours_abc<span class="sc">$</span>hours, hours_abc<span class="sc">$</span>group,</span>
<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a>    <span class="at">data =</span> hours_abc, <span class="at">p.adjust.method =</span> <span class="st">&quot;bonferroni&quot;</span>)</span>
<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a>bonferroni</span></code></pre></div>
<pre><code>## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  hours_abc$hours and hours_abc$group 
## 
##        low    medium
## medium &lt;2e-16 -     
## high   &lt;2e-16 &lt;2e-16
## 
## P value adjustment method: bonferroni</code></pre>
<p>In the output, you will get the corrected p-values for the individual tests. This mean, to reject the null hypothesis, we require the p-value to be smaller than 0.05 again, since the reported p-values are already corrected for the family-wise error. In our example, we can reject H<sub>0</sub> of equal means for all three tests, since p &lt; 0.05 for all combinations of groups.</p>
<p>Note the difference between the results from the post-hoc test compared to individual t-tests. For example, when we test the “medium” vs. “high” groups, the result from a t-test would be:</p>
<div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a>data_subset <span class="ot">&lt;-</span> <span class="fu">subset</span>(hours_abc, group <span class="sc">!=</span> <span class="st">&quot;low&quot;</span>)</span>
<span id="cb44-2"><a href="#cb44-2" aria-hidden="true" tabindex="-1"></a>ttest <span class="ot">&lt;-</span> <span class="fu">t.test</span>(hours <span class="sc">~</span> group, <span class="at">data =</span> data_subset,</span>
<span id="cb44-3"><a href="#cb44-3" aria-hidden="true" tabindex="-1"></a>    <span class="at">var.equal =</span> <span class="cn">TRUE</span>)</span>
<span id="cb44-4"><a href="#cb44-4" aria-hidden="true" tabindex="-1"></a>ttest</span></code></pre></div>
<pre><code>## 
##  Two Sample t-test
## 
## data:  hours by group
## t = -11.884, df = 198, p-value &lt; 2.2e-16
## alternative hypothesis: true difference in means between group medium and group high is not equal to 0
## 95 percent confidence interval:
##  -11.997471  -8.582529
## sample estimates:
## mean in group medium   mean in group high 
##                24.70                34.99</code></pre>
<p>Usually the p-value is lower in the t-test, reflecting the fact that the family-wise error is not corrected (i.e., the test is less conservative). In this case the p-value is extremely small in both cases and thus indistinguishable.</p>
</div>
<div id="tukeys-hsd" class="section level5" number="0.1.3.2.2">
<h5><span class="header-section-number">0.1.3.2.2</span> Tukey’s HSD</h5>
<p>Tukey’s HSD also compares all possible pairs of means (two-by-two combinations; i.e., like a t-test, except that it corrects for family-wise error rate).</p>
<p>Test statistic:</p>
<p><span class="math display" id="eq:tukey">\[\begin{equation} 
\begin{split}
HSD= q\sqrt{\frac{MS_R}{n_c}}
\end{split}
\tag{2}
\end{equation}\]</span></p>
<p>where:</p>
<ul>
<li>q = value from studentized range table (see e.g., <a href="http://www.real-statistics.com/statistics-tables/studentized-range-q-table/" target="_blank">here</a>)</li>
<li>MS<sub>R</sub> = Mean Square Error from ANOVA</li>
<li>n<sub>c</sub> = number of observations per group</li>
<li>Decision: Reject H<sub>0</sub> if</li>
</ul>
<p><span class="math display">\[|\bar{Y}_i-\bar{Y}_j | &gt; HSD\]</span></p>
<p>The value from the studentized range table can be obtained using the <code>qtukey()</code> function.</p>
<div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a>q <span class="ot">&lt;-</span> <span class="fu">qtukey</span>(<span class="fl">0.95</span>, <span class="at">nm =</span> <span class="dv">3</span>, <span class="at">df =</span> <span class="dv">297</span>)</span>
<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a>q</span></code></pre></div>
<pre><code>## [1] 3.331215</code></pre>
<p>Hence:</p>
<p><span class="math display">\[HSD= 3.33\sqrt{\frac{33.99}{100}}=1.94\]</span></p>
<p>Or, in R:</p>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a>hsd <span class="ot">&lt;-</span> q <span class="sc">*</span> <span class="fu">sqrt</span>(<span class="fu">summary</span>(aov)[[<span class="dv">1</span>]]<span class="sc">$</span><span class="st">&quot;Mean Sq&quot;</span>[<span class="dv">2</span>]<span class="sc">/</span><span class="dv">100</span>)</span>
<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a>hsd</span></code></pre></div>
<pre><code>## [1] 1.887434</code></pre>
<p>Since all mean differences between groups are larger than 1.906, we can reject the null hypothesis for all individual tests, confirming the results from the Bonferroni test. To compute Tukey’s HSD, we can use the appropriate function from the <code>multcomp</code> package.</p>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(multcomp)</span>
<span id="cb50-2"><a href="#cb50-2" aria-hidden="true" tabindex="-1"></a>aov<span class="sc">$</span>model<span class="sc">$</span>group <span class="ot">&lt;-</span> <span class="fu">as.factor</span>(aov<span class="sc">$</span>model<span class="sc">$</span>group)</span>
<span id="cb50-3"><a href="#cb50-3" aria-hidden="true" tabindex="-1"></a>tukeys <span class="ot">&lt;-</span> <span class="fu">glht</span>(aov, <span class="at">linfct =</span> <span class="fu">mcp</span>(<span class="at">group =</span> <span class="st">&quot;Tukey&quot;</span>))</span>
<span id="cb50-4"><a href="#cb50-4" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(tukeys)</span></code></pre></div>
<pre><code>## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = hours ~ group, data = hours_abc)
## 
## Linear Hypotheses:
##                    Estimate Std. Error t value Pr(&gt;|t|)    
## medium - low == 0   10.3600     0.8013   12.93   &lt;2e-16 ***
## high - low == 0     20.6500     0.8013   25.77   &lt;2e-16 ***
## high - medium == 0  10.2900     0.8013   12.84   &lt;2e-16 ***
## ---
## Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1
## (Adjusted p values reported -- single-step method)</code></pre>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="fu">confint</span>(tukeys)</span></code></pre></div>
<pre><code>## 
##   Simultaneous Confidence Intervals
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = hours ~ group, data = hours_abc)
## 
## Quantile = 2.3558
## 95% family-wise confidence level
##  
## 
## Linear Hypotheses:
##                    Estimate lwr     upr    
## medium - low == 0  10.3600   8.4723 12.2477
## high - low == 0    20.6500  18.7623 22.5377
## high - medium == 0 10.2900   8.4023 12.1777</code></pre>
<p>We may also plot the result for the mean differences incl. their confidence intervals:</p>
<div class="sourceCode" id="cb54"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(tukeys)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-36"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-36-1.png" alt="Tukey's HSD" width="672" />
<p class="caption">
Figure 10: Tukey’s HSD
</p>
</div>
<p>You can see that the CIs do not cross zero, which means that the true difference between group means is unlikely zero. It is sufficient to report the results in the way described above. However, you could also manually compute the differences between the groups and their confidence interval as follows:</p>
<div class="sourceCode" id="cb55"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a>mean1 <span class="ot">&lt;-</span> <span class="fu">mean</span>(hours_abc[hours_abc<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;low&quot;</span>, <span class="st">&quot;hours&quot;</span>])  <span class="co">#mean group &#39;low&#39;</span></span>
<span id="cb55-2"><a href="#cb55-2" aria-hidden="true" tabindex="-1"></a>mean1</span></code></pre></div>
<pre><code>## [1] 14.34</code></pre>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a>mean2 <span class="ot">&lt;-</span> <span class="fu">mean</span>(hours_abc[hours_abc<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;medium&quot;</span>,</span>
<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">&quot;hours&quot;</span>])  <span class="co">#mean group &#39;medium&#39;</span></span>
<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a>mean2</span></code></pre></div>
<pre><code>## [1] 24.7</code></pre>
<div class="sourceCode" id="cb59"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>mean3 <span class="ot">&lt;-</span> <span class="fu">mean</span>(hours_abc[hours_abc<span class="sc">$</span>group <span class="sc">==</span> <span class="st">&quot;high&quot;</span>,</span>
<span id="cb59-2"><a href="#cb59-2" aria-hidden="true" tabindex="-1"></a>    <span class="st">&quot;hours&quot;</span>])  <span class="co">#mean group &#39;high&#39;</span></span>
<span id="cb59-3"><a href="#cb59-3" aria-hidden="true" tabindex="-1"></a>mean3</span></code></pre></div>
<pre><code>## [1] 34.99</code></pre>
<div class="sourceCode" id="cb61"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a><span class="co"># CI high vs. medium</span></span>
<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a>mean_diff_high_med <span class="ot">&lt;-</span> mean2 <span class="sc">-</span> mean1</span>
<span id="cb61-3"><a href="#cb61-3" aria-hidden="true" tabindex="-1"></a>mean_diff_high_med</span></code></pre></div>
<pre><code>## [1] 10.36</code></pre>
<div class="sourceCode" id="cb63"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a>ci_med_high_lower <span class="ot">&lt;-</span> mean_diff_high_med <span class="sc">-</span> hsd</span>
<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a>ci_med_high_upper <span class="ot">&lt;-</span> mean_diff_high_med <span class="sc">+</span> hsd</span>
<span id="cb63-3"><a href="#cb63-3" aria-hidden="true" tabindex="-1"></a>ci_med_high_lower</span></code></pre></div>
<pre><code>## [1] 8.472566</code></pre>
<div class="sourceCode" id="cb65"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a>ci_med_high_upper</span></code></pre></div>
<pre><code>## [1] 12.24743</code></pre>
<div class="sourceCode" id="cb67"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a><span class="co"># CI high vs.low</span></span>
<span id="cb67-2"><a href="#cb67-2" aria-hidden="true" tabindex="-1"></a>mean_diff_high_low <span class="ot">&lt;-</span> mean3 <span class="sc">-</span> mean1</span>
<span id="cb67-3"><a href="#cb67-3" aria-hidden="true" tabindex="-1"></a>mean_diff_high_low</span></code></pre></div>
<pre><code>## [1] 20.65</code></pre>
<div class="sourceCode" id="cb69"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a>ci_low_high_lower <span class="ot">&lt;-</span> mean_diff_high_low <span class="sc">-</span> hsd</span>
<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a>ci_low_high_upper <span class="ot">&lt;-</span> mean_diff_high_low <span class="sc">+</span> hsd</span>
<span id="cb69-3"><a href="#cb69-3" aria-hidden="true" tabindex="-1"></a>ci_low_high_lower</span></code></pre></div>
<pre><code>## [1] 18.76257</code></pre>
<div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a>ci_low_high_upper</span></code></pre></div>
<pre><code>## [1] 22.53743</code></pre>
<div class="sourceCode" id="cb73"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a><span class="co"># CI medium vs.low</span></span>
<span id="cb73-2"><a href="#cb73-2" aria-hidden="true" tabindex="-1"></a>mean_diff_med_low <span class="ot">&lt;-</span> mean3 <span class="sc">-</span> mean2</span>
<span id="cb73-3"><a href="#cb73-3" aria-hidden="true" tabindex="-1"></a>mean_diff_med_low</span></code></pre></div>
<pre><code>## [1] 10.29</code></pre>
<div class="sourceCode" id="cb75"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb75-1"><a href="#cb75-1" aria-hidden="true" tabindex="-1"></a>ci_low_med_lower <span class="ot">&lt;-</span> mean_diff_med_low <span class="sc">-</span> hsd</span>
<span id="cb75-2"><a href="#cb75-2" aria-hidden="true" tabindex="-1"></a>ci_low_med_upper <span class="ot">&lt;-</span> mean_diff_med_low <span class="sc">+</span> hsd</span>
<span id="cb75-3"><a href="#cb75-3" aria-hidden="true" tabindex="-1"></a>ci_low_med_lower</span></code></pre></div>
<pre><code>## [1] 8.402566</code></pre>
<div class="sourceCode" id="cb77"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb77-1"><a href="#cb77-1" aria-hidden="true" tabindex="-1"></a>ci_low_med_upper</span></code></pre></div>
<pre><code>## [1] 12.17743</code></pre>
<p>The results of a post-hoc test can be reported as follows:</p>
<p>The post-hoc tests based on Bonferroni and Tukey’s HSD revealed that users listened to music significantly more when the intensity of personalized recommendations was increased. This is true for “low” vs. “medium” intensity, as well as for “low” vs. “high” and “medium” vs. “high” intensity.</p>
<p>As with the t-test, you could also use the functions contained in the <code>ggstatsplot</code> package to combine a visual depiction of the data with the results of statistical tests. In the case of an ANOVA, the output would also include the pairwise comparisons.</p>
<div class="sourceCode" id="cb79"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb79-1"><a href="#cb79-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggstatsplot)</span>
<span id="cb79-2"><a href="#cb79-2" aria-hidden="true" tabindex="-1"></a><span class="fu">ggbetweenstats</span>(</span>
<span id="cb79-3"><a href="#cb79-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">data =</span> hours_abc,</span>
<span id="cb79-4"><a href="#cb79-4" aria-hidden="true" tabindex="-1"></a>  <span class="at">x =</span> group,</span>
<span id="cb79-5"><a href="#cb79-5" aria-hidden="true" tabindex="-1"></a>  <span class="at">y =</span> hours,</span>
<span id="cb79-6"><a href="#cb79-6" aria-hidden="true" tabindex="-1"></a>  <span class="at">plot.type =</span> <span class="st">&quot;box&quot;</span>,</span>
<span id="cb79-7"><a href="#cb79-7" aria-hidden="true" tabindex="-1"></a>  <span class="at">pairwise.comparisons =</span> <span class="cn">TRUE</span>,</span>
<span id="cb79-8"><a href="#cb79-8" aria-hidden="true" tabindex="-1"></a>  <span class="at">pairwise.annotation =</span> <span class="st">&quot;p.value&quot;</span>,</span>
<span id="cb79-9"><a href="#cb79-9" aria-hidden="true" tabindex="-1"></a>  <span class="at">p.adjust.method =</span> <span class="st">&quot;bonferroni&quot;</span>,</span>
<span id="cb79-10"><a href="#cb79-10" aria-hidden="true" tabindex="-1"></a>  <span class="at">effsize.type =</span> <span class="st">&quot;partial_eta&quot;</span>,</span>
<span id="cb79-11"><a href="#cb79-11" aria-hidden="true" tabindex="-1"></a>  <span class="at">var.equal =</span> <span class="cn">FALSE</span>,</span>
<span id="cb79-12"><a href="#cb79-12" aria-hidden="true" tabindex="-1"></a>  <span class="at">mean.plotting =</span> <span class="cn">TRUE</span>, <span class="co"># whether mean for each group is to be displayed</span></span>
<span id="cb79-13"><a href="#cb79-13" aria-hidden="true" tabindex="-1"></a>  <span class="at">mean.ci =</span> <span class="cn">TRUE</span>, <span class="co"># whether to display confidence interval for means</span></span>
<span id="cb79-14"><a href="#cb79-14" aria-hidden="true" tabindex="-1"></a>  <span class="at">mean.label.size =</span> <span class="fl">2.5</span>, <span class="co"># size of the label for mean</span></span>
<span id="cb79-15"><a href="#cb79-15" aria-hidden="true" tabindex="-1"></a>  <span class="at">type =</span> <span class="st">&quot;parametric&quot;</span>, <span class="co"># which type of test is to be run</span></span>
<span id="cb79-16"><a href="#cb79-16" aria-hidden="true" tabindex="-1"></a>  <span class="at">k =</span> <span class="dv">3</span>, <span class="co"># number of decimal places for statistical results</span></span>
<span id="cb79-17"><a href="#cb79-17" aria-hidden="true" tabindex="-1"></a>  <span class="at">outlier.label.color =</span> <span class="st">&quot;darkgreen&quot;</span>, <span class="co"># changing the color for the text label</span></span>
<span id="cb79-18"><a href="#cb79-18" aria-hidden="true" tabindex="-1"></a>  <span class="at">title =</span> <span class="st">&quot;Comparison of listening times between groups&quot;</span>,</span>
<span id="cb79-19"><a href="#cb79-19" aria-hidden="true" tabindex="-1"></a>  <span class="at">xlab =</span> <span class="st">&quot;Experimental group&quot;</span>, <span class="co"># label for the x-axis variable</span></span>
<span id="cb79-20"><a href="#cb79-20" aria-hidden="true" tabindex="-1"></a>  <span class="at">ylab =</span> <span class="st">&quot;Listening time&quot;</span>, <span class="co"># label for the y-axis variable</span></span>
<span id="cb79-21"><a href="#cb79-21" aria-hidden="true" tabindex="-1"></a>  <span class="at">messages =</span> <span class="cn">FALSE</span>,</span>
<span id="cb79-22"><a href="#cb79-22" aria-hidden="true" tabindex="-1"></a>  <span class="at">bf.message =</span> <span class="cn">FALSE</span></span>
<span id="cb79-23"><a href="#cb79-23" aria-hidden="true" tabindex="-1"></a>)</span></code></pre></div>
<div class="figure" style="text-align: center"><span style="display:block;" id="fig:unnamed-chunk-38"></span>
<img src="08-Anova_files/figure-html/unnamed-chunk-38-1.png" alt="ANOVA using ggstatsplot" width="672" />
<p class="caption">
Figure 11: ANOVA using ggstatsplot
</p>
</div>
</div>
</div>
</div>
</div>
            </section>

          </div>
        </div>
      </div>


    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"whatsapp": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": null,
"search": false,
"toc": {
"collapse": "section"
}
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "true";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:")
      if (/^https?:/.test(src))
        src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>