This repository has been archived by the owner on Dec 30, 2023. It is now read-only.
forked from MetuStat112/metustat112.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStat 112-Recitation 1.html
1005 lines (978 loc) · 40.9 KB
/
Stat 112-Recitation 1.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.0.38">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<title>Recitation 1</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<script src="Stat 112-Recitation 1_files/libs/clipboard/clipboard.min.js"></script>
<script src="Stat 112-Recitation 1_files/libs/quarto-html/quarto.js"></script>
<script src="Stat 112-Recitation 1_files/libs/quarto-html/popper.min.js"></script>
<script src="Stat 112-Recitation 1_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="Stat 112-Recitation 1_files/libs/quarto-html/anchor.min.js"></script>
<link href="Stat 112-Recitation 1_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="Stat 112-Recitation 1_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="Stat 112-Recitation 1_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="Stat 112-Recitation 1_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="Stat 112-Recitation 1_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
</head>
<body class="fullcontent">
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Recitation 1</h1>
</div>
<div class="quarto-title-meta">
</div>
</header>
<section id="introducing" class="level3">
<h3 class="anchored" data-anchor-id="introducing">Introducing</h3>
<p>The course GitHub page: <a href="https://github.com/MetuStat112" class="uri">https://github.com/MetuStat112</a></p>
<p>The course unofficial web-page: <a href="https://metustat112.github.io/" class="uri">https://metustat112.github.io/</a></p>
<table class="table">
<thead>
<tr class="header">
<th></th>
<th><p><a href="https://ozancanozdemir.github.io/">Personal Web Page</a></p>
<p><a href="https://twitter.com/OzancanOzdemir">Twitter</a></p>
<p><a href="https://github.com/ozancanozdemir">GitHub</a></p>
<p><strong>e-mail:</strong> <a href="mailto:[email protected]">[email protected]</a> / <a href="mailto:[email protected]">[email protected]</a></p>
<p><strong>Room No: 234</strong></p>
Office Hour: Wed. / Fri. 13.40-14.30</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Res. Assist. Ozancan Ozdemir</td>
<td></td>
</tr>
</tbody>
</table>
</section>
<section id="recitation-1" class="level3">
<h3 class="anchored" data-anchor-id="recitation-1">Recitation 1</h3>
<p><em>Statistics is the grammar of science. - Karl Pearson.</em></p>
<p><img src="images/pic1.png" class="img-fluid" width="280"></p>
<p>Data is a collection of information collected by observations, measurements, research or analysis. One of the main characteristics of today’s world is to be data generative. According to the estimates, at least <strong>2.5 quintillion bytes</strong> of data is produced every day.</p>
<p><strong>Question : How to make such an amount of data valuable?</strong></p>
<p><strong>Answer: Statistics!</strong></p>
<section id="what-is-statistics" class="level4">
<h4 class="anchored" data-anchor-id="what-is-statistics">What is Statistics?</h4>
<p>There are several definitions of statistics in the literature.</p>
<p><strong>Definition 1:</strong></p>
<ul>
<li><p><strong>Formally:</strong> “Statistics” is the science of learning from data, and of measuring, controlling and communicating uncertainty. (American Statistical Association (ASA))</p></li>
<li><p><strong>Informally:</strong> Statistics is the art of learning from data.</p></li>
</ul>
<p>In statistics, we are interested in obtaining information about a total collection of the elements, which we will refer to as the <strong>population.</strong> The population is often too large for us to examine each of its members. For instance, we might have all the resident of the Ankara, or all the television sets produced in the last year by a particular manufacturer, or all the households in a given community. In such cases, we try to learn about the population by choosing and then examining a subgroup of its elements. This subgroup of a population is called a <strong>sample.</strong></p>
<p><strong>Definition 2:</strong> The total collection of all the elements that we are interested in is a called <strong>population</strong>. A subgroup of the population that will be studied in details is called a <strong>sample.</strong></p>
<p>The quantity that describe the population is called <strong>parameter</strong>. The quantity that describe the sample is called <strong>statistic.</strong> It is usually impossible to have data for the entire population, and so a parameter, which defines the characteristic of the population and is a fixed number, remains an unknown number. Thus, a set of individuals or objects is collected or selected from a population by a defined procedure, and it is called sample. Luckily, a sample can provide a data in which we can calculate a quantity that can supply the best information related to unknown parameter, and this quantity is called <strong>statistic</strong> or <strong>sample statistic.</strong></p>
<p><img src="images/paste-CAFEB7BA.png" class="img-fluid"> https://bookdown.org/mcbroom_j/Book/introduction-to-statistics.html#individuals-and-variables</p>
</section>
<section id="example" class="level4">
<h4 class="anchored" data-anchor-id="example">Example</h4>
<table class="table">
<colgroup>
<col style="width: 46%">
<col style="width: 53%">
</colgroup>
<thead>
<tr class="header">
<th>Population</th>
<th>Sample</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>All resident of Turkey</td>
<td>All resident in Turkey who live above the poverty line</td>
</tr>
<tr class="even">
<td>All residents above the poverty line in Turkey</td>
<td>All residents who are millionaires</td>
</tr>
<tr class="odd">
<td>All employees in an office</td>
<td>All managers in the office</td>
</tr>
</tbody>
</table>
<p><strong>1)</strong> In the following exercises, determine whether the data set is a population or a sample. Explain your reasoning.</p>
<p><strong>a )</strong> The height of each player on a school’s basketball team</p>
<p><strong>b)</strong> The amount of energy collected from every wind turbine on a wind farm.</p>
<p><strong>c)</strong> The cholesterol levels of 20 patients in a hospital with 100 patients</p>
<p><strong>d)</strong> The number of televisions in each household in Ankara</p>
<p><strong>e)</strong> The age of every third person entering a clothing store</p>
<p><strong>2)</strong> Identify the sample and the population in each of the following scenarios.</p>
<p><strong>a)</strong> In order to study the response times for emergency 112 calls in Ankara, fifty “robbery in progress” calls are selected randomly over a six month period and the response times are recorded.</p>
<p><strong>b)</strong> In order to study a new medical charting system at Medico, a representative group of nurses is asked to use the charting system. Recording times and error rates are recorded for the group.</p>
<p><strong>c)</strong> Fifteen hundred individuals who watch The Office are selected and information concerning their education level, income level and so forth is recorded.</p>
<p>As stated above, data are the facts and figures collected, analyzed and summarized for presentation and interpretation. All data collected in a particular study are referred to as the <strong>data set.</strong> The following table represents a data set summarizing information of Galatasaray Football Player.</p>
<table class="table">
<thead>
<tr class="header">
<th>Name</th>
<th>Age</th>
<th>Market Value</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Muslera</td>
<td>36</td>
<td>1.800.000</td>
</tr>
<tr class="even">
<td>Boey</td>
<td>22</td>
<td>4.000.000</td>
</tr>
<tr class="odd">
<td>Nelsson</td>
<td>24</td>
<td>14.000.000</td>
</tr>
<tr class="even">
<td>Abdülkerim</td>
<td>28</td>
<td>4.000.000</td>
</tr>
<tr class="odd">
<td>Van Aanholt</td>
<td>32</td>
<td>4.000.000</td>
</tr>
<tr class="even">
<td>Torreira</td>
<td>27</td>
<td>17.000.000</td>
</tr>
<tr class="odd">
<td>Oliviera</td>
<td>30</td>
<td>6.500.000</td>
</tr>
<tr class="even">
<td>Kerem</td>
<td>24</td>
<td>15.000.000</td>
</tr>
<tr class="odd">
<td>Mertens</td>
<td>35</td>
<td>4.000.000</td>
</tr>
<tr class="even">
<td>Yunus</td>
<td>22</td>
<td>7.500.000</td>
</tr>
<tr class="odd">
<td>Icardi</td>
<td>30</td>
<td>22.000.000</td>
</tr>
</tbody>
</table>
<p><strong>Elements</strong> are the entities where data are collected. For the data set given above, each player is an element.</p>
<p>A <strong>Variable</strong> is a characteristic of interest for the elements. The data set given above includes the following two variables.</p>
<ul>
<li><p>Age</p></li>
<li><p>Market Value</p></li>
</ul>
<p><strong>3)</strong> Please identify the element and the variable for the scenarios given below.</p>
<p><strong>a)</strong> In a sociological study involving 35 low income house holds, the number of children per household was recorded for each household.</p>
<p><strong>b)</strong> The number of hours spent per week on paper work was determined for 200 middle level managers. The minimum was 0 hours and the maximum was 27 hours.</p>
</section>
<section id="data-types" class="level4">
<h4 class="anchored" data-anchor-id="data-types">Data Types</h4>
<p>There are mainly two forms of the statistics.</p>
<ul>
<li><p><strong>Descriptive Statistics</strong> is primarily about summarizing a given data set through numerical summaries and graphs, and can be used for exploratory analysis to visualize the information contained in the data and suggest hypotheses etc.</p></li>
<li><p><strong>Inferential Statistics</strong> is concerned with methods for making conclusions about a population using information from a sample, and assessing the reliability of, and uncertainty in, these conclusions.</p></li>
</ul>
<p>The data types or groups are an important concept of statistics, which needs to be understood, to correctly apply statistical analysis to your data and therefore to correctly conclude certain assumptions about it.</p>
<p>In general attribute, we can divide data as</p>
<ul>
<li><p><strong>Quantitative data</strong> deals with numbers and things you can measure objectively. It is also known as <strong>numerical variable</strong> such as height, width, and length.</p></li>
<li><p><strong>Qualitative data</strong> data deals with characteristics and descriptors that can’t be easily measured, but can be observed subjectively. It is also known as <strong>categorical variable</strong> such as smells, tastes, eye color, gender.</p></li>
</ul>
<p>These two data attributes have <strong>subgroups.</strong></p>
</section>
<section id="quantitative-attribute" class="level4">
<h4 class="anchored" data-anchor-id="quantitative-attribute"><strong>Quantitative Attribute</strong></h4>
<ul>
<li><p><strong>Discrete Data:</strong> We speak of discrete data if its values are distinct and separate. In other words: We speak of discrete data if the data can only take on certain values. This type of data can not be <strong>measured but it can be counted.</strong> For example, <strong>Number of defective item in a box</strong>, <strong>Number of children in a household</strong>.</p></li>
<li><p><strong>Continuous Data:</strong> Continuous Data represents measurements and therefore their values cannot be counted but they can be measured. For example, <strong>temperature, height, or weight</strong>.</p></li>
<li><p><strong>Interval Data</strong> Interval values represent ordered units that have the same difference. Therefore we speak of interval data when we have a variable that contains numeric values that are ordered and where we know the exact differences between the values. It can be either discrete or continious.</p></li>
</ul>
<p>The problem about the interval data is zero have no real meaning. That’s why a lot of descriptive and inferential statistics cannot be applied. For example, <strong>Temperature</strong>.</p>
<ul>
<li><strong>Ratio Data :</strong> Ratio values are also ordered units that have the same difference. Ratio values are the same as interval values, with the difference that they do have an absolute zero. In other words, zero has its real meaning. For example, <strong>age, distance</strong></li>
</ul>
<p><strong>Qualitative Attribute</strong></p>
<ul>
<li><p><strong>Nominal Data:</strong> Nominal values represent discrete units and are used to label variables, that have no quantitative value. Just think of them as labels. Note that nominal data that has no order. For example, <strong>Gender: Male, Female</strong></p></li>
<li><p><strong>Ordinal Data:</strong> Ordinal values represent discrete and ordered units. As you would guess from it’s name, order have an importance. For example,</p></li>
<li><p>1- Totally disagree.</p></li>
<li><p>2- Disagree.</p></li>
<li><p>3- Neither agree nor disagree.</p></li>
<li><p>4- Agree.</p></li>
<li><p>5- Totally Agree.</p></li>
</ul>
<p>We can also categorized data set types into several groups, For example,</p>
<ul>
<li><p><strong>Cross-Sectional:</strong> It is a collection of observations (behaviour) for multiple subjects(entities) at single point in time.</p></li>
<li><p><strong>Time Series:</strong> It is a collection of observations (behaviour) for a single subject(entity) at different time intervals (generally equally spaced)</p></li>
<li><p><strong>Panel Data:</strong> It is usually called as Cross-sectional Time-series data as it a combination of above mentioned types, i.e., collection of observations for multiple subjects at different time points.</p></li>
<li><p><strong>Circular Data.</strong></p></li>
</ul>
<p><strong>4)</strong> Determine whether the following variables are qualitative or quantitative.</p>
<p><strong>a)</strong> The color of automobiles involved in several severe accidents.</p>
<p><strong>b)</strong> The length of time required for rats to move through a maze</p>
<p><strong>c)</strong> The classification of police administrations as city, county or state</p>
<p><strong>d)</strong> The rating given to a pizza in a taste as poor, good, excellent</p>
<p><strong>e)</strong> The number of times subjects in a sociological research study have been married.</p>
<p><strong>5)</strong> Match the measurement with the data type.</p>
<table class="table">
<colgroup>
<col style="width: 89%">
<col style="width: 10%">
</colgroup>
<tbody>
<tr class="odd">
<td>The roster of a basketball team lists the jersey numbers for each of the players.</td>
<td>Nominal</td>
</tr>
<tr class="even">
<td>Grade of students (A, B, C, D and F)</td>
<td>Ratio</td>
</tr>
<tr class="odd">
<td>Colour of shirt</td>
<td>Interval</td>
</tr>
<tr class="even">
<td>Weight of students</td>
<td>Ordinal</td>
</tr>
</tbody>
</table>
<p><strong>6)</strong> Please determine whether the given statement is True or False.</p>
<ul>
<li><p>The movie list based on IMDB ratings is an ordinal data. ( )</p></li>
<li><p>The height of the student in this lab is an interval data. ( )</p></li>
<li><p>Temperature of the body expressed in Kelvin is a ratio data. ( )</p></li>
<li><p>The provinces that the students in this lab come from are nominal data. ( )</p></li>
</ul>
</section>
<section id="descriptive-analysis-of-categorical-data" class="level4">
<h4 class="anchored" data-anchor-id="descriptive-analysis-of-categorical-data">Descriptive Analysis of Categorical Data</h4>
<p>Descriptive analysis of the data includes both numerical and graphical representations of our findings. The type of the method that you apply for descriptive analysis changes based on type of the data that you are working with. This part introduces tabular and graphical methods commonly used to summarize categorical (qualitative) data.</p>
<p><strong>Summarizing Qualitative Data</strong></p>
<p><strong>One Categorical Data</strong></p>
<p>When we are interested in only one categorical variable (nominal or ordinal) we can use <strong>frequency</strong>, <strong>relative frequency(proportion)</strong> or <strong>cumulative frequency</strong>. Since these statistics are represented in a table format, it is called <strong>frequency table</strong>, <strong>relative frequency (proportion) table</strong>, etc.</p>
<p><strong>Frequency Distribution:</strong> A frequency distribution is a tabular summary of data showing the number (frequency) of items in each of several non-overlapping classes. For example, the manufacturer list of 20 cars in KKM is given below.</p>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;">manufacturer</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">chevrolet</td>
</tr>
<tr class="even">
<td style="text-align: left;">ford</td>
</tr>
<tr class="odd">
<td style="text-align: left;">nissan</td>
</tr>
<tr class="even">
<td style="text-align: left;">honda</td>
</tr>
<tr class="odd">
<td style="text-align: left;">hyundai</td>
</tr>
<tr class="even">
<td style="text-align: left;">lincoln</td>
</tr>
<tr class="odd">
<td style="text-align: left;">land rover</td>
</tr>
<tr class="even">
<td style="text-align: left;">subaru</td>
</tr>
<tr class="odd">
<td style="text-align: left;">nissan</td>
</tr>
<tr class="even">
<td style="text-align: left;">land rover</td>
</tr>
<tr class="odd">
<td style="text-align: left;">ford</td>
</tr>
<tr class="even">
<td style="text-align: left;">honda</td>
</tr>
<tr class="odd">
<td style="text-align: left;">volkswagen</td>
</tr>
<tr class="even">
<td style="text-align: left;">ford</td>
</tr>
<tr class="odd">
<td style="text-align: left;">dodge</td>
</tr>
<tr class="even">
<td style="text-align: left;">ford</td>
</tr>
<tr class="odd">
<td style="text-align: left;">toyota</td>
</tr>
<tr class="even">
<td style="text-align: left;">hyundai</td>
</tr>
<tr class="odd">
<td style="text-align: left;">audi</td>
</tr>
<tr class="even">
<td style="text-align: left;">jeep</td>
</tr>
</tbody>
</table>
</div>
</div>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;">Var1</th>
<th style="text-align: right;">Freq</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">audi</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="even">
<td style="text-align: left;">chevrolet</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="odd">
<td style="text-align: left;">dodge</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="even">
<td style="text-align: left;">ford</td>
<td style="text-align: right;">4</td>
</tr>
<tr class="odd">
<td style="text-align: left;">honda</td>
<td style="text-align: right;">2</td>
</tr>
<tr class="even">
<td style="text-align: left;">hyundai</td>
<td style="text-align: right;">2</td>
</tr>
<tr class="odd">
<td style="text-align: left;">jeep</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="even">
<td style="text-align: left;">land rover</td>
<td style="text-align: right;">2</td>
</tr>
<tr class="odd">
<td style="text-align: left;">lincoln</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="even">
<td style="text-align: left;">nissan</td>
<td style="text-align: right;">2</td>
</tr>
<tr class="odd">
<td style="text-align: left;">subaru</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="even">
<td style="text-align: left;">toyota</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="odd">
<td style="text-align: left;">volkswagen</td>
<td style="text-align: right;">1</td>
</tr>
</tbody>
</table>
</div>
</div>
<p><strong>Comment:</strong> We see that Ford the leader, Honda, Hyundai, Land Rover and Nissan are second.</p>
<p><strong>Relative Frequency:</strong> A frequency distribution shows the number (frequency) of items in each of several non-overlapping classes. We are often interested in the proportion, or percentage, of items in each class. The relative frequency of a class equals the fraction or proportion of items belonging to a class.</p>
<p><strong>Relative frequency makes your data more interpretable.</strong></p>
<p><span class="math display">\[
\text{Relative Frequency of a class} = \frac{\text{Frequency of class}}{n}
\]</span></p>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;">Var1</th>
<th style="text-align: right;">Freq</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">audi</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="even">
<td style="text-align: left;">chevrolet</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="odd">
<td style="text-align: left;">dodge</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="even">
<td style="text-align: left;">ford</td>
<td style="text-align: right;">0.20</td>
</tr>
<tr class="odd">
<td style="text-align: left;">honda</td>
<td style="text-align: right;">0.10</td>
</tr>
<tr class="even">
<td style="text-align: left;">hyundai</td>
<td style="text-align: right;">0.10</td>
</tr>
<tr class="odd">
<td style="text-align: left;">jeep</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="even">
<td style="text-align: left;">land rover</td>
<td style="text-align: right;">0.10</td>
</tr>
<tr class="odd">
<td style="text-align: left;">lincoln</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="even">
<td style="text-align: left;">nissan</td>
<td style="text-align: right;">0.10</td>
</tr>
<tr class="odd">
<td style="text-align: left;">subaru</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="even">
<td style="text-align: left;">toyota</td>
<td style="text-align: right;">0.05</td>
</tr>
<tr class="odd">
<td style="text-align: left;">volkswagen</td>
<td style="text-align: right;">0.05</td>
</tr>
</tbody>
</table>
</div>
</div>
<p>We can say that 20% of the car in KKM are produced by Ford.</p>
<p><strong>7)</strong> Please consider the data set given below.</p>
<div class="cell">
<div class="cell-output cell-output-stderr">
<pre><code>Warning: package 'vcd' was built under R version 4.1.3</code></pre>
</div>
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: right;">ID</th>
<th style="text-align: left;">Treatment</th>
<th style="text-align: left;">Sex</th>
<th style="text-align: right;">Age</th>
<th style="text-align: left;">Improved</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: right;">57</td>
<td style="text-align: left;">Treated</td>
<td style="text-align: left;">Male</td>
<td style="text-align: right;">27</td>
<td style="text-align: left;">Some</td>
</tr>
<tr class="even">
<td style="text-align: right;">46</td>
<td style="text-align: left;">Treated</td>
<td style="text-align: left;">Male</td>
<td style="text-align: right;">29</td>
<td style="text-align: left;">None</td>
</tr>
<tr class="odd">
<td style="text-align: right;">77</td>
<td style="text-align: left;">Treated</td>
<td style="text-align: left;">Male</td>
<td style="text-align: right;">30</td>
<td style="text-align: left;">None</td>
</tr>
<tr class="even">
<td style="text-align: right;">17</td>
<td style="text-align: left;">Treated</td>
<td style="text-align: left;">Male</td>
<td style="text-align: right;">32</td>
<td style="text-align: left;">Marked</td>
</tr>
<tr class="odd">
<td style="text-align: right;">36</td>
<td style="text-align: left;">Treated</td>
<td style="text-align: left;">Male</td>
<td style="text-align: right;">46</td>
<td style="text-align: left;">Marked</td>
</tr>
<tr class="even">
<td style="text-align: right;">23</td>
<td style="text-align: left;">Treated</td>
<td style="text-align: left;">Male</td>
<td style="text-align: right;">58</td>
<td style="text-align: left;">Marked</td>
</tr>
</tbody>
</table>
</div>
</div>
<p>The variable expressions are given below.</p>
<ul>
<li><p><strong>ID</strong>: patient ID.</p></li>
<li><p><strong>Treatment</strong>: indicating treatment (Placebo, Treated).</p></li>
<li><p><strong>Sex:</strong> indicating sex (Female, Male).</p></li>
<li><p><strong>Age:</strong> age of patient.</p></li>
<li><p><strong>Improved:</strong> ordered factor indicating treatment outcome (None, Some, Marked).</p></li>
</ul>
<p><strong>a)</strong> Interpret the given output below.</p>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;">Var1</th>
<th style="text-align: right;">Freq</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">Placebo</td>
<td style="text-align: right;">43</td>
</tr>
<tr class="even">
<td style="text-align: left;">Treated</td>
<td style="text-align: right;">41</td>
</tr>
</tbody>
</table>
</div>
</div>
<p><strong>b)</strong> Interpret the given output below.</p>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;">Var1</th>
<th style="text-align: right;">Freq</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">Placebo</td>
<td style="text-align: right;">0.5119048</td>
</tr>
<tr class="even">
<td style="text-align: left;">Treated</td>
<td style="text-align: right;">0.4880952</td>
</tr>
</tbody>
</table>
</div>
</div>
<p><strong>c)</strong> What is the problem with the output above? How to solve it?</p>
<p>In addition to the numerical representations, the visualization of the categorical data can help the researchers to deliver the information in the most efficient way possible. To display the one categorical variable, you can use</p>
<ul>
<li><p>bar plot</p></li>
<li><p>pie chart</p></li>
<li><p>doughnut chart</p></li>
</ul>
<p><strong>Bar Plot:</strong> A bar plot or bar chart is a graphical device for displaying qualitative data summarized in a frequency, relative frequency, or percentage frequency distribution. One one axis of the chart (usually the horizontal axis), we specify the labels for the classes (categories) of data. A frequency, relative frequency or percentage frequency scale can be used for the other axis of the chart (usually the vertical axis)</p>
<div class="cell">
<div class="cell-output cell-output-stderr">
<pre><code>Warning: package 'ggthemes' was built under R version 4.1.2</code></pre>
</div>
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>Comment:</strong> It is seen that Ford has the highest number of car in KKM garage. It is followed by nissan, land rover, hyundai and honda.</p>
<p><strong>Pie Chart:</strong> It is another way of presenting relative frequency and the percentage frequency distribution of the qualitative data. It may be a good option when your categorical variable has a few number of levels (For example: Answer is your variable, and it has only yes and no), but it is not generally preferred in the data visualization world.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-8-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>Comment:</strong> 55% of the participants say yes, while 45% of them say no.</p>
<p><strong>Doughnut Chart:</strong> A doughnut or donut chart is a ring divided into sectors that each represent a proportion of the whole. It is very close from a pie chart and thus suffers the same problem.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-9-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><img src="images/dusunen-adam.jpg" class="img-fluid"></p>
<p>It is possible to include more than one categorical variable in the analysis at the same time, luckily. If you are displaying two categorical variables, you construct a <strong>contingency table.</strong></p>
<p><strong>Contingency Table:</strong> It is a way of displaying the frequency distribution of two categorical variables. It is a powerful tool for the comparison and exploring the association between two categorical variables, especially. Although it is generally used for categorical variables, we are able to apply this approach to discrete, even continuous variables.</p>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;"></th>
<th style="text-align: right;">4</th>
<th style="text-align: right;">6</th>
<th style="text-align: right;">8</th>
<th style="text-align: right;">Sum</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">e</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">1</td>
<td style="text-align: right;">1</td>
</tr>
<tr class="even">
<td style="text-align: left;">p</td>
<td style="text-align: right;">1</td>
<td style="text-align: right;">2</td>
<td style="text-align: right;">1</td>
<td style="text-align: right;">4</td>
</tr>
<tr class="odd">
<td style="text-align: left;">r</td>
<td style="text-align: right;">6</td>
<td style="text-align: right;">3</td>
<td style="text-align: right;">6</td>
<td style="text-align: right;">15</td>
</tr>
<tr class="even">
<td style="text-align: left;">Sum</td>
<td style="text-align: right;">7</td>
<td style="text-align: right;">5</td>
<td style="text-align: right;">8</td>
<td style="text-align: right;">20</td>
</tr>
</tbody>
</table>
</div>
</div>
<p>This is 3x3 contingency table where rows represent the fuel type of the car and columns denotes the number of cylinders of the 20 cars in 5KKM garage.</p>
<p><strong>Comment:</strong> We can say that almost 33% of cars are R fuel type and 8 cylinder. It can be also said that almost one out of three cars are R fuel type and 4 cylinders. Only 8 cylinder cars has three fuel types while 4 and 6 cylinder cars have P and R fuel type.</p>
<p>It is also possible to include more than two categorical variables by displaying its frequencies on the same table, and it is called N-way table where N represents the number of categorical variables in the table.</p>
<p>You can use the following graphs to illustrate the at least two categorical variables visually.</p>
<ul>
<li><p>Clustered or Stacked Bar Chart</p></li>
<li><p>Spine Plot</p></li>
<li><p>Mosaic Plot</p></li>
</ul>
<p><strong>Clustered and Stacked Bar Plot:</strong> Clustered and Stacked barplot display a numeric value for several entities, organised in groups and subgroups.</p>
<p>Let’s visualize the table representing the fuel type of the car and columns denotes the number of cylinders of the 20 cars in KKM garage using this chart.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-11-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-12-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>Mosaic Plot:</strong> A mosaic plot is a graphical display of the cell frequencies of a contingency table in which the area of boxes of the plot are proportional to the cell frequencies of the contingency table. This procedure can construct mosaic plots for up to four-way tables.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-13-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>Comment:</strong> The plot shows that most of the car are automatic transmission. It is seen that the distribution of cylinders are almost equal in manual tranmission car, but the proportion of 8 cylinder cars is slightly greater than the other types in automatic transmission.</p>
<p><strong>Spine Plot:</strong> Spine plots are a generalization of stacked bar plots where not the heights but the widths of the bars corresponds to the relative frequencies of x . It is a specific type of mosaic plot, but it can be used for <strong>only two variables</strong>. Also, one of variables here can be either numerical or categorical where one of them has to be categorical.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-14-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>8)</strong> Consider the data set in Question 7. Please answer the following questions and interpret the outputs, if necessary.</p>
<p><strong>a)</strong> Please classify type of the variables in the data. (Categorical or Numerical, Nominal or Ordinal, Interval or Ratio)</p>
<table class="table">
<colgroup>
<col style="width: 15%">
<col style="width: 18%">
<col style="width: 15%">
<col style="width: 12%">
<col style="width: 12%">
<col style="width: 14%">
<col style="width: 10%">
</colgroup>
<thead>
<tr class="header">
<th>Var. Name</th>
<th>Categorical</th>
<th>Numerical</th>
<th>Nominal</th>
<th>Ordinal</th>
<th>Interval</th>
<th>Ratio</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Treatment</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="even">
<td>Sex</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="odd">
<td>Age</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="even">
<td>Improved</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
<p><strong>b)</strong> Can we calculate the mean of the age variable? Explain your reason.</p>
<p><strong>c)</strong> Assume that improved variable is represented by number.</p>
<ul>
<li><p>0: None</p></li>
<li><p>1: Some</p></li>
<li><p>2: Marked</p></li>
</ul>
<p>Which statistics can be used to measure the central tendency of this variable?</p>
<p><strong>d)</strong> Please interpret the following output.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-15-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>e)</strong> The sex column of the data has two attributes, “Female” and “Male”. Do you prefer pie chart or donut chart to represent the distribution of the sex column? If you say yes, you use whether frequency or relative frequency. If you say no, explain why?</p>
<p><strong>f)</strong> Please interpret the following output.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="Stat-112-Recitation-1_files/figure-html/unnamed-chunk-16-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p><strong>g)</strong> Please interpret the following tables.</p>
<div class="cell">
<div class="cell-output cell-output-stdout">
<pre><code> None Some Marked
Placebo Female 19 7 6
Male 10 0 1
Treated Female 6 5 16
Male 7 2 5</code></pre>
</div>
</div>
<div class="cell">
<div class="cell-output cell-output-stdout">
<pre><code> None Some Marked
Placebo Female 0.23 0.08 0.07
Male 0.12 0.00 0.01
Treated Female 0.07 0.06 0.19
Male 0.08 0.02 0.06</code></pre>
</div>
</div>
<p><img src="images/hadi-oyle-bir-sey-yapalim-kafamiza-gore_897804.jpg" class="img-fluid" width="433"></p>
<p><strong>9)</strong> Please answer the following questions.</p>
<p><strong>9.1)</strong> What is the definition of the statistics.</p>
<p>a ) Science of Collecting , organizing ,analyzing , interpreting Data</p>
<p>b) Science of Data</p>
<p>c) Collection</p>
<p>d) Analyzing</p>
<p><strong>9.2)</strong> Descriptive Statistics is useful for</p>
<p>a) comparing the statistical events</p>
<p>b) comparing two different samples</p>
<p>c) describing the quantitative characteristics of variables</p>
<p><strong>9.3)</strong> A ___ is a small portion of the population used to gather data from.</p>
<p>a) Systematic Sampling Method</p>
<p>b) Sample</p>
<p>c) Population</p>
<p>d) Bias</p>
<p><strong>9.4)</strong> What would be the sample in this example?<br>
<em>Surveyors in a mall choose shoppers to ask about products they prefer.</em></p>
<p>a) the surveyors</p>
<p>b) all shoppers in the mall</p>
<p>c) the shoppers who were asked their preferences</p>
<p><strong>9.5)</strong> How old are you?</p>
<p>a) Categorical</p>
<p>b) Numerical</p>
<p><strong>9.6)</strong> In which month were you born?</p>
<p>a) Categorical</p>
<p>b) Numerical</p>
<p><strong>9.7)</strong> The number of orange Skittles in a bag.</p>
<p>a) Discrete</p>
<p>b) Continuous</p>
<p><strong>9.8)</strong> Which level of measurement is used to measure the size of different size M&Ms?</p>
<p>a) Nominal</p>
<p>b) Ordinal</p>
<p>c) Interval</p>
<p>d) Ratio</p>
<p><strong>9.9)</strong> What level of measurement is used to measure shoe size?</p>
<ol type="a">
<li>Nominal</li>
</ol>
<p>b) Ordinal</p>
<p>c) Interval</p>
<p>d) Ratio</p>
<p><strong>9.10)</strong> The chart given below gives the demographic and socioeconomic characteristics of adult smokers in upstate New York in 2006. Use it to determine if the following statements appear to be true. Answer yes or no.</p>
<p>a) A higher proportion of men than of women are current smokers.</p>
<p>b) The longer a person has been out of work, the more likely that person is a smoker.</p>
<p>c) The more education a person has, the more likely that person is to smoke.</p>
<p>d) Ethnicity does not appear to be related to smoking prevalence.</p>
<p><img src="images/g1.png" class="img-fluid" width="388"><img src="images/g2.png" class="img-fluid"></p>
</section>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const clipboard = new window.ClipboardJS('.code-copy-button', {
target: function(trigger) {
return trigger.previousElementSibling;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
setTimeout(function() {
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
let href = ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const cites = ref.parentNode.getAttribute('data-cites').split(' ');
tippyHover(ref, function() {
var popup = window.document.createElement('div');
cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
});
</script>