-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathvisualization-gallery.Rmd
1646 lines (1326 loc) · 57.7 KB
/
visualization-gallery.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# 可视化之图库 {#chap-gallery}
```{r,echo=FALSE}
if (is.na(Sys.getenv("CI", NA))) {
# 准备 Noto 中英文字体
sysfonts::font_paths(new = "~/Library/Fonts/")
## 宋体
sysfonts::font_add(
family = "Noto Serif CJK SC",
regular = "NotoSerifCJKsc-Regular.otf",
bold = "NotoSerifCJKsc-Bold.otf"
)
## 黑体
sysfonts::font_add(
family = "Noto Sans CJK SC",
regular = "NotoSansCJKsc-Regular.otf",
bold = "NotoSansCJKsc-Bold.otf"
)
sysfonts::font_add(
family = "Noto Serif",
regular = "NotoSerif-Regular.ttf",
bold = "NotoSerif-Bold.ttf",
italic = "NotoSerif-Italic.ttf",
bolditalic = "NotoSerif-BoldItalic.ttf"
)
sysfonts::font_add(
family = "Noto Sans",
regular = "NotoSans-Regular.ttf",
bold = "NotoSans-Bold.ttf",
italic = "NotoSans-Italic.ttf",
bolditalic = "NotoSans-BoldItalic.ttf"
)
} else {
sysfonts::font_paths(new = c(
"/usr/share/fonts/opentype/noto/",
"/usr/share/fonts/truetype/noto/"
))
## 宋体
sysfonts::font_add(
family = "Noto Serif CJK SC",
regular = "NotoSerifCJK-Regular.ttc",
bold = "NotoSerifCJK-Bold.ttc"
)
## 黑体
sysfonts::font_add(
family = "Noto Sans CJK SC",
regular = "NotoSansCJK-Regular.ttc",
bold = "NotoSansCJK-Bold.ttc"
)
sysfonts::font_add(
family = "Noto Serif",
regular = "NotoSerif-Regular.ttf",
bold = "NotoSerif-Bold.ttf",
italic = "NotoSerif-Italic.ttf",
bolditalic = "NotoSerif-BoldItalic.ttf"
)
sysfonts::font_add(
family = "Noto Sans",
regular = "NotoSans-Regular.ttf",
bold = "NotoSans-Bold.ttf",
italic = "NotoSans-Italic.ttf",
bolditalic = "NotoSans-BoldItalic.ttf"
)
}
```
```{r}
library(ggplot2) # ggplot2 图形
library(patchwork) # 图形布局
library(magrittr) # 管道操作
library(ggrepel) # 文本注释
library(extrafont) # 加载外部字体 TTF
library(maps) # 地图数据
library(mapdata) # 地图数据
library(data.table) # 数据操作
library(KernSmooth) # 核平滑
library(ggnormalviolin) # 提琴图
library(ggbeeswarm) # 蜂群图
library(ggridges) # 岭线图
library(ggpubr) # 组合图
library(treemap) # 树状图
library(treemapify) # 树状图
library(ggquiver) # 向量场图
library(ggstream) # 水流图
library(timelineS) # 时间线
library(ggdendro) # 聚类图
library(ggfortify) # 统计分析结果可视化:主成分图
library(gganimate) # 动态图
```
## 饼图 {#sec-ggplot2-pie}
我对饼图是又爱又恨,爱的是它表示百分比的时候,往往让读者联想到蛋糕,份额这类根深蒂固的情景,从而让数字通俗易懂、深入人心,是一种很好的表达方式,恨的也是这一点,我用柱状图表达不香吗?人眼对角度的区分度远不如柱状图呢,特别是当两个类所占的份额比较接近的时候,所以很多时候,除了用饼图表达份额,还会在旁边标上百分比,从数据可视化的角度来说,如图 \@ref(fig:bod-pie) 所示,这是信息冗余!
```{r bod-pie, fig.asp=1, fig.width=5, fig.height=5, fig.cap="饼图"}
BOD %>% transform(., ratio = demand / sum(demand)) %>%
ggplot(., aes(x = "", y = demand, fill = reorder(Time, demand))) +
geom_bar(stat = "identity", show.legend = FALSE, color = "white") +
coord_polar(theta = "y") +
geom_text(aes(x = 1.6, label = paste0(round(ratio, digits = 4) * 100, "%")),
position = position_stack(vjust = 0.5), color = "black"
) +
geom_text(aes(x = 1.2, label = Time),
position = position_stack(vjust = 0.5), color = "black"
) +
theme_void(base_size = 14)
```
`plot_ly(type = "pie", ... )` 和添加图层 `add_pie()` 的效果是一样的
```{r diamond-pie, fig.cap="饼图", eval=knitr::is_html_output()}
dat = aggregate(carat ~ cut, data = diamonds, FUN = length)
plotly::plot_ly() %>%
plotly::add_pie(
data = dat, labels = ~cut, values = ~carat,
name = "简单饼图1", domain = list(row = 0, column = 0)
) %>%
plotly::add_pie(
data = dat, labels = ~cut, values = ~carat, hole = 0.6,
textposition = "inside", textinfo = "label+percent",
name = "简单饼图2", domain = list(row = 0, column = 1)
) %>%
plotly::layout(
title = "多图布局", showlegend = F,
grid = list(rows = 1, columns = 2),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
) %>%
plotly::config(displayModeBar = FALSE)
```
设置参数 hole 可以绘制环形饼图,比如 hole = 0.6
## 地图 {#sec-ggplot2-map}
USArrests 数据集描述了1973年美国50个州每10万居民中因袭击、抢劫和强奸而逮捕的人,以及城市人口占比。这里的地图是指按照行政区划为边界的示意图,比如图 \@ref(fig:state-crimes)
```{r state-crimes, fig.cap="1975年美国各州犯罪事件", fig.width=8, fig.height=4}
library(maps)
crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests)
# 等价于 crimes %>% tidyr::pivot_longer(Murder:Rape)
vars <- lapply(names(crimes)[-1], function(j) {
data.frame(state = crimes$state, variable = j, value = crimes[[j]])
})
crimes_long <- do.call("rbind", vars)
states_map <- map_data("state")
ggplot(crimes, aes(map_id = state)) +
geom_map(aes(fill = Murder), map = states_map) +
expand_limits(x = states_map$long, y = states_map$lat) +
scale_fill_binned(type = "viridis") +
coord_map() +
theme_minimal()
```
先来看看中国及其周边,见图\@ref(fig:incorrect-map),这个地图的缺陷就是中国南海及九段线没有标记,台湾和中国大陆不是一种颜色标记,这里的地图数据来自 R 包 **maps** 和 **mapdata**,像这样的地图就不宜在国内正式刊物上出现。
```{r incorrect-map, fig.cap="中国及其周边", fig.width=8, fig.height=4}
library(maps)
library(mapdata)
east_asia <- map_data("worldHires",
region = c(
"Japan", "Taiwan", "China",
"North Korea", "South Korea"
)
)
ggplot(east_asia, aes(x = long, y = lat, group = group, fill = region)) +
geom_polygon(colour = "black") +
scale_fill_brewer(palette = "Set2") +
coord_map() +
theme_minimal()
```
绘制真正的地图需要考虑投影坐标系,观察角度、分辨率、政策法规等一系列因素,它是一种复杂的图形,如图 \@ref(fig:draw-map) 所示。
```{r draw-map,fig.cap="画地图的正确姿势",fig.width=4,fig.height=4,out.width="45%",fig.show='hold',fig.ncol=2,fig.subcap=c("墨卡托投影", "北极观察", "正交投影", "正交投影北极观察"),collapse=TRUE}
worldmap <- map_data("world")
# 默认 mercator 投影下的默认视角 c(90, 0, mean(range(x)))
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map(
xlim = c(-120, 40), ylim = c(30, 90)
)
# 换观察角度
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map(
xlim = c(-120, 40), ylim = c(30, 90),
orientation = c(90, 0, 0)
)
# 换投影坐标系
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map("ortho",
xlim = c(-120, 40), ylim = c(30, 90)
)
# 二者皆换
ggplot(worldmap, aes(long, lat, group = group)) +
geom_polygon(aes(fill = region), show.legend = FALSE) +
coord_map("ortho",
xlim = c(-120, 40), ylim = c(30, 90),
orientation = c(90, 0, 0)
)
```
## 热图 {#sec-ggplot2-heatmap}
<!-- [heatmap3](https://cran.r-project.org/package=heatmap3) 包提供兼容 Base R 的 heatmap() 函数 -->
Zuguang Gu 开发的 [ComplexHeatmap](https://github.com/jokergoo/ComplexHeatmap) 包实现复杂数据的可视化,用以发现关联数据集之间的模式。特别地,比如基因数据、生存数据等,更多应用见开发者的书籍 [ComplexHeatmap 完全手册](https://jokergoo.github.io/ComplexHeatmap-reference/book/) 。 R 包发布在 Bioconductor 上 <https://www.bioconductor.org/packages/ComplexHeatmap>。使用之前我要确保已经安装 **BiocManager** 包,这个包负责管理 Bioconductor 上所有的包,需要先安装它,然后安装 **ComplexHeatmap** 包 [@Gu_2016_heatmap]。
```{r, eval=!require("ComplexHeatmap")}
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")
```
## 散点图 {#ggplot2-scatter}
下面以 diamonds 数据集为例展示 ggplot2 的绘图过程,首先加载 diamonds 数据集,查看数据集的内容
```{r}
data(diamonds)
str(diamonds)
```
数值型变量 carat 作为 x 轴
```{r diamonds-axis}
#| fig.subcap=c("指定 x 轴","数值变量 price 作为纵轴","有序分类变量 cut 指定颜色","指定统一颜色"),
#| fig.cap="绘图过程",
#| out.width="35%",
#| fig.ncol=2,
#| fig.width=2
ggplot(diamonds, aes(x = carat))
ggplot(diamonds, aes(x = carat, y = price))
ggplot(diamonds, aes(x = carat, color = cut))
ggplot(diamonds, aes(x = carat), color = "steelblue")
```
图 \@ref(fig:diamonds-axis) 的基础上添加数据图层
```{r scatter,fig.cap="添加数据图层"}
sub_diamonds <- diamonds[sample(1:nrow(diamonds), 1000), ]
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point()
```
给散点图\@ref(fig:scatter)上色
```{r scatter-color-1,fig.cap="散点图配色"}
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point(color = "steelblue")
```
```{r scatter-scale-1,fig.cap="格式化坐标轴刻度标签"}
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point(color = "steelblue") +
scale_y_continuous(
labels = scales::unit_format(unit = "k", scale = 1e-3),
breaks = seq(0, 20000, 4000)
)
```
让另一变量 cut 作为颜色分类指标
```{r scatter-color-2,fig.cap="分类散点图"}
ggplot(sub_diamonds, aes(x = carat, y = price, color = cut)) +
geom_point()
```
当然还有一种类似的表示就是分组,默认情况下,ggplot2将所有观测点视为一组,以分类变量 cut 来分组
```{r scatter-group,fig.cap="分组"}
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point()
```
在图\@ref(fig:scatter-group) 上没有体现出来分组的意思,下面以 cut 分组线性回归为例
```{r group-lm,fig.cap="分组线性回归",fig.ncol=1}
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point() +
geom_smooth(method = "lm")
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "lm")
```
我们当然可以选择更加合适的拟合方式,如局部多项式平滑 `loess` 但是该方法不太适用观测值比较多的情况,因为它会占用比较多的内存,建议使用广义可加模型作平滑拟合
```{r,fig.cap="局部多项式平滑"}
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "loess")
```
```{r group-gam,fig.cap="数据分组应用广义可加平滑"}
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))
```
[ggfortify](https://github.com/sinhrks/ggfortify) 包支持更多的统计分析结果的可视化。
为了更好地区分开组别,我们在图\@ref(fig:group-gam)的基础上分面或者配色
```{r group-facet,fig.cap=c("分组分面","分组配色"),fig.ncol=1}
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs")) +
facet_grid(~cut)
ggplot(sub_diamonds, aes(x = carat, y = price, group = cut, color = cut)) +
geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))
```
在分类散点图的另一种表示方法就是分面图,以 cut 变量作为分面的依据
```{r scatter-facet,fig.cap="分面散点图"}
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point() +
facet_grid(~cut)
```
给图 \@ref(fig:scatter-facet) 上色
```{r scatter-facet-color-1,fig.cap="给分面散点图上色"}
ggplot(sub_diamonds, aes(x = carat, y = price)) +
geom_point(color = "steelblue") +
facet_grid(~cut)
```
在图\@ref(fig:scatter-facet-color-1)的基础上,给不同的类上不同的颜色
```{r scatter-facet-color-2,fig.cap="给不同的类上不同的颜色"}
ggplot(sub_diamonds, aes(x = carat, y = price, color = cut)) +
geom_point() +
facet_grid(~cut)
```
去掉图例,此时图例属于冗余信息了
```{r scatter-facet-color-3,fig.cap="去掉图例"}
ggplot(sub_diamonds, aes(x = carat, y = price, color = cut)) +
geom_point(show.legend = FALSE) +
facet_grid(~cut)
```
四块土地,所施肥料不同,肥力大小顺序 4 < 2 < 3 < 1 小麦产量随肥力的变化
```{r,fig.cap="多个图例"}
data(Wheat2, package = "nlme") # Wheat Yield Trials
library(colorspace)
ggplot(Wheat2, aes(longitude, latitude)) +
geom_point(aes(size = yield, colour = Block)) +
scale_color_discrete_sequential(palette = "Viridis") +
scale_x_continuous(breaks = seq(0, 30, 5)) +
scale_y_continuous(breaks = seq(0, 50, 10))
```
```{r category-ggplot,fig.cap="分类散点图"}
ggplot(mtcars, aes(x = hp, y = mpg, color = factor(am))) +
geom_point()
```
图层、分组、分面和散点图介绍完了,接下来就是其它统计图形,如箱线图,小提琴图和条形图
```{r,fig.cap="1948年至1960年航班乘客人数变化"}
dat <- as.data.frame(cbind(rep(1948 + seq(12), each = 12), rep(seq(12), 12), AirPassengers))
colnames(dat) <- c("year", "month", "passengers")
ggplot(data = dat, aes(x = as.factor(year), y = as.factor(month))) +
stat_sum(aes(size = passengers), colour = "lightblue") +
scale_size(range = c(1, 10), breaks = seq(100, 650, 50)) +
labs(x = "Year", y = "Month", colour = "Passengers") +
theme_minimal()
```
## 条形图 {#sec-ggplot2-barplot}
条形图特别适合分类变量的展示,我们这里展示钻石切割质量 cut 不同等级的数量,当然我们可以直接展示各类的数目,在图层 `geom_bar` 中指定 `stat="identity"`
```{r}
# 需要映射数据框的两个变量,相当于自己先计算了每类的数量
with(diamonds, table(cut))
cut_df <- as.data.frame(table(diamonds$cut))
ggplot(cut_df, aes(x = Var1, y = Freq)) + geom_bar(stat = "identity")
```
```{r diamonds-barplot-1,fig.cap="频数条形图"}
ggplot(diamonds, aes(x = cut)) + geom_bar()
```
还有另外三种表示方法
```{r}
ggplot(diamonds, aes(x = cut)) + geom_bar(stat = "count")
ggplot(diamonds, aes(x = cut, y = ..count..)) + geom_bar()
ggplot(diamonds, aes(x = cut, y = stat(count))) + geom_bar()
```
我们还可以在图 \@ref(fig:diamonds-barplot-1) 的基础上再添加一个分类变量钻石的纯净度 clarity,形成堆积条形图
```{r diamonds-barplot-2,fig.cap="堆积条形图"}
ggplot(diamonds, aes(x = cut, fill = clarity)) + geom_bar()
```
再添加一个分类变量钻石颜色 color 比较好的做法是分面
```{r diamonds-barplot-3,fig.cap="分面堆积条形图"}
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar() +
facet_grid(~cut)
```
实际上,绘制图\@ref(fig:diamonds-barplot-3)包含了对分类变量的分组计数过程,如下
```{r}
with(diamonds, table(cut, color))
```
还有一种堆积的方法是按比例,而不是按数量,如图\@ref(fig:diamonds-barplot-4)
```{r diamonds-barplot-4,fig.cap="比例堆积条形图"}
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar(position = "fill") +
facet_grid(~cut)
```
接下来就是复合条形图
```{r diamonds-barplot-5,fig.cap="复合条形图"}
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar(position = "dodge")
```
再添加一个分类变量,就是需要分面大法了,图 \@ref(fig:diamonds-barplot-5) 展示了三个分类变量,其实我们还可以再添加一个分类变量用作分面的列依据
```{r diamonds-barplot-6,fig.cap="分面复合条形图"}
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar(position = "dodge") +
facet_grid(rows = vars(cut))
```
图 \@ref(fig:diamonds-barplot-6) 展示的数据如下
```{r}
with(diamonds, table(color, clarity, cut))
```
```{r barplot-1,fig.cap="条形图的四种常见形态"}
# 漫谈条形图 https://cosx.org/2017/10/discussion-about-bar-graph
set.seed(2020)
dat <- data.frame(
age = rep(1:30, 2),
gender = rep(c("man", "woman"), each = 30),
num = sample(x = 1:100, size = 60, replace = T)
)
# 重叠
p1 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "identity", alpha = 0.5)
# 堆积
p2 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "stack")
# 双柱
p3 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "dodge")
# 百分比
p4 <- ggplot(data = dat, aes(x = age, y = num, fill = gender)) +
geom_col(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = "%")
(p1 + p2) / (p3 + p4)
```
以数据集 diamonds 为例,按照纯净度 clarity 和切工 cut 分组统计钻石的数量,再按切工分组统计不同纯净度的钻石数量占比,如表 \@ref(tab:diamonds-table) 所示
```{r diamonds-table}
library(data.table)
diamonds <- as.data.table(diamonds)
dat <- diamonds[, .(cnt = .N), by = .(cut, clarity)] %>%
.[, pct := cnt / sum(cnt), by = .(cut)] %>%
.[, pct_pp := paste0(cnt, " (", scales::percent(pct, accuracy = 0.01), ")") ]
# 分组计数 with(diamonds, table(clarity, cut))
dcast(dat, formula = clarity ~ cut, value.var = "pct_pp") %>%
knitr::kable(align = "crrrrr", caption = "数值和比例组合呈现")
```
分别以堆积条形图和百分比堆积条形图展示,添加注释到条形图上,见 \@ref(fig:barplot-2)
```{r barplot-2,fig.cap="添加注释到条形图",fig.height=8,fig.width=8}
p1 = ggplot(data = dat, aes(x = cut, y = cnt, fill = clarity)) +
geom_col(position = "dodge") +
geom_text(aes(label = cnt), position = position_dodge(1), vjust = -0.5) +
geom_text(aes(label = scales::percent(pct, accuracy = 0.1)),
position = position_dodge(1), vjust = 1, hjust = 0.5
) +
scale_fill_brewer(palette = "Spectral") +
labs(fill = "clarity", y = "", x = "cut") +
theme_minimal() +
theme(legend.position = "top")
p2 = ggplot(data = dat, aes(y = cut, x = cnt, fill = clarity)) +
geom_col(position = "fill") +
geom_text(aes(label = cnt), position = position_fill(1), vjust = -0.5) +
geom_text(aes(label = scales::percent(pct, accuracy = 0.1)),
position = position_fill(1), vjust = 1, hjust = 0.5
) +
scale_fill_brewer(palette = "Spectral") +
scale_x_continuous(labels = scales::percent) +
labs(fill = "clarity", y = "", x = "cut") +
theme_minimal() +
theme(legend.position = "top")
p1 / p2
```
借助 plotly 制作相应的动态百分比堆积条形图
```{r barplot-3, eval=knitr::is_html_output(), fig.cap="百分比堆积条形图", warning=FALSE}
ggplot(data = diamonds, aes(x = cut, fill = clarity)) +
geom_bar(position = "dodge2") +
scale_fill_brewer(palette = "Spectral")
# 百分比堆积条形图
plotly::plot_ly(dat,
x = ~cut, color = ~clarity, y = ~pct,
colors = "Spectral", type = "bar",
text = ~ paste0(
cnt, "颗 <br>",
"占比:", scales::percent(pct, accuracy = 0.1), "<br>"
),
hoverinfo = "text"
) %>%
plotly::layout(
barmode = "stack",
yaxis = list(tickformat = ".0%")
) %>%
plotly::config(displayModeBar = FALSE)
# `type = "histogram"` 以 cut 和 clarity 分组计数
plotly::plot_ly(diamonds,
x = ~cut, color = ~clarity,
colors = "Spectral", type = "histogram"
) %>%
plotly::config(displayModeBar = FALSE)
# 堆积图
plotly::plot_ly(diamonds,
x = ~cut, color = ~clarity,
colors = "Spectral", type = "histogram"
) %>%
plotly::layout(
barmode = "stack",
yaxis = list(title = "cnt"),
legend = list(title = list(text = "clarity"))
) %>%
plotly::config(displayModeBar = FALSE)
```
## 直方图 {#ggplot2-histogram}
直方图用来查看连续变量的分布
```{r,fig.cap="钻石价格的分布"}
ggplot(diamonds, aes(price)) + geom_histogram(bins = 30)
```
堆积直方图
```{r,fig.cap="钻石价格随切割质量的分布"}
ggplot(diamonds, aes(x = price, fill = cut)) + geom_histogram(bins = 30)
```
基础 R 包与 Ggplot2 包绘制的直方图的对比,Base R 绘图速度快,代码更加稳定,Ggplot2 代码简洁,更美观
```{r base-vs-ggplot2-hist, fig.width=4,fig.height=3, out.width="45%", fig.ncol=2, fig.cap="直方图", fig.subcap=c("Base R 直方图","Ggplot2 直方图")}
par(mar = c(2.1, 2.1, 1.5, 0.5))
plot(c(50, 350), c(0, 10),
type = "n", font.main = 1,
xlab = "", ylab = "", frame.plot = FALSE, axes = FALSE,
# xlab = "hp", ylab = "Frequency",
main = paste("Histogram with Base R", paste(rep(" ", 60), collapse = ""))
)
axis(
side = 1, at = seq(50, 350, 50), labels = seq(50, 350, 50),
tick = FALSE, las = 1, padj = 0, mgp = c(3, 0.1, 0)
)
axis(
side = 2, at = seq(0, 10, 2), labels = seq(0, 10, 2),
# col = "white", 坐标轴的颜色
# col.ticks 刻度线的颜色
tick = FALSE, # 取消刻度线
las = 1, # 水平方向
hadj = 1, # 右侧对齐
mgp = c(3, 0.1, 0) # 纵轴边距线设置为 0.1
)
abline(h = seq(0, 10, 2), v = seq(50, 350, 50), col = "gray90", lty = "solid")
abline(h = seq(1, 9, 2), v = seq(75, 325, 50), col = "gray95", lty = "solid")
hist(mtcars$hp,
col = "#56B4E9", border = "white",
freq = TRUE, add = TRUE
# labels = TRUE, axes = TRUE, ylim = c(0, 10.5),
# xlab = "hp",main = "Histogram with Base R"
)
mtext("hp", 1, line = 1.0)
mtext("Frequency", 2, line = 1.0)
ggplot(mtcars) +
geom_histogram(aes(x = hp), fill = "#56B4E9", color = "white", breaks = seq(50, 350, 50)) +
scale_x_continuous(breaks = seq(50, 350, 50)) +
scale_y_continuous(breaks = seq(0, 12, 2)) +
labs(x = "hp", y = "Frequency", title = "Histogram with Ggplot2") +
theme_minimal(base_size = 12)
```
## 箱线图 {#ggplot2-boxplot}
以 PlantGrowth 数据集为例展示箱线图,在两组不同实验条件下,植物生长的情况,纵坐标是干燥植物的量,横坐标表示不同的实验条件。这是非常典型的适合用箱线图来表达数据的场合,Y 轴对应数值型变量,X 轴对应分类变量,在 R 语言中,分类变量的类型是 factor
```{r}
data("PlantGrowth")
str(PlantGrowth)
```
```{r PlantGrowth-boxplot}
ggplot(data = PlantGrowth, aes(x = group, y = weight)) + geom_boxplot()
```
PlantGrowth 数据量比较小,此时比较适合采用抖动散点图,抖动是为了避免点之间相互重叠,为了增加不同类别之间的识别性,我们可以用不同的点的形状或者不同的颜色来表示类别
```{r PlantGrowth-jitter}
ggplot(data = PlantGrowth, aes(x = group, y = weight, shape = group)) + geom_jitter()
ggplot(data = PlantGrowth, aes(x = group, y = weight, color = group)) + geom_jitter()
```
```{r,fig.asp=0.8}
boxplot(weight ~ group,
data = PlantGrowth,
ylab = "Dried weight of plants", col = "lightgray",
notch = FALSE, varwidth = TRUE
)
```
以钻石切割质量 cut 为分面依据,以钻石颜色类别 color 为 x 轴,钻石价格为 y 轴,绘制箱线图\@ref(fig:boxplot-facet-color)
```{r boxplot-facet-color,fig.cap="箱线图"}
ggplot(diamonds, aes(x = color, y = price, color = cut)) +
geom_boxplot(show.legend = FALSE) +
facet_grid(~cut)
```
我们当然还可以添加钻石的纯净度 clarity 作为分面依据,那么箱线图可以为图 \@ref(fig:boxplot-facet-color-clarity-1)
```{r boxplot-facet-color-clarity-1,fig.cap="复合分面箱线图"}
ggplot(diamonds, aes(x = color, y = price, color = cut)) +
geom_boxplot(show.legend = FALSE) +
facet_grid(clarity ~ cut)
```
经过观察,我们发现水平分类过多,考虑用切割质量 cut 替换钻石颜色 color 绘图,但是由于分类过细,图信息展示不简练,反而不好,如图 \@ref(fig:boxplot-facet-color-clarity-2)
```{r boxplot-facet-color-clarity-2,fig.cap="箱线图配色",fig.subcap=c("切割质量cut上色","钻石颜色配色"),fig.ncol=1}
ggplot(diamonds, aes(x = cut, y = price, color = cut)) +
geom_boxplot(show.legend = FALSE) +
facet_grid(clarity ~ color)
ggplot(diamonds, aes(x = cut, y = price, color = color)) +
geom_boxplot(show.legend = FALSE) +
facet_grid(clarity ~ color)
```
## 函数图 {#sec-ggplot2-function}
蝴蝶图的参数方程如下
\begin{align}
x &= \sin t \big(\mathrm e^{\cos t} - 2 \cos 4t + \sin^5(\frac{t}{12})\big) \\
y &= \cos t \big(\mathrm e^{\cos t} - 2 \cos 4t + \sin^5(\frac{t}{12})\big), t \in [- \pi, \pi]
\end{align}
## 密度图 {#sec-ggplot2-density}
```{r mpg-cyl-density,fig.cap="按汽缸数分组的城市里程"}
ggplot(mpg, aes(cty)) +
geom_density(aes(fill = factor(cyl)), alpha = 0.8) +
labs(
title = "Density plot",
subtitle = "City Mileage Grouped by Number of cylinders",
caption = "Source: mpg",
x = "City Mileage",
fill = "# Cylinders"
)
```
添加透明度,解决遮挡
```{r density,fig.cap=c("密度图","添加透明度的密度图"),fig.ncol=1}
ggplot(diamonds, aes(x = price, fill = cut)) + geom_density()
ggplot(diamonds, aes(x = price, fill = cut)) + geom_density(alpha = 0.5)
```
堆积密度图
```{r stack-density,fig.cap="堆积密度图"}
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_density(position = "stack")
```
条件密度估计
```{r,fig.cap="条件密度估计图"}
# You can use position="fill" to produce a conditional density estimate
ggplot(diamonds, aes(carat, stat(count), fill = cut)) +
geom_density(position = "fill")
```
岭线图是密度图的一种变体,可以防止密度曲线重叠在一起
```{r}
ggplot(diamonds) +
ggridges::geom_density_ridges(aes(x = price, y = color, fill = color))
```
二维的密度图又是一种延伸
```{r}
ggplot(diamonds, aes(x = carat, y = price)) +
geom_density_2d(aes(color = cut)) +
facet_grid(~cut)
```
`stat` 函数,特别是 nlevel 参数,在密度曲线之间填充我们又可以得到热力图
```{r}
ggplot(diamonds, aes(x = carat, y = price)) +
stat_density_2d(aes(fill = stat(nlevel)), geom = "polygon") +
facet_grid(. ~ cut)
```
`gemo_hex` 也是二维密度图的一种变体,特别适合数据量比较大的情形
```{r}
ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() +
scale_fill_viridis_c()
```
[heatmaps in ggplot2](https://themockup.blog/posts/2020-08-28-heatmaps-in-ggplot2/) 二维密度图
```{r density-2d,fig.cap="二维密度图",fig.width=4,fig.height=3,out.width="45%",fig.show='hold',fig.ncol=2,fig.subcap=c("默认调色板","viridis 调色板")}
ggplot(faithful, aes(x = eruptions, y = waiting)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon") +
xlim(1, 6) +
ylim(40, 100)
ggplot(faithful, aes(x = eruptions, y = waiting)) +
stat_density2d(aes(fill = stat(level)), geom = "polygon") +
scale_fill_viridis_c(option = "viridis") +
xlim(1, 6) +
ylim(40, 100)
```
::: {.rmdtip data-latex="{提示}"}
`MASS::kde2d()` 实现二维核密度估计,**ggplot2** 包提供了两种等价的绘图方式
1. `stat_density_2d()` 和 `..`
1. `stat_density2d()` 和 `stat()`
:::
```{r histogram,eval=knitr::is_html_output(),fig.cap="二维直方图/密度图/轮廓图"}
plotly::plot_ly(
data = faithful, x = ~eruptions,
y = ~waiting, type = "histogram2dcontour"
) %>%
plotly::config(displayModeBar = FALSE)
# plot_ly(faithful, x = ~waiting, y = ~eruptions) %>%
# add_histogram2d() %>%
# add_histogram2dcontour()
```
延伸一下,热力图
```{r, eval=knitr::is_html_output()}
library(KernSmooth)
den <- bkde2D(x = faithful, bandwidth = c(0.7, 7))
# 热力图
p1 <- plotly::plot_ly(x = den$x1, y = den$x2, z = den$fhat) %>%
plotly::config(displayModeBar = FALSE) %>%
plotly::add_heatmap()
# 等高线图
p2 <- plotly::plot_ly(x = den$x1, y = den$x2, z = den$fhat) %>%
plotly::config(displayModeBar = FALSE) %>%
plotly::add_contour()
htmltools::tagList(p1, p2)
```
## 提琴图 {#sec-ggplot2-violin}
2004 年 Daniel Adler 开发 [vioplot](https://github.com/TomKellyGenetics/vioplot) 包实现提琴图的绘制,它可能是最早实现此功能的 R 包,随后10余年没有更新却一直坚挺在 CRAN 上,非常难得,好在 Thomas Kelly 已经接手维护。另一款绘制提琴图的 R 包是 Peter Kampstra 开发的 [beanplot](https://cran.r-project.org/package=beanplot) [@beanplot_2008_jss],也存在很多年了,不过随着时间的变迁,比较现代的方式是 **ggplot2** 带来的 `geom_violin()` 扔掉了很多依赖,也是各种图形的汇集地,可以看作是最佳实践。提琴图比起箱线图优势在于呈现更多的分布信息,其次在于更加美观,但是就目前来说箱线图的受众比提琴图要多很多,毕竟前者是包含更多统计信息,如图\@ref(fig:boxplot-violin) 所示。
```{r boxplot-violin,fig.cap="几种不同的提琴图",fig.width=4,fig.height=4,out.width="45%",fig.show='hold',fig.ncol=2,fig.subcap=c("简单箱线图", "vioplot 绘制的提琴图", "ggplot2 绘制的提琴图", "beanplot 绘制的提琴图"),collapse=TRUE}
boxplot(count ~ spray, data = InsectSprays)
vioplot::vioplot(count ~ spray, data = InsectSprays, col = "lightgray")
ggplot(InsectSprays, aes(x = spray, y = count)) +
geom_violin(fill = "lightgray") +
theme_minimal()
beanplot::beanplot(count ~ spray, data = InsectSprays, col = "lightgray")
```
[ggnormalviolin](https://github.com/wjschne/ggnormalviolin) 包在给定均值和标准差的情况下,绘制正态分布的概率密度曲线,如图 \@ref(fig:normal-violin) 所示。
```{r normal-violin,fig.cap="正态分布的概率密度曲线",fig.width=6,fig.height=4}
library(ggnormalviolin)
with(
aggregate(
data = iris, Sepal.Length ~ Species,
FUN = function(x) c(dist_mean = mean(x), dist_sd = sd(x))
),
cbind.data.frame(Sepal.Length, Species)
) %>%
ggplot(aes(x = Species, mu = dist_mean, sigma = dist_sd, fill = Species)) +
geom_normalviolin() +
theme_minimal()
```
## 抖动图 {#ggplot2-jitter}
抖动图适合数据量比较小的情况
```{r}
ggplot(mpg, aes(x = class, y = hwy, color = class)) + geom_jitter()
```
抖不抖,还是抖一下
```{r}
ggplot(iris, aes(x = Species, y = Sepal.Length)) +
geom_point(aes(fill = Species), size = 5, shape = 21, colour = "grey20") +
# geom_boxplot(outlier.colour = NA, fill = NA, colour = "grey20") +
labs(title = "Not Jittered")
ggplot(iris, aes(x = Species, y = Sepal.Length)) +
geom_point(aes(fill = Species),
size = 5, shape = 21, colour = "grey20",
position = position_jitter(width = 0.2, height = 0.1)
) +
# geom_boxplot(outlier.colour = NA, fill = NA, colour = "grey20") +
labs(title = "Jittered")
```
在数据量比较大的时候,可以用箱线图、密度图、提琴图
```{r,fig.cap="抖动图的反例"}
ggplot(sub_diamonds, aes(x = cut, y = price)) + geom_jitter()
```
上色和分面都不好使的抖动图,因为区分度变小
```{r,fig.cap="根据钻石颜色上色",fig.asp=1}
ggplot(sub_diamonds, aes(x = color, y = price, color = color)) +
geom_jitter() +
facet_grid(clarity ~ cut)
```
箱线图此时不宜分的过细
```{r boxplot-facet-cut-clarity,fig.cap="箱线图",fig.asp=1}
ggplot(diamonds, aes(x = color, y = price, color = color)) +
geom_boxplot() +
facet_grid(cut ~ clarity)
```
所以这样更好,先按纯净度分面,再对比不同的颜色,钻石价格的差异
```{r boxplot-facet-clarity,fig.cap="钻石按纯净度分面",fig.asp=1}
ggplot(diamonds, aes(x = color, y = price, color = color)) +
geom_boxplot() +
facet_grid(~clarity)
```
最好只比较一个维度,不同颜色钻石的价格对比
```{r boxplot-color,fig.cap="不同颜色钻石的价格比较"}
ggplot(diamonds, aes(x = color, y = price, color = color)) +
geom_boxplot()
```
设置随机数种子,抖动图是可重复的。
```{r}
ggplot(iris, aes(x = Species, y = Sepal.Width, color = Species)) +
geom_boxplot(width = 0.65) +
geom_point(position = position_jitter(seed = 37, width = 0.25))
```
## 蜂群图 {#sec-ggplot2-beeswarm}
在样本点有限的情况下,用蜜蜂图代替普通的抖动图,可视化效果会好很多,如图 \@ref(fig:beeswarm) 所示。Erik Clarke 开发的 [ggbeeswarm](https://github.com/eclarke/ggbeeswarm) 包可以将随机抖动的散点图朝着比较规律的方向聚合,又不丢失数据本身的准确性。
```{r beeswarm,fig.cap="蜜蜂图可视化效果比抖动图好",fig.width=8,fig.height=4}
library(ggbeeswarm)
p1 <- ggplot(iris, aes(Species, Sepal.Length)) +
geom_jitter() +
theme_minimal()
p2 <- ggplot(iris, aes(Species, Sepal.Length)) +
geom_quasirandom() +
theme_minimal()
p1 + p2
```
## 玫瑰图 {#ggplot2-rose}
南丁格尔风玫瑰图[^nightingale-rose] 可以作为堆积条形图,分组条形图
```{r stack-to-rose,fig.cap="堆积条形图转风玫瑰图"}
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar()
ggplot(diamonds, aes(x = color, fill = clarity)) +
geom_bar() +
coord_polar()
```
```{r wind-rose,fig.cap="风玫瑰图"}
# 风玫瑰图 http://blog.csdn.net/Bone_ACE/article/details/47624987
set.seed(2018)
# 随机生成100次风向,并汇集到16个区间内
direction <- cut_interval(runif(100, 0, 360), n = 16)
# 随机生成100次风速,并划分成4种强度
mag <- cut_interval(rgamma(100, 15), 4)
dat <- data.frame(direction = direction, mag = mag)
# 将风向映射到X轴,频数映射到Y轴,风速大小映射到填充色,生成条形图后再转为极坐标形式即可
p <- ggplot(dat, aes(x = direction, y = ..count.., fill = mag))
p + geom_bar(colour = "white") +
coord_polar() +
theme(axis.ticks = element_blank(), axis.text.y = element_blank()) +
labs(x = "", y = "", fill = "Magnitude")
```
```{r}
p + geom_bar(position = "fill") +
coord_polar() +
theme(axis.ticks = element_blank(), axis.text.y = element_blank()) +
labs(x = "", y = "", fill = "Magnitude")
```
[^nightingale-rose]: https://mbostock.github.io/protovis/ex/crimea-rose-full.html
## 瓦片图 {#sec-ggplot2-tile}
```{r geom-tile,fig.cap="1949-1960年国际航线乘客数量的月度趋势",fig.showtext=TRUE,fig.width=8,fig.height=4}
p1 <- expand.grid(months = month.abb, years = 1949:1960) %>%
transform(num = as.vector(AirPassengers)) %>%
ggplot(aes(x = years, y = months, fill = num)) +
scale_fill_continuous(type = "viridis") +
geom_tile(color = "white", size = 0.4) +
scale_x_continuous(
expand = c(0.01, 0.01),
breaks = seq(1949, 1960, by = 1), labels = 1949:1960
) +
theme_minimal(base_size = 10.54, base_family = "Noto Serif CJK SC") +
theme(legend.position = "top") +
labs(x = "年", y = "月", fill = "人数")
p2 <- expand.grid(months = month.abb, years = 1949:1960) %>%
transform(num = as.vector(AirPassengers)) %>%
ggplot(aes(x = years, y = months, color = num)) +
geom_point(pch = 15, size = 8) +
scale_color_distiller(palette = "Spectral") +
scale_x_continuous(
expand = c(0.01, 0.01),
breaks = seq(1949, 1960, by = 1), labels = 1949:1960
) +
theme_minimal(base_size = 10.54, base_family = "Noto Serif CJK SC") +
theme(legend.position = "top") +
labs(x = "年", y = "月", color = "人数")
p1 + p2
```