From d4cf78685f6a6335ad4a3652d7464ef90101c9d5 Mon Sep 17 00:00:00 2001 From: Lokesh Mano Date: Thu, 31 Oct 2024 10:05:08 +0100 Subject: [PATCH] moved data formats to top --- .gitignore | 1 + slide_ggplot2.Rmd | 202 +++++++++++++++++++++------------------------- 2 files changed, 93 insertions(+), 110 deletions(-) diff --git a/.gitignore b/.gitignore index 3f2405b1..4c460046 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ data/.DS* libs/ .DS_Store Marcin/ +Lokesh/ \ No newline at end of file diff --git a/slide_ggplot2.Rmd b/slide_ggplot2.Rmd index fa885b3e..32a1f54d 100644 --- a/slide_ggplot2.Rmd +++ b/slide_ggplot2.Rmd @@ -296,20 +296,104 @@ name: data-format # Data • Format -* Transforming data into long or wide formats +-- + +- Wide format -```{r,comment=""} -iris %>% head(n=4) +```{r, echo=FALSE} +gc <- read.table("data/slide_ggplot2/counts_raw.txt", header = T, row.names = 1, sep = "\t") +kable(gc[c(1:6),c(1:4)]) %>% + kable_styling(bootstrap_options = "striped", full_width = F) %>% + row_spec(1:6, color = "orange") %>% + column_spec(1, color = "red") %>% + row_spec(0, bold = T, color = "blue") ``` -```{r,comment=""} -iris %>% tidyr::pivot_longer(!Species,names_to="variable",values_to="value") %>% - as.data.frame() %>% head(n=5) +-- + +* familiarity +* conveniency +* you see more data + +--- + +name: data-format-2 + +# Data • Format + +- Long format + +-- + + +```{r echo=FALSE} +md <- read.table("data/slide_ggplot2/metadata.csv", header = T, sep = ";") +samples <- colnames(gc[,c(1:4)]) +gc[c(1:6),c(1:4)] %>% + rownames_to_column(var = "Gene") %>% + gather(Sample_ID, count, -Gene) %>% + select(Sample_ID, everything()) %>% + head(6) %>% + kable() %>% + kable_styling("striped", full_width = F) %>% + column_spec(1, color = "blue") %>% + column_spec(2, color = "red")%>% + column_spec(3, color = "orange") ``` -??? +-- + + +```{r echo=FALSE} +md <- read.table("data/slide_ggplot2/metadata.csv", header = T, sep = ";") +samples <- colnames(gc[,c(1:4)]) +gc[c(1:6),c(1:4)] %>% + rownames_to_column(var = "Gene") %>% + gather(Sample_ID, count, -Gene) %>% + full_join(md[c(1:4),], by = "Sample_ID") %>% + select(Sample_ID, everything()) %>% + select(-c(Gene,count), c(Gene,count)) %>% + head(6) %>% + kable() %>% + kable_styling("striped", full_width = F) %>% + column_spec(1:5, color = "blue") %>% + column_spec(6, color = "red")%>% + column_spec(7, color = "orange") +``` + +--- + +name: data-format-3 + +# Data • Format + +- Long format + +```{r echo=FALSE} +md <- read.table("data/slide_ggplot2/metadata.csv", header = T, sep = ";") +samples <- colnames(gc[,c(1:4)]) +gc[c(1:6),c(1:4)] %>% + rownames_to_column(var = "Gene") %>% + gather(Sample_ID, count, -Gene) %>% + full_join(md[c(1:4),], by = "Sample_ID") %>% + select(Sample_ID, everything()) %>% + select(-c(Gene,count), c(Gene,count)) %>% + head(6) %>% + kable() %>% + kable_styling("striped", full_width = F) %>% + column_spec(1:5, color = "blue") %>% + column_spec(6, color = "red")%>% + column_spec(7, color = "orange") +``` -The data must be cleaned up and prepared for plotting. The data must be 'tidy'. Columns must be variables and rows must be observations. The data can then be in wide or long format depending on the variables to be plotted. +-- + +* easier to add data to the existing +* Most databases store and maintain in long-formats due to its efficiency +* R tools **like ggplot** require data in long format. +* Functions available to change between data-formats + * `melt()` from **reshape2** + * `gather()` from **tidyverse** --- name: geom @@ -956,108 +1040,6 @@ class: spaced * Numerous personal blogs, r-bloggers.com etc. ---- - -name: data - -## Data Formats - --- - -- Wide format - -```{r, echo=FALSE} -gc <- read.table("data/slide_ggplot2/counts_raw.txt", header = T, row.names = 1, sep = "\t") -kable(gc[c(1:6),c(1:4)]) %>% - kable_styling(bootstrap_options = "striped", full_width = F) %>% - row_spec(1:6, color = "orange") %>% - column_spec(1, color = "red") %>% - row_spec(0, bold = T, color = "blue") -``` - --- - -* familiarity -* conveniency -* you see more data - ---- - -name: data-2 - -## Data Formats - -- Long format - --- - - -```{r echo=FALSE} -md <- read.table("data/slide_ggplot2/metadata.csv", header = T, sep = ";") -samples <- colnames(gc[,c(1:4)]) -gc[c(1:6),c(1:4)] %>% - rownames_to_column(var = "Gene") %>% - gather(Sample_ID, count, -Gene) %>% - select(Sample_ID, everything()) %>% - head(6) %>% - kable() %>% - kable_styling("striped", full_width = F) %>% - column_spec(1, color = "blue") %>% - column_spec(2, color = "red")%>% - column_spec(3, color = "orange") -``` - --- - - -```{r echo=FALSE} -md <- read.table("data/slide_ggplot2/metadata.csv", header = T, sep = ";") -samples <- colnames(gc[,c(1:4)]) -gc[c(1:6),c(1:4)] %>% - rownames_to_column(var = "Gene") %>% - gather(Sample_ID, count, -Gene) %>% - full_join(md[c(1:4),], by = "Sample_ID") %>% - select(Sample_ID, everything()) %>% - select(-c(Gene,count), c(Gene,count)) %>% - head(6) %>% - kable() %>% - kable_styling("striped", full_width = F) %>% - column_spec(1:5, color = "blue") %>% - column_spec(6, color = "red")%>% - column_spec(7, color = "orange") -``` - ---- - -name: data-3 - -## Data Formats - -- Long format - -```{r echo=FALSE} -md <- read.table("data/slide_ggplot2/metadata.csv", header = T, sep = ";") -samples <- colnames(gc[,c(1:4)]) -gc[c(1:6),c(1:4)] %>% - rownames_to_column(var = "Gene") %>% - gather(Sample_ID, count, -Gene) %>% - full_join(md[c(1:4),], by = "Sample_ID") %>% - select(Sample_ID, everything()) %>% - select(-c(Gene,count), c(Gene,count)) %>% - head(6) %>% - kable() %>% - kable_styling("striped", full_width = F) %>% - column_spec(1:5, color = "blue") %>% - column_spec(6, color = "red")%>% - column_spec(7, color = "orange") -``` - --- - -* easier to add data to the existing -* Most databases store and maintain in long-formats due to its efficiency -* R tools **like ggplot** require data in long format. - ---