-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_deseq_dfs.R
109 lines (95 loc) · 3.98 KB
/
make_deseq_dfs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#this function takes the total rna tables produced by featureCounts, and gives a reasonable output data frame and
#metadata frame
make_deseq_dfs = function(total_table,
grep_pattern = "",
leave_out = "",
base_grep = "",
contrast_grep = ""){
if(grep_pattern == ""){
grep_pattern = paste0(colnames(total_table[,2:length(total_table)]),collapse = "|")
}
#grep pattern is being used to select small parts of this overall
total_table = as.data.table(total_table, keep.rownames = TRUE)
if('gene_name' %in% colnames(total_table)){
total_table$gene_name = NULL
}
if(leave_out == ""){
conv_df = as.data.frame(total_table[,round(.SD),
.SDcols = grep(grep_pattern,names(total_table))])
}else{
conv_df = as.data.frame(total_table[,round(.SD),
.SDcols = grep(grep_pattern,names(total_table))])
drop_col = paste0(leave_out_sample,".aligned.sorted.out.bam")
print(paste("Dropping column:",drop_col ))
conv_df = conv_df[ , !(names(conv_df) %in% drop_col)]
}
if("geneid" %in% names(total_table)){
rownames(conv_df) = total_table$geneid
}else if("Geneid" %in% names(total_table)){
rownames(conv_df) = total_table$Geneid
}else if("rn" %in% names(total_table)){
rownames(conv_df) = total_table$rn
}
else{
rownames(conv_df) = total_table$gene
}
coldata = as.data.table(names(conv_df))
if(base_grep == "" & contrast_grep == ""){
stop("Where are your greps gurl?")
}else if(base_grep != ""){
coldata[grep(base_grep,V1), cond := "base"]
}else if(contrast_grep != ""){
coldata[grep(contrast_grep,V1), cond := "contrast"]
}
coldata[is.na(cond), cond := "contrast"]
coldata = as.data.frame(coldata[,2:ncol(coldata)])
rownames(coldata) = names(conv_df)
morphed = list(conv_df,coldata)
names(morphed) = c("conv_df","coldata")
print(names(conv_df))
return(morphed)
}
# this function takes a deseq metadata df, the name you want to call the baseline and contrast conditions and returns the metadata table with
# a new column called 'comparison' which is a factor with baseline as the first level
rename_relevel_for_deseq = function(coldata, baseName = "", contrastName = ""){
coldata$comparison = factor(ifelse(coldata$cond == "base", baseName, contrastName), levels = c(baseName, contrastName))
return(coldata)
}
# this function filters a count table
filter_count_table = function(count_table){
keep <- rowSums(edgeR::cpm(count_table) > 0.5) >= 2
print("Filtered Genes by CPM greater than 0.5 in a least 2 samples")
print(table(keep))
return(count_table[keep, ])
}
# this function takes the big feature counts table and determines which samples are female
find_females = function(featureCountsTables, species = "human"){
if(species == "human"){
gene_table = fread("/Users/annaleigh/Documents/data/reference_genomes/gencode.v31.parsed_names.txt")
xist = "XIST"
}else if(species == "mouse"){
gene_table = fread("/Users/annaleigh/Documents/data/reference_genomes/gencode.vM22.parsed_names.txt")
xist = "Xist"
}else{
print("Species either 'human' or 'mouse")
return(1)
}
# check that the right species was chosen
if(sum(featureCountsTables$Geneid %in% gene_table$gene_id) < 10 ){
print("are you sure you chose the right species? Nothing found ")
return(1)
}
# --- find all the female samples
female_xist = featureCountsTables %>%
left_join(gene_table %>% dplyr::select(gene_id,gene_name) %>% unique(),by = c("Geneid" = "gene_id")) %>%
filter(gene_name == xist) %>%
select_if(is.numeric) %>% select_if( . > 100) %>% colnames()
return(female_xist)
}
# this function adds a metadata column for gender
add_gender = function(coldata, femalelist){
coldata = coldata %>% rownames_to_column("samp") %>%
mutate(sex = ifelse(samp %in% femalelist, "F", "M")) %>%
column_to_rownames('samp')
return(coldata)
}