-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathomm_surveyo.R
181 lines (145 loc) · 9.74 KB
/
omm_surveyo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env Rscript
## == surveyor - take a 3-point slope from variation in aggregated MultiQC reports
## + + + + + + + + would need to be very different to use rLib FastQC, but likely doable.
## + + + + + + + + relies on the "fastqc_seq_content_data" field in MQC output. Found circa line ~5720.
## +
## + jfg_init:0ct_23 :: any chance of an aul docopt?
## + jfg_cont:Dec_24 :: to AMUKHERJEE ; same as surveyo_0.2.
## +
## note: assumes libs stringr & reshape2 available
## params ------------------------------------------------------------
bash_args <- commandArgs(trailingOnly = TRUE)
html_path <- bash_args[1]
## test
# html_path <- "C:/Users/Jamie.FitzGerald.TEAGASCAD/Desktop/fhi__mm2630_fwd_multiqc_raw.html"
print( paste0( "+++++++ working on it :: ", html_path))
## always needs to be some value yr staying below. This one chosen based on appearances
varDiff <- 1.5
varRat <- 1.5
## lowest possible value for 5prime trim
fiveprime_limit <- 15
## lowest possible value for 3prime trim
threeprime_limit <- 140
## generic term to separate text for each sample
split_term <- '}}, \\"'
## catch here if get nothing, or too much!! handy for testing setup if nothing else
if( !file.exists(html_path) ){ stop(c(" < ! > the MultiQC Report.html file is a SHAM, and a FAKE, and.. \n\n.. and a SHAM. Check input and try again."))}else{print(c(" + + + wait two secs while Beast eat Chris?"))}
## wrangle -----------------------------------------------------------
# I dont know HTML well enough to consider premature EOL significant. plx mail if a bad
xh <- readLines( html_path, warn = FALSE)[ which( grepl( split_term, readLines( html_path, warn = FALSE))) ]
## belete info from negative controls
xhs_pre <- unlist(strsplit( xh, split_term, perl = TRUE) )
xhs <- xhs_pre[ !grepl( "Neg", xhs_pre, ignore.case = TRUE)]
## break samples down into rows per base-position, based on field match (& drop first mangled instance)
xhs_l1 <- lapply( xhs, function(aa){ # aa <- xhs[3]
strsplit(aa, '\\"\\d*\\"', perl = TRUE )[[1]][-1]
})
## break rows to df. lazily not keeping the individual sample names.
xhs_l2 <- as.data.frame( do.call( "rbind", lapply( xhs_l1,
function(aaa){ # aaa <- xhs_l1[65]
bbb <- unlist( aaa, recursive = TRUE )
do.call("rbind",
lapply( bbb, function(cccc){ # cccc <- bbb[65]
c(
"base" = stringr::str_pad( width = 3, pad = 0,
gsub("(\\d)\\.0", "\\1", gsub('"\\d*-(\\d*)"', '\\1',
gsub('.*"base": (.*?),.*', '\\1', cccc, perl = TRUE),
perl = TRUE), perl = TRUE)),
"A" = gsub('.*"a": (\\d*\\.\\d*),.*', '\\1', cccc, perl = TRUE),
"T" = gsub('.*"t": (\\d*\\.\\d*),.*', '\\1', cccc, perl = TRUE),
"G" = gsub('.*"g": (\\d*\\.\\d*).*', '\\1', cccc, perl = TRUE),
"C" = gsub('.*"c": (\\d*\\.\\d*).*', '\\1', cccc, perl = TRUE) )
}))
})))
## for processing
xhs_l2[,1] <- paste0("base_", stringr::str_pad( width = 3, pad = 0,
gsub( "(\\d)\\.0", "\\1", gsub( '"\\d*-(\\d*)"', '\\1', gsub('.*"base": (.*), "t.*', '\\1',
xhs_l2[,1],
perl = TRUE), perl = TRUE), perl = TRUE)))
colnames(xhs_l2) <- c("base", "A", "C", "T", "G")
## get clear drop in variation
agg_val <- aggregate( value ~ base, reshape2::melt(xhs_l2, id.vars = c("base")), FUN = var)
## process -----------------------------------------------------------
## JE: get the RATIO between points - might still be the way to go :: abs(agg_val[aa-1, "value"] / agg_val[aa, "value"]) < 1
je <- sapply( 1:nrow(agg_val), function(aa){
if( aa == nrow(agg_val)){ Inf }else{
agg_val[aa, "value"] / agg_val[aa+1, "value"]
}
})
## instead get differences within a threesome: aa-1, aa, aa+1
variance_ratio_bin <- sapply( 1:nrow(agg_val), function(aa){
abs(agg_val[aa-1, "value"] / agg_val[aa, "value"]) < varRat &&
abs(agg_val[aa, "value"] / agg_val[aa+1, "value"]) < varRat &&
abs(agg_val[aa-1, "value"] / agg_val[aa+1, "value"]) < varRat
})
## ratios perform well - and as expected on trimmed data. Stick?
variance_diff_bin <- sapply( 1:nrow(agg_val), function(aa){
abs(agg_val[aa-1, "value"] - agg_val[aa, "value"]) < varDiff &&
abs(agg_val[aa, "value"] - agg_val[aa+1, "value"]) < varDiff &&
abs(agg_val[aa-1, "value"] - agg_val[aa+1, "value"]) < varDiff
})
## need to get the longest sequential stretch - not beautiful but appears robust
seq_lengs <- do.call("rbind", sapply( 1:length(variance_ratio_bin), function(aa){ # aa <- 10
do.call( "rbind",
lapply( aa:length(variance_ratio_bin), function(aaa){ # aaa <- 38
if( any( is.na(c(variance_ratio_bin[ aa], variance_ratio_bin[aaa]))) ){
c("start" = aa, "end" = aaa, "length" = 0)
}else if( all( variance_ratio_bin[ aa : aaa]) ){
c("start" = aa, "end" = aaa, "length" = c(aaa-aa))
} })
)
})
)
# remember, these are the row numbers of agg_val - need base positions:
longest_seq <- seq_lengs[ which( seq_lengs[ , "length"] == max( seq_lengs[,"length"])) , ]
(slopes_out <- as.numeric( gsub("base_0*", "", c(agg_val[ longest_seq[1], 1], agg_val[ longest_seq[2], 1]) )) + c(0,-1)) # a slight caution
## diagnose / outputs -----------------------------------------------
## stout fellow
print( paste0("-+-+-+-"))
if( any( slopes_out > 152)){
warning(" < ! > reads >150bp detected - we ASSUME this is 150bp illumina, this script should NOT be used for amplicon data")
}else if( any(is.na(slopes_out)) ){
error( paste0( " < ! > some trimming points were NA, possibly because of bad data munging. Suggested trims: ", paste0(slopes_out, collapse = ", ")))
}
if( slopes_out[1] < fiveprime_limit ){
print( paste0(" + + + suggested 5prime trim is too short: ", slopes_out[1], "bp; increasing to ", fiveprime_limit, "bp."))
slopes_out[1] <- fiveprime_limit
}else if( slopes_out[2] > threeprime_limit ){
print( paste0(" + + + suggested 3prime trim is too short: ", slopes_out[2], "bp; increasing to ", threeprime_limit, "bp."))
slopes_out[2] <- threeprime_limit
}
if( all(sapply( slopes_out,function(aa){ is.numeric(aa) && !is.null(aa) && !is.na(aa)})) && slopes_out[1] < slopes_out[2] ){
print( paste0(" + + + 5prime trim is ", slopes_out[1], "; 3prime trim is ", slopes_out[2], "."))
print( paste0(" + + + (feel free to check the PNG output and decide otherwise)"))
}else{
stop(" < ! > 5prime/3prime trim points were in a strange (string/NA/Null/error) format: check wrangling of DF and subsequent munging") # ~~modelling~~
}
print( paste0("-+-+-+-"))
## write to small file : apologies if too basic for yr
try( write.table( x = slopes_out, file = gsub( ".html", "_MQC-trim-points.txt", html_path), sep = "\t", row.names = FALSE, col.names = FALSE), silent = FALSE)
## retrognostic diagram
try_plot <- function(){
png(gsub(".html", "_MQC-diagnostic.png", html_path), width = 900, height = 1200)
par(mfrow= c(3,2))
## plot some outputs....
plot(je, type = "b", main = "JE ratio of variance in seq comp")
plot( type = "b", agg_val[,2], main = "aggregated variance in seq comp", xlab = NA, xaxt ="n" )
axis(1,at=1:nrow(agg_val), labels=agg_val[,1], las=2)
plot(unlist(sapply( 1:nrow(agg_val), function(aa){ abs(agg_val[(aa-1), "value"] - agg_val[aa, "value"]) })), main = "difference between a/a+1/a-1")
points(unlist( sapply( 1:nrow(agg_val), function(aa){ abs(agg_val[aa, "value"] - agg_val[(aa+1), "value"]) })), col = "red")
points(unlist( sapply( 1:nrow(agg_val), function(aa){ abs(agg_val[(aa-1), "value"] - agg_val[(aa+1), "value"]) })), col = "blue")
plot(unlist(sapply( 1:nrow(agg_val), function(aa){ abs(agg_val[(aa-1), "value"] / agg_val[aa, "value"]) })), main = "ratio between a/a+1/a-1")
points(unlist( sapply( 1:nrow(agg_val), function(aa){ abs(agg_val[aa, "value"] / agg_val[(aa+1), "value"]) })), col = "red")
points(unlist( sapply( 1:nrow(agg_val), function(aa){ abs(agg_val[(aa-1), "value"] / agg_val[(aa+1), "value"]) })), col = "blue")
# primary
plot( unlist( sapply( 1:nrow(agg_val), function(aa){ agg_val[ aa, 2] - agg_val[ aa+1, 2] }) ), type = "b", xaxt = "n", xlab = "base position", ylab = "variance in base composition",
main = paste0("trim along dotted line, at 5prime: ", slopes_out[1], " and 3prime: ", slopes_out[2],
",\nbased on \"flat\" stretch of var RATIOs.\n(n.b.: script allowed a min 5prime trim of ", fiveprime_limit, "bp)")) # aa <- 10
axis(1,at=1:nrow(agg_val), labels= gsub("base_", "", agg_val[,1]), las=2)
abline(lm( 0~1), lty= 2)
segments( x0 = longest_seq[1], x1 = longest_seq[2],
y0 = 0, y1 = 0, lty= 2, col = "green", lwd = 4)
dev.off()
}
try( try_plot(), silent = FALSE )
slopes_out