-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add R script to plot benchmark results #92
- Loading branch information
Showing
1 changed file
with
315 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,315 @@ | ||
rm(list=ls()) | ||
suppressPackageStartupMessages(require(argparser)) | ||
suppressPackageStartupMessages(require(ggplot2)) | ||
suppressPackageStartupMessages(require(tools)) | ||
suppressPackageStartupMessages(require(StructuralVariantAnnotation)) | ||
suppressPackageStartupMessages(require(dplyr)) | ||
|
||
# The palette with black: | ||
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") | ||
|
||
make_percent <- function(x) { | ||
signif(x * 100, digits = 4) | ||
} | ||
|
||
infer_svtype <- function(gr) | ||
{ | ||
gr$svtype <- | ||
ifelse( | ||
seqnames(gr) != seqnames(partner(gr)), | ||
"TRA", | ||
# Using TRA instead of ITX or BP | ||
ifelse( | ||
gr$insLen >= abs(gr$svLen) * 0.7, | ||
"INS", | ||
ifelse( | ||
strand(gr) == strand(partner(gr)), | ||
"INV", | ||
ifelse(xor( | ||
start(gr) < start(partner(gr)), strand(gr) == "-" | ||
), "DEL", | ||
"DUP") | ||
) | ||
) | ||
) | ||
return(gr) | ||
} | ||
|
||
load_vcf <- function(vcf_file, svtype, caller) | ||
{ | ||
message(paste('Loading ', svtype, 'calls for', caller)) | ||
|
||
# Load VCF file | ||
vcf_gr <- | ||
VariantAnnotation::readVcf(vcf_file) | ||
|
||
# Keep only SVs that passed the filtering (PASS or .) | ||
vcf_gr <- vcf_gr[rowRanges(vcf_gr)$FILTER %in% c("PASS", ".")] | ||
|
||
if (caller == 'lumpy') | ||
{ | ||
# Read evidence support as a proxy for QUAL | ||
support <- unlist(info(vcf_gr)$SU) | ||
fixed(vcf_gr)$QUAL <- support | ||
} else if (caller == 'delly') | ||
{ | ||
# Split-read support plus Paired-end read support as a proxy for QUAL | ||
sr_support <- info(vcf_gr)$SR | ||
sr_support[is.na(vcf_gr)] <- 0 | ||
fixed(vcf_gr)$QUAL <- | ||
sr_support + info(vcf_gr)$PE | ||
} | ||
|
||
vcf_gr <- breakpointRanges(vcf_gr) | ||
vcf_gr <- infer_svtype(vcf_gr) | ||
|
||
# Select only one SV type | ||
vcf_gr <- vcf_gr[which(vcf_gr$svtype == svtype)] | ||
message(length(vcf_gr)) | ||
# Select SVs >= 50 bp | ||
if (!svtype %in% c('TRA', 'INS')) | ||
{ | ||
vcf_gr <- vcf_gr[abs(vcf_gr$svLen) >= 50] | ||
} | ||
# select only chromosomes 1 to 22 | ||
vcf_gr <- vcf_gr[seqnames(vcf_gr) %in% paste('chr',c(1:22),sep='')] | ||
vcf_gr <- vcf_gr[vcf_gr$partner %in% names(vcf_gr)] | ||
|
||
return(vcf_gr) | ||
} | ||
|
||
# create a parser and add command-line arguments | ||
p <- | ||
arg_parser("Plot benchmark results") | ||
p <- | ||
add_argument( | ||
p, | ||
"-i", | ||
help = paste("path to the CSV file with SV callsets (VCF)"), | ||
type = "character", | ||
default = '/Users/lsantuari/Documents/Projects/GTCG/sv-channels/sv-channels_manuscript/1KG_trios/results/manta_loocv/HG03992/callsets_HG03992.csv' | ||
) | ||
p <- | ||
add_argument(p, | ||
"-t", | ||
help = "path to the truth set", | ||
type = "character", | ||
default = '/Users/lsantuari/Documents/Projects/GTCG/sv-channels/sv-channels_manuscript/1KG_trios/results/truth_sets/HG03992.DEL.vcf') | ||
p <- | ||
add_argument(p, | ||
"-s", | ||
help = "sample name", | ||
type = "character", | ||
default = 'HG03992') | ||
|
||
p <- | ||
add_argument(p, | ||
"-sr", | ||
help = "path to split reads in BEDPE format", | ||
type = "character", | ||
default = '/Users/lsantuari/Documents/Processed/SPIDER/split_reads/NA12878/split_reads_real.bedpe') | ||
|
||
p <- | ||
add_argument(p, | ||
"-b", | ||
help = "consider only split read positions?", | ||
type = "boolean", | ||
default = FALSE) | ||
|
||
p <- | ||
add_argument(p, | ||
"-f", | ||
help = "Filename", | ||
type = "character", | ||
default = '/Users/lsantuari/Documents/Projects/GTCG/sv-channels/sv-channels_manuscript/1KG_trios/results/manta_loocv/HG03992/HG03992') | ||
|
||
# parse the command line arguments | ||
argv <- parse_args(p) | ||
svtype <- 'DEL' | ||
callsets <- read.csv(argv$i, header=FALSE) | ||
sv_regions <- list() | ||
for(i in 1:nrow(callsets)) | ||
{ | ||
if(tools::file_ext(callsets[i,2]) == 'vcf') | ||
{ | ||
sv_regions[[callsets[i,1]]] <- load_vcf(callsets[i,2], 'DEL', callsets[i,1]) | ||
}else if(tools::file_ext(callsets[i,2]) == 'bedpe') | ||
{ | ||
sv_regions[[callsets[i,1]]] <- pairs2breakpointgr(rtracklayer::import(callsets[i,2])) | ||
} | ||
} | ||
# exclude callsets with zero calls | ||
sv_regions <- sv_regions[sapply(sv_regions, length) != 0] | ||
for (c in names(sv_regions)) | ||
{ | ||
sv_regions[[c]]$caller <- c | ||
} | ||
|
||
if(tools::file_ext(argv$t) == 'vcf') | ||
{ | ||
truth_set <- load_vcf(argv$t, 'DEL', 'truth_set') | ||
ignore.strand.flag <- FALSE | ||
}else if(tools::file_ext(argv$t) == 'bedpe') | ||
{ | ||
truth_set <- pairs2breakpointgr(rtracklayer::import(argv$t)) | ||
ignore.strand.flag <- TRUE | ||
} | ||
|
||
if(argv$b) | ||
{ | ||
|
||
# Consider only SVs with split read positions? | ||
split_reads <- pairs2breakpointgr(rtracklayer::import(argv$sr)) | ||
|
||
for (c in names(sv_regions)) | ||
{ | ||
print(c) | ||
myHits <- findBreakpointOverlaps( | ||
sv_regions[[c]], | ||
split_reads, | ||
# read pair based callers make imprecise calls. | ||
# A margin around the call position is required when matching with the truth set | ||
maxgap = 100, | ||
# Since we added a maxgap, we also need to restrict the mismatch between the | ||
# size of the events. We don't want to match a 100bp deletion with a | ||
# 5bp duplication. This will happen if we have a 100bp margin but don't also | ||
# require an approximate size match as well | ||
sizemargin = 0.25, | ||
ignore.strand = TRUE, | ||
# We also don't want to match a 20bp deletion with a 20bp deletion 80bp away | ||
# by restricting the margin based on the size of the event, we can make sure | ||
# that simple events actually do overlap | ||
restrictMarginToSizeMultiple = 0.5 | ||
) | ||
print(length(unique(queryHits(myHits)))) | ||
sv_regions[[c]] <- sv_regions[[c]][unique(queryHits(myHits)),] | ||
sv_regions[[c]] <- sv_regions[[c]][sv_regions[[c]]$partner %in% names(sv_regions[[c]])] | ||
} | ||
|
||
myHits <- findBreakpointOverlaps( | ||
truth_set, | ||
split_reads, | ||
# read pair based callers make imprecise calls. | ||
# A margin around the call position is required when matching with the truth set | ||
maxgap = 100, | ||
# Since we added a maxgap, we also need to restrict the mismatch between the | ||
# size of the events. We don't want to match a 100bp deletion with a | ||
# 5bp duplication. This will happen if we have a 100bp margin but don't also | ||
# require an approximate size match as well | ||
sizemargin = 0.25, | ||
ignore.strand = TRUE, | ||
# We also don't want to match a 20bp deletion with a 20bp deletion 80bp away | ||
# by restricting the margin based on the size of the event, we can make sure | ||
# that simple events actually do overlap | ||
restrictMarginToSizeMultiple = 0.5 | ||
) | ||
truth_set <- truth_set[unique(queryHits(myHits)),] | ||
|
||
} | ||
|
||
for (c in names(sv_regions)) | ||
{ | ||
if (svtype == 'INS') { | ||
sv_regions[[c]]$truth_matches <- | ||
countBreakpointOverlaps( | ||
sv_regions[[c]], | ||
truth_set, | ||
# using a smaller margin for insertions, insertion location should be precise | ||
maxgap = 5, | ||
# sizemargin cannot be used for insertions | ||
# sizemargin = 0.25, | ||
ignore.strand = ignore.strand.flag, | ||
restrictMarginToSizeMultiple = 0.5 | ||
# countOnlyBest cannot be used for insertions | ||
# countOnlyBest = TRUE | ||
) | ||
|
||
} else{ | ||
sv_regions[[c]]$truth_matches <- | ||
countBreakpointOverlaps( | ||
sv_regions[[c]], | ||
truth_set, | ||
# read pair based callers make imprecise calls. | ||
# A margin around the call position is required when matching with the truth set | ||
maxgap = 100, | ||
# Since we added a maxgap, we also need to restrict the mismatch between the | ||
# size of the events. We don't want to match a 100bp deletion with a | ||
# 5bp duplication. This will happen if we have a 100bp margin but don't also | ||
# require an approximate size match as well | ||
sizemargin = 0.25, | ||
ignore.strand = ignore.strand.flag, | ||
# We also don't want to match a 20bp deletion with a 20bp deletion 80bp away | ||
# by restricting the margin based on the size of the event, we can make sure | ||
# that simple events actually do overlap | ||
restrictMarginToSizeMultiple = 0.5, | ||
# Some callers make duplicate calls and will sometimes report a variant multiple | ||
# times with slightly different bounds. countOnlyBest prevents these being | ||
# double-counted as multiple true positives. | ||
countOnlyBest = TRUE | ||
) | ||
} | ||
} | ||
|
||
sv_regions_unlisted <- unlist(GRangesList(sv_regions)) | ||
|
||
main.title <- | ||
paste(argv$s, svtype, sep = ' ') | ||
print(main.title) | ||
|
||
ggplot( | ||
as.data.frame(sv_regions_unlisted) %>% | ||
dplyr::select(QUAL, caller, truth_matches) %>% | ||
dplyr::group_by(caller, QUAL) %>% | ||
dplyr::summarise(calls = n(), | ||
tp = sum(truth_matches > 0)) %>% | ||
dplyr::group_by(caller) %>% | ||
dplyr::arrange(dplyr::desc(QUAL)) %>% | ||
dplyr::mutate( | ||
cum_tp = cumsum(tp), | ||
cum_n = cumsum(calls), | ||
cum_fp = cum_n - cum_tp, | ||
precision = cum_tp / cum_n, | ||
recall = cum_tp / length(truth_set) | ||
) | ||
) + | ||
aes(x = recall, | ||
y = precision, | ||
colour = caller) + | ||
geom_point() + | ||
geom_line() + | ||
theme(text = element_text(size=30)) + | ||
scale_y_continuous(labels = scales::percent) + | ||
scale_x_continuous( | ||
labels = scales::percent, | ||
sec.axis = sec_axis(~ (.) * length(truth_set), name = "true positives") | ||
) + | ||
labs(title = main.title) + scale_fill_manual(values=cbbPalette) | ||
|
||
filename <- paste(argv$f, '.png', sep='') | ||
|
||
ggsave(file = filename, width = 15, height = 10) | ||
|
||
res.df <- as.data.frame(sv_regions_unlisted) %>% | ||
dplyr::select(caller, truth_matches) %>% | ||
dplyr::group_by(caller) %>% | ||
dplyr::summarise(calls = n(), | ||
TP = sum(truth_matches > 0)) %>% | ||
dplyr::group_by(caller) %>% | ||
dplyr::mutate( | ||
FP = calls - TP, | ||
precision = signif(TP / calls, digits = 4), | ||
recall = signif(TP / length(truth_set), digits = 4) | ||
) | ||
res.df$F1_score = with(res.df, 2 * (precision * recall) / (precision + recall)) | ||
res.df$precision <- make_percent(res.df$precision) | ||
res.df$recall <- make_percent(res.df$recall) | ||
res.df$F1_score <- make_percent(res.df$F1_score) | ||
|
||
res.df | ||
|
||
filename <- paste(argv$f, '.csv', sep='') | ||
|
||
write.csv(file = filename, | ||
res.df, | ||
quote = FALSE, | ||
row.names = FALSE) |