Skip to content

Commit

Permalink
bugfix guess barcodes for single-end reads (#51)
Browse files Browse the repository at this point in the history
* bugfix guess barcodes for single-end reads

bugfix guess barcodes for single-end reads; allow barcodes other than 8bp in the sorted common barcodes output file

* emit counts in single-end guessed barcode report
  • Loading branch information
tomkinsc authored Jan 23, 2021
1 parent 06888f0 commit f31be0a
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 32 deletions.
32 changes: 26 additions & 6 deletions illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,21 @@ def main_illumina_demux(args):
JVMmemory=args.JVMmemory)

if args.commonBarcodes:
barcode_lengths = re.findall(r'(\d+)B',read_structure)
try:
barcode1_len=int(barcode_lengths[0])
except IndexError:
barcode1_len = 0
try:
barcode2_len=int(barcode_lengths[1])
except IndexError:
barcode2_len = 0

# this step can take > 2 hours on a large high-output flowcell
# so kick it to the background while we demux
#count_and_sort_barcodes(barcodes_tmpdir, args.commonBarcodes)
executor = concurrent.futures.ProcessPoolExecutor()
executor.submit(count_and_sort_barcodes, barcodes_tmpdir, args.commonBarcodes, truncateToLength=args.max_barcodes, threads=util.misc.sanitize_thread_count(args.threads))
executor.submit(count_and_sort_barcodes, barcodes_tmpdir, args.commonBarcodes, barcode1_len, barcode2_len, truncateToLength=args.max_barcodes, threads=util.misc.sanitize_thread_count(args.threads))

# Picard IlluminaBasecallsToSam
basecalls_input = util.file.mkstempfname('.txt', prefix='.'.join(['library_params', flowcell, str(args.lane)]))
Expand Down Expand Up @@ -428,7 +438,17 @@ def main_common_barcodes(args):
picardOptions=picardOpts,
JVMmemory=args.JVMmemory)

count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, args.truncateToLength, args.includeNoise, args.omitHeader)
barcode_lengths = re.findall(r'(\d+)B',read_structure)
try:
barcode1_len=int(barcode_lengths[0])
except IndexError:
barcode1_len = 0
try:
barcode2_len=int(barcode_lengths[1])
except IndexError:
barcode2_len = 0

count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader)

# clean up
os.unlink(barcode_file)
Expand All @@ -438,7 +458,7 @@ def main_common_barcodes(args):

__commands__.append(('common_barcodes', parser_common_barcodes))

def count_and_sort_barcodes(barcodes_dir, outSummary, truncateToLength=None, includeNoise=False, omitHeader=False, threads=None):
def count_and_sort_barcodes(barcodes_dir, outSummary, barcode1_len=8, barcode2_len=8, truncateToLength=None, includeNoise=False, omitHeader=False, threads=None):
# collect the barcode file paths for all tiles
tile_barcode_files = [os.path.join(barcodes_dir, filename) for filename in os.listdir(barcodes_dir)]

Expand Down Expand Up @@ -479,8 +499,8 @@ def count_and_sort_barcodes(barcodes_dir, outSummary, truncateToLength=None, inc

barcode,count = row

writer.writerow((barcode[:8], ",".join([x for x in illumina_reference.guess_index(barcode[:8], distance=1)] or ["Unknown"]),
barcode[8:], ",".join([x for x in illumina_reference.guess_index(barcode[8:], distance=1)] or ["Unknown"]),
writer.writerow((barcode[:barcode1_len], ",".join([x for x in illumina_reference.guess_index(barcode[:barcode1_len], distance=1)] or ["Unknown"]),
barcode[barcode1_len:len(barcode)], ",".join([x for x in illumina_reference.guess_index(barcode[barcode1_len:len(barcode)], distance=1)] or ["Unknown"]),
count))

if num_processed%50000==0:
Expand Down Expand Up @@ -518,7 +538,7 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()):
parser.add_argument('--outlier_threshold',
help='threshold of how far from unbalanced a sample must be to be considered an outlier.',
type=float,
default=0.675)
default=0.775)
parser.add_argument('--expected_assigned_fraction',
help='The fraction of reads expected to be assigned. An exception is raised if fewer than this fraction are assigned.',
type=float,
Expand Down
10 changes: 5 additions & 5 deletions test/input/TestIlluminaBarcodeHelper/single_index/expected.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sample_name expected_barcode_1 expected_barcode_1_name guessed_barcode_1 guessed_barcode_1_name match_type
EBOV-VP24UTR_G10218A-C10220U.DMSO CAAAAGAT RPI28 alternative_indices_uncertain
EBOV-VP24UTR_G10218A-C10242U.DMSO CACCGGAT RPI30 alternative_indices_uncertain
EBOV-VP24UTR_G10218A-C10242U.NMIA CTAGCTAT RPI38 alternative_indices_uncertain
EBOV-VP24UTR_G10218A-C10242U.DENAT TCCCGAAT RPI46 alternative_indices_uncertain
sample_name expected_barcode_1 expected_barcode_1_name expected_barcodes_read_count guessed_barcode_1 guessed_barcode_1_name match_type
EBOV-VP24UTR_G10218A-C10220U.DMSO CAAAAGAT RPI28 264750 alternative_indices_uncertain
EBOV-VP24UTR_G10218A-C10242U.DMSO CACCGGAT RPI30 187070 alternative_indices_uncertain
EBOV-VP24UTR_G10218A-C10242U.NMIA CTAGCTAT RPI38 268529 alternative_indices_uncertain
EBOV-VP24UTR_G10218A-C10242U.DENAT TCCCGAAT RPI46 287559 alternative_indices_uncertain
63 changes: 42 additions & 21 deletions util/illumina_indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -1644,7 +1644,7 @@ def __init__(self, barcode_counts, picard_metrics, sample_name, rows_limit=1000)
if "Barcode2" in row and row["Barcode2"]:
self.barcode_name_map[row["Barcode2"]] = row["Likely_Index_Names2"]

def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0.7, number_of_negative_controls=1):
def outlier_barcodes(self, outlier_threshold=0.775, expected_assigned_fraction=0.7, number_of_negative_controls=1):
"""
This identifies samples listed in the Picard metrics (derived from
the sample sheet) that are believed to have an anomalously low
Expand All @@ -1664,10 +1664,11 @@ def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0
The ASQC Basic References in Quality Control: Statistical Techniques
Params:
outlier_threshold (float): 0.675 corresponds to 75th percentile
outlier_threshold (float): 0.675, corresponds to 75th percentile
expected_assigned_fraction (float): fraction of reads assigned to samples
number_of_negative_controls (int): the number of samples in the pool expected to have zero reads
"""
##log.debug(f"outlier_threshold {outlier_threshold}")
assigned_read_count = sum(self.sample_to_read_counts.values())
total_read_count = assigned_read_count+self.unassigned_read_count
fraction_assigned = float(assigned_read_count)/float(total_read_count)
Expand All @@ -1676,24 +1677,33 @@ def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0
raise UncertainSamplesheetError("Only {:.0%} of reads were assigned to barcode pairs listed in the samplesheet. Check the sample sheet for errors.".format(fraction_assigned))

num_samples = len(self.sample_to_read_counts)
assert number_of_negative_controls<num_samples, "number_of_negative_controls must be < num_samples"

log_obs_fractions_of_pool = [ -math.log(float(x)/float(total_read_count),10) if x>0
else 0
else 0
for x in
list(self.sample_to_read_counts.values())+[self.unassigned_read_count]
list(self.sample_to_read_counts.values())
]

log_exp_fractions_of_pool = [-math.log(1.0/float(num_samples-number_of_negative_controls),10)]*num_samples + [0]

##log.debug(f"log_obs_fractions_of_pool {log_obs_fractions_of_pool}")
log_exp_fractions_of_pool = [-math.log(1.0/float(num_samples-number_of_negative_controls),10)]*num_samples
##log.warning(f"log_exp_fractions_of_pool {log_exp_fractions_of_pool}")
residuals = [obs-exp for (obs,exp) in zip(log_obs_fractions_of_pool,log_exp_fractions_of_pool)]
resid_stdev = self.stddevp(residuals) #essentially RMSE
resid_mean = self.mean(residuals) # mean error
resid_median = self.median(residuals) # median error

##log.debug(f"residuals {residuals}")
##log.debug(f"resid_stdev {resid_stdev}")
##log.debug(f"resid_mean {resid_mean}")
##log.debug(f"resid_median {resid_median}")

# modifed zscore using median to reduce influence of outliers
zscores_residual_relative_to_median = [float(1.0 * (x-resid_median))/resid_stdev for x in residuals]
##log.debug(f"zscores_residual_relative_to_median {zscores_residual_relative_to_median}")
# only consider LOW variance
indices_of_outliers = [i for i,v in enumerate(zscores_residual_relative_to_median[:-1]) if v > outlier_threshold]
indices_of_outliers += [i for i,v in enumerate(list(self.sample_to_read_counts.values())[:-1]) if v==0]
indices_of_outliers = [i for i,v in enumerate(zscores_residual_relative_to_median) if v > outlier_threshold]
indices_of_outliers += [i for i,v in enumerate(list(self.sample_to_read_counts.values())) if v==0]
##log.debug("self.sample_to_read_counts.keys() {}".format(self.sample_to_read_counts.keys()))
names_of_outlier_samples = [(list(self.sample_to_read_counts.keys()))[i] for i in indices_of_outliers]
log.warning("outlier samples")
for s in names_of_outlier_samples:
Expand Down Expand Up @@ -1760,7 +1770,8 @@ def guess_barcodes_for_sample(self, sample_name):
novel_barcode_pairs = []

for barcode_pair in self.barcodes_seen.keys():
if barcode_pair not in self.sample_to_barcodes.values():
barcode = tuple((b for b in barcode_pair if b is not None))
if barcode not in self.sample_to_barcodes.values():
novel_barcode_pairs.append(barcode_pair)
else:
del barcodes_seen_novel[barcode_pair]
Expand All @@ -1772,44 +1783,54 @@ def guess_barcodes_for_sample(self, sample_name):
if is_dual_index:
out_dict["expected_barcode_2"] = self.sample_to_barcodes[sample_name][1]
out_dict["expected_barcode_2_name"] = ",".join(self.index_reference.guess_index(self.sample_to_barcodes[sample_name][1]))
out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name]
out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name]

claimed_barcodes = self.sample_to_barcodes[sample_name]

found_partial_match = False
putative_match = None

if is_dual_index:
# barcodes_seen_novel is sorted by read count, desc
# barcodes_seen_novel is sorted by read count, desc
for (barcode_pair,count) in barcodes_seen_novel.items():
if barcode_pair[0]==claimed_barcodes[0] or barcode_pair[1]==claimed_barcodes[1]:
found_partial_match=True
putative_match = barcode_pair
out_dict["match_type"] = "one_barcode_match"
break

# find index of match to help determine if it is a reasonable guess
idx_of_match = -1
for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()):
if barcode_pair==putative_match:
idx_of_match=idx
else:
# barcodes_seen_novel is sorted by read count, desc
for (barcode_pair,count) in barcodes_seen_novel.items():
# get barcode with greatest count that isn't the one we're looking for
if barcode_pair[0]!=claimed_barcodes[0]:
found_partial_match=True
putative_match = barcode_pair
out_dict["match_type"] = "one_barcode_match"
break

# find index of match to help determine if it is a reasonable guess
idx_of_match = -1
for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()):
if barcode_pair==putative_match:
idx_of_match=idx
break

# if the one-barcode match is too far down the list of barcode pairs seen
# (farther down than 1.5x the number of samples)
if not found_partial_match or idx_of_match>(len(self.sample_to_read_counts)*1.5):
for (barcode_pair,count) in barcodes_seen_novel.items():
# return the most pair with greatest count
putative_match = barcode_pair
out_dict["match_type"] = "high_count_novel_pair"
out_dict["match_type"] = "high_count_novel_barcode"
break

out_dict["guessed_barcode_1"] = "unknown"
out_dict["guessed_barcode_1_name"] = "unknown"
#out_dict["guessed_barcodes_read_count"] = 0
if is_dual_index and (putative_match is not None and putative_match[1] != None):
out_dict["guessed_barcode_2"] = "unknown"
out_dict["guessed_barcode_2_name"] = "unknown"
out_dict["guessed_barcodes_read_count"] = "0"


if putative_match is not None and putative_match[0]!=None:
out_dict["guessed_barcode_1"] = putative_match[0]
Expand Down Expand Up @@ -1868,7 +1889,7 @@ def clear_guessed_fields(sample,match_reason,fields_to_clear=None):
log.warning("Ambiguous! Multiple samples corresponding to guessed barcodes %s:", barcode_pair)
for sample in samples:
log.warning("\t%s expected (%s,%s); -> Guessed (%s,%s); match type: %s", sample["sample_name"], sample["expected_barcode_1"],sample.get("expected_barcode_2",""),sample["guessed_barcode_1"],sample.get("guessed_barcode_2",""),sample["match_type"])

final_guesses.append(clear_guessed_fields(sample, "alternative_indices_uncertain"))
else:
for sample in samples:
Expand Down

0 comments on commit f31be0a

Please sign in to comment.