From f31be0a9fc826e5b7afcac1e0a45488c27d6b700 Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Fri, 22 Jan 2021 21:30:34 -0500 Subject: [PATCH] bugfix guess barcodes for single-end reads (#51) * bugfix guess barcodes for single-end reads bugfix guess barcodes for single-end reads; allow barcodes other than 8bp in the sorted common barcodes output file * emit counts in single-end guessed barcode report --- illumina.py | 32 ++++++++-- .../single_index/expected.txt | 10 +-- util/illumina_indices.py | 63 ++++++++++++------- 3 files changed, 73 insertions(+), 32 deletions(-) diff --git a/illumina.py b/illumina.py index 711dc23f..4a1fdc4d 100755 --- a/illumina.py +++ b/illumina.py @@ -215,11 +215,21 @@ def main_illumina_demux(args): JVMmemory=args.JVMmemory) if args.commonBarcodes: + barcode_lengths = re.findall(r'(\d+)B',read_structure) + try: + barcode1_len=int(barcode_lengths[0]) + except IndexError: + barcode1_len = 0 + try: + barcode2_len=int(barcode_lengths[1]) + except IndexError: + barcode2_len = 0 + # this step can take > 2 hours on a large high-output flowcell # so kick it to the background while we demux #count_and_sort_barcodes(barcodes_tmpdir, args.commonBarcodes) executor = concurrent.futures.ProcessPoolExecutor() - executor.submit(count_and_sort_barcodes, barcodes_tmpdir, args.commonBarcodes, truncateToLength=args.max_barcodes, threads=util.misc.sanitize_thread_count(args.threads)) + executor.submit(count_and_sort_barcodes, barcodes_tmpdir, args.commonBarcodes, barcode1_len, barcode2_len, truncateToLength=args.max_barcodes, threads=util.misc.sanitize_thread_count(args.threads)) # Picard IlluminaBasecallsToSam basecalls_input = util.file.mkstempfname('.txt', prefix='.'.join(['library_params', flowcell, str(args.lane)])) @@ -428,7 +438,17 @@ def main_common_barcodes(args): picardOptions=picardOpts, JVMmemory=args.JVMmemory) - count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, args.truncateToLength, args.includeNoise, args.omitHeader) + barcode_lengths = re.findall(r'(\d+)B',read_structure) + try: + barcode1_len=int(barcode_lengths[0]) + except IndexError: + barcode1_len = 0 + try: + barcode2_len=int(barcode_lengths[1]) + except IndexError: + barcode2_len = 0 + + count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader) # clean up os.unlink(barcode_file) @@ -438,7 +458,7 @@ def main_common_barcodes(args): __commands__.append(('common_barcodes', parser_common_barcodes)) -def count_and_sort_barcodes(barcodes_dir, outSummary, truncateToLength=None, includeNoise=False, omitHeader=False, threads=None): +def count_and_sort_barcodes(barcodes_dir, outSummary, barcode1_len=8, barcode2_len=8, truncateToLength=None, includeNoise=False, omitHeader=False, threads=None): # collect the barcode file paths for all tiles tile_barcode_files = [os.path.join(barcodes_dir, filename) for filename in os.listdir(barcodes_dir)] @@ -479,8 +499,8 @@ def count_and_sort_barcodes(barcodes_dir, outSummary, truncateToLength=None, inc barcode,count = row - writer.writerow((barcode[:8], ",".join([x for x in illumina_reference.guess_index(barcode[:8], distance=1)] or ["Unknown"]), - barcode[8:], ",".join([x for x in illumina_reference.guess_index(barcode[8:], distance=1)] or ["Unknown"]), + writer.writerow((barcode[:barcode1_len], ",".join([x for x in illumina_reference.guess_index(barcode[:barcode1_len], distance=1)] or ["Unknown"]), + barcode[barcode1_len:len(barcode)], ",".join([x for x in illumina_reference.guess_index(barcode[barcode1_len:len(barcode)], distance=1)] or ["Unknown"]), count)) if num_processed%50000==0: @@ -518,7 +538,7 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()): parser.add_argument('--outlier_threshold', help='threshold of how far from unbalanced a sample must be to be considered an outlier.', type=float, - default=0.675) + default=0.775) parser.add_argument('--expected_assigned_fraction', help='The fraction of reads expected to be assigned. An exception is raised if fewer than this fraction are assigned.', type=float, diff --git a/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt b/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt index 25f81f52..86452935 100644 --- a/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt +++ b/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt @@ -1,5 +1,5 @@ -sample_name expected_barcode_1 expected_barcode_1_name guessed_barcode_1 guessed_barcode_1_name match_type -EBOV-VP24UTR_G10218A-C10220U.DMSO CAAAAGAT RPI28 alternative_indices_uncertain -EBOV-VP24UTR_G10218A-C10242U.DMSO CACCGGAT RPI30 alternative_indices_uncertain -EBOV-VP24UTR_G10218A-C10242U.NMIA CTAGCTAT RPI38 alternative_indices_uncertain -EBOV-VP24UTR_G10218A-C10242U.DENAT TCCCGAAT RPI46 alternative_indices_uncertain +sample_name expected_barcode_1 expected_barcode_1_name expected_barcodes_read_count guessed_barcode_1 guessed_barcode_1_name match_type +EBOV-VP24UTR_G10218A-C10220U.DMSO CAAAAGAT RPI28 264750 alternative_indices_uncertain +EBOV-VP24UTR_G10218A-C10242U.DMSO CACCGGAT RPI30 187070 alternative_indices_uncertain +EBOV-VP24UTR_G10218A-C10242U.NMIA CTAGCTAT RPI38 268529 alternative_indices_uncertain +EBOV-VP24UTR_G10218A-C10242U.DENAT TCCCGAAT RPI46 287559 alternative_indices_uncertain diff --git a/util/illumina_indices.py b/util/illumina_indices.py index 6e95faed..040f38a1 100755 --- a/util/illumina_indices.py +++ b/util/illumina_indices.py @@ -1644,7 +1644,7 @@ def __init__(self, barcode_counts, picard_metrics, sample_name, rows_limit=1000) if "Barcode2" in row and row["Barcode2"]: self.barcode_name_map[row["Barcode2"]] = row["Likely_Index_Names2"] - def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0.7, number_of_negative_controls=1): + def outlier_barcodes(self, outlier_threshold=0.775, expected_assigned_fraction=0.7, number_of_negative_controls=1): """ This identifies samples listed in the Picard metrics (derived from the sample sheet) that are believed to have an anomalously low @@ -1664,10 +1664,11 @@ def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0 The ASQC Basic References in Quality Control: Statistical Techniques Params: - outlier_threshold (float): 0.675 corresponds to 75th percentile + outlier_threshold (float): 0.675, corresponds to 75th percentile expected_assigned_fraction (float): fraction of reads assigned to samples number_of_negative_controls (int): the number of samples in the pool expected to have zero reads """ + ##log.debug(f"outlier_threshold {outlier_threshold}") assigned_read_count = sum(self.sample_to_read_counts.values()) total_read_count = assigned_read_count+self.unassigned_read_count fraction_assigned = float(assigned_read_count)/float(total_read_count) @@ -1676,24 +1677,33 @@ def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0 raise UncertainSamplesheetError("Only {:.0%} of reads were assigned to barcode pairs listed in the samplesheet. Check the sample sheet for errors.".format(fraction_assigned)) num_samples = len(self.sample_to_read_counts) + assert number_of_negative_controls0 - else 0 + else 0 for x in - list(self.sample_to_read_counts.values())+[self.unassigned_read_count] + list(self.sample_to_read_counts.values()) ] - - log_exp_fractions_of_pool = [-math.log(1.0/float(num_samples-number_of_negative_controls),10)]*num_samples + [0] - + ##log.debug(f"log_obs_fractions_of_pool {log_obs_fractions_of_pool}") + log_exp_fractions_of_pool = [-math.log(1.0/float(num_samples-number_of_negative_controls),10)]*num_samples + ##log.warning(f"log_exp_fractions_of_pool {log_exp_fractions_of_pool}") residuals = [obs-exp for (obs,exp) in zip(log_obs_fractions_of_pool,log_exp_fractions_of_pool)] resid_stdev = self.stddevp(residuals) #essentially RMSE resid_mean = self.mean(residuals) # mean error resid_median = self.median(residuals) # median error + ##log.debug(f"residuals {residuals}") + ##log.debug(f"resid_stdev {resid_stdev}") + ##log.debug(f"resid_mean {resid_mean}") + ##log.debug(f"resid_median {resid_median}") + # modifed zscore using median to reduce influence of outliers zscores_residual_relative_to_median = [float(1.0 * (x-resid_median))/resid_stdev for x in residuals] + ##log.debug(f"zscores_residual_relative_to_median {zscores_residual_relative_to_median}") # only consider LOW variance - indices_of_outliers = [i for i,v in enumerate(zscores_residual_relative_to_median[:-1]) if v > outlier_threshold] - indices_of_outliers += [i for i,v in enumerate(list(self.sample_to_read_counts.values())[:-1]) if v==0] + indices_of_outliers = [i for i,v in enumerate(zscores_residual_relative_to_median) if v > outlier_threshold] + indices_of_outliers += [i for i,v in enumerate(list(self.sample_to_read_counts.values())) if v==0] + ##log.debug("self.sample_to_read_counts.keys() {}".format(self.sample_to_read_counts.keys())) names_of_outlier_samples = [(list(self.sample_to_read_counts.keys()))[i] for i in indices_of_outliers] log.warning("outlier samples") for s in names_of_outlier_samples: @@ -1760,7 +1770,8 @@ def guess_barcodes_for_sample(self, sample_name): novel_barcode_pairs = [] for barcode_pair in self.barcodes_seen.keys(): - if barcode_pair not in self.sample_to_barcodes.values(): + barcode = tuple((b for b in barcode_pair if b is not None)) + if barcode not in self.sample_to_barcodes.values(): novel_barcode_pairs.append(barcode_pair) else: del barcodes_seen_novel[barcode_pair] @@ -1772,7 +1783,7 @@ def guess_barcodes_for_sample(self, sample_name): if is_dual_index: out_dict["expected_barcode_2"] = self.sample_to_barcodes[sample_name][1] out_dict["expected_barcode_2_name"] = ",".join(self.index_reference.guess_index(self.sample_to_barcodes[sample_name][1])) - out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name] + out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name] claimed_barcodes = self.sample_to_barcodes[sample_name] @@ -1780,36 +1791,46 @@ def guess_barcodes_for_sample(self, sample_name): putative_match = None if is_dual_index: - # barcodes_seen_novel is sorted by read count, desc + # barcodes_seen_novel is sorted by read count, desc for (barcode_pair,count) in barcodes_seen_novel.items(): if barcode_pair[0]==claimed_barcodes[0] or barcode_pair[1]==claimed_barcodes[1]: found_partial_match=True putative_match = barcode_pair out_dict["match_type"] = "one_barcode_match" break - - # find index of match to help determine if it is a reasonable guess - idx_of_match = -1 - for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()): - if barcode_pair==putative_match: - idx_of_match=idx + else: + # barcodes_seen_novel is sorted by read count, desc + for (barcode_pair,count) in barcodes_seen_novel.items(): + # get barcode with greatest count that isn't the one we're looking for + if barcode_pair[0]!=claimed_barcodes[0]: + found_partial_match=True + putative_match = barcode_pair + out_dict["match_type"] = "one_barcode_match" break + # find index of match to help determine if it is a reasonable guess + idx_of_match = -1 + for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()): + if barcode_pair==putative_match: + idx_of_match=idx + break + # if the one-barcode match is too far down the list of barcode pairs seen # (farther down than 1.5x the number of samples) if not found_partial_match or idx_of_match>(len(self.sample_to_read_counts)*1.5): for (barcode_pair,count) in barcodes_seen_novel.items(): # return the most pair with greatest count putative_match = barcode_pair - out_dict["match_type"] = "high_count_novel_pair" + out_dict["match_type"] = "high_count_novel_barcode" break out_dict["guessed_barcode_1"] = "unknown" out_dict["guessed_barcode_1_name"] = "unknown" + #out_dict["guessed_barcodes_read_count"] = 0 if is_dual_index and (putative_match is not None and putative_match[1] != None): out_dict["guessed_barcode_2"] = "unknown" out_dict["guessed_barcode_2_name"] = "unknown" - out_dict["guessed_barcodes_read_count"] = "0" + if putative_match is not None and putative_match[0]!=None: out_dict["guessed_barcode_1"] = putative_match[0] @@ -1868,7 +1889,7 @@ def clear_guessed_fields(sample,match_reason,fields_to_clear=None): log.warning("Ambiguous! Multiple samples corresponding to guessed barcodes %s:", barcode_pair) for sample in samples: log.warning("\t%s expected (%s,%s); -> Guessed (%s,%s); match type: %s", sample["sample_name"], sample["expected_barcode_1"],sample.get("expected_barcode_2",""),sample["guessed_barcode_1"],sample.get("guessed_barcode_2",""),sample["match_type"]) - + final_guesses.append(clear_guessed_fields(sample, "alternative_indices_uncertain")) else: for sample in samples: