From f31be0a9fc826e5b7afcac1e0a45488c27d6b700 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 22 Jan 2021 21:30:34 -0500
Subject: [PATCH] bugfix guess barcodes for single-end reads (#51)

* bugfix guess barcodes for single-end reads

bugfix guess barcodes for single-end reads; allow barcodes other than 8bp in the sorted common barcodes output file

* emit counts in single-end guessed barcode report
---
 illumina.py                                   | 32 ++++++++--
 .../single_index/expected.txt                 | 10 +--
 util/illumina_indices.py                      | 63 ++++++++++++-------
 3 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/illumina.py b/illumina.py
index 711dc23f..4a1fdc4d 100755
--- a/illumina.py
+++ b/illumina.py
@@ -215,11 +215,21 @@ def main_illumina_demux(args):
             JVMmemory=args.JVMmemory)
 
         if args.commonBarcodes:
+            barcode_lengths = re.findall(r'(\d+)B',read_structure)
+            try:
+                barcode1_len=int(barcode_lengths[0])
+            except IndexError:
+                barcode1_len = 0
+            try:
+                barcode2_len=int(barcode_lengths[1])
+            except IndexError:
+                barcode2_len = 0
+
             # this step can take > 2 hours on a large high-output flowcell
             # so kick it to the background while we demux
             #count_and_sort_barcodes(barcodes_tmpdir, args.commonBarcodes)
             executor = concurrent.futures.ProcessPoolExecutor()
-            executor.submit(count_and_sort_barcodes, barcodes_tmpdir, args.commonBarcodes, truncateToLength=args.max_barcodes, threads=util.misc.sanitize_thread_count(args.threads))
+            executor.submit(count_and_sort_barcodes, barcodes_tmpdir, args.commonBarcodes, barcode1_len, barcode2_len, truncateToLength=args.max_barcodes, threads=util.misc.sanitize_thread_count(args.threads))
 
         # Picard IlluminaBasecallsToSam
         basecalls_input = util.file.mkstempfname('.txt', prefix='.'.join(['library_params', flowcell, str(args.lane)]))
@@ -428,7 +438,17 @@ def main_common_barcodes(args):
         picardOptions=picardOpts,
         JVMmemory=args.JVMmemory)
 
-    count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, args.truncateToLength, args.includeNoise, args.omitHeader)
+    barcode_lengths = re.findall(r'(\d+)B',read_structure)
+    try:
+        barcode1_len=int(barcode_lengths[0])
+    except IndexError:
+        barcode1_len = 0
+    try:
+        barcode2_len=int(barcode_lengths[1])
+    except IndexError:
+        barcode2_len = 0
+
+    count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader)
 
     # clean up
     os.unlink(barcode_file)
@@ -438,7 +458,7 @@ def main_common_barcodes(args):
 
 __commands__.append(('common_barcodes', parser_common_barcodes))
 
-def count_and_sort_barcodes(barcodes_dir, outSummary, truncateToLength=None, includeNoise=False, omitHeader=False, threads=None):
+def count_and_sort_barcodes(barcodes_dir, outSummary, barcode1_len=8, barcode2_len=8, truncateToLength=None, includeNoise=False, omitHeader=False, threads=None):
     # collect the barcode file paths for all tiles
     tile_barcode_files = [os.path.join(barcodes_dir, filename) for filename in os.listdir(barcodes_dir)]
 
@@ -479,8 +499,8 @@ def count_and_sort_barcodes(barcodes_dir, outSummary, truncateToLength=None, inc
 
                 barcode,count = row
 
-                writer.writerow((barcode[:8], ",".join([x for x in illumina_reference.guess_index(barcode[:8], distance=1)] or ["Unknown"]), 
-                            barcode[8:], ",".join([x for x in illumina_reference.guess_index(barcode[8:], distance=1)] or ["Unknown"]), 
+                writer.writerow((barcode[:barcode1_len], ",".join([x for x in illumina_reference.guess_index(barcode[:barcode1_len], distance=1)] or ["Unknown"]), 
+                            barcode[barcode1_len:len(barcode)], ",".join([x for x in illumina_reference.guess_index(barcode[barcode1_len:len(barcode)], distance=1)] or ["Unknown"]),
                             count))
 
                 if num_processed%50000==0:
@@ -518,7 +538,7 @@ def parser_guess_barcodes(parser=argparse.ArgumentParser()):
     parser.add_argument('--outlier_threshold', 
                         help='threshold of how far from unbalanced a sample must be to be considered an outlier.',
                         type=float,
-                        default=0.675)
+                        default=0.775)
     parser.add_argument('--expected_assigned_fraction', 
                         help='The fraction of reads expected to be assigned. An exception is raised if fewer than this fraction are assigned.',
                         type=float,
diff --git a/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt b/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt
index 25f81f52..86452935 100644
--- a/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt
+++ b/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt
@@ -1,5 +1,5 @@
-sample_name	expected_barcode_1	expected_barcode_1_name	guessed_barcode_1	guessed_barcode_1_name	match_type
-EBOV-VP24UTR_G10218A-C10220U.DMSO	CAAAAGAT	RPI28			alternative_indices_uncertain
-EBOV-VP24UTR_G10218A-C10242U.DMSO	CACCGGAT	RPI30			alternative_indices_uncertain
-EBOV-VP24UTR_G10218A-C10242U.NMIA	CTAGCTAT	RPI38			alternative_indices_uncertain
-EBOV-VP24UTR_G10218A-C10242U.DENAT	TCCCGAAT	RPI46			alternative_indices_uncertain
+sample_name	expected_barcode_1	expected_barcode_1_name	expected_barcodes_read_count	guessed_barcode_1	guessed_barcode_1_name	match_type
+EBOV-VP24UTR_G10218A-C10220U.DMSO	CAAAAGAT	RPI28	264750			alternative_indices_uncertain
+EBOV-VP24UTR_G10218A-C10242U.DMSO	CACCGGAT	RPI30	187070			alternative_indices_uncertain
+EBOV-VP24UTR_G10218A-C10242U.NMIA	CTAGCTAT	RPI38	268529			alternative_indices_uncertain
+EBOV-VP24UTR_G10218A-C10242U.DENAT	TCCCGAAT	RPI46	287559			alternative_indices_uncertain
diff --git a/util/illumina_indices.py b/util/illumina_indices.py
index 6e95faed..040f38a1 100755
--- a/util/illumina_indices.py
+++ b/util/illumina_indices.py
@@ -1644,7 +1644,7 @@ def __init__(self, barcode_counts, picard_metrics, sample_name, rows_limit=1000)
             if "Barcode2" in row and row["Barcode2"]:
                 self.barcode_name_map[row["Barcode2"]] = row["Likely_Index_Names2"]
 
-    def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0.7, number_of_negative_controls=1):
+    def outlier_barcodes(self, outlier_threshold=0.775, expected_assigned_fraction=0.7, number_of_negative_controls=1):
         """
             This identifies samples listed in the Picard metrics (derived from 
             the sample sheet) that are believed to have an anomalously low 
@@ -1664,10 +1664,11 @@ def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0
                 The ASQC Basic References in Quality Control: Statistical Techniques
 
             Params:
-            outlier_threshold (float): 0.675 corresponds to 75th percentile
+            outlier_threshold (float): 0.675, corresponds to 75th percentile
             expected_assigned_fraction (float): fraction of reads assigned to samples
             number_of_negative_controls (int): the number of samples in the pool expected to have zero reads
         """
+        ##log.debug(f"outlier_threshold {outlier_threshold}")
         assigned_read_count = sum(self.sample_to_read_counts.values())
         total_read_count = assigned_read_count+self.unassigned_read_count
         fraction_assigned = float(assigned_read_count)/float(total_read_count)
@@ -1676,24 +1677,33 @@ def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0
             raise UncertainSamplesheetError("Only {:.0%} of reads were assigned to barcode pairs listed in the samplesheet. Check the sample sheet for errors.".format(fraction_assigned))
 
         num_samples = len(self.sample_to_read_counts)
+        assert number_of_negative_controls<num_samples, "number_of_negative_controls must be < num_samples"
+
         log_obs_fractions_of_pool = [ -math.log(float(x)/float(total_read_count),10) if x>0 
-                                        else 0 
+                                        else 0
                                         for x in 
-                                        list(self.sample_to_read_counts.values())+[self.unassigned_read_count]
+                                        list(self.sample_to_read_counts.values())
                                     ]
-
-        log_exp_fractions_of_pool = [-math.log(1.0/float(num_samples-number_of_negative_controls),10)]*num_samples + [0]
-
+        ##log.debug(f"log_obs_fractions_of_pool {log_obs_fractions_of_pool}")
+        log_exp_fractions_of_pool = [-math.log(1.0/float(num_samples-number_of_negative_controls),10)]*num_samples
+        ##log.warning(f"log_exp_fractions_of_pool {log_exp_fractions_of_pool}")
         residuals = [obs-exp for (obs,exp) in zip(log_obs_fractions_of_pool,log_exp_fractions_of_pool)]
         resid_stdev = self.stddevp(residuals) #essentially RMSE
         resid_mean = self.mean(residuals) # mean error
         resid_median = self.median(residuals) # median error
 
+        ##log.debug(f"residuals {residuals}")
+        ##log.debug(f"resid_stdev {resid_stdev}")
+        ##log.debug(f"resid_mean {resid_mean}")
+        ##log.debug(f"resid_median {resid_median}")
+
         # modifed zscore using median to reduce influence of outliers
         zscores_residual_relative_to_median = [float(1.0 * (x-resid_median))/resid_stdev for x in residuals]
+        ##log.debug(f"zscores_residual_relative_to_median {zscores_residual_relative_to_median}")
         # only consider LOW variance
-        indices_of_outliers = [i for i,v in enumerate(zscores_residual_relative_to_median[:-1]) if v > outlier_threshold]
-        indices_of_outliers += [i for i,v in enumerate(list(self.sample_to_read_counts.values())[:-1]) if v==0]
+        indices_of_outliers = [i for i,v in enumerate(zscores_residual_relative_to_median) if v > outlier_threshold]
+        indices_of_outliers += [i for i,v in enumerate(list(self.sample_to_read_counts.values())) if v==0]
+        ##log.debug("self.sample_to_read_counts.keys() {}".format(self.sample_to_read_counts.keys()))
         names_of_outlier_samples = [(list(self.sample_to_read_counts.keys()))[i] for i in indices_of_outliers]
         log.warning("outlier samples")
         for s in names_of_outlier_samples:
@@ -1760,7 +1770,8 @@ def guess_barcodes_for_sample(self, sample_name):
         novel_barcode_pairs = []
 
         for barcode_pair in self.barcodes_seen.keys():
-            if barcode_pair not in self.sample_to_barcodes.values():
+            barcode = tuple((b for b in barcode_pair if b is not None))
+            if barcode not in self.sample_to_barcodes.values():
                 novel_barcode_pairs.append(barcode_pair)
             else:
                 del barcodes_seen_novel[barcode_pair]
@@ -1772,7 +1783,7 @@ def guess_barcodes_for_sample(self, sample_name):
         if is_dual_index:
             out_dict["expected_barcode_2"]           = self.sample_to_barcodes[sample_name][1]
             out_dict["expected_barcode_2_name"]      = ",".join(self.index_reference.guess_index(self.sample_to_barcodes[sample_name][1]))
-            out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name]
+        out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name]
 
         claimed_barcodes = self.sample_to_barcodes[sample_name]
 
@@ -1780,36 +1791,46 @@ def guess_barcodes_for_sample(self, sample_name):
         putative_match = None
 
         if is_dual_index:
-        # barcodes_seen_novel is sorted by read count, desc
+            # barcodes_seen_novel is sorted by read count, desc
             for (barcode_pair,count) in barcodes_seen_novel.items():
                 if barcode_pair[0]==claimed_barcodes[0] or barcode_pair[1]==claimed_barcodes[1]:
                     found_partial_match=True
                     putative_match = barcode_pair
                     out_dict["match_type"] = "one_barcode_match"
                     break
-
-            # find index of match to help determine if it is a reasonable guess
-            idx_of_match = -1
-            for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()):
-                if barcode_pair==putative_match:
-                    idx_of_match=idx
+        else:
+            # barcodes_seen_novel is sorted by read count, desc
+            for (barcode_pair,count) in barcodes_seen_novel.items():
+                # get barcode with greatest count that isn't the one we're looking for
+                if barcode_pair[0]!=claimed_barcodes[0]:
+                    found_partial_match=True
+                    putative_match = barcode_pair
+                    out_dict["match_type"] = "one_barcode_match"
                     break
 
+        # find index of match to help determine if it is a reasonable guess
+        idx_of_match = -1
+        for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()):
+            if barcode_pair==putative_match:
+                idx_of_match=idx
+                break
+
         # if the one-barcode match is too far down the list of barcode pairs seen
         # (farther down than 1.5x the number of samples)
         if not found_partial_match or idx_of_match>(len(self.sample_to_read_counts)*1.5):
             for (barcode_pair,count) in barcodes_seen_novel.items():
                 # return the most pair with greatest count
                 putative_match = barcode_pair
-                out_dict["match_type"] = "high_count_novel_pair"
+                out_dict["match_type"] = "high_count_novel_barcode"
                 break
         
         out_dict["guessed_barcode_1"]           = "unknown"
         out_dict["guessed_barcode_1_name"]      = "unknown"
+        #out_dict["guessed_barcodes_read_count"] = 0
         if is_dual_index and (putative_match is not None and putative_match[1] != None):
             out_dict["guessed_barcode_2"]           = "unknown"
             out_dict["guessed_barcode_2_name"]      = "unknown"
-            out_dict["guessed_barcodes_read_count"] = "0"
+        
 
         if putative_match is not None and putative_match[0]!=None:
             out_dict["guessed_barcode_1"]           = putative_match[0]
@@ -1868,7 +1889,7 @@ def clear_guessed_fields(sample,match_reason,fields_to_clear=None):
                 log.warning("Ambiguous! Multiple samples corresponding to guessed barcodes %s:", barcode_pair)
                 for sample in samples:
                     log.warning("\t%s expected (%s,%s); -> Guessed (%s,%s); match type: %s", sample["sample_name"], sample["expected_barcode_1"],sample.get("expected_barcode_2",""),sample["guessed_barcode_1"],sample.get("guessed_barcode_2",""),sample["match_type"])
-    
+
                     final_guesses.append(clear_guessed_fields(sample, "alternative_indices_uncertain"))
             else:
                 for sample in samples: