From 60997cd6a912136224b8d75f599c95bdfadc0beb Mon Sep 17 00:00:00 2001
From: joshuailevy <joshua.levy16@gmail.com>
Date: Fri, 8 Mar 2024 17:39:15 -0800
Subject: [PATCH 1/5] updates to collapse, summarization

---
 freyja/_cli.py          |  30 +++++++--
 freyja/sample_deconv.py |   6 +-
 freyja/utils.py         | 137 ++++++++++++++++++++++++++++++----------
 3 files changed, 135 insertions(+), 38 deletions(-)

diff --git a/freyja/_cli.py b/freyja/_cli.py
index 872ef43a..e659da35 100644
--- a/freyja/_cli.py
+++ b/freyja/_cli.py
@@ -58,9 +58,15 @@ def print_barcode_version(ctx, param, value):
 @click.option('--region_of_interest', default='', help='JSON file containing'
               'region(s) of interest for which to compute additional coverage'
               'estimates')
+@click.option('--nonstrictmrca', is_flag=True, default=False,
+              help='for use with depth cutoff,'
+              'clusters are assigned robust mrca to handle outliers')
+@click.option('--nonstrictthresh', default=0.9,
+              help='associated threshold for robust mrca function')
 def demix(variants, depths, output, eps, barcodes, meta,
           covcut, confirmedonly, depthcutoff, lineageyml,
-          adapt, a_eps, region_of_interest):
+          adapt, a_eps, region_of_interest,
+          nonstrictmrca, nonstrictthresh):
     """
     Generate prevalence of lineages per sample
 
@@ -85,7 +91,10 @@ def demix(variants, depths, output, eps, barcodes, meta,
      :param lineageyml: used to pass a custom lineage hierarchy file
      :param adapt: used to set adaptive lasso penalty parameter
      :param a_eps: used to set adaptive lasso
-     penalty parameter hard threshold
+     penalty parameter hard threshold'
+     :param nonstrictmrca: for use with depth cutoff,
+     clusters are assigned robust mrca to handle outliers
+     :param nonstrictthresh: associated threshold for robust mrca function
      :return : a tsv file that includes the
      lineages present,their corresponding abundances,
       and summarization by constellation.
@@ -114,7 +123,8 @@ def demix(variants, depths, output, eps, barcodes, meta,
     df_depth = pd.read_csv(depths, sep='\t', header=None, index_col=1)
     if depthcutoff != 0:
         df_barcodes = collapse_barcodes(df_barcodes, df_depth, depthcutoff,
-                                        lineageyml, locDir, output)
+                                        lineageyml, locDir, output,
+                                        nonstrictmrca, nonstrictthresh)
     muts = list(df_barcodes.columns)
     mapDict = buildLineageMap(meta)
     print('building mix/depth matrices')
@@ -382,8 +392,14 @@ def variants(bamfile, ref, variants, depths, refname, minq, annot):
 @click.option('--depthcutoff', default=0,
               help='exclude sites with coverage depth below this value and'
               'group identical barcodes')
+@click.option('--nonstrictmrca', is_flag=True, default=False,
+              help='for use with depth cutoff,'
+              'clusters are assigned robust mrca to handle outliers')
+@click.option('--nonstrictthresh', default=0.9,
+              help='associated threshold for robust mrca function')
 def boot(variants, depths, output_base, eps, barcodes, meta,
-         nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff, rawboots):
+         nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff,
+         rawboots, nonstrictmrca, nonstrictthresh):
     """
     Perform bootstrapping method for freyja
 
@@ -404,6 +420,9 @@ def boot(variants, depths, output_base, eps, barcodes, meta,
      :param lineageyml: used to pass a custom lineage hierarchy file
      :param depthcutoff: used to exclude sites with coverage depth
      below this value andgroup identical barcodes
+     :param nonstrictmrca: for use with depth cutoff,
+     clusters are assigned robust mrca to handle outliers
+     :param nonstrictthresh: associated threshold for robust mrca function
 
      :return : base-name_lineages.csv and base-name_summarized.csv
     """
@@ -435,7 +454,8 @@ def boot(variants, depths, output_base, eps, barcodes, meta,
     if depthcutoff != 0:
         df_barcodes = collapse_barcodes(
             df_barcodes, df_depths, depthcutoff,
-            lineageyml, locDir, output_base)
+            lineageyml, locDir, output_base,
+            nonstrictmrca, nonstrictthresh)
 
     muts = list(df_barcodes.columns)
     mapDict = buildLineageMap(meta)
diff --git a/freyja/sample_deconv.py b/freyja/sample_deconv.py
index 836cc16a..62334f6a 100644
--- a/freyja/sample_deconv.py
+++ b/freyja/sample_deconv.py
@@ -128,7 +128,11 @@ def reindex_dfs(df_barcodes, mix, depths):
 def map_to_constellation(sample_strains, vals, mapDict):
     # maps lineage names to constellations
     localDict = {}
-    for jj, lin in enumerate(sample_strains):
+    for jj, lin0 in enumerate(sample_strains):
+        if '-like' in lin0:
+            lin = lin0.split('-like')[0]
+        else:
+            lin = lin0
         if lin in mapDict.keys():
             if mapDict[lin] not in localDict.keys():
                 localDict[mapDict[lin]] = vals[jj]
diff --git a/freyja/utils.py b/freyja/utils.py
index d219049c..cc9ff2fc 100644
--- a/freyja/utils.py
+++ b/freyja/utils.py
@@ -205,7 +205,8 @@ def get_color_scheme(df, default_color_scheme, config=None):
     return color_scheme
 
 
-def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None):
+def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None,
+                    mergeLikes=False):
 
     if len(agg_d0.index[agg_d0.index.duplicated(keep=False)]) > 0:
         print('WARNING: multiple samples have the same ID/filename.')
@@ -234,7 +235,22 @@ def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None):
           .apply(lambda x:
                  re.sub(' +', ' ', x)
                    .split(' ')).copy()
-    # print([float(abund) for abund in agg_d0.iloc[0].loc['abundances']])
+
+    if mergeLikes:
+        agg_d0.loc[:, 'lineages'] = agg_d0['lineages'].apply(
+            lambda x: [x0.split('-like')[0] for x0 in x])
+        newLins, newAbunds = [], []
+        for lins, abunds in zip(agg_d0['lineages'], agg_d0['abundances']):
+            linsUnique, indicesUnique = np.unique(lins, return_inverse=True)
+            newLins.append(linsUnique)
+            newAbund = [0]*len(linsUnique)
+            for ind0, j in enumerate(indicesUnique):
+                newAbund[j] += float(abunds[ind0])
+            newAbunds.append(newAbund)
+        agg_d0['lineages'] = newLins
+        agg_d0['abundances'] = newAbunds
+        # agg_d0.loc[:,'abundances'] = agg_d0[['lineages']
+        # merge abundances
     agg_d0.loc[:, 'linDict'] = [{lin: float(abund) for lin, abund in
                                 zip(agg_d0.loc[samp, 'lineages'],
                                     agg_d0.loc[samp, 'abundances'])}
@@ -496,7 +512,6 @@ def makePlot_time(agg_df, lineages, times_df, interval, outputFn,
         weekInfo = [Week.fromdate(dfi).weektuple()
                     for dfi in df_abundances.index]
         df_abundances.index = [str(wi[0])+'-'+str(wi[1]) for wi in weekInfo]
-        print(df_abundances)
         for i in range(0, df_abundances.shape[1]):
             label = df_abundances.columns[i]
             ax.bar(df_abundances.index, df_abundances.iloc[:, i],
@@ -902,7 +917,8 @@ def make_dashboard(agg_df, meta_df, thresh, title, introText,
 
 
 def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
-                      lineageyml, locDir, output):
+                      lineageyml, locDir, output,
+                      nonstrict, nonstrictthresh):
     # drop low coverage sites
     low_cov_sites = df_depth[df_depth[3].astype(int) < depthcutoff] \
         .index.astype(str)
@@ -913,7 +929,6 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
     max_depth = df_depth[3].astype(int).max()
 
     # find lineages with identical barcodes
-
     try:
         duplicates = df_barcodes.groupby(df_barcodes.columns.tolist()).apply(
             lambda x: tuple(x.index) if len(x.index) > 1 else None
@@ -937,39 +952,97 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
     for tup in duplicates:
         pango_aliases = [lineage_data[lin]['alias']
                          for lin in tup]
-        alias_dict = {lineage_data[lin]['alias']: lin for lin in tup}
-
         # handle cases where multiple lineage classes are being merged
         # e.g. (A.5, B.12) or (XBB, XBN)
         multiple_lin_classes = len(
-            set([alias[0] for alias in pango_aliases])) > 1
+            set([alias.split('.')[0] for alias in pango_aliases])) > 1
 
         if multiple_lin_classes:
+            recombs = [alias for alias in pango_aliases
+                       if 'recombinant_parents' in
+                       lineage_data[alias.split('.')[0]]]
+
             # for recombinant lineages, find the parent lineages
-            parent_aliases = []
-            for alias in pango_aliases:
-
-                if 'recombinant_parents' in lineage_data[alias_dict[alias]]:
-
-                    # replace with its recombinant parents
-                    pango_aliases.remove(alias)
-                    parents = lineage_data[alias]['recombinant_parents'] \
-                        .replace('*', '').split(',')
-                    parents = [lineage_data[lin]['alias'] for lin in parents]
-
-                    # only consider parents related to the other lineages
-                    for parent in parents:
-                        if any([alias.startswith(parent)
-                                for alias in pango_aliases]) and \
-                                parent not in pango_aliases:
-                            parent_aliases.append(parent)
-
-            pango_aliases += parent_aliases
-
-        # get MRCA
-        mrca = os.path.commonpath(
-            [lin.replace('.', '/') for lin in pango_aliases]
-        ).replace('/', '.')
+            startTypes = set([alias.split('.')[0] for alias in pango_aliases])
+            # figure out which are the candidates for recomb merging
+            # if they exist
+            while len(recombs) > 0:
+                parent_aliases = []
+                for alias in recombs:
+                    if 'recombinant_parents' in lineage_data[alias.split('.')
+                                                             [0]]:
+                        # trace up tree until a recombination event.
+                        # grab parents of recombinant
+                        parents = lineage_data[alias.split('.')
+                                               [0]]['recombinant_parents'
+                                                    ].replace('*', ''
+                                                              ).split(',')
+                        parent_aliases.append([lineage_data[lin]['alias']
+                                               for lin in parents])
+
+                distinct = []
+                newRecombs = []
+                mergedIn = False
+                for alias, pa in zip(recombs, parent_aliases):
+                    for aliasP in pa:
+                        if aliasP.split('.')[0] in startTypes:
+                            mergedIn = True
+                            # if now using same start as others,
+                            # add to list of aliases
+                            pango_aliases.append(aliasP)
+                            if alias in pango_aliases:
+                                pango_aliases.remove(alias)
+                        elif 'recombinant_parents' in lineage_data[aliasP.
+                                                                   split('.')
+                                                                   [0]]:
+                            # check if it's a different recombinant
+                            newRecombs.append(aliasP)
+                        else:
+                            # non-recombinant, but not in current start types.
+                            distinct.append(aliasP)
+                if not mergedIn:
+                    # if no merges, remove the recombinants
+                    # and add in the parents
+                    for r in recombs:
+                        pango_aliases.remove(r)
+                    pango_aliases.extend(distinct+newRecombs)
+
+                startTypes = set([alias.split('.')[0]
+                                  for alias in pango_aliases])
+                if len(startTypes) == 1:
+                    break
+                recombs = [alias for alias in pango_aliases if
+                           'recombinant_parents' in
+                           lineage_data[alias.split('.')[0]]]
+
+        if not nonstrict:
+            mrca = os.path.commonpath(
+                [lin.replace('.', '/') for lin in pango_aliases]
+            ).replace('/', '.')
+        else:
+            j0 = 1
+            groupCt = float(len(pango_aliases))
+            ext_counts = np.unique([lin.split('.')[0:j0]
+                                    for lin in pango_aliases],
+                                   return_counts=True)
+            coherentFrac = np.max(ext_counts[1]) / groupCt
+            if coherentFrac < nonstrictthresh:
+                mrca = ''
+            else:
+                maxLength = np.max([len(lin.split('.'))
+                                    for lin in pango_aliases])
+                while coherentFrac >= nonstrictthresh and j0 <= maxLength:
+                    ext_counts = np.unique([lin.split('.')[0:j0]
+                                            if j0 <= len(lin.split('.'))
+                                            else lin.split('.') +
+                                            ['']*(j0-len(lin.split('.')))
+                                            for lin in pango_aliases],
+                                           return_counts=True,
+                                           axis=0)
+                    max_ind = np.argmax(ext_counts[1])
+                    coherentFrac = ext_counts[1][max_ind] / groupCt
+                    mrca = '.'.join(ext_counts[0][max_ind])
+                    j0 += 1
 
         # assign placeholder if no MRCA found
         if len(mrca) == 0:

From c157d5204475d3d0ceb78f828b263f6327d15854 Mon Sep 17 00:00:00 2001
From: joshuailevy <joshua.levy16@gmail.com>
Date: Fri, 8 Mar 2024 17:48:30 -0800
Subject: [PATCH 2/5] update utils test

---
 freyja/tests/test_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/freyja/tests/test_utils.py b/freyja/tests/test_utils.py
index e72fc5b8..e9712c91 100644
--- a/freyja/tests/test_utils.py
+++ b/freyja/tests/test_utils.py
@@ -411,7 +411,8 @@ def test_collapse_barcodes(self):
         original = barcodes.shape
         lineages_yml = 'freyja/data/lineages.yml'
         barcodes = collapse_barcodes(barcodes, self.depth,
-                                     100, lineages_yml, 'freyja', 'test')
+                                     100, lineages_yml, 'freyja', 'test',
+                                     False, 0.9)
         collapsed = barcodes.shape
 
         self.assertLess(collapsed[0], original[0])

From 5643087a9413c7236388bc20fcdf13ad45575ed4 Mon Sep 17 00:00:00 2001
From: joshuailevy <joshua.levy16@gmail.com>
Date: Fri, 8 Mar 2024 18:15:55 -0800
Subject: [PATCH 3/5] fix alias lookup issue

---
 freyja/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/freyja/utils.py b/freyja/utils.py
index cc9ff2fc..dd94bdeb 100644
--- a/freyja/utils.py
+++ b/freyja/utils.py
@@ -1048,6 +1048,8 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
         if len(mrca) == 0:
             mrca = 'Misc'
         else:
+            if mrca[-1] == '.':
+                mrca = mrca[0:(len(mrca)-1)]
             # otherwise, get the shortened alias, if available
             for lineage in lineage_data:
                 if lineage_data[lineage]['alias'] == mrca:

From 2995c762496274db6c82d00707b536956157bb96 Mon Sep 17 00:00:00 2001
From: joshuailevy <joshua.levy16@gmail.com>
Date: Mon, 18 Mar 2024 12:59:26 -0700
Subject: [PATCH 4/5] change option name

---
 freyja/_cli.py  | 24 ++++++++++++------------
 freyja/utils.py |  8 ++++----
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/freyja/_cli.py b/freyja/_cli.py
index e659da35..06ab9dac 100644
--- a/freyja/_cli.py
+++ b/freyja/_cli.py
@@ -58,15 +58,15 @@ def print_barcode_version(ctx, param, value):
 @click.option('--region_of_interest', default='', help='JSON file containing'
               'region(s) of interest for which to compute additional coverage'
               'estimates')
-@click.option('--nonstrictmrca', is_flag=True, default=False,
+@click.option('--relaxedmrca', is_flag=True, default=False,
               help='for use with depth cutoff,'
               'clusters are assigned robust mrca to handle outliers')
-@click.option('--nonstrictthresh', default=0.9,
+@click.option('--relaxedthresh', default=0.9,
               help='associated threshold for robust mrca function')
 def demix(variants, depths, output, eps, barcodes, meta,
           covcut, confirmedonly, depthcutoff, lineageyml,
           adapt, a_eps, region_of_interest,
-          nonstrictmrca, nonstrictthresh):
+          relaxedmrca, relaxedthresh):
     """
     Generate prevalence of lineages per sample
 
@@ -92,9 +92,9 @@ def demix(variants, depths, output, eps, barcodes, meta,
      :param adapt: used to set adaptive lasso penalty parameter
      :param a_eps: used to set adaptive lasso
      penalty parameter hard threshold'
-     :param nonstrictmrca: for use with depth cutoff,
+     :param relaxedmrca: for use with depth cutoff,
      clusters are assigned robust mrca to handle outliers
-     :param nonstrictthresh: associated threshold for robust mrca function
+     :param relaxedthresh: associated threshold for robust mrca function
      :return : a tsv file that includes the
      lineages present,their corresponding abundances,
       and summarization by constellation.
@@ -124,7 +124,7 @@ def demix(variants, depths, output, eps, barcodes, meta,
     if depthcutoff != 0:
         df_barcodes = collapse_barcodes(df_barcodes, df_depth, depthcutoff,
                                         lineageyml, locDir, output,
-                                        nonstrictmrca, nonstrictthresh)
+                                        relaxedmrca, relaxedthresh)
     muts = list(df_barcodes.columns)
     mapDict = buildLineageMap(meta)
     print('building mix/depth matrices')
@@ -392,14 +392,14 @@ def variants(bamfile, ref, variants, depths, refname, minq, annot):
 @click.option('--depthcutoff', default=0,
               help='exclude sites with coverage depth below this value and'
               'group identical barcodes')
-@click.option('--nonstrictmrca', is_flag=True, default=False,
+@click.option('--relaxedmrca', is_flag=True, default=False,
               help='for use with depth cutoff,'
               'clusters are assigned robust mrca to handle outliers')
-@click.option('--nonstrictthresh', default=0.9,
+@click.option('--relaxedthresh', default=0.9,
               help='associated threshold for robust mrca function')
 def boot(variants, depths, output_base, eps, barcodes, meta,
          nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff,
-         rawboots, nonstrictmrca, nonstrictthresh):
+         rawboots, relaxedmrca, relaxedthresh):
     """
     Perform bootstrapping method for freyja
 
@@ -420,9 +420,9 @@ def boot(variants, depths, output_base, eps, barcodes, meta,
      :param lineageyml: used to pass a custom lineage hierarchy file
      :param depthcutoff: used to exclude sites with coverage depth
      below this value andgroup identical barcodes
-     :param nonstrictmrca: for use with depth cutoff,
+     :param relaxedmrca: for use with depth cutoff,
      clusters are assigned robust mrca to handle outliers
-     :param nonstrictthresh: associated threshold for robust mrca function
+     :param relaxedthresh: associated threshold for robust mrca function
 
      :return : base-name_lineages.csv and base-name_summarized.csv
     """
@@ -455,7 +455,7 @@ def boot(variants, depths, output_base, eps, barcodes, meta,
         df_barcodes = collapse_barcodes(
             df_barcodes, df_depths, depthcutoff,
             lineageyml, locDir, output_base,
-            nonstrictmrca, nonstrictthresh)
+            relaxedmrca, relaxedthresh)
 
     muts = list(df_barcodes.columns)
     mapDict = buildLineageMap(meta)
diff --git a/freyja/utils.py b/freyja/utils.py
index dd94bdeb..8eee276a 100644
--- a/freyja/utils.py
+++ b/freyja/utils.py
@@ -918,7 +918,7 @@ def make_dashboard(agg_df, meta_df, thresh, title, introText,
 
 def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
                       lineageyml, locDir, output,
-                      nonstrict, nonstrictthresh):
+                      relaxed, relaxedthresh):
     # drop low coverage sites
     low_cov_sites = df_depth[df_depth[3].astype(int) < depthcutoff] \
         .index.astype(str)
@@ -1015,7 +1015,7 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
                            'recombinant_parents' in
                            lineage_data[alias.split('.')[0]]]
 
-        if not nonstrict:
+        if not relaxed:
             mrca = os.path.commonpath(
                 [lin.replace('.', '/') for lin in pango_aliases]
             ).replace('/', '.')
@@ -1026,12 +1026,12 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
                                     for lin in pango_aliases],
                                    return_counts=True)
             coherentFrac = np.max(ext_counts[1]) / groupCt
-            if coherentFrac < nonstrictthresh:
+            if coherentFrac < relaxedthresh:
                 mrca = ''
             else:
                 maxLength = np.max([len(lin.split('.'))
                                     for lin in pango_aliases])
-                while coherentFrac >= nonstrictthresh and j0 <= maxLength:
+                while coherentFrac >= relaxedthresh and j0 <= maxLength:
                     ext_counts = np.unique([lin.split('.')[0:j0]
                                             if j0 <= len(lin.split('.'))
                                             else lin.split('.') +

From 9ca18776e685e94865ed405d93ed0c5b40ff6a34 Mon Sep 17 00:00:00 2001
From: joshuailevy <joshua.levy16@gmail.com>
Date: Mon, 18 Mar 2024 13:32:56 -0700
Subject: [PATCH 5/5] add option to check if hierarchy file is out of date

---
 freyja/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/freyja/utils.py b/freyja/utils.py
index 8eee276a..bdc818fa 100644
--- a/freyja/utils.py
+++ b/freyja/utils.py
@@ -950,8 +950,13 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff,
 
     # collapse lineages into MRCA, where possible
     for tup in duplicates:
-        pango_aliases = [lineage_data[lin]['alias']
-                         for lin in tup]
+        try:
+            pango_aliases = [lineage_data[lin]['alias']
+                             for lin in tup]
+        except KeyError:
+            print('Lineage hierarchy file is likely behind'
+                  ' the selected barcode file. Try updating'
+                  ' the hierarchy file.')
         # handle cases where multiple lineage classes are being merged
         # e.g. (A.5, B.12) or (XBB, XBN)
         multiple_lin_classes = len(