From 60997cd6a912136224b8d75f599c95bdfadc0beb Mon Sep 17 00:00:00 2001 From: joshuailevy Date: Fri, 8 Mar 2024 17:39:15 -0800 Subject: [PATCH 1/5] updates to collapse, summarization --- freyja/_cli.py | 30 +++++++-- freyja/sample_deconv.py | 6 +- freyja/utils.py | 137 ++++++++++++++++++++++++++++++---------- 3 files changed, 135 insertions(+), 38 deletions(-) diff --git a/freyja/_cli.py b/freyja/_cli.py index 872ef43a..e659da35 100644 --- a/freyja/_cli.py +++ b/freyja/_cli.py @@ -58,9 +58,15 @@ def print_barcode_version(ctx, param, value): @click.option('--region_of_interest', default='', help='JSON file containing' 'region(s) of interest for which to compute additional coverage' 'estimates') +@click.option('--nonstrictmrca', is_flag=True, default=False, + help='for use with depth cutoff,' + 'clusters are assigned robust mrca to handle outliers') +@click.option('--nonstrictthresh', default=0.9, + help='associated threshold for robust mrca function') def demix(variants, depths, output, eps, barcodes, meta, covcut, confirmedonly, depthcutoff, lineageyml, - adapt, a_eps, region_of_interest): + adapt, a_eps, region_of_interest, + nonstrictmrca, nonstrictthresh): """ Generate prevalence of lineages per sample @@ -85,7 +91,10 @@ def demix(variants, depths, output, eps, barcodes, meta, :param lineageyml: used to pass a custom lineage hierarchy file :param adapt: used to set adaptive lasso penalty parameter :param a_eps: used to set adaptive lasso - penalty parameter hard threshold + penalty parameter hard threshold' + :param nonstrictmrca: for use with depth cutoff, + clusters are assigned robust mrca to handle outliers + :param nonstrictthresh: associated threshold for robust mrca function :return : a tsv file that includes the lineages present,their corresponding abundances, and summarization by constellation. @@ -114,7 +123,8 @@ def demix(variants, depths, output, eps, barcodes, meta, df_depth = pd.read_csv(depths, sep='\t', header=None, index_col=1) if depthcutoff != 0: df_barcodes = collapse_barcodes(df_barcodes, df_depth, depthcutoff, - lineageyml, locDir, output) + lineageyml, locDir, output, + nonstrictmrca, nonstrictthresh) muts = list(df_barcodes.columns) mapDict = buildLineageMap(meta) print('building mix/depth matrices') @@ -382,8 +392,14 @@ def variants(bamfile, ref, variants, depths, refname, minq, annot): @click.option('--depthcutoff', default=0, help='exclude sites with coverage depth below this value and' 'group identical barcodes') +@click.option('--nonstrictmrca', is_flag=True, default=False, + help='for use with depth cutoff,' + 'clusters are assigned robust mrca to handle outliers') +@click.option('--nonstrictthresh', default=0.9, + help='associated threshold for robust mrca function') def boot(variants, depths, output_base, eps, barcodes, meta, - nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff, rawboots): + nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff, + rawboots, nonstrictmrca, nonstrictthresh): """ Perform bootstrapping method for freyja @@ -404,6 +420,9 @@ def boot(variants, depths, output_base, eps, barcodes, meta, :param lineageyml: used to pass a custom lineage hierarchy file :param depthcutoff: used to exclude sites with coverage depth below this value andgroup identical barcodes + :param nonstrictmrca: for use with depth cutoff, + clusters are assigned robust mrca to handle outliers + :param nonstrictthresh: associated threshold for robust mrca function :return : base-name_lineages.csv and base-name_summarized.csv """ @@ -435,7 +454,8 @@ def boot(variants, depths, output_base, eps, barcodes, meta, if depthcutoff != 0: df_barcodes = collapse_barcodes( df_barcodes, df_depths, depthcutoff, - lineageyml, locDir, output_base) + lineageyml, locDir, output_base, + nonstrictmrca, nonstrictthresh) muts = list(df_barcodes.columns) mapDict = buildLineageMap(meta) diff --git a/freyja/sample_deconv.py b/freyja/sample_deconv.py index 836cc16a..62334f6a 100644 --- a/freyja/sample_deconv.py +++ b/freyja/sample_deconv.py @@ -128,7 +128,11 @@ def reindex_dfs(df_barcodes, mix, depths): def map_to_constellation(sample_strains, vals, mapDict): # maps lineage names to constellations localDict = {} - for jj, lin in enumerate(sample_strains): + for jj, lin0 in enumerate(sample_strains): + if '-like' in lin0: + lin = lin0.split('-like')[0] + else: + lin = lin0 if lin in mapDict.keys(): if mapDict[lin] not in localDict.keys(): localDict[mapDict[lin]] = vals[jj] diff --git a/freyja/utils.py b/freyja/utils.py index d219049c..cc9ff2fc 100644 --- a/freyja/utils.py +++ b/freyja/utils.py @@ -205,7 +205,8 @@ def get_color_scheme(df, default_color_scheme, config=None): return color_scheme -def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None): +def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None, + mergeLikes=False): if len(agg_d0.index[agg_d0.index.duplicated(keep=False)]) > 0: print('WARNING: multiple samples have the same ID/filename.') @@ -234,7 +235,22 @@ def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None): .apply(lambda x: re.sub(' +', ' ', x) .split(' ')).copy() - # print([float(abund) for abund in agg_d0.iloc[0].loc['abundances']]) + + if mergeLikes: + agg_d0.loc[:, 'lineages'] = agg_d0['lineages'].apply( + lambda x: [x0.split('-like')[0] for x0 in x]) + newLins, newAbunds = [], [] + for lins, abunds in zip(agg_d0['lineages'], agg_d0['abundances']): + linsUnique, indicesUnique = np.unique(lins, return_inverse=True) + newLins.append(linsUnique) + newAbund = [0]*len(linsUnique) + for ind0, j in enumerate(indicesUnique): + newAbund[j] += float(abunds[ind0]) + newAbunds.append(newAbund) + agg_d0['lineages'] = newLins + agg_d0['abundances'] = newAbunds + # agg_d0.loc[:,'abundances'] = agg_d0[['lineages'] + # merge abundances agg_d0.loc[:, 'linDict'] = [{lin: float(abund) for lin, abund in zip(agg_d0.loc[samp, 'lineages'], agg_d0.loc[samp, 'abundances'])} @@ -496,7 +512,6 @@ def makePlot_time(agg_df, lineages, times_df, interval, outputFn, weekInfo = [Week.fromdate(dfi).weektuple() for dfi in df_abundances.index] df_abundances.index = [str(wi[0])+'-'+str(wi[1]) for wi in weekInfo] - print(df_abundances) for i in range(0, df_abundances.shape[1]): label = df_abundances.columns[i] ax.bar(df_abundances.index, df_abundances.iloc[:, i], @@ -902,7 +917,8 @@ def make_dashboard(agg_df, meta_df, thresh, title, introText, def collapse_barcodes(df_barcodes, df_depth, depthcutoff, - lineageyml, locDir, output): + lineageyml, locDir, output, + nonstrict, nonstrictthresh): # drop low coverage sites low_cov_sites = df_depth[df_depth[3].astype(int) < depthcutoff] \ .index.astype(str) @@ -913,7 +929,6 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, max_depth = df_depth[3].astype(int).max() # find lineages with identical barcodes - try: duplicates = df_barcodes.groupby(df_barcodes.columns.tolist()).apply( lambda x: tuple(x.index) if len(x.index) > 1 else None @@ -937,39 +952,97 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, for tup in duplicates: pango_aliases = [lineage_data[lin]['alias'] for lin in tup] - alias_dict = {lineage_data[lin]['alias']: lin for lin in tup} - # handle cases where multiple lineage classes are being merged # e.g. (A.5, B.12) or (XBB, XBN) multiple_lin_classes = len( - set([alias[0] for alias in pango_aliases])) > 1 + set([alias.split('.')[0] for alias in pango_aliases])) > 1 if multiple_lin_classes: + recombs = [alias for alias in pango_aliases + if 'recombinant_parents' in + lineage_data[alias.split('.')[0]]] + # for recombinant lineages, find the parent lineages - parent_aliases = [] - for alias in pango_aliases: - - if 'recombinant_parents' in lineage_data[alias_dict[alias]]: - - # replace with its recombinant parents - pango_aliases.remove(alias) - parents = lineage_data[alias]['recombinant_parents'] \ - .replace('*', '').split(',') - parents = [lineage_data[lin]['alias'] for lin in parents] - - # only consider parents related to the other lineages - for parent in parents: - if any([alias.startswith(parent) - for alias in pango_aliases]) and \ - parent not in pango_aliases: - parent_aliases.append(parent) - - pango_aliases += parent_aliases - - # get MRCA - mrca = os.path.commonpath( - [lin.replace('.', '/') for lin in pango_aliases] - ).replace('/', '.') + startTypes = set([alias.split('.')[0] for alias in pango_aliases]) + # figure out which are the candidates for recomb merging + # if they exist + while len(recombs) > 0: + parent_aliases = [] + for alias in recombs: + if 'recombinant_parents' in lineage_data[alias.split('.') + [0]]: + # trace up tree until a recombination event. + # grab parents of recombinant + parents = lineage_data[alias.split('.') + [0]]['recombinant_parents' + ].replace('*', '' + ).split(',') + parent_aliases.append([lineage_data[lin]['alias'] + for lin in parents]) + + distinct = [] + newRecombs = [] + mergedIn = False + for alias, pa in zip(recombs, parent_aliases): + for aliasP in pa: + if aliasP.split('.')[0] in startTypes: + mergedIn = True + # if now using same start as others, + # add to list of aliases + pango_aliases.append(aliasP) + if alias in pango_aliases: + pango_aliases.remove(alias) + elif 'recombinant_parents' in lineage_data[aliasP. + split('.') + [0]]: + # check if it's a different recombinant + newRecombs.append(aliasP) + else: + # non-recombinant, but not in current start types. + distinct.append(aliasP) + if not mergedIn: + # if no merges, remove the recombinants + # and add in the parents + for r in recombs: + pango_aliases.remove(r) + pango_aliases.extend(distinct+newRecombs) + + startTypes = set([alias.split('.')[0] + for alias in pango_aliases]) + if len(startTypes) == 1: + break + recombs = [alias for alias in pango_aliases if + 'recombinant_parents' in + lineage_data[alias.split('.')[0]]] + + if not nonstrict: + mrca = os.path.commonpath( + [lin.replace('.', '/') for lin in pango_aliases] + ).replace('/', '.') + else: + j0 = 1 + groupCt = float(len(pango_aliases)) + ext_counts = np.unique([lin.split('.')[0:j0] + for lin in pango_aliases], + return_counts=True) + coherentFrac = np.max(ext_counts[1]) / groupCt + if coherentFrac < nonstrictthresh: + mrca = '' + else: + maxLength = np.max([len(lin.split('.')) + for lin in pango_aliases]) + while coherentFrac >= nonstrictthresh and j0 <= maxLength: + ext_counts = np.unique([lin.split('.')[0:j0] + if j0 <= len(lin.split('.')) + else lin.split('.') + + ['']*(j0-len(lin.split('.'))) + for lin in pango_aliases], + return_counts=True, + axis=0) + max_ind = np.argmax(ext_counts[1]) + coherentFrac = ext_counts[1][max_ind] / groupCt + mrca = '.'.join(ext_counts[0][max_ind]) + j0 += 1 # assign placeholder if no MRCA found if len(mrca) == 0: From c157d5204475d3d0ceb78f828b263f6327d15854 Mon Sep 17 00:00:00 2001 From: joshuailevy Date: Fri, 8 Mar 2024 17:48:30 -0800 Subject: [PATCH 2/5] update utils test --- freyja/tests/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/freyja/tests/test_utils.py b/freyja/tests/test_utils.py index e72fc5b8..e9712c91 100644 --- a/freyja/tests/test_utils.py +++ b/freyja/tests/test_utils.py @@ -411,7 +411,8 @@ def test_collapse_barcodes(self): original = barcodes.shape lineages_yml = 'freyja/data/lineages.yml' barcodes = collapse_barcodes(barcodes, self.depth, - 100, lineages_yml, 'freyja', 'test') + 100, lineages_yml, 'freyja', 'test', + False, 0.9) collapsed = barcodes.shape self.assertLess(collapsed[0], original[0]) From 5643087a9413c7236388bc20fcdf13ad45575ed4 Mon Sep 17 00:00:00 2001 From: joshuailevy Date: Fri, 8 Mar 2024 18:15:55 -0800 Subject: [PATCH 3/5] fix alias lookup issue --- freyja/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/freyja/utils.py b/freyja/utils.py index cc9ff2fc..dd94bdeb 100644 --- a/freyja/utils.py +++ b/freyja/utils.py @@ -1048,6 +1048,8 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, if len(mrca) == 0: mrca = 'Misc' else: + if mrca[-1] == '.': + mrca = mrca[0:(len(mrca)-1)] # otherwise, get the shortened alias, if available for lineage in lineage_data: if lineage_data[lineage]['alias'] == mrca: From 2995c762496274db6c82d00707b536956157bb96 Mon Sep 17 00:00:00 2001 From: joshuailevy Date: Mon, 18 Mar 2024 12:59:26 -0700 Subject: [PATCH 4/5] change option name --- freyja/_cli.py | 24 ++++++++++++------------ freyja/utils.py | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/freyja/_cli.py b/freyja/_cli.py index e659da35..06ab9dac 100644 --- a/freyja/_cli.py +++ b/freyja/_cli.py @@ -58,15 +58,15 @@ def print_barcode_version(ctx, param, value): @click.option('--region_of_interest', default='', help='JSON file containing' 'region(s) of interest for which to compute additional coverage' 'estimates') -@click.option('--nonstrictmrca', is_flag=True, default=False, +@click.option('--relaxedmrca', is_flag=True, default=False, help='for use with depth cutoff,' 'clusters are assigned robust mrca to handle outliers') -@click.option('--nonstrictthresh', default=0.9, +@click.option('--relaxedthresh', default=0.9, help='associated threshold for robust mrca function') def demix(variants, depths, output, eps, barcodes, meta, covcut, confirmedonly, depthcutoff, lineageyml, adapt, a_eps, region_of_interest, - nonstrictmrca, nonstrictthresh): + relaxedmrca, relaxedthresh): """ Generate prevalence of lineages per sample @@ -92,9 +92,9 @@ def demix(variants, depths, output, eps, barcodes, meta, :param adapt: used to set adaptive lasso penalty parameter :param a_eps: used to set adaptive lasso penalty parameter hard threshold' - :param nonstrictmrca: for use with depth cutoff, + :param relaxedmrca: for use with depth cutoff, clusters are assigned robust mrca to handle outliers - :param nonstrictthresh: associated threshold for robust mrca function + :param relaxedthresh: associated threshold for robust mrca function :return : a tsv file that includes the lineages present,their corresponding abundances, and summarization by constellation. @@ -124,7 +124,7 @@ def demix(variants, depths, output, eps, barcodes, meta, if depthcutoff != 0: df_barcodes = collapse_barcodes(df_barcodes, df_depth, depthcutoff, lineageyml, locDir, output, - nonstrictmrca, nonstrictthresh) + relaxedmrca, relaxedthresh) muts = list(df_barcodes.columns) mapDict = buildLineageMap(meta) print('building mix/depth matrices') @@ -392,14 +392,14 @@ def variants(bamfile, ref, variants, depths, refname, minq, annot): @click.option('--depthcutoff', default=0, help='exclude sites with coverage depth below this value and' 'group identical barcodes') -@click.option('--nonstrictmrca', is_flag=True, default=False, +@click.option('--relaxedmrca', is_flag=True, default=False, help='for use with depth cutoff,' 'clusters are assigned robust mrca to handle outliers') -@click.option('--nonstrictthresh', default=0.9, +@click.option('--relaxedthresh', default=0.9, help='associated threshold for robust mrca function') def boot(variants, depths, output_base, eps, barcodes, meta, nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff, - rawboots, nonstrictmrca, nonstrictthresh): + rawboots, relaxedmrca, relaxedthresh): """ Perform bootstrapping method for freyja @@ -420,9 +420,9 @@ def boot(variants, depths, output_base, eps, barcodes, meta, :param lineageyml: used to pass a custom lineage hierarchy file :param depthcutoff: used to exclude sites with coverage depth below this value andgroup identical barcodes - :param nonstrictmrca: for use with depth cutoff, + :param relaxedmrca: for use with depth cutoff, clusters are assigned robust mrca to handle outliers - :param nonstrictthresh: associated threshold for robust mrca function + :param relaxedthresh: associated threshold for robust mrca function :return : base-name_lineages.csv and base-name_summarized.csv """ @@ -455,7 +455,7 @@ def boot(variants, depths, output_base, eps, barcodes, meta, df_barcodes = collapse_barcodes( df_barcodes, df_depths, depthcutoff, lineageyml, locDir, output_base, - nonstrictmrca, nonstrictthresh) + relaxedmrca, relaxedthresh) muts = list(df_barcodes.columns) mapDict = buildLineageMap(meta) diff --git a/freyja/utils.py b/freyja/utils.py index dd94bdeb..8eee276a 100644 --- a/freyja/utils.py +++ b/freyja/utils.py @@ -918,7 +918,7 @@ def make_dashboard(agg_df, meta_df, thresh, title, introText, def collapse_barcodes(df_barcodes, df_depth, depthcutoff, lineageyml, locDir, output, - nonstrict, nonstrictthresh): + relaxed, relaxedthresh): # drop low coverage sites low_cov_sites = df_depth[df_depth[3].astype(int) < depthcutoff] \ .index.astype(str) @@ -1015,7 +1015,7 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, 'recombinant_parents' in lineage_data[alias.split('.')[0]]] - if not nonstrict: + if not relaxed: mrca = os.path.commonpath( [lin.replace('.', '/') for lin in pango_aliases] ).replace('/', '.') @@ -1026,12 +1026,12 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, for lin in pango_aliases], return_counts=True) coherentFrac = np.max(ext_counts[1]) / groupCt - if coherentFrac < nonstrictthresh: + if coherentFrac < relaxedthresh: mrca = '' else: maxLength = np.max([len(lin.split('.')) for lin in pango_aliases]) - while coherentFrac >= nonstrictthresh and j0 <= maxLength: + while coherentFrac >= relaxedthresh and j0 <= maxLength: ext_counts = np.unique([lin.split('.')[0:j0] if j0 <= len(lin.split('.')) else lin.split('.') + From 9ca18776e685e94865ed405d93ed0c5b40ff6a34 Mon Sep 17 00:00:00 2001 From: joshuailevy Date: Mon, 18 Mar 2024 13:32:56 -0700 Subject: [PATCH 5/5] add option to check if hierarchy file is out of date --- freyja/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/freyja/utils.py b/freyja/utils.py index 8eee276a..bdc818fa 100644 --- a/freyja/utils.py +++ b/freyja/utils.py @@ -950,8 +950,13 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, # collapse lineages into MRCA, where possible for tup in duplicates: - pango_aliases = [lineage_data[lin]['alias'] - for lin in tup] + try: + pango_aliases = [lineage_data[lin]['alias'] + for lin in tup] + except KeyError: + print('Lineage hierarchy file is likely behind' + ' the selected barcode file. Try updating' + ' the hierarchy file.') # handle cases where multiple lineage classes are being merged # e.g. (A.5, B.12) or (XBB, XBN) multiple_lin_classes = len(