diff --git a/freyja/utils.py b/freyja/utils.py index 6b314c81..ba5e17d5 100644 --- a/freyja/utils.py +++ b/freyja/utils.py @@ -862,49 +862,58 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, locDir, output): lineage_data = {lineage['name']: lineage for lineage in lineage_yml} alias_count = {} - merging_recomb = False collapsed_lineages = {} # collapse lineages into MRCA, where possible for tup in duplicates: pango_aliases = [lineage_data[lin]['alias'] for lin in tup] + alias_dict = {lineage_data[lin]['alias']: lin for lin in tup} - # handle case where recombinant and non-recombinant lins are merged - recomb_and_nonrecomb = len( + # handle cases where multiple lineage classes are being merged + # e.g. (A.5, B.12) or (XBB, XBN) + multiple_lin_classes = len( set([alias[0] for alias in pango_aliases])) > 1 - if recomb_and_nonrecomb: + + if multiple_lin_classes: + # for recombinant lineages, find the parent lineages + parent_aliases = [] for alias in pango_aliases: - if alias.startswith('X'): + + if 'recombinant_parents' in lineage_data[alias_dict[alias]]: + + # replace with its recombinant parents pango_aliases.remove(alias) - parents = lineage_data[alias]['recombinant_parents']\ + parents = lineage_data[alias]['recombinant_parents'] \ .replace('*', '').split(',') parents = [lineage_data[lin]['alias'] for lin in parents] - for alias in pango_aliases: - for parent in parents: - if alias.startswith(parent) and\ - parent not in pango_aliases: - pango_aliases.append(parent) - merging_recomb = True + # only consider parents related to the other lineages + for parent in parents: + if any([alias.startswith(parent) + for alias in pango_aliases]) and \ + parent not in pango_aliases: + parent_aliases.append(parent) + pango_aliases += parent_aliases + + # get MRCA mrca = os.path.commonpath( [lin.replace('.', '/') for lin in pango_aliases] ).replace('/', '.') - # attempting to collapse multiple recombinant lineages - if len(mrca) == 0 and all([alias.startswith('X') - for alias in pango_aliases]): - mrca = 'Recombinant' - merging_recomb = True + # assign placeholder if no MRCA found + if len(mrca) == 0: + mrca = 'Misc' else: + # otherwise, get the shortened alias, if available for lineage in lineage_data: if lineage_data[lineage]['alias'] == mrca: - mrca = lineage + # add flag to indicate that this is a merged lineage + mrca = lineage + '-like' break - # add flag to indicate that this is a merged lineage - mrca += '-like' + # include index for multiple barcode classes with same MRCA if mrca not in alias_count: alias_count[mrca] = 0 else: @@ -912,7 +921,6 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, locDir, output): mrca += f'({alias_count[mrca]})' collapsed_lineages[mrca] = list(tup) - df_barcodes = df_barcodes.rename({lin: mrca for lin in tup}) df_barcodes = df_barcodes.drop_duplicates() @@ -926,10 +934,6 @@ def collapse_barcodes(df_barcodes, df_depth, depthcutoff, locDir, output): yaml.dump(collapsed_lineages, f, default_flow_style=False) print(f'collapsed lineages saved to {output}') - if merging_recomb: - print('warning: Recombinant and non-recombinant lineage barcodes' - ' being merged based on available sequence coverage and' - ' --depthcutoff value. Solution may be inaccurate.') return df_barcodes