ablab · albidgy · Feb 13, 2023 · Feb 13, 2023 · Feb 13, 2023 · Feb 19, 2023
diff --git a/README.md b/README.md
@@ -2,10 +2,11 @@
 
 ### Genome assembly evaluation tool
 
-QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics.
+QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome/diploid assemblies by computing various metrics.
 The current QUAST toolkit includes the general QUAST tool for genome assemblies, 
 MetaQUAST, the extension for metagenomic datasets, 
-QUAST-LG, the extension for large genomes (e.g., mammalians), and Icarus, the interactive visualizer for these tools.
+QUAST-LG, the extension for large genomes (e.g., mammalians),
+dipQUAST, the extension for diploid assemblies, and Icarus, the interactive visualizer for these tools.
 
 The QUAST package works both with and without reference genomes. 
 However, it is much more informative if at least a close reference genome is provided along with the assemblies.
@@ -99,10 +100,12 @@ or MetaGeneMark (for metagenomes).
 
 When a reference is given:
 
-* Numbers of misassemblies of different kinds (inversions, relocations, translocations, interspecies translocations (metaQUAST only) or local).
+* Numbers of misassemblies of different kinds (inversions, relocations, translocations, interspecies translocations (metaQUAST only), interhaplotype translocations (dipQUAST only) or local).
 * Number and total length of unaligned contigs.  
-* Numbers of mismatches and indels, over the assembly and per 100 kb.  
-* Genome fraction %, assembled part of the reference.  
+* Numbers of mismatches and indels, over the assembly and per 100 kb.
+* Number of cases of missed heterozygosity (dipQUAST only).
+* Genome fraction %, assembled part of the reference.
+* Genome fraction % of haplotype 1 / haplotype 2, assembled parts of each haplotype reference (dipQUAST only).
 * Duplication ratio, the total number of aligned bases in the assembly divided by the total number of those in the reference. 
 If the assembly contains many contigs that cover the same regions, its duplication ratio will significantly exceed 1. 
 This occurs due to multiple reasons, including overestimating repeat multiplicities and overlaps between contigs.  

diff --git a/dipquast.py b/dipquast.py
@@ -0,0 +1 @@
+quast.py
diff --git a/manual.html b/manual.html
diff --git a/quast.py b/quast.py
@@ -12,7 +12,7 @@
 import sys
 import shutil
 
-from quast_libs import qconfig
+from quast_libs import qconfig, diputils
 
 qconfig.check_python_version()
 
@@ -132,6 +132,12 @@ def main(args):
 
     qconfig.assemblies_fpaths = contigs_fpaths
 
+    # Fill dip_dict and calculate length of haplotypes
+    if qconfig.ploid_mode:
+        diputils.dip_genome_by_chr = diputils.fill_dip_dict_by_chromosomes(ref_fpath)
+        diputils.length_of_haplotypes = diputils.get_haplotypes_len(ref_fpath)
+        diputils.ploid_aligned = dict(zip(diputils.dip_genome_by_chr.keys(), [0]*len(diputils.dip_genome_by_chr.keys())))
+
     # Where all pdfs will be saved
     all_pdf_fpath = None
     if qconfig.draw_plots and plotter.can_draw_plots:
@@ -161,9 +167,59 @@ def main(args):
         ########################################################################
         from quast_libs import contigs_analyzer
         is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
+
+        # Counting cases of missed heterozygosity
+        if qconfig.ploid_mode and (not qconfig.large_genome) and qconfig.show_snps:
+            aux_files_dirpath = os.path.join(output_dirpath, 'dipquast_auxiliary_files')
+            if not os.path.isdir(aux_files_dirpath):
+                os.mkdir(aux_files_dirpath)
+
+            # Creating separate reference files for each haplotype
+            ref1_fpath, ref2_fpath = diputils.split_ref_file_by_haplotypes(ref_fpath, diputils.dip_genome_by_chr,
+                                                                           aux_files_dirpath)
+            ref1_fpaths, old_ref1_fpaths = qutils.correct_contigs([ref1_fpath], corrected_dirpath, ['haplotype_1'],
+                                                                  reporting)
+            ref2_fpaths, old_ref2_fpaths = qutils.correct_contigs([ref2_fpath], corrected_dirpath, ['haplotype_2'],
+                                                                  reporting)
+            ref1_fpath, ref2_fpath = ref1_fpaths[0], ref2_fpaths[0]
+
+            # Making initial .used_snps file
+            snps_fpath = contigs_analyzer.do_ref_to_ref_align(
+                ref_fpath, contigs_fpaths, is_cyclic,
+                os.path.join(aux_files_dirpath, qconfig.detailed_contigs_reports_dirname),
+                old_contigs_fpaths, qconfig.bed)
+
+            # Changing ploid_mode temporarily in order to align haplotypes to one another
+            qconfig.ploid_mode = False # Check!
+
+            # Aligning haplotype_1 to haplotype_2:
+            snps_ref1_to_ref2_fpath = contigs_analyzer.do_ref_to_ref_align(
+                ref2_fpath, ref1_fpaths, is_cyclic,
+                os.path.join(aux_files_dirpath, qconfig.detailed_contigs_reports_dirname + '_ref1_to_ref2'),
+                old_ref1_fpaths, qconfig.bed)
+
+            # Aligning haplotype_2 to haplotype_1:
+            snps_ref2_to_ref1_fpath = contigs_analyzer.do_ref_to_ref_align(
+                ref1_fpath, ref2_fpaths, is_cyclic,
+                os.path.join(aux_files_dirpath, qconfig.detailed_contigs_reports_dirname + '_ref2_to_ref1'),
+                old_ref2_fpaths, qconfig.bed)
+
+            # Remove ref to ref alignments from assembly_fpaths
+            reporting.assembly_fpaths.remove(ref1_fpath)
+            reporting.assembly_fpaths.remove(ref2_fpath)
+
+            # Counting cases of missed heterozygosity
+            diputils.n_missed_heterozygosity = diputils.count_missed_heterozygosity(snps_fpath[0],
+                                                                                    snps_ref1_to_ref2_fpath[0],
+                                                                                    snps_ref2_to_ref1_fpath[0])
+
+            # Changing ploid_mode back
+            qconfig.ploid_mode = True
+
         aligner_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
-            ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, qconfig.detailed_contigs_reports_dirname),
-            old_contigs_fpaths, qconfig.bed)
+            ref_fpath, contigs_fpaths, is_cyclic,
+            os.path.join(output_dirpath, qconfig.detailed_contigs_reports_dirname), old_contigs_fpaths, qconfig.bed)
+
         for contigs_fpath in contigs_fpaths:
             if aligner_statuses[contigs_fpath] == contigs_analyzer.AlignerStatus.OK:
                 aligned_contigs_fpaths.append(contigs_fpath)
@@ -303,7 +359,6 @@ def main(args):
     cleanup(corrected_dirpath)
     return logger.finish_up(check_test=qconfig.test)
 
-
 if __name__ == '__main__':
     try:
         return_code = main(sys.argv[1:])

diff --git a/quast_libs/basic_stats.py b/quast_libs/basic_stats.py
@@ -14,6 +14,8 @@
 from quast_libs import fastaparser, qconfig, qutils, reporting, plotter
 from quast_libs.circos import set_window_size
 from quast_libs.log import get_logger
+from quast_libs.diputils import length_of_haplotypes
+
 logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)
 MIN_HISTOGRAM_POINTS = 5
 MIN_GC_WINDOW_SIZE = qconfig.GC_window_size // 2
@@ -324,6 +326,12 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
         if ref_fpath:
             report.add_field(reporting.Fields.REFLEN, int(reference_length))
             report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
+
+            if qconfig.ploid_mode:
+                report.add_field(reporting.Fields.REFLEN_HAPLOTYPES,
+                                 [int(l) for l in length_of_haplotypes.values()])
+
+
             if not qconfig.is_combined_ref:
                 report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
         elif reference_length:

diff --git a/quast_libs/ca_utils/analyze_contigs.py b/quast_libs/ca_utils/analyze_contigs.py
@@ -10,6 +10,7 @@
 from quast_libs.ca_utils.analyze_misassemblies import process_misassembled_contig, IndelsInfo, find_all_sv, Misassembly
 from quast_libs.ca_utils.best_set_selection import get_best_aligns_sets, get_used_indexes, score_single_align
 from quast_libs.ca_utils.misc import ref_labels_by_chromosomes
+from quast_libs.diputils import dip_genome_by_chr, l_names_ambiguity_contigs
 
 
 def add_potential_misassembly(ref, misassemblies_by_ref, refs_with_translocations):
@@ -192,16 +193,45 @@ def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fp
                         for align in top_aligns:
                             ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')
                     elif qconfig.ambiguity_usage == "one":
-                        ca_output.stdout_f.write('\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n')
-                        ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0]))
-                        ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n')
-                        ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0])
-                        aligned_lengths.append(top_aligns[0].len2)
-                        contigs_aligned_lengths[-1] = top_aligns[0].len2
-                        ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n')
-                        top_aligns = top_aligns[1:]
-                        for align in top_aligns:
-                            ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')
+                        if qconfig.ploid_mode:
+                            ploidy = len(dip_genome_by_chr)
+                            ca_output.stdout_f.write(
+                                f'\t\tThere are {ploidy} haplotypes. Using no more than one alignment for each haplotype\n')
+                            used_haplotypes = []
+                            skipped_aligns = []
+                            while len(top_aligns):
+                                if len(used_haplotypes) == ploidy:
+                                    skipped_aligns += top_aligns
+                                    break
+                                for key, value in dip_genome_by_chr.items():  # Create method for this later!
+                                    if top_aligns[0].ref in value:
+                                        haplotype = key
+                                if haplotype not in used_haplotypes:
+                                    ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0]))
+                                    ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n')
+                                    ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0])
+                                    aligned_lengths.append(top_aligns[0].len2)
+                                    contigs_aligned_lengths[-1] = top_aligns[0].len2
+                                    ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n')
+                                    used_haplotypes.append(haplotype)
+                                    if qconfig.ploid_assembly_type == 'phased' and len(used_haplotypes) > 1 and top_aligns[0].contig not in l_names_ambiguity_contigs:
+                                        l_names_ambiguity_contigs.append(top_aligns[0].contig)
+                                else:
+                                    skipped_aligns.append(top_aligns[0])
+                                top_aligns = top_aligns[1:]
+                            for align in skipped_aligns:
+                                ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')
+                        else:
+                            ca_output.stdout_f.write('\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n')
+                            ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0]))
+                            ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n')
+                            ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0])
+                            aligned_lengths.append(top_aligns[0].len2)
+                            contigs_aligned_lengths[-1] = top_aligns[0].len2
+                            ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n')
+                            top_aligns = top_aligns[1:]
+                            for align in top_aligns:
+                                ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')
                     elif qconfig.ambiguity_usage == "all":
                         ca_output.stdout_f.write('\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n')
                         # we count only extra bases, so we shouldn't include bases in the first alignment

diff --git a/quast_libs/ca_utils/analyze_misassemblies.py b/quast_libs/ca_utils/analyze_misassemblies.py
@@ -14,14 +14,15 @@
 from quast_libs.log import get_logger
 logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)
 from quast_libs.qutils import correct_name
-
+from quast_libs.diputils import is_homologous_ref
 
 class Misassembly:
     LOCAL = 0
     INVERSION = 1
     RELOCATION = 2
     TRANSLOCATION = 3
     INTERSPECTRANSLOCATION = 4  # for metaquast, if translocation occurs between chromosomes of different references
+    INTERHAPLOTRANSLOCATION = 200  # for dipquast
     SCAFFOLD_GAP = 5
     LOCAL_SCAFFOLD_GAP = 6
     FRAGMENTED = 7
@@ -34,6 +35,7 @@ class Misassembly:
     SCF_RELOCATION = 13
     SCF_TRANSLOCATION = 14
     SCF_INTERSPECTRANSLOCATION = 15
+    SCF_INTERHAPLOTRANSLOCATION = 16
 
 
 class StructuralVariations(object):
@@ -43,9 +45,10 @@ def __init__(self):
         self.inversions = []
         self.relocations = []
         self.translocations = []
+        self.inter_haplotype_translocations = []
 
     def get_count(self):
-        return len(self.inversions) + len(self.relocations) + len(self.translocations)
+        return len(self.inversions) + len(self.relocations) + len(self.translocations) + len(self.inter_haplotype_translocations)
 
 
 class Mapping(object):
@@ -289,7 +292,10 @@ def find_all_sv(bed_fpath):
                     align1 = Mapping(s1=int(fs[1]), e1=int(fs[2]), ref=correct_name(fs[0]), sv_type=fs[6])
                     align2 = Mapping(s1=int(fs[4]), e1=int(fs[5]), ref=correct_name(fs[3]), sv_type=fs[6])
                     if align1.ref != align2.ref:
-                        region_struct_variations.translocations.append((align1, align2))
+                        if qconfig.ploid_mode and is_homologous_ref(align1.ref, align2.ref):
+                            region_struct_variations.inter_haplotype_translocations.append((align1, align2))
+                        else:
+                            region_struct_variations.translocations.append((align1, align2))
                     elif 'INV' in fs[6]:
                         region_struct_variations.inversions.append((align1, align2))
                     elif 'DEL' in fs[6] or 'INS' in fs[6] or 'BND' in fs[6]:
@@ -469,6 +475,8 @@ def process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, regio
             if prev_align.ref != next_align.ref:  # if chromosomes from different references
                 if qconfig.is_combined_ref and prev_ref != next_ref:
                     misassembly_type = 'interspecies translocation'
+                elif qconfig.ploid_mode and is_homologous_ref(prev_align.ref, next_align.ref):
+                    misassembly_type = 'interhaplotype translocation'
                 else:
                     misassembly_type = 'translocation'
             elif abs(aux_data["inconsistency"]) > qconfig.extensive_misassembly_threshold:
@@ -544,6 +552,8 @@ def process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, regio
                 misassembly_id = Misassembly.INTERSPECTRANSLOCATION
                 istranslocations_by_ref[prev_ref][next_ref] += 1
                 istranslocations_by_ref[next_ref][prev_ref] += 1
+            elif misassembly_type == 'interhaplotype translocation':
+                misassembly_id = Misassembly.INTERHAPLOTRANSLOCATION
             elif misassembly_type == 'translocation':
                 misassembly_id = Misassembly.TRANSLOCATION
             elif misassembly_type == 'relocation':

diff --git a/quast_libs/ca_utils/best_set_selection.py b/quast_libs/ca_utils/best_set_selection.py
@@ -14,6 +14,7 @@
 from quast_libs.ca_utils.analyze_misassemblies import is_misassembly, exclude_internal_overlaps, Misassembly, \
     is_fragmented_ref_fake_translocation
 from quast_libs.ca_utils.misc import is_same_reference
+from quast_libs.diputils import is_homologous_ref
 
 
 class ScoredSet(object):
@@ -199,7 +200,7 @@ def get_best_aligns_sets(sorted_aligns, ctg_len, stdout_f, seq, ref_lens, is_cyc
                 break
             cur_set_aligns = [sorted_aligns[i] for i in scored_set.indexes] + [align]
             score, uncovered = get_score(scored_set.score, cur_set_aligns, ref_lens, is_cyclic, scored_set.uncovered,
-                                         seq, region_struct_variations, penalties)
+                                         seq, region_struct_variations, penalties, ctg_len)
             if score is None:  # incorrect set, i.e. internal overlap excluding resulted in incorrectly short alignment
                 continue
             if score > local_max_score:
@@ -262,7 +263,7 @@ def get_best_aligns_sets(sorted_aligns, ctg_len, stdout_f, seq, ref_lens, is_cyc
                 break
             cur_set_aligns = [sorted_aligns[i] for i in scored_set.indexes] + [align]
             score, uncovered = get_score(scored_set.score, cur_set_aligns, ref_lens, is_cyclic, scored_set.uncovered,
-                                         seq, region_struct_variations, penalties)
+                                         seq, region_struct_variations, penalties, ctg_len)
             if score is not None:
                 putative_predecessors[scored_set] = (score, uncovered)
                 if score > local_max_score:
@@ -303,7 +304,7 @@ def get_added_len(set_aligns, cur_align):
     return added_right + added_left
 
 
-def get_score(score, aligns, ref_lens, is_cyclic, uncovered_len, seq, region_struct_variations, penalties):
+def get_score(score, aligns, ref_lens, is_cyclic, uncovered_len, seq, region_struct_variations, penalties, ctg_len):
     if len(aligns) > 1:
         align1, align2 = aligns[-2], aligns[-1] = aligns[-2].clone(), aligns[-1].clone()
         is_fake_translocation = is_fragmented_ref_fake_translocation(align1, align2, ref_lens)
@@ -326,6 +327,8 @@ def get_score(score, aligns, ref_lens, is_cyclic, uncovered_len, seq, region_str
             if align1.ref != align2.ref:
                 if qconfig.is_combined_ref and not is_same_reference(align1.ref, align2.ref):
                     misassembly = Misassembly.INTERSPECTRANSLOCATION
+                elif qconfig.ploid_mode and is_homologous_ref(align1.ref, align2.ref):
+                    misassembly = max(Misassembly.INTERHAPLOTRANSLOCATION, ctg_len * 0.001)
                 else:
                     misassembly = Misassembly.TRANSLOCATION
             elif abs(aux_data["inconsistency"]) > qconfig.extensive_misassembly_threshold: