graph-genome · dimatr · Apr 28, 2020 · Apr 30, 2020 · Apr 30, 2020 · May 11, 2020
diff --git a/data/chrk_ath_12samples_10kb.w100000_S.json b/data/chrk_ath_12samples_10kb.w100000_S.json
@@ -1,4 +1,4 @@
-{"odgi_version": 10,"bin_width": 100000,"pangenome_length": 210026186}
+{"odgi_version": 12,"bin_width": 100000,"pangenome_length": 210026186}
 {"bin_id":1}
 {"bin_id":2}
 {"bin_id":3}

diff --git a/matrixcomponent/JSONparser.py b/matrixcomponent/JSONparser.py
@@ -41,7 +41,7 @@ def process_path(line=None):
             for r in ranges:
                 compressed_ranges.extend(r)
 
-            bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges)
+            bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges, 0, b[3])
             p.bins.setdefault(bin.bin_id, bin)
 
         # do the major part of the segmentation.find_dividers() method

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
@@ -1,4 +1,9 @@
+import gzip
 import json
+import logging
+import os
+import shutil
+
 from collections import OrderedDict
 from statistics import mean
 
@@ -10,11 +15,159 @@
 
 from dataclasses import dataclass
 
-from matrixcomponent import JSON_VERSION
+from joblib import delayed
+from rdflib import URIRef, Graph, Namespace
+
+from matrixcomponent import JSON_VERSION, ontology
 from matrixcomponent.matrix import Component, Bin, LinkColumn
 
 from DNASkittleUtils.Contigs import Contig, write_contigs_to_file
 
+LOGGER = logging.getLogger(__name__)
+"""logging.Logger: The logger for this module"""
+
+# move out of the class to be able to use with joblib
+def write_rdf(schematic, ontology_output_path):
+    zoom_level = ontology.ZoomLevel()
+    zoom_level.zoom_factor = schematic.bin_width
+    zoom_level.ns = URIRef('http://example.org/vg/')
+
+    prev_comp_id = -1
+    cell_counter = 0
+    ocomp_dict = {}
+    obin_dict = {}
+    oposition_dict = {}
+
+    min_bin_id = schematic.first_bin
+    max_bin_id = schematic.last_bin
+
+    for ic, component in enumerate(schematic.components):
+        ocomp = ontology.Component(ic + 1)
+        ocomp.ns = zoom_level.ns_term() + '/'
+        zoom_level.components.append(ocomp)
+
+        # save the sequence 1-2-3-..-n as a bi-directed list
+        if prev_comp_id in ocomp_dict:
+            prev_comp = ocomp_dict[prev_comp_id]
+            ocomp.reverse_component_edge = prev_comp.ns_term()
+            prev_comp.forward_component_edge = ocomp.ns_term()
+
+        ocomp_dict[ic] = ocomp
+        prev_comp_id = ic
+
+        obin_ns = ocomp.ns_term() + '/'
+        obin_tmp = ontology.Bin()
+        obin_tmp.ns = obin_ns
+
+        # bins
+        for bins in component.matrix:
+            for bin in bins[1][1]:  # follow the compressed format
+                if bin:
+                    cur_bin_id = bin.bin_id
+                    obin = ontology.Bin()
+                    obin.ns = obin_ns
+                    obin.bin_rank = cur_bin_id
+                    obin_dict[cur_bin_id] = obin
+
+                    if (cur_bin_id > min_bin_id):
+                        obin_tmp.bin_rank = cur_bin_id - 1
+                        obin.reverse_bin_edge = obin_tmp.ns_term()  # string value
+                    if (cur_bin_id < max_bin_id):
+                        obin_tmp.bin_rank = cur_bin_id + 1
+                        obin.forward_bin_edge = obin_tmp.ns_term()  # string value
+
+                    ocomp.bins.append(obin)
+
+                    cell_counter = cell_counter + 1
+                    ocell = ontology.Cell()
+                    ocell.id = cell_counter
+                    ocell.path_id = schematic.path_names[bin.path_id]  # saved in the populate_component_matrix
+                    ocell.inversion_percent = bin.inversion
+                    ocell.position_percent = bin.position
+
+                    # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
+                    cell_ns = URIRef("{0}/".format(ocell.path_id))
+                    for be in range(0, len(bin.nucleotide_ranges), 2):
+                        begin, end = bin.nucleotide_ranges[be], bin.nucleotide_ranges[be + 1]
+                        real_begin = begin if begin else end
+                        real_end = end if end else begin
+
+                        oregion = ontology.Region()
+                        oregion.begin = real_begin
+                        oregion.end = real_end
+                        ocell.cell_region.append(oregion)
+
+                        path = schematic.path_names[bin.path_id]
+                        oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns)
+                        oposition_end = ontology.Position(real_end, begin < end, path, cell_ns)
+                        oposition_dict[oposition_begin.ns_term()] = oposition_begin
+                        oposition_dict[oposition_end.ns_term()] = oposition_end
+
+                    obin.cells.append(ocell)
+
+    # links between components and their bins
+    LOGGER.info(f"Bin dictionary {obin_dict}")
+
+    olink_dict = {}
+    link_counter = 0
+    for component in schematic.components:
+        # search in all arrivals component <-> component links; departures are iterated automatically
+        # every link from departures will be in some other arrival
+        for link in component.departures:
+            if len(link.participants):
+                link_counter = link_counter + 1
+                olink = ontology.Link()
+                olink.id = link_counter
+                olink_dict[link_counter] = olink
+
+    link_counter = 0
+    for component in schematic.components:
+        # search in all arrivals component <-> component links; departures are iterated automatically
+        # every link from departures will be in some other arrival
+        for link in component.departures:
+            if len(link.participants):
+                link_counter = link_counter + 1
+                olink = olink_dict[link_counter]
+
+                if link.upstream in obin_dict:
+                    from_bin = obin_dict[link.upstream]
+                    olink.departure = from_bin.ns_term()
+                else:
+                    LOGGER.info(f"No upstream {link.upstream}")
+
+                if link.downstream in obin_dict:
+                    to_bin = obin_dict[link.downstream]
+                    olink.arrival = to_bin.ns_term()
+                else:
+                    LOGGER.info(f"No downstream {link.downstream}")
+
+                olink.paths = [schematic.path_names[k] for k in link.participants]
+                olink.linkZoomLevel = zoom_level.ns_term()
+                zoom_level.links.append(olink)
+
+    g = Graph()
+    vg = Namespace('http://biohackathon.org/resource/vg#')
+    faldo = Namespace('http://biohackathon.org/resource/faldo#')
+    g.bind('vg', vg)
+    g.bind('faldo', faldo)
+
+    # here the magic happens
+    zoom_level.add_to_graph(g, vg, faldo)
+    for oposition in oposition_dict.values():
+        oposition.add_to_graph(g, vg, faldo)
+    for path in schematic.path_names:
+        ontology.Path(path).add_to_graph(g, vg, faldo)
+
+    # format='nt' works 10x faster than 'turtle'; the produced files are 3x bigger
+    # do not forget to compress them afterwards - gives 15x diskspace reduction
+    g.serialize(destination=ontology_output_path, format='nt')
+    with open(ontology_output_path, 'rb') as fin:
+        with gzip.open(ontology_output_path + '.gz', 'wb') as fout:
+            shutil.copyfileobj(fin, fout)
+            fout.close()
+    os.remove(ontology_output_path)
+
+
 @dataclass
 class PangenomeSchematic:
     json_version: int
@@ -37,14 +190,7 @@ def dumper(obj):
                     ranges.append([flat_ranges[i], flat_ranges[i+1]])
                 return [obj.coverage, obj.inversion, ranges]
             if isinstance(obj, LinkColumn):
-                # todo: get rid of this once the JS side can work with sparse containers
-                if self.json_version <= 14:
-                    bools = [False] * len(self.path_names)
-                    for i in obj.participants:
-                        bools[i] = True
-                    return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':bools}
-                else:
-                    return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':obj.participants.tolist()}
+                return {'upstream': obj.upstream, 'downstream': obj.downstream, 'participants': obj.participants.tolist()}
             if isinstance(obj, set):
                 return list(obj)
             try:
@@ -68,28 +214,7 @@ def update_first_last_bin(self):
         self.first_bin = 1  # these have not been properly initialized
         self.last_bin = self.components[-1].last_bin
 
-    def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links):
-        # todo: get rid of this once the JS side can work with sparse containers
-        if self.json_version <= 14:
-            empty = []
-            for comp in self.components:
-                bools = [False] * len(self.path_names)
-                for i in comp.occupants:
-                    bools[i] = True
-                comp.occupants = bools
-
-                matrix = [empty] * len(self.path_names)
-                fb, lb = comp.first_bin, comp.last_bin
-                for item in comp.matrix:
-                    padded = [empty] * (lb - fb + 1)
-                    sliced = item[1]
-                    for id, val in zip(sliced[0], sliced[1]):
-                        padded[id] = val
-                    matrix[item[0]] = padded
-
-                comp.matrix = matrix
-
-
+    def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links, ontology_folder, parallel):
         """Splits one Schematic into multiple files with their own
         unique first and last_bin based on the volume of data desired per
         file specified by cells_per_file.  """
@@ -137,6 +262,22 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                     c = folder.joinpath(schematic.fasta_filename(i))
                     write_contigs_to_file(c, chunk)
 
+        if ontology_folder:
+            # generator for the pairs (Schematics, path) - will be instantiated on the item access!
+            prepared_schematics = ( (PangenomeSchematic(JSON_VERSION, self.bin_width, self.components[cut:cut_points[i + 1]][0].first_bin,
+                                                   self.components[cut:cut_points[i + 1]][-1].last_bin, self.includes_connectors,
+                                                   self.components[cut:cut_points[i + 1]], self.path_names,
+                                                   self.total_nr_files, self.pangenome_length),
+                                     str(ontology_folder.joinpath(self.ttl_filename(i))) )
+                                   for i, cut in enumerate(cut_points[:-1]) if self.components[cut:cut_points[i + 1]] )
+
+            #TODO() Now parallel is disabled because links overlap across partial pangenomic schematic occurs, though they cannot rescue because the communication across threads is not implemented yet.
+
+            #if parallel:
+            #    results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics)
+            #else:
+            write_rdf(self, str(ontology_folder.joinpath(self.ttl_filename(0))))
+
         return bin2file_mapping
 
     def find_cut_points_in_file_split(self, columns_per_file, column_counts):
@@ -154,12 +295,7 @@ def find_cut_points_in_file_split(self, columns_per_file, column_counts):
     def lazy_average_occupants(self):
         """grab four random components and check how many occupants they have"""
         samples = [self.components[int(len(self.components) * (perc/100))] for perc in range(1, 99)]
-
-        # todo: get rid of this once the JS side can work with sparse containers
-        if self.json_version <= 14:
-            avg_paths = mean([sum(x.occupants) for x in samples])
-        else:
-            avg_paths = mean([len(x.occupants) for x in samples])
+        avg_paths = mean([len(x.occupants) for x in samples])
 
         return avg_paths
 
@@ -172,6 +308,9 @@ def filename(self, nth_file):
     def fasta_filename(self, nth_file):
         return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa'
 
+    def ttl_filename(self, nth_file):
+        return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.nt'
+
     def write_index_file(self, folder, bin2file_mapping):
 
         file_contents = {'bin_width': self.bin_width,

diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py
@@ -12,6 +12,8 @@ class Bin(recordclass.dataobject):
     coverage: float
     inversion: float
     nucleotide_ranges: 'numpy.array' # List[List[int]] is encoded as a Numpy flat array - this saves memory
+    path_id: int
+    position: float
 
 ## Path is all for input files