Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ontology repacking and exporting - relates to #34 #44

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
acb580f
do the data repack into the ontology containers with further RDF 'tur…
dimatr Apr 28, 2020
5c39f2a
Merge remote-tracking branch 'remotes/origin/master' into ontology
dimatr Apr 30, 2020
44caccc
further ontology fixes: real path names; better forward* and reverse*…
dimatr Apr 30, 2020
29550fd
- do not forget to store the path_id
dimatr May 11, 2020
b8d0e16
- explicit faldo:ExactPosition containers
dimatr May 12, 2020
1eaacd0
Merge branch 'master' into ontology
dimatr Jun 16, 2020
26e6901
write faldo:ForwardStrandPosition, faldo:position and faldo:reference…
dimatr Jun 16, 2020
db95d1d
create ontology folder when needed
dimatr Jun 16, 2020
fb1a3ec
proper name and strand directions
dimatr Jun 16, 2020
8e233ec
directions
dimatr Jun 16, 2020
d0bff9a
fix bin_edge in .ttl output; but might be slow
subwaystation Jun 17, 2020
3741e10
recycle temp objects, do not create new ones
dimatr Jun 18, 2020
d50210b
positionPercent and inversionPercent are printed as doubles
dimatr Jun 18, 2020
3f4984c
fix orientation
subwaystation Jun 22, 2020
a96974e
add base IRI
subwaystation Jun 24, 2020
350e85d
added example SPARQL queries
subwaystation Jun 24, 2020
d2eb34c
Link -> ZoomLevel, replace pg with vg, add 'path/'
subwaystation Jun 26, 2020
209d697
Actually emit the position percentage instead of the coverage of a bin.
subwaystation Jun 26, 2020
7dc028b
a big update:
dimatr Jul 4, 2020
46d9c20
Update requirements.txt
6br Jul 18, 2020
1f5a379
Add 'path/' on cells
6br Jul 26, 2020
e014b73
Add assertion
6br Jul 26, 2020
9beafb3
Add assertion
6br Jul 26, 2020
8a336d9
Add logger info
6br Jul 26, 2020
61ceb22
Register logger
6br Jul 26, 2020
7e9f6f5
Merge branch 'ontology' of https://github.com/graph-genome/component_…
6br Jul 26, 2020
6378acd
Register logger
6br Jul 26, 2020
545e4c1
Register logger
6br Jul 26, 2020
86eca81
Add bin logger
6br Jul 26, 2020
f704767
Disable parallel on rdf writer
6br Jul 26, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/chrk_ath_12samples_10kb.w100000_S.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{"odgi_version": 10,"bin_width": 100000,"pangenome_length": 210026186}
{"odgi_version": 12,"bin_width": 100000,"pangenome_length": 210026186}
{"bin_id":1}
{"bin_id":2}
{"bin_id":3}
Expand Down
2 changes: 1 addition & 1 deletion matrixcomponent/JSONparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def process_path(line=None):
for r in ranges:
compressed_ranges.extend(r)

bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges)
bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges, 0, b[3])
p.bins.setdefault(bin.bin_id, bin)

# do the major part of the segmentation.find_dividers() method
Expand Down
213 changes: 176 additions & 37 deletions matrixcomponent/PangenomeSchematic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import gzip
import json
import logging
import os
import shutil

from collections import OrderedDict
from statistics import mean

Expand All @@ -10,11 +15,159 @@

from dataclasses import dataclass

from matrixcomponent import JSON_VERSION
from joblib import delayed
from rdflib import URIRef, Graph, Namespace

from matrixcomponent import JSON_VERSION, ontology
from matrixcomponent.matrix import Component, Bin, LinkColumn

from DNASkittleUtils.Contigs import Contig, write_contigs_to_file

LOGGER = logging.getLogger(__name__)
"""logging.Logger: The logger for this module"""

# move out of the class to be able to use with joblib
def write_rdf(schematic, ontology_output_path):
zoom_level = ontology.ZoomLevel()
zoom_level.zoom_factor = schematic.bin_width
zoom_level.ns = URIRef('http://example.org/vg/')

prev_comp_id = -1
cell_counter = 0
ocomp_dict = {}
obin_dict = {}
oposition_dict = {}

min_bin_id = schematic.first_bin
max_bin_id = schematic.last_bin

for ic, component in enumerate(schematic.components):
ocomp = ontology.Component(ic + 1)
ocomp.ns = zoom_level.ns_term() + '/'
zoom_level.components.append(ocomp)

# save the sequence 1-2-3-..-n as a bi-directed list
if prev_comp_id in ocomp_dict:
prev_comp = ocomp_dict[prev_comp_id]
ocomp.reverse_component_edge = prev_comp.ns_term()
prev_comp.forward_component_edge = ocomp.ns_term()

ocomp_dict[ic] = ocomp
prev_comp_id = ic

obin_ns = ocomp.ns_term() + '/'
obin_tmp = ontology.Bin()
obin_tmp.ns = obin_ns

# bins
for bins in component.matrix:
for bin in bins[1][1]: # follow the compressed format
if bin:
cur_bin_id = bin.bin_id
obin = ontology.Bin()
obin.ns = obin_ns
obin.bin_rank = cur_bin_id
obin_dict[cur_bin_id] = obin

if (cur_bin_id > min_bin_id):
obin_tmp.bin_rank = cur_bin_id - 1
obin.reverse_bin_edge = obin_tmp.ns_term() # string value
if (cur_bin_id < max_bin_id):
obin_tmp.bin_rank = cur_bin_id + 1
obin.forward_bin_edge = obin_tmp.ns_term() # string value

ocomp.bins.append(obin)

cell_counter = cell_counter + 1
ocell = ontology.Cell()
ocell.id = cell_counter
ocell.path_id = schematic.path_names[bin.path_id] # saved in the populate_component_matrix
ocell.inversion_percent = bin.inversion
ocell.position_percent = bin.position

# todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
cell_ns = URIRef("{0}/".format(ocell.path_id))
for be in range(0, len(bin.nucleotide_ranges), 2):
begin, end = bin.nucleotide_ranges[be], bin.nucleotide_ranges[be + 1]
real_begin = begin if begin else end
real_end = end if end else begin

oregion = ontology.Region()
oregion.begin = real_begin
oregion.end = real_end
ocell.cell_region.append(oregion)

path = schematic.path_names[bin.path_id]
oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns)
oposition_end = ontology.Position(real_end, begin < end, path, cell_ns)
oposition_dict[oposition_begin.ns_term()] = oposition_begin
oposition_dict[oposition_end.ns_term()] = oposition_end

obin.cells.append(ocell)

# links between components and their bins
LOGGER.info(f"Bin dictionary {obin_dict}")

olink_dict = {}
link_counter = 0
for component in schematic.components:
# search in all arrivals component <-> component links; departures are iterated automatically
# every link from departures will be in some other arrival
for link in component.departures:
if len(link.participants):
link_counter = link_counter + 1
olink = ontology.Link()
olink.id = link_counter
olink_dict[link_counter] = olink

link_counter = 0
for component in schematic.components:
# search in all arrivals component <-> component links; departures are iterated automatically
# every link from departures will be in some other arrival
for link in component.departures:
if len(link.participants):
link_counter = link_counter + 1
olink = olink_dict[link_counter]

if link.upstream in obin_dict:
from_bin = obin_dict[link.upstream]
olink.departure = from_bin.ns_term()
else:
LOGGER.info(f"No upstream {link.upstream}")

if link.downstream in obin_dict:
to_bin = obin_dict[link.downstream]
olink.arrival = to_bin.ns_term()
else:
LOGGER.info(f"No downstream {link.downstream}")

olink.paths = [schematic.path_names[k] for k in link.participants]
olink.linkZoomLevel = zoom_level.ns_term()
zoom_level.links.append(olink)

g = Graph()
vg = Namespace('http://biohackathon.org/resource/vg#')
faldo = Namespace('http://biohackathon.org/resource/faldo#')
g.bind('vg', vg)
g.bind('faldo', faldo)

# here the magic happens
zoom_level.add_to_graph(g, vg, faldo)
for oposition in oposition_dict.values():
oposition.add_to_graph(g, vg, faldo)
for path in schematic.path_names:
ontology.Path(path).add_to_graph(g, vg, faldo)

# format='nt' works 10x faster than 'turtle'; the produced files are 3x bigger
# do not forget to compress them afterwards - gives 15x diskspace reduction
g.serialize(destination=ontology_output_path, format='nt')
with open(ontology_output_path, 'rb') as fin:
with gzip.open(ontology_output_path + '.gz', 'wb') as fout:
shutil.copyfileobj(fin, fout)
fout.close()
os.remove(ontology_output_path)


@dataclass
class PangenomeSchematic:
json_version: int
Expand All @@ -37,14 +190,7 @@ def dumper(obj):
ranges.append([flat_ranges[i], flat_ranges[i+1]])
return [obj.coverage, obj.inversion, ranges]
if isinstance(obj, LinkColumn):
# todo: get rid of this once the JS side can work with sparse containers
if self.json_version <= 14:
bools = [False] * len(self.path_names)
for i in obj.participants:
bools[i] = True
return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':bools}
else:
return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':obj.participants.tolist()}
return {'upstream': obj.upstream, 'downstream': obj.downstream, 'participants': obj.participants.tolist()}
if isinstance(obj, set):
return list(obj)
try:
Expand All @@ -68,28 +214,7 @@ def update_first_last_bin(self):
self.first_bin = 1 # these have not been properly initialized
self.last_bin = self.components[-1].last_bin

def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links):
# todo: get rid of this once the JS side can work with sparse containers
if self.json_version <= 14:
empty = []
for comp in self.components:
bools = [False] * len(self.path_names)
for i in comp.occupants:
bools[i] = True
comp.occupants = bools

matrix = [empty] * len(self.path_names)
fb, lb = comp.first_bin, comp.last_bin
for item in comp.matrix:
padded = [empty] * (lb - fb + 1)
sliced = item[1]
for id, val in zip(sliced[0], sliced[1]):
padded[id] = val
matrix[item[0]] = padded

comp.matrix = matrix


def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links, ontology_folder, parallel):
"""Splits one Schematic into multiple files with their own
unique first and last_bin based on the volume of data desired per
file specified by cells_per_file. """
Expand Down Expand Up @@ -137,6 +262,22 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
c = folder.joinpath(schematic.fasta_filename(i))
write_contigs_to_file(c, chunk)

if ontology_folder:
# generator for the pairs (Schematics, path) - will be instantiated on the item access!
prepared_schematics = ( (PangenomeSchematic(JSON_VERSION, self.bin_width, self.components[cut:cut_points[i + 1]][0].first_bin,
self.components[cut:cut_points[i + 1]][-1].last_bin, self.includes_connectors,
self.components[cut:cut_points[i + 1]], self.path_names,
self.total_nr_files, self.pangenome_length),
str(ontology_folder.joinpath(self.ttl_filename(i))) )
for i, cut in enumerate(cut_points[:-1]) if self.components[cut:cut_points[i + 1]] )

#TODO() Now parallel is disabled because links overlap across partial pangenomic schematic occurs, though they cannot rescue because the communication across threads is not implemented yet.

#if parallel:
# results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics)
#else:
write_rdf(self, str(ontology_folder.joinpath(self.ttl_filename(0))))

return bin2file_mapping

def find_cut_points_in_file_split(self, columns_per_file, column_counts):
Expand All @@ -154,12 +295,7 @@ def find_cut_points_in_file_split(self, columns_per_file, column_counts):
def lazy_average_occupants(self):
"""grab four random components and check how many occupants they have"""
samples = [self.components[int(len(self.components) * (perc/100))] for perc in range(1, 99)]

# todo: get rid of this once the JS side can work with sparse containers
if self.json_version <= 14:
avg_paths = mean([sum(x.occupants) for x in samples])
else:
avg_paths = mean([len(x.occupants) for x in samples])
avg_paths = mean([len(x.occupants) for x in samples])

return avg_paths

Expand All @@ -172,6 +308,9 @@ def filename(self, nth_file):
def fasta_filename(self, nth_file):
return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa'

def ttl_filename(self, nth_file):
return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.nt'

def write_index_file(self, folder, bin2file_mapping):

file_contents = {'bin_width': self.bin_width,
Expand Down
2 changes: 2 additions & 0 deletions matrixcomponent/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ class Bin(recordclass.dataobject):
coverage: float
inversion: float
nucleotide_ranges: 'numpy.array' # List[List[int]] is encoded as a Numpy flat array - this saves memory
path_id: int
position: float

## Path is all for input files

Expand Down
Loading