From 673844a81fb5a21a330fe2157b595664a4ce8e53 Mon Sep 17 00:00:00 2001 From: Michael Baudis <675030+mbaudis@users.noreply.github.com> Date: Mon, 4 Mar 2024 09:40:05 +0100 Subject: [PATCH] 1.6.0 -> see bycon docs --- bin/ISCNsegmenter.py | 3 +- bin/analysesStatusmapsRefresher.py | 0 bin/collationsCreator.py | 75 +- bin/collationsPlotter.py | 59 - bin/frequencymapsCreator.py | 123 +- bin/lib/mongodb_utils.py | 4 +- bin/local/beacon_defaults.yaml | 46 +- bin/local/instance_overrides.yaml | 62 + bin/local/local_paths.yaml | 5 - bin/local/services_defaults.yaml | 3 + bin/publicationsInserter.py | 2 +- bin/templateTablesCreator.py | 2 +- bin/variantsInserter.py | 3 +- exports/multicollationtest-collationplots.svg | 1715 +++++++++++++++++ exports/multicollationtest.svg | 1488 +++++++------- local/beacon_defaults.yaml | 46 +- local/instance_overrides.yaml | 62 + local/local_paths.yaml | 5 - local/services_defaults.yaml | 3 + rsrc/templates/analysis_template.tsv | 2 +- rsrc/templates/biosample_template.tsv | 2 +- rsrc/templates/genomicVariant_template.tsv | 2 +- rsrc/templates/individual_template.tsv | 2 +- rsrc/templates/metadata_template.tsv | 2 +- services/collationplots.py | 41 +- services/config/genespans.yaml | 6 - services/intervalFrequencies.py | 14 +- services/lib/bycon_bundler.py | 63 +- services/lib/cytoband_utils.py | 15 +- services/lib/datatable_utils.py | 10 +- services/lib/export_file_generation.py | 117 +- services/lib/file_utils.py | 37 +- services/lib/interval_utils.py | 38 +- services/lib/service_response_generation.py | 31 +- services/local/beacon_defaults.yaml | 46 +- services/local/instance_overrides.yaml | 62 + services/local/local_paths.yaml | 5 - services/local/services_defaults.yaml | 3 + services/services.py | 2 +- services/variantsbedfile.py | 47 + tmp/aggregator.yaml | 2 +- 41 files changed, 3082 insertions(+), 1173 deletions(-) mode change 100644 => 100755 bin/analysesStatusmapsRefresher.py delete mode 100755 bin/collationsPlotter.py create mode 100644 bin/local/instance_overrides.yaml mode change 100644 => 100755 bin/templateTablesCreator.py create mode 100644 exports/multicollationtest-collationplots.svg create mode 100644 local/instance_overrides.yaml create mode 100644 services/local/instance_overrides.yaml create mode 100644 services/variantsbedfile.py diff --git a/bin/ISCNsegmenter.py b/bin/ISCNsegmenter.py index c5c40f33..29d887a6 100755 --- a/bin/ISCNsegmenter.py +++ b/bin/ISCNsegmenter.py @@ -38,7 +38,6 @@ def iscn_segmenter(): group_parameter = BYC_PARS.get("groupBy", "histological_diagnosis_id") input_file = BYC_PARS.get("inputfile") output_file = BYC_PARS.get("outputfile") - dt_m = byc.get("datatable_mappings", {}) technique = "cCGH" iscn_field = "iscn_ccgh" @@ -85,7 +84,7 @@ def iscn_segmenter(): "callset_id": s.get("callset_id", "exp-"+n), "individual_id": s.get("individual_id", "ind-"+n), } - update_bs = import_datatable_dict_line(dt_m, update_bs, fieldnames, s, "biosample") + update_bs = import_datatable_dict_line(update_bs, fieldnames, s, "biosample") h_line = pgxseg_biosample_meta_line(byc, update_bs, group_parameter) pgxseg.write( "{}\n".format(h_line) ) diff --git a/bin/analysesStatusmapsRefresher.py b/bin/analysesStatusmapsRefresher.py old mode 100644 new mode 100755 diff --git a/bin/collationsCreator.py b/bin/collationsCreator.py index 90a62f47..0b8038bf 100755 --- a/bin/collationsCreator.py +++ b/bin/collationsCreator.py @@ -69,7 +69,6 @@ def collations_creator(): data_coll = data_db[ collection ] onto_ids = _get_ids_for_prefix( data_coll, coll_defs ) - is_series = coll_defs.get("is_series", False) onto_keys = list( set( onto_ids ) & hier.keys() ) # get the set of all parents for sample codes @@ -78,10 +77,6 @@ def collations_creator(): if o_id in hier.keys(): onto_keys.update( hier[ o_id ][ "parent_terms" ] ) - if is_series is True: - child_ids = _get_child_ids_for_prefix(data_coll, coll_defs) - onto_keys.update(child_ids) - sel_hiers = [ ] no = len(hier.keys()) matched = 0 @@ -90,13 +85,13 @@ def collations_creator(): for count, code in enumerate(hier.keys(), start=1): if not BYC["TEST_MODE"]: bar.next() - children = list( set( hier[ code ][ "child_terms" ] ) & onto_keys ) - hier[ code ].update( { "child_terms": children } ) + children = list(set(hier[ code ]["child_terms"]) & onto_keys) + hier[ code ].update( {"child_terms": children}) if len( children ) < 1: if BYC["TEST_MODE"]: print(code+" w/o children") continue - code_no = data_coll.count_documents( { db_key: code } ) + code_no = data_coll.count_documents({db_key: code}) if code_no < 1: code_no = 0 if len( children ) < 2: @@ -106,7 +101,7 @@ def collations_creator(): if child_no > 0: # sub_id = re.sub(pre, coll_type, code) sub_id = code - update_obj = hier[ code ].copy() + update_obj = hier[code].copy() update_obj.update({ "id": sub_id, "ft_type": coll_defs.get("ft_type", "ontologyTerm"), @@ -115,6 +110,7 @@ def collations_creator(): "reference": "https://progenetix.org/services/ids/"+code, "namespace_prefix": coll_defs.get("namespace_prefix", ""), "scope": coll_defs.get("scope", ""), + "entity": coll_defs.get("entity", ""), "code_matches": code_no, "code": code, "count": child_no, @@ -131,8 +127,7 @@ def collations_creator(): if not BYC["TEST_MODE"]: sel_hiers.append( update_obj ) else: - print("{}:\t{} ({} deep) samples - {} / {} {}".format(sub_id, code_no, child_no, count, no, pre)) - + print(f'{sub_id}:\t{code_no} ({child_no} deep) samples - {count} / {no} {pre}') # UPDATE if not BYC["TEST_MODE"]: bar.finish() @@ -141,12 +136,12 @@ def collations_creator(): coll_coll.delete_many( { "collation_type": coll_type } ) coll_coll.insert_many( sel_hiers ) - print("===> Found {} of {} {} codes & added them to {}.collations <===".format(matched, no, coll_type, ds_id)) - + print(f'===> Found {matched} of {no} {coll_type} codes & added them to {ds_id}.collations <===') + + ################################################################################ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc): - coll_defs = byc["filter_definitions"][coll_type] hier = hierarchy_from_file(ds_id, coll_type, pre_h_f, byc) no = len(hier.keys()) @@ -174,21 +169,18 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc): "collation_type": coll_type, "namespace_prefix": coll_defs.get("namespace_prefix", ""), "scope": coll_defs.get("scope", ""), + "entity": coll_defs.get("entity", ""), "db_key": coll_defs.get("db_key", ""), "hierarchy_paths": [ { "order": no, "depth": 1, "path": [ "NCIT:C3262", "NCIT:C000000" ] } ] } } ) for o in onto_ids: - if o in hier.keys(): continue - added_no += 1 no += 1 - l = _get_label_for_code(data_coll, coll_defs, o) - if coll_type == "NCIT": hier.update( { o: { "id": o, "label": l, "hierarchy_paths": @@ -200,15 +192,13 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc): o_p = { "order": int(no), "depth": 0, "path": [ o ] } hier.update( { o: { "id": o, "label": l, "hierarchy_paths": [ o_p ] } } ) print("Added:\t{}\t{}".format(o, l)) - if added_no > 0: print("===> Added {} {} codes from {}.{} <===".format(added_no, coll_type, ds_id, coll_defs["scope"] ) ) - ############################################################################ + #--------------------------------------------------------------------------# no = len(hier) bar = Bar(" parsing parents ", max = no, suffix='%(percent)d%%'+" of "+str(no) ) - for c, h in hier.items(): bar.next() all_parents = { } @@ -219,29 +209,16 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc): bar.finish() - ############################################################################ + #--------------------------------------------------------------------------# bar = Bar(" parsing children ", max = no, suffix='%(percent)d%%'+" of "+str(no) ) - for c, h in hier.items(): bar.next() all_children = set() for c_2, h_2 in hier.items(): if c in h_2["parent_terms"]: all_children.add( c_2 ) - hier[ c ].update( { "child_terms": list( all_children ) } ) - - if "series_pattern" in coll_defs: - ch_re = re.compile( coll_defs["series_pattern"] ) - for c, h in hier.items(): - all_children = set( ) - for p in h["child_terms"]: - gsms = data_coll.distinct( db_key, { db_key: p } ) - gsms = list(filter(lambda d: ch_re.match(d), gsms)) - all_children.update(gsms) - all_children.add(p) - h.update({ "child_terms": list(all_children) }) - + hier[c].update({"child_terms": list(all_children)}) bar.finish() return hier @@ -249,11 +226,10 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc): ################################################################################ def _make_dummy_publication_hierarchy(byc): - coll_type = "pubmed" coll_defs = byc["filter_definitions"][coll_type] data_db = "progenetix" - data_coll = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))[ data_db ][ "publications" ] + data_coll = MongoClient(host=DB_MONGOHOST)[data_db]["publications"] query = { "id": { "$regex": r'^PMID\:\d+?$' } } no = data_coll.count_documents( query ) bar = Bar("Publications...", max = no, suffix='%(percent)d%%'+" of "+str(no) ) @@ -272,6 +248,7 @@ def _make_dummy_publication_hierarchy(byc): "collation_type": coll_type, "namespace_prefix": coll_defs.get("namespace_prefix", ""), "scope": coll_defs.get("scope", ""), + "entity": coll_defs.get("entity", ""), "db_key": coll_defs.get("db_key", ""), "updated": datetime.datetime.now().isoformat(), "hierarchy_paths": [ { "order": int(order), "depth": 0, "path": [ code ] } ], @@ -279,48 +256,31 @@ def _make_dummy_publication_hierarchy(byc): "child_terms": [ code ] } } ) - bar.finish() - return hier ################################################################################ def _get_dummy_hierarchy(ds_id, coll_type, coll_defs, byc): - data_client = MongoClient(host=DB_MONGOHOST) data_db = data_client[ ds_id ] data_coll = data_db[ coll_defs["scope"] ] data_pat = coll_defs["pattern"] db_key = coll_defs["db_key"] - is_series = coll_defs.get("is_series", False) - - if is_series is True: - s_pat = coll_defs["series_pattern"] - s_re = re.compile( s_pat ) - - pre_ids = _get_ids_for_prefix( data_coll, coll_defs ) - + pre_ids = _get_ids_for_prefix(data_coll, coll_defs) hier = { } no = len(pre_ids) bar = Bar(coll_type, max = no, suffix='%(percent)d%%'+" of "+str(no) ) for order, c in enumerate(sorted(pre_ids), start=1): - bar.next() hier.update( { c: _get_hierarchy_item( data_coll, coll_defs, coll_type, c, order, 0, [ c ] ) } ) - - if is_series is True: - - ser_ids = data_coll.distinct( db_key, { db_key: c } ) - ser_ids = list(filter(lambda d: s_re.match(d), ser_ids)) - hier[c].update( { "child_terms": list( set(ser_ids) | set(hier[c]["child_terms"]) ) } ) bar.finish() - return hier + ################################################################################ def _get_hierarchy_item(data_coll, coll_defs, coll_type, code, order, depth, path): @@ -332,6 +292,7 @@ def _get_hierarchy_item(data_coll, coll_defs, coll_type, code, order, depth, pat "collation_type": coll_type, "namespace_prefix": coll_defs.get("namespace_prefix", ""), "scope": coll_defs.get("scope", ""), + "entity": coll_defs.get("entity", ""), "db_key": coll_defs.get("db_key", ""), "updated": datetime.datetime.now().isoformat(), "hierarchy_paths": [ { "order": int(order), "depth": int(depth), "path": list(path) } ], diff --git a/bin/collationsPlotter.py b/bin/collationsPlotter.py deleted file mode 100755 index 61e3d27a..00000000 --- a/bin/collationsPlotter.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -import argparse, datetime, re, sys -from pymongo import MongoClient -from humps import camelize -from os import path, environ, pardir - -from bycon import * - -services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" ) -sys.path.append( services_lib_path ) -from bycon_bundler import * -from bycon_plot import * -from interval_utils import generate_genome_bins - -""" -./bin/collationsPlotter.py -d "progenetix,cellz" --filters "pgx:icdom-85003,pgx:icdom-81703,pgx:icdom-87003,pgx:icdom-87203,pgx:icdom-94003,pgx:icdom-95003,pgx:icdom-81403" -o ./exports/multicollationtest.svg -p "plot_area_height=50&plot_axis_y_max=80&plot_histogram_frequency_labels=30,60" - -""" - -################################################################################ -################################################################################ -################################################################################ - -def main(): - collations_plotter() - -################################################################################ - -def collations_plotter(): - initialize_bycon_service(byc, "collations_plotter") - run_beacon_init_stack(byc) - generate_genome_bins(byc) - - BYC_PARS.update({"plot_type": "histoplot"}) - out_putfile = BYC_PARS.get("outputfile") - - if len(byc["dataset_ids"]) < 1: - print("Please indicate one or more dataset ids using `-d`") - exit() - if len(BYC_PARS.get("filters", [])) < 1: - print("Please indicate one or more collation ids using `--filters`") - if not out_putfile: - print("No output file specified (-o, --outputfile) => quitting ...") - exit() - svg_file = out_putfile - if not ".svg" in svg_file.lower(): - print("The output file should be an `.svg` => quitting ...") - exit() - - pdb = ByconBundler(byc).collationsPlotbundles() - ByconPlot(byc, pdb).svg2file(svg_file) - -################################################################################ -################################################################################ -################################################################################ - - -if __name__ == '__main__': - main() diff --git a/bin/frequencymapsCreator.py b/bin/frequencymapsCreator.py index b1390b82..755b375a 100755 --- a/bin/frequencymapsCreator.py +++ b/bin/frequencymapsCreator.py @@ -13,6 +13,7 @@ services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" ) sys.path.append( services_lib_path ) +from bycon_bundler import ByconBundler from interval_utils import generate_genome_bins, interval_cnv_arrays, interval_counts_from_callsets from collation_utils import set_collation_types @@ -53,44 +54,35 @@ def frequencymaps_creator(): coll_ids = _filter_coll_ids(coll_coll, byc) coll_no = len(coll_ids) - bar = Bar(f'{coll_no} {ds_id} fMaps', max = coll_no, suffix='%(percent)d%%'+f' of {coll_no}' ) + if not BYC["TEST_MODE"]: + bar = Bar(f'{coll_no} {ds_id} fMaps', max = coll_no, suffix='%(percent)d%%'+f' of {coll_no}' ) coll_i = 0 for c_id in coll_ids: - - bar.next() - + if not BYC["TEST_MODE"]: + bar.next() coll = coll_coll.find_one({"id": c_id}) c_o_id = coll.get("_id") if not coll: - print("¡¡¡ some error - collation {} not found !!!".format(c_id)) + print(f"¡¡¡ some error - collation {c_id} not found !!!") continue - - pre, code = re.split("[:-]", c_id, 1) - coll_type = coll.get("collation_type", "undefined") - db_key = coll["db_key"] - - exclude_normals = True - for normal in ("EFO:0009654", "oneKgenomes"): - if normal in c_id: - print(f'\n---> keeping normals for {c_id}') - exclude_normals = False - coll_i += 1 - query = { db_key: { '$in': coll["child_terms"] } } - bios_no, cs_cursor = _cs_cursor_from_bios_query(byc, bios_coll, ind_coll, cs_coll, c_id, coll["scope"], query, exclude_normals) - cs_no = len(list(cs_cursor)) + byc.update({"filters":[{"id":c_id}, {"id": "EDAM:operation_3961"}]}) + RSS = ByconResultSets(byc).datasetsResults() + pdb = ByconBundler(byc).resultsets_frequencies_bundles(RSS) + if_bundles = pdb.get("interval_frequencies_bundles") + if len(if_bundles) < 1: + prdbug(f'No interval_frequencies for {c_id}') + continue - if cs_no < 1: - coll_coll.update_one({"_id": c_o_id}, {"$set": {"cnv_analyses": 0}}) + cnv_cs_count = if_bundles[0].get("sample_count", 0) + coll_coll.update_one({"_id": c_o_id}, {"$set": {"cnv_analyses": cnv_cs_count}}) + if cnv_cs_count < 1: continue - i_t = coll_i % 100 start_time = time.time() - # if i_t == 0 or cs_no > 1000: - # print("{}: {} bios, {} cs\t{}/{}\t{:.1f}%".format(c_id, bios_no, cs_no, coll_i, coll_no, 100*coll_i/coll_no)) update_obj = { "id": c_id, @@ -101,22 +93,15 @@ def frequencymaps_creator(): "collation_type": coll["collation_type"], "child_terms": coll["child_terms"], "updated": datetime.datetime.now().isoformat(), - "counts": {"biosamples": bios_no, "analyses": cs_no }, + "counts": {"analyses": cnv_cs_count }, "frequencymap": { "interval_count": byc["genomic_interval_count"], "binning": BYC_PARS.get("genome_binning", ""), - "biosample_count": bios_no + "intervals": if_bundles[0].get("interval_frequencies", []), + "analysis_count": cnv_cs_count } } - intervals, cnv_cs_count = interval_counts_from_callsets(cs_cursor, byc) - update_obj["frequencymap"].update({ - "intervals": intervals, - "analysis_count": cnv_cs_count - }) - - coll_coll.update_one({"_id": c_o_id}, {"$set": {"cnv_analyses": cnv_cs_count}}) - proc_time = time.time() - start_time # if cs_no > 1000: # print(" => Processed in {:.2f}s: {:.4f}s per callset".format(proc_time, (proc_time/cs_no))) @@ -125,29 +110,32 @@ def frequencymaps_creator(): fm_coll.delete_many( { "id": c_id } ) fm_coll.insert_one( update_obj ) - if coll["code_matches"] > 0: - if int(cs_no) > int(coll["code_matches"]): - query_cm = { db_key: c_id } - bios_no_cm, cs_cursor_cm = _cs_cursor_from_bios_query(byc, bios_coll, ind_coll, cs_coll, c_id, coll["scope"], query_cm) - cs_no_cm = len(list(cs_cursor_cm)) - if cs_no_cm > 0: - cm_obj = { "frequencymap_codematches": { - "interval_count": len(byc["genomic_intervals"]), - "binning": BYC_PARS.get("genome_binning", ""), - "biosample_count": bios_no_cm - } + if cnv_cs_count > coll.get("code_matches", cnv_cs_count): + byc.update({"filters":[{"id":c_id, "includeDescendantTerms": False}, {"id": "EDAM:operation_3961"}]}) + CMRSS = ByconResultSets(byc).datasetsResults() + cmpdb = ByconBundler(byc).resultsets_frequencies_bundles(CMRSS) + + cmif_bundles = cmpdb.get("interval_frequencies_bundles") + if len(cmif_bundles) < 1: + # print(f'No code match interval_frequencies for {c_id}') + continue + + cnv_cmcs_count = cmif_bundles[0].get("sample_count", 0) + if cnv_cmcs_count > 0: + cm_obj = {"frequencymap_codematches": + { + "interval_count": len(byc["genomic_intervals"]), + "binning": BYC_PARS.get("genome_binning", ""), + "intervals": cmif_bundles[0].get("interval_frequencies", []), + "analysis_count": cnv_cmcs_count } + } + prdbug(f'\n{c_id}: {cnv_cmcs_count} exact of {cnv_cs_count} total code matches ({coll["code_matches"]} indicated)') + if not BYC["TEST_MODE"]: + fm_coll.update_one( { "id": c_id }, { '$set': cm_obj }, upsert=False ) - intervals, cnv_cs_count = interval_counts_from_callsets(cs_cursor_cm, byc) - cm_obj["frequencymap_codematches"].update({ - "intervals": intervals, - "analysis_count": cs_no_cm - }) - prdbug(f'\n{c_id}: {cs_no_cm} exact of {cs_no} total code matches ({coll["code_matches"]} indicated)') - if not BYC["TEST_MODE"]: - fm_coll.update_one( { "id": c_id }, { '$set': cm_obj }, upsert=False ) - - bar.finish() + if not BYC["TEST_MODE"]: + bar.finish() ################################################################################ @@ -177,31 +165,6 @@ def _filter_coll_ids(coll_coll, byc): return coll_ids -################################################################################ - -def _cs_cursor_from_bios_query(byc, bios_coll, ind_coll, cs_coll, coll_id, scope, query, exclude_normals=True): - if scope == "individuals": - ind_ids = ind_coll.distinct( "id" , query ) - bios_ids = bios_coll.distinct( "id" , {"individual_id":{"$in": ind_ids } } ) - elif scope == "analyses": - bios_ids = cs_coll.distinct( "biosample_id" , query ) - else: - bios_ids = bios_coll.distinct( "id" , query ) - - pre_b = len(bios_ids) - - # for most entities samples labeled as "normal" will be excluded for frequency calculations - if exclude_normals: - bios_ids = bios_coll.distinct( "id" , { "id": { "$in": bios_ids } , "biosample_status.id": {"$ne": "EFO:0009654" }} ) - bios_no = len(bios_ids) - - if pre_b > bios_no: - prdbug(f'\nWARNING: {pre_b} samples for {coll_id}, while {bios_no} after excluding normals by EFO:0009654') - - cs_query = { "biosample_id": { "$in": bios_ids } , "variant_class": { "$ne": "SNV" } } - cs_cursor = cs_coll.find(cs_query) - - return bios_no, cs_cursor ################################################################################ ################################################################################ diff --git a/bin/lib/mongodb_utils.py b/bin/lib/mongodb_utils.py index 8c2c18da..03fa9fdd 100644 --- a/bin/lib/mongodb_utils.py +++ b/bin/lib/mongodb_utils.py @@ -2,12 +2,12 @@ from os import environ from pymongo import MongoClient, GEOSPHERE -from bycon import DB_MONGOHOST +from bycon import BYC, DB_MONGOHOST ################################################################################ def mongodb_update_indexes(ds_id, byc): - dt_m = byc["datatable_mappings"] + dt_m = BYC["datatable_mappings"] b_rt_s = byc["service_config"]["indexed_response_types"] mongo_client = MongoClient(host=DB_MONGOHOST) data_db = mongo_client[ds_id] diff --git a/bin/local/beacon_defaults.yaml b/bin/local/beacon_defaults.yaml index e5c94826..55ef4d17 100644 --- a/bin/local/beacon_defaults.yaml +++ b/bin/local/beacon_defaults.yaml @@ -13,11 +13,12 @@ defaults: # the aliases here are for non-standard speling or additional entry types service_path_aliases: - analyses: analyses - filteringTerms: filteringTerms - phenopackets: phenopackets - variants: genomicVariations - genomicVariations: genomicVariations + filteringTerms: filtering_terms # just for speling variations + entryTypes: entry_types # just for speling variations + variants: genomicVariations # just for speling variations + genomicVariations: genomicVariations # just for speling variations + phenopackets: phenopackets # Beacon+ specific example + ################################################################################ # here you can map additional path values to the corresponding (additional) @@ -25,16 +26,16 @@ service_path_aliases: ################################################################################ path_entry_type_mappings: - phenopackets: phenopacket + phenopackets: phenopacket # Beacon+ specific example ################################################################################ # her you can add additional path ids to the data query aggregation pipeline # that usually mapps/reduces queries against biosamples, genomicVariations, -#individuals ... +# individuals ... ################################################################################ data_pipeline_path_ids: - - phenopackets + - phenopackets # Beacon+ specific example ################################################################################ # Beacon entry type defaults - please adjust esp. info and schema paths... @@ -42,36 +43,7 @@ data_pipeline_path_ids: # framework and might be disentangled further on ... ################################################################################ -# => snake_casing - -# standard examples - -# ################################################################################ -# filteringTerm: -# is_entry_type: False -# request_entity_path_id: filteringTerms -# response_entity_id: filteringTerm -# collection: collations -# response_schema: beaconFilteringTermsResponse -# beacon_schema: -# entity_type: filteringTerm -# schema: https://progenetix.org/services/schemas/filteringTermsSchema/ -# h->o_access_key: Null -# ################################################################################ -# biosample: -# is_entry_type: True -# request_entity_path_id: biosamples -# response_entity_id: biosample -# collection: biosamples -# response_schema: beaconResultsetsResponse -# beacon_schema: -# entity_type: biosample -# schema: https://progenetix.org/services/schemas/biosample/ -# h->o_access_key: biosamples._id -# ################################################################################ - entity_defaults: - info: is_entry_type: False collection: Null diff --git a/bin/local/instance_overrides.yaml b/bin/local/instance_overrides.yaml new file mode 100644 index 00000000..8c8fc7a6 --- /dev/null +++ b/bin/local/instance_overrides.yaml @@ -0,0 +1,62 @@ +progenetix: + domains: + - progenetix.org + - www.progenetix.org + - progenetix.test + beacon_defaults: + defaults: + default_dataset_id: progenetix + test_domains: + - progenetix.test + +beaconplus: + domains: + - beaconplus.progenetix.org + - beaconplus.test + beacon_defaults: + defaults: + default_dataset_id: examplez + test_domains: + - beaconplus.test + +cancercelllines: + domains: + - cancercelllines.org + - www.cancercelllines.org + - cancercelllines.test + beacon_defaults: + defaults: + default_dataset_id: cellz + test_domains: + - cancercelllines.test + entity_defaults: + info: + content: + beacon_id: org.cancercelllines + name: Cancer Cell Line Genomics Beacon+ + id: org.cancercelllines.beacon + environment: prod + description: >- + The cancercelllines.org Beacon is a specific instance of the Progenetix + Beacon+ environment providing information about genommic variations in + cancer cell lines. + type: + group: org.ga4gh + artifact: beacon + version: v2.1.0-beaconplus + documentation_url: http://docs.cancercelllines.org + service_url: http://cancercelllines.org/beacon/ + welcome_url: https://cancercelllines.org/biosamples/ + alternative_url: https://cancercelllines.org + contact_url: mailto:contact@progenetix.org + created_at: 2023-07-01T00:00:00 + updated_at: 2024-02-24T13:00:00 + organization: + welcome_url: https://cancercelllines.org/ + contact_url: mailto:contact@progenetix.org + logoUrl: https://cancercelllines.org/img/cancercelllines-icon-400x300.png + info: + update_date_time: 2024-02-24T12:45:00 + create_date_time: 2023-07-01T00:00:00 + update_date_time: 2024-02-24T13:00:00 + diff --git a/bin/local/local_paths.yaml b/bin/local/local_paths.yaml index 29dc1724..a35deda8 100644 --- a/bin/local/local_paths.yaml +++ b/bin/local/local_paths.yaml @@ -19,8 +19,3 @@ server_callsets_dir_loc: - grch38 probefile_name: probes,cn.tsv - -test_domains: - - progenetix.test - - cancercelllines.test - - beaconplus.test diff --git a/bin/local/services_defaults.yaml b/bin/local/services_defaults.yaml index e60ce82f..6d50321f 100644 --- a/bin/local/services_defaults.yaml +++ b/bin/local/services_defaults.yaml @@ -1,3 +1,5 @@ +# Definitions here in fact are treated like `beacon_defaults` and merged into +# the global `beacon_defaults` dictionary defaults: {} ################################################################################ @@ -35,6 +37,7 @@ service_path_aliases: schemas: schemas uploader: uploader uploadplotter: uploadplotter + variantsbedfile: variantsbedfile vcf: vcfvariants vcfvariants: vcfvariants diff --git a/bin/publicationsInserter.py b/bin/publicationsInserter.py index e230e46a..2608585d 100755 --- a/bin/publicationsInserter.py +++ b/bin/publicationsInserter.py @@ -238,7 +238,7 @@ def get_ncit_tumor_types(n_p, pub): ############################################################################## def get_empty_publication(byc): - publication = object_instance_from_schema_name(byc, "Publication", "") + publication = object_instance_from_schema_name("Publication", "") publication.update({ "updated": date_isoformat(datetime.datetime.now()), "provenance": { diff --git a/bin/templateTablesCreator.py b/bin/templateTablesCreator.py old mode 100644 new mode 100755 index ae3b8055..e8b01b7c --- a/bin/templateTablesCreator.py +++ b/bin/templateTablesCreator.py @@ -28,7 +28,7 @@ def main(): def templates_creator(): initialize_bycon_service(byc, "templates_creator") - dt_m = byc["datatable_mappings"].get("definitions", {}) + dt_m = BYC["datatable_mappings"].get("definitions", {}) rsrc_p = path.join(pkg_path, "rsrc", "templates") all_cols = [] diff --git a/bin/variantsInserter.py b/bin/variantsInserter.py index 0ff4c4e3..7a7d6415 100755 --- a/bin/variantsInserter.py +++ b/bin/variantsInserter.py @@ -35,7 +35,6 @@ def variants_inserter(): ds_id = byc["dataset_ids"][0] input_file = BYC_PARS.get("inputfile") - dt_m = byc.get("datatable_mappings", {}) if not input_file: print("No input file file specified (-i, --inputfile) => quitting ...") @@ -121,7 +120,7 @@ def variants_inserter(): "individual_id": v.get("individual_id", re.sub("pgxbs-", "pgxind-", bs_id)) }) - insert_v = import_datatable_dict_line(dt_m, insert_v, variants.fieldnames, v, "genomicVariant") + insert_v = import_datatable_dict_line(insert_v, variants.fieldnames, v, "genomicVariant") prdbug(insert_v) insert_v = ByconVariant(byc).pgxVariant(insert_v) insert_v.update({"updated": datetime.datetime.now().isoformat()}) diff --git a/exports/multicollationtest-collationplots.svg b/exports/multicollationtest-collationplots.svg new file mode 100644 index 00000000..da16af40 --- /dev/null +++ b/exports/multicollationtest-collationplots.svg @@ -0,0 +1,1715 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Hepatocellular carcinoma, NOS +pgx:icdom-81703 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Neuroblastoma, NOS +pgx:icdom-95003 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Astrocytoma, NOS +pgx:icdom-94003 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Malignant melanoma, NOS +pgx:icdom-87203 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Adenocarcinoma, NOS +pgx:icdom-81403 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Infiltrating duct carcinoma, NOS +pgx:icdom-85003 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Pheochromocytoma, malignant +pgx:icdom-87003 (progenetix, 56 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Neuroblastoma, NOS +pgx:icdom-95003 (cellz, 112 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Hepatocellular carcinoma, NOS +pgx:icdom-81703 (cellz, 72 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Adenocarcinoma, NOS +pgx:icdom-81403 (cellz, 951 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Infiltrating duct carcinoma, NOS +pgx:icdom-85003 (cellz, 835 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Malignant melanoma, NOS +pgx:icdom-87203 (cellz, 673 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Astrocytoma, NOS +pgx:icdom-94003 (cellz, 53 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +© CC-BY 2001 - 2024 progenetix.org +13 collations + \ No newline at end of file diff --git a/exports/multicollationtest.svg b/exports/multicollationtest.svg index e4cdd6c7..da16af40 100644 --- a/exports/multicollationtest.svg +++ b/exports/multicollationtest.svg @@ -4,7 +4,7 @@ xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="genomeplot" width="1024px" -height="785px" +height="1435px" style="margin: auto; font-family: Helvetica, sans-serif;"> - + @@ -918,694 +918,798 @@ style="margin: auto; font-family: Helvetica, sans-serif;"> -Adenocarcinoma, NOS -pgx:icdom-81403 (progenetix, 18589 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Infiltrating duct carcinoma, NOS -pgx:icdom-85003 (progenetix, 12621 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Hepatocellular carcinoma, NOS -pgx:icdom-81703 (progenetix, 2024 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Pheochromocytoma, malignant -pgx:icdom-87003 (progenetix, 56 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Neuroblastoma, NOS -pgx:icdom-95003 (cellz, 112 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Astrocytoma, NOS -pgx:icdom-94003 (progenetix, 556 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Neuroblastoma, NOS -pgx:icdom-95003 (progenetix, 1982 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Hepatocellular carcinoma, NOS -pgx:icdom-81703 (cellz, 72 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Infiltrating duct carcinoma, NOS -pgx:icdom-85003 (cellz, 835 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Adenocarcinoma, NOS -pgx:icdom-81403 (cellz, 951 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Astrocytoma, NOS -pgx:icdom-94003 (cellz, 53 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Malignant melanoma, NOS -pgx:icdom-87203 (progenetix, 2538 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Malignant melanoma, NOS -pgx:icdom-87203 (cellz, 673 samples) - - - -30% - -30% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -© CC-BY 2001 - 2023 progenetix.org -13 collations +Hepatocellular carcinoma, NOS +pgx:icdom-81703 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Neuroblastoma, NOS +pgx:icdom-95003 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Astrocytoma, NOS +pgx:icdom-94003 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Malignant melanoma, NOS +pgx:icdom-87203 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Adenocarcinoma, NOS +pgx:icdom-81403 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Infiltrating duct carcinoma, NOS +pgx:icdom-85003 (progenetix, 200 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Pheochromocytoma, malignant +pgx:icdom-87003 (progenetix, 56 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Neuroblastoma, NOS +pgx:icdom-95003 (cellz, 112 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Hepatocellular carcinoma, NOS +pgx:icdom-81703 (cellz, 72 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Adenocarcinoma, NOS +pgx:icdom-81403 (cellz, 951 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Infiltrating duct carcinoma, NOS +pgx:icdom-85003 (cellz, 835 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Malignant melanoma, NOS +pgx:icdom-87203 (cellz, 673 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Astrocytoma, NOS +pgx:icdom-94003 (cellz, 53 samples) + + + +25% + +25% + +50% + +50% + +75% + +75% + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +© CC-BY 2001 - 2024 progenetix.org +13 collations \ No newline at end of file diff --git a/local/beacon_defaults.yaml b/local/beacon_defaults.yaml index e5c94826..55ef4d17 100644 --- a/local/beacon_defaults.yaml +++ b/local/beacon_defaults.yaml @@ -13,11 +13,12 @@ defaults: # the aliases here are for non-standard speling or additional entry types service_path_aliases: - analyses: analyses - filteringTerms: filteringTerms - phenopackets: phenopackets - variants: genomicVariations - genomicVariations: genomicVariations + filteringTerms: filtering_terms # just for speling variations + entryTypes: entry_types # just for speling variations + variants: genomicVariations # just for speling variations + genomicVariations: genomicVariations # just for speling variations + phenopackets: phenopackets # Beacon+ specific example + ################################################################################ # here you can map additional path values to the corresponding (additional) @@ -25,16 +26,16 @@ service_path_aliases: ################################################################################ path_entry_type_mappings: - phenopackets: phenopacket + phenopackets: phenopacket # Beacon+ specific example ################################################################################ # her you can add additional path ids to the data query aggregation pipeline # that usually mapps/reduces queries against biosamples, genomicVariations, -#individuals ... +# individuals ... ################################################################################ data_pipeline_path_ids: - - phenopackets + - phenopackets # Beacon+ specific example ################################################################################ # Beacon entry type defaults - please adjust esp. info and schema paths... @@ -42,36 +43,7 @@ data_pipeline_path_ids: # framework and might be disentangled further on ... ################################################################################ -# => snake_casing - -# standard examples - -# ################################################################################ -# filteringTerm: -# is_entry_type: False -# request_entity_path_id: filteringTerms -# response_entity_id: filteringTerm -# collection: collations -# response_schema: beaconFilteringTermsResponse -# beacon_schema: -# entity_type: filteringTerm -# schema: https://progenetix.org/services/schemas/filteringTermsSchema/ -# h->o_access_key: Null -# ################################################################################ -# biosample: -# is_entry_type: True -# request_entity_path_id: biosamples -# response_entity_id: biosample -# collection: biosamples -# response_schema: beaconResultsetsResponse -# beacon_schema: -# entity_type: biosample -# schema: https://progenetix.org/services/schemas/biosample/ -# h->o_access_key: biosamples._id -# ################################################################################ - entity_defaults: - info: is_entry_type: False collection: Null diff --git a/local/instance_overrides.yaml b/local/instance_overrides.yaml new file mode 100644 index 00000000..8c8fc7a6 --- /dev/null +++ b/local/instance_overrides.yaml @@ -0,0 +1,62 @@ +progenetix: + domains: + - progenetix.org + - www.progenetix.org + - progenetix.test + beacon_defaults: + defaults: + default_dataset_id: progenetix + test_domains: + - progenetix.test + +beaconplus: + domains: + - beaconplus.progenetix.org + - beaconplus.test + beacon_defaults: + defaults: + default_dataset_id: examplez + test_domains: + - beaconplus.test + +cancercelllines: + domains: + - cancercelllines.org + - www.cancercelllines.org + - cancercelllines.test + beacon_defaults: + defaults: + default_dataset_id: cellz + test_domains: + - cancercelllines.test + entity_defaults: + info: + content: + beacon_id: org.cancercelllines + name: Cancer Cell Line Genomics Beacon+ + id: org.cancercelllines.beacon + environment: prod + description: >- + The cancercelllines.org Beacon is a specific instance of the Progenetix + Beacon+ environment providing information about genommic variations in + cancer cell lines. + type: + group: org.ga4gh + artifact: beacon + version: v2.1.0-beaconplus + documentation_url: http://docs.cancercelllines.org + service_url: http://cancercelllines.org/beacon/ + welcome_url: https://cancercelllines.org/biosamples/ + alternative_url: https://cancercelllines.org + contact_url: mailto:contact@progenetix.org + created_at: 2023-07-01T00:00:00 + updated_at: 2024-02-24T13:00:00 + organization: + welcome_url: https://cancercelllines.org/ + contact_url: mailto:contact@progenetix.org + logoUrl: https://cancercelllines.org/img/cancercelllines-icon-400x300.png + info: + update_date_time: 2024-02-24T12:45:00 + create_date_time: 2023-07-01T00:00:00 + update_date_time: 2024-02-24T13:00:00 + diff --git a/local/local_paths.yaml b/local/local_paths.yaml index 29dc1724..a35deda8 100644 --- a/local/local_paths.yaml +++ b/local/local_paths.yaml @@ -19,8 +19,3 @@ server_callsets_dir_loc: - grch38 probefile_name: probes,cn.tsv - -test_domains: - - progenetix.test - - cancercelllines.test - - beaconplus.test diff --git a/local/services_defaults.yaml b/local/services_defaults.yaml index e60ce82f..6d50321f 100644 --- a/local/services_defaults.yaml +++ b/local/services_defaults.yaml @@ -1,3 +1,5 @@ +# Definitions here in fact are treated like `beacon_defaults` and merged into +# the global `beacon_defaults` dictionary defaults: {} ################################################################################ @@ -35,6 +37,7 @@ service_path_aliases: schemas: schemas uploader: uploader uploadplotter: uploadplotter + variantsbedfile: variantsbedfile vcf: vcfvariants vcfvariants: vcfvariants diff --git a/rsrc/templates/analysis_template.tsv b/rsrc/templates/analysis_template.tsv index 55fb41a1..46eab2c9 100644 --- a/rsrc/templates/analysis_template.tsv +++ b/rsrc/templates/analysis_template.tsv @@ -1 +1 @@ -analysis_id biosample_id individual_id legacy_ids variant_class experiment_id series_id platform_id platform_label data_provenance +analysis_id biosample_id individual_id analysis_legacy_id legacy_ids analysis_operation_id analysis_operation_label experiment_id series_id platform_id platform_label data_provenance calling_pipeline diff --git a/rsrc/templates/biosample_template.tsv b/rsrc/templates/biosample_template.tsv index 35dd41b0..7a1f5ed0 100644 --- a/rsrc/templates/biosample_template.tsv +++ b/rsrc/templates/biosample_template.tsv @@ -1 +1 @@ -biosample_id group_id group_label individual_id callset_ids external_references_id___PMID external_references_label___PMID external_references_id___arrayexpress external_references_label___arrayexpress external_references_id___cbioportal external_references_label___cbioportal external_references_id___cellosaurus external_references_label___cellosaurus legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label +biosample_id biosample_label biosample_legacy_id individual_id callset_ids group_id group_label pubmed_id pubmed_label cellosaurus_id cellosaurus_label cbioportal_id cbioportal_label external_references_id___arrayexpress external_references_label___arrayexpress cohort_ids legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label diff --git a/rsrc/templates/genomicVariant_template.tsv b/rsrc/templates/genomicVariant_template.tsv index 5fd7c5b5..6ceabf49 100644 --- a/rsrc/templates/genomicVariant_template.tsv +++ b/rsrc/templates/genomicVariant_template.tsv @@ -1 +1 @@ -variant_id variant_internal_id callset_id biosample_id individual_id sequence_id reference_name start end variant_state_id variant_state_label reference_sequence sequence annotation_derived aminoacid_changes genomic_hgvs_id log2 variant_type reference_bases alternate_bases +variant_id variant_internal_id callset_id biosample_id individual_id sequence_id reference_name start end variant_state_id variant_state_label reference_sequence sequence annotation_derived aminoacid_changes genomic_hgvs_id log2 variant_type diff --git a/rsrc/templates/individual_template.tsv b/rsrc/templates/individual_template.tsv index 3c75eaa7..82be3265 100644 --- a/rsrc/templates/individual_template.tsv +++ b/rsrc/templates/individual_template.tsv @@ -1 +1 @@ -individual_id legacy_ids sex_id sex_label age_iso age_days data_use_conditions_id data_use_conditions_label histological_diagnosis_id histological_diagnosis_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cell_line_donation_id cell_line_donation_label +individual_id individual_legacy_id legacy_ids sex_id sex_label age_iso age_days data_use_conditions_id data_use_conditions_label histological_diagnosis_id histological_diagnosis_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cell_line_donation_id cell_line_donation_label diff --git a/rsrc/templates/metadata_template.tsv b/rsrc/templates/metadata_template.tsv index 039ec8df..7857c2c0 100644 --- a/rsrc/templates/metadata_template.tsv +++ b/rsrc/templates/metadata_template.tsv @@ -1 +1 @@ -biosample_id group_id group_label individual_id callset_ids external_references_id___PMID external_references_label___PMID external_references_id___arrayexpress external_references_label___arrayexpress external_references_id___cbioportal external_references_label___cbioportal external_references_id___cellosaurus external_references_label___cellosaurus legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label age_days data_use_conditions_id data_use_conditions_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes cell_line_donation_id cell_line_donation_label analysis_id variant_class platform_label data_provenance +biosample_id biosample_label biosample_legacy_id individual_id callset_ids group_id group_label pubmed_id pubmed_label cellosaurus_id cellosaurus_label cbioportal_id cbioportal_label external_references_id___arrayexpress external_references_label___arrayexpress cohort_ids legacy_ids notes histological_diagnosis_id histological_diagnosis_label icdo_morphology_id icdo_morphology_label icdo_topography_id icdo_topography_label pathological_stage_id pathological_stage_label biosample_status_id biosample_status_label sampled_tissue_id sampled_tissue_label tnm stage grade age_iso sex_id sex_label followup_state_id followup_state_label followup_time geoprov_city geoprov_country geoprov_iso_alpha3 geoprov_long_lat cnv_fraction cnv_del_fraction cnv_dup_fraction cell_line experiment_id series_id platform_id cell_line_id cell_line_label individual_legacy_id age_days data_use_conditions_id data_use_conditions_label index_disease_notes index_disease_followup_time index_disease_followup_state_id index_disease_followup_state_label auxiliary_disease_id auxiliary_disease_label auxiliary_disease_notes cell_line_donation_id cell_line_donation_label analysis_id analysis_legacy_id analysis_operation_id analysis_operation_label platform_label data_provenance calling_pipeline diff --git a/services/collationplots.py b/services/collationplots.py index a9b23419..c3c31262 100755 --- a/services/collationplots.py +++ b/services/collationplots.py @@ -4,12 +4,22 @@ import sys, datetime, argparse from pymongo import MongoClient -from bycon import * +from bycon import ( + BeaconErrorResponse, + byc, + initialize_bycon_service, + print_text_response, + rest_path_value, + run_beacon_init_stack, + BYC, + BYC_PARS +) services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) sys.path.append( services_lib_path ) from bycon_bundler import * from bycon_plot import * +from file_utils import ExportFile from interval_utils import generate_genome_bins from service_helpers import * from service_response_generation import * @@ -19,9 +29,9 @@ * https://progenetix.org/services/collationplots/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167,pgx:icdom-85003 * https://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167&output=histoplot * https://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&id=pgxcohort-TCGAcancers -* https://progenetix.org/cgi/bycon/services/intervalFrequencies.py/?output=pgxseg&datasetIds=progenetix&filters=NCIT:C7376 * http://progenetix.org/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT&filterPrecision=start&withSamples=20&collationTypes=NCIT&output=histoplot&plot_area_height=20&plot_labelcol_font_size=6&plot_axislab_y_width=2&plot_label_y_values=0&plot_axis_y_max=80&plot_region_gap_width=1&debug= * http://progenetix.test/services/intervalFrequencies/?datasetIds=progenetix&filters=NCIT:C7376,PMID:22824167&output=histoheatplot +* http://progenetix.test/services/collationplots/?datasetIds=progenetix&collationTypes=NCIT&minNumber=500&plotType=histoheatplot&method=codematches podmd""" ################################################################################ @@ -29,7 +39,6 @@ ################################################################################ def main(): - try: collationplots() except Exception: @@ -42,23 +51,23 @@ def collationplots(): run_beacon_init_stack(byc) generate_genome_bins(byc) - plot_type = BYC_PARS.get("plot_type", "___none___") - if plot_type not in ["histoplot", "histoheatplot", "histosparkplot"]: - plot_type = "histoplot" - - BYC_PARS.update({"plot_type": plot_type}) - id_from_path = rest_path_value("collationplots") - if id_from_path: - byc[ "filters" ] = [ {"id": id_from_path } ] + if (id_from_path := rest_path_value("collationplots")): + byc["filters"] = [ {"id": id_from_path } ] elif "id" in BYC_PARS: - byc[ "filters" ] = [ {"id": BYC_PARS["id"]} ] + byc["filters"] = [ {"id": BYC_PARS["id"]} ] + if BYC_PARS.get("plot_type", "___none___") not in ["histoplot", "histoheatplot", "histosparkplot"]: + BYC_PARS.update({"plot_type": "histoplot"}) - if not "filters" in byc: - BYC["ERRORS"].append("No value was provided for collation `id` or `filters`.") + svg_f = ExportFile("svg").checkOutputFile() + pdb = ByconBundler(byc).collationsPlotbundles() + if len(BYC["ERRORS"]) >1: BeaconErrorResponse(byc).response(422) - pdb = ByconBundler(byc).collationsPlotbundles() - ByconPlot(byc, pdb).svgResponse() + BP = ByconPlot(byc, pdb) + if svg_f: + BP.svg2file(svg_f) + else: + BP.svgResponse() ################################################################################ diff --git a/services/config/genespans.yaml b/services/config/genespans.yaml index 185c72cc..9b49087e 100644 --- a/services/config/genespans.yaml +++ b/services/config/genespans.yaml @@ -1,17 +1,11 @@ defaults: - query_fields: - - symbol - - ensembl_gene_ids - - synonyms response_entity_id: gene meta: received_request_summary: - requested_granularity: record assembly_id: GRCh38 requested_schemas: - entity_type: gene schema: https://progenetix.org/services/schemas/ProgenetixGene - returned_granularity: record info: > The main genes payload can be accessed in `response.results`. assembly_ids: diff --git a/services/intervalFrequencies.py b/services/intervalFrequencies.py index 6cd5a343..dfc57dc5 100755 --- a/services/intervalFrequencies.py +++ b/services/intervalFrequencies.py @@ -47,22 +47,18 @@ def interval_frequencies(): run_beacon_init_stack(byc) generate_genome_bins(byc) - id_from_path = rest_path_value("intervalFrequencies") - if id_from_path: - byc[ "filters" ] = [ {"id": id_from_path } ] + if (id_from_path := rest_path_value("collationplots")): + byc["filters"] = [ {"id": id_from_path } ] elif "id" in BYC_PARS: - byc[ "filters" ] = [ {"id": BYC_PARS["id"]} ] - - if not "filters" in byc: - BYC["ERRORS"].append("No value was provided for collation `id` or `filters`.") + byc["filters"] = [ {"id": BYC_PARS["id"]} ] + pdb = ByconBundler(byc).collationsPlotbundles() + if len(BYC["ERRORS"]) >1: BeaconErrorResponse(byc).response(422) file_type = BYC_PARS.get("output", "___none___") if file_type not in ["pgxfreq", "pgxmatrix", "pgxseg"]: file_type = "pgxfreq" output = file_type - pdb = ByconBundler(byc).collationsPlotbundles() - if "pgxseg" in output or "pgxfreq" in output: export_pgxseg_frequencies(byc, pdb["interval_frequencies_bundles"]) elif "matrix" in output: diff --git a/services/lib/bycon_bundler.py b/services/lib/bycon_bundler.py index 723344cb..50ed29ad 100644 --- a/services/lib/bycon_bundler.py +++ b/services/lib/bycon_bundler.py @@ -18,7 +18,6 @@ ################################################################################ class ByconBundler: - """ # The `ByconBundler` class @@ -36,8 +35,8 @@ def __init__(self, byc): self.local_paths = byc.get("local_paths", {}) self.datasets_results = None self.dataset_ids = byc.get("dataset_ids", []) - self.datatable_mappings = byc.get("datatable_mappings", {}) self.filters = byc.get("filters", []) + self.collation_types = BYC_PARS.get("collation_types", []) self.min_number = BYC_PARS.get("min_number", 0) self.delivery_method = BYC_PARS.get("method") self.header = [] @@ -79,7 +78,6 @@ def __init__(self, byc): #--------------------------------------------------------------------------# def read_pgx_file(self, filepath): - self.filepath = filepath h_lines = [] @@ -101,7 +99,6 @@ def read_pgx_file(self, filepath): #--------------------------------------------------------------------------# def read_probedata_file(self, filepath): - self.filepath = filepath self.probedata = [] @@ -139,11 +136,9 @@ def read_probedata_file(self, filepath): def pgxseg_to_keyed_bundle(self, filepath): self.read_pgx_file(filepath) - if not "biosample_id" in self.fieldnames: self.errors.append("¡¡¡ The `biosample_id` parameter is required for variant assignment !!!") return - self.__deparse_pgxseg_samples_header() self.__keyed_bundle_add_variants_from_lines() @@ -153,33 +148,27 @@ def pgxseg_to_keyed_bundle(self, filepath): #--------------------------------------------------------------------------# def pgxseg_to_plotbundle(self, filepath): - self.pgxseg_to_keyed_bundle(filepath) self.__flatten_keyed_bundle() - return { "interval_frequencies_bundles": self.callsets_frequencies_bundles(), "callsets_variants_bundles": self.callsets_variants_bundles() } + #--------------------------------------------------------------------------# def callsets_variants_bundles(self): - # TODO: This is similar to a keyed bundle component ... - bb = self.bundle - c_p_l = [] for p_o in bb.get("analyses", []): cs_id = p_o.get("id") p_o.update({ "variants": list(filter(lambda v: v.get("callset_id", "___none___") == cs_id, bb["variants"])) }) - c_p_l.append(p_o) - + c_p_l.append(p_o) self.callsetVariantsBundles = c_p_l - return self.callsetVariantsBundles @@ -240,7 +229,7 @@ def __deparse_pgxseg_samples_header(self): continue bios = {"id": bs_id} - bios = import_datatable_dict_line(self.datatable_mappings, bios, fieldnames, bios_d, "biosample") + bios = import_datatable_dict_line(bios, fieldnames, bios_d, "biosample") cs_id = bios.get("callset_id", re.sub("pgxbs", "pgxcs", bs_id) ) ind_id = bios.get("individual_id", re.sub("pgxbs", "pgxind", bs_id) ) ind = {"id": ind_id} @@ -295,9 +284,6 @@ def __callsets_bundle_from_result_set(self, bundle_type="analyses"): if cnv_chro_stats is False or cnv_statusmaps is False: continue - prdbug(f'dataset_id: {ds_id}') - prdbug(f'label in bundler: {s.get("label")}') - p_o = { "dataset_id": ds_id, "callset_id": s.get(analysis_key, "NA"), @@ -328,9 +314,6 @@ def __callsets_add_database_variants(self): cs_id = p_o.get("callset_id", "___none___") v_q = {"callset_id": cs_id} p_o.update({"variants": list(var_coll.find(v_q))}) - # for v in var_coll.find(v_q): - # p_o["variants"].append(ByconVariant(self.byc).byconVariant(v)) - c_p_l.append(p_o) self.callsetVariantsBundles = c_p_l @@ -380,7 +363,7 @@ def __keyed_bundle_add_variants_from_lines(self): "callset_id": cs_id, } - update_v = import_datatable_dict_line(self.datatable_mappings, update_v, fieldnames, v, "genomicVariant") + update_v = import_datatable_dict_line(update_v, fieldnames, v, "genomicVariant") update_v = ByconVariant(self.byc).pgxVariant(update_v) update_v.update({ @@ -403,6 +386,7 @@ def __keyed_bundle_add_variants_from_lines(self): "variants_by_callset_id": vars_ided }) + #--------------------------------------------------------------------------# def __flatten_keyed_bundle(self): @@ -419,6 +403,7 @@ def __flatten_keyed_bundle(self): "variants": [elem for sublist in ( v_cs_k.values() ) for elem in sublist] }) + #--------------------------------------------------------------------------# def __callsetBundleCreateIsets(self, label=""): @@ -426,10 +411,8 @@ def __callsetBundleCreateIsets(self, label=""): for ds_id in self.dataset_ids: dscs = list(filter(lambda cs: cs.get("dataset_id", "NA") == ds_id, self.bundle["analyses"])) intervals, cnv_cs_count = interval_counts_from_callsets(self.bundle["analyses"], self.byc) - if cnv_cs_count < self.min_number: continue - iset = { "dataset_id": ds_id, "group_id": ds_id, @@ -446,44 +429,50 @@ def __callsetBundleCreateIsets(self, label=""): def __isetBundlesFromCollationParameters(self): if len(self.dataset_ids) < 1: + BYC["ERRORS"].append("¡¡¡ No `datasetdIds` parameter !!!") return - if len(self.filters) < 1: + if len(self.filters) < 1 and len(self.collation_types) < 1: + BYC["ERRORS"].append("¡¡¡ No `filters` or `collationTypes` parameter !!!") return - fmap_name = "frequencymap" if "codematches" in str(self.delivery_method): fmap_name = "frequencymap_codematches" - mongo_client = MongoClient(host=DB_MONGOHOST) + id_q = {} + if len(self.filters) > 0: + fids = [x.get("id", "___none___") for x in self.filters] + id_q = {"id": {"$in": fids}} + elif len(self.collation_types) > 0: + id_q = {"collation_type": {"$in": self.collation_types}} + prdbug(f'... __isetBundlesFromCollationParameters query {id_q}') + + mongo_client = MongoClient(host=DB_MONGOHOST) for ds_id in self.dataset_ids: coll_db = mongo_client[ds_id] - for f in self.filters: - f_val = f["id"] + coll_ids = coll_db[ "collations" ].distinct("id", id_q) + prdbug(f'prefetched coll ids: {coll_ids}') + for f_val in coll_ids: f_q = { "id": f_val } - collation_f = coll_db[ "frequencymaps" ].find_one( { "id": f_val } ) - collation_c = coll_db[ "collations" ].find_one( { "id": f_val } ) - + collation_f = coll_db[ "frequencymaps" ].find_one( f_q ) + collation_c = coll_db[ "collations" ].find_one( f_q ) if not collation_f: continue if not collation_c: continue if not fmap_name in collation_f: continue - fmap_count = collation_f[ fmap_name ].get("analysis_count", 0) if fmap_count < self.min_number: continue - r_o = { "dataset_id": ds_id, "group_id": f_val, "label": re.sub(r';', ',', collation_c["label"]), "sample_count": fmap_count, - "interval_frequencies": collation_f[ fmap_name ]["intervals"] } - + "interval_frequencies": collation_f[ fmap_name ]["intervals"] } self.intervalFrequenciesBundles.append(r_o) - mongo_client.close( ) + ################################################################################ diff --git a/services/lib/cytoband_utils.py b/services/lib/cytoband_utils.py index 3ab9de9d..463d7cc9 100644 --- a/services/lib/cytoband_utils.py +++ b/services/lib/cytoband_utils.py @@ -239,37 +239,26 @@ def deparse_ISCN_to_variants(iscn, byc): errors = [] for cnv_t, cnv_defs in v_t_defs.items(): - revish = cnv_defs.get("revish_label") if not revish: continue iscn_re = re.compile(rf"^.*?{revish}\(([\w.,]+)\).*?$", re.IGNORECASE) - if iscn_re.match(iscn): - m = iscn_re.match(iscn).group(1) - - for i_v in re.split(",", m): - + for i_v in re.split(",", m): if not cb_pat.match(i_v): continue - cytoBands, chro, start, end, error = bands_from_cytobands(i_v, c_b_d, a_d) if len(error) > 0: errors.append(error) continue - v_l = end - start t = cnv_defs.get("DUPDEL", "CNV") - cytostring = "{}({})".format(cnv_t, i_v).lower() - if "amp" in revish and v_l > i_d.get("cnv_amp_max_size", 3000000): revish = "hldup" - - v_s = {} - + v_s = {} v = ({ "variant_state": cnv_defs.get("variant_state"), "location": { diff --git a/services/lib/datatable_utils.py b/services/lib/datatable_utils.py index 44aede25..31105c11 100644 --- a/services/lib/datatable_utils.py +++ b/services/lib/datatable_utils.py @@ -3,7 +3,7 @@ from random import sample as randomSamples # bycon -from bycon import assign_nested_value, get_nested_value, prdbug, prjsonnice, BYC_PARS, ENV +from bycon import assign_nested_value, get_nested_value, prdbug, prjsonnice, BYC, BYC_PARS, ENV ################################################################################ @@ -11,9 +11,7 @@ def export_datatable_download(results, byc): # TODO: separate table generation from HTTP response output = BYC_PARS.get("output", "___none___") prdbug(f'... in export_datatable_download => {output}') - dt_m = byc.get("datatable_mappings") - if not dt_m: - return + dt_m = BYC["datatable_mappings"] r_t = byc.get("response_entity_id", "___none___") if not r_t in dt_m["definitions"]: return @@ -62,8 +60,8 @@ def export_datatable_download(results, byc): ################################################################################ -def import_datatable_dict_line(datatable_mappings, parent, fieldnames, lineobj, primary_scope="biosample"): - dt_m = datatable_mappings +def import_datatable_dict_line(parent, fieldnames, lineobj, primary_scope="biosample"): + dt_m = BYC["datatable_mappings"] if not primary_scope in dt_m["definitions"]: return io_params = dt_m["definitions"][ primary_scope ]["parameters"] diff --git a/services/lib/export_file_generation.py b/services/lib/export_file_generation.py index d6cdcf8f..4123f608 100644 --- a/services/lib/export_file_generation.py +++ b/services/lib/export_file_generation.py @@ -1,7 +1,7 @@ -import pymongo from os import path, environ +from pymongo import MongoClient -from bycon_helpers import get_nested_value, return_paginated_list +from bycon_helpers import get_nested_value, return_paginated_list, select_this_server from cgi_parsing import * from config import * from variant_mapping import ByconVariant @@ -18,7 +18,7 @@ def stream_pgx_meta_header(ds_id, ds_results, byc): ds_d = byc.get("dataset_definitions", {}) ds_ds_d = ds_d.get(ds_id, {}) - mongo_client = pymongo.MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost")) + mongo_client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost")) bs_coll = mongo_client[ ds_id ][ "biosamples" ] open_text_streaming() @@ -44,7 +44,7 @@ def stream_pgx_meta_header(ds_id, ds_results, byc): ################################################################################ def pgxseg_biosample_meta_line(byc, biosample, group_id_key="histological_diagnosis_id"): - dt_m = byc["datatable_mappings"] + dt_m = BYC["datatable_mappings"] io_params = dt_m["definitions"][ "biosample" ]["parameters"] g_id_k = group_id_key @@ -118,11 +118,11 @@ def print_filters_meta_line(byc): ################################################################################ def export_pgxseg_download(datasets_results, ds_id, byc): - data_client = pymongo.MongoClient(host=DB_MONGOHOST) + data_client = MongoClient(host=DB_MONGOHOST) v_coll = data_client[ ds_id ][ "variants" ] ds_results = datasets_results.get(ds_id, {}) if not "variants._id" in ds_results: - # TODO: error message here + BYC["ERRORS"].append("No variants found in the dataset results.") return v__ids = ds_results["variants._id"].get("target_values", []) if test_truthy( BYC_PARS.get("paginate_results", True) ): @@ -139,14 +139,113 @@ def export_pgxseg_download(datasets_results, ds_id, byc): v_instances = list(sorted(v_instances, key=lambda x: (f'{x["reference_name"].replace("X", "XX").replace("Y", "YY").zfill(2)}', x['start']))) for v in v_instances: print_variant_pgxseg(v) - close_text_streaming() + +################################################################################ + +def write_variants_bedfile(datasets_results, ds_id, byc): + """podmd + ##### Accepts + + * a Bycon `byc` object + * a Bycon `h_o` handover object with its `target_values` representing `_id` + objects of a `variants` collection + + The function creates a basic BED file and returns its local path. A standard + use would be to create a link to this file and submit it as `hgt.customText` + parameter to the UCSC browser. + + ##### TODO + + * The creation of the different variant types is still rudimentary and has to be + expanded in lockstep with improving Beacon documentation and examples. The + definition of the types and their match patterns should also be moved to a + +separate configuration entry and subroutine. + * evaluate to use "bedDetails" format + + podmd""" + local_paths = byc.get("local_paths") + if not local_paths: + return False + tmp_path = path.join( *local_paths[ "server_tmp_dir_loc" ]) + if not path.isdir(tmp_path): + BYC["ERRORS"].append(f"Temporary directory `{tmp_path}` not found.") + return False + h_o_server = select_this_server(byc) + ext_url = f'http://genome.ucsc.edu/cgi-bin/hgTracks?org=human&db=hg38' + bed_url = f'' + + vs = { "DUP": [ ], "DEL": [ ], "LOH": [ ], "SNV": [ ]} + colors = { + "plot_DUP_color": (255, 198, 51), + "plot_AMP_color": (255,102,0), + "plot_DEL_color": (51, 160, 255), + "plot_HOMODEL_color": (0, 51, 204), + "plot_LOH_color": (102, 170, 153), + "plot_SNV_color": (255, 51, 204) + } + + data_client = MongoClient(host=DB_MONGOHOST) + v_coll = data_client[ ds_id ][ "variants" ] + ds_results = datasets_results.get(ds_id, {}) + if not "variants._id" in ds_results: + BYC["ERRORS"].append("No variants found in the dataset results.") + return [ext_url, bed_url] + v__ids = ds_results["variants._id"].get("target_values", []) + v_count = ds_results["variants._id"].get("target_count", 0) + accessid = ds_results["variants._id"].get("id", "___none___") + if test_truthy( BYC_PARS.get("paginate_results", True) ): + v__ids = return_paginated_list(v__ids, BYC_PARS.get("skip", 0), BYC_PARS.get("limit", 0)) + + bed_file_name = f'{accessid}.bed' + bed_file = path.join( tmp_path, bed_file_name ) + + for v__id in v__ids: + v = v_coll.find_one( { "_id": v__id }, { "_id": 0 } ) + pv = ByconVariant(byc).byconVariant(v) + if (pvt := pv.get("variant_type", "___none___")) not in vs.keys(): + continue + vs[pvt].append(pv) + + b_f = open( bed_file, 'w' ) + pos = set() + ucsc_chr = "" + for vt in vs.keys(): + if len(vs[vt]) > 0: + try: + vs[vt] = sorted(vs[vt], key=lambda k: k['variant_length'], reverse=True) + except: + pass + col_key = f"plot_{vt}_color" + col_rgb = colors.get(col_key, (127, 127, 127)) + # col_rgb = [127, 127, 127] + b_f.write(f'track name={vt} visibility=squish description=\"overall {v_count} variants matching the query; {len(vs[vt])} in this track\" color={col_rgb[0]},{col_rgb[1]},{col_rgb[2]}\n') + b_f.write("#chrom\tchromStart\tchromEnd\tbiosampleId\n") + for v in vs[vt]: + ucsc_chr = "chr"+v["reference_name"] + ucsc_min = int( v["start"] + 1 ) + ucsc_max = int( v["end"] ) + l = f'{ucsc_chr}\t{ucsc_min}\t{ucsc_max}\t{v.get("biosample_id", "___none___")}\n' + pos.add(ucsc_min) + pos.add(ucsc_max) + b_f.write( l ) + + b_f.close() + ucsc_range = sorted(pos) + ucsc_pos = "{}:{}-{}".format(ucsc_chr, ucsc_range[0], ucsc_range[-1]) + ext_url = f'{ext_url}&position={ucsc_pos}&hgt.customText=' + bed_url = f'{h_o_server}{local_paths.get("server_tmp_dir_web", "/tmp")}/{bed_file_name}' + + return [ext_url, bed_url] + + ################################################################################ def print_variant_pgxseg(v_pgxseg): print( pgxseg_variant_line(v_pgxseg) ) + ################################################################################ def print_pgxseg_header_line(): @@ -196,7 +295,7 @@ def export_callsets_matrix(datasets_results, ds_id, byc): cs_r = datasets_results[ds_id].get("analyses._id") if not cs_r: return - mongo_client = pymongo.MongoClient(host=DB_MONGOHOST) + mongo_client = MongoClient(host=DB_MONGOHOST) bs_coll = mongo_client[ ds_id ][ "biosamples" ] cs_coll = mongo_client[ ds_id ][ "analyses" ] @@ -354,7 +453,7 @@ def export_vcf_download(datasets_results, ds_id, byc): "INFO": "" } - data_client = pymongo.MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost")) + data_client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost")) v_coll = data_client[ ds_id ][ "variants" ] ds_results = datasets_results.get(ds_id, {}) if not "variants._id" in ds_results: diff --git a/services/lib/file_utils.py b/services/lib/file_utils.py index f0113cda..f3d7b150 100644 --- a/services/lib/file_utils.py +++ b/services/lib/file_utils.py @@ -6,13 +6,46 @@ from copy import deepcopy from random import sample as random_samples -from bycon import ByconVariant, prjsonnice, return_paginated_list +from bycon import ( + ByconVariant, + BYC, + BYC_PARS, + ENV, + prdbug, + prjsonnice, + return_paginated_list +) -from datatable_utils import import_datatable_dict_line from interval_utils import interval_cnv_arrays, interval_counts_from_callsets ################################################################################ +class ExportFile: + + def __init__(self, file_type=None): + self.file_path = BYC_PARS.get("outputfile") + self.file_type = file_type + + # -------------------------------------------------------------------------# + # ----------------------------- public ------------------------------------# + # -------------------------------------------------------------------------# + + def checkOutputFile(self): + if not self.file_path: + if "local" in ENV: + BYC["ERRORS"].append("No output file specified (-o, --outputfile) => quitting ...") + return False + if self.file_type: + if not self.file_path.endswith(self.file_type): + if "local" in ENV: + BYC["ERRORS"].append(f"The output file should be an `{self.file_type}` => quitting ...") + return False + return self.file_path + + +################################################################################ + + def read_tsv_to_dictlist(filepath, max_count=0): dictlist = [] with open(filepath, newline='') as csvfile: diff --git a/services/lib/interval_utils.py b/services/lib/interval_utils.py index daf8fdc1..c682d5b8 100644 --- a/services/lib/interval_utils.py +++ b/services/lib/interval_utils.py @@ -64,6 +64,15 @@ ################################################################################ ################################################################################ +class GenomeBins: + def __init__(self, byc): + self.genomic_intervals = [] + + #--------------------------------------------------------------------------# + #----------------------------- public -------------------------------------# + #--------------------------------------------------------------------------# + + def generate_genome_bins(byc): parse_cytoband_file(byc) __generate_cytoband_intervals(byc) @@ -220,48 +229,33 @@ def interval_cnv_arrays(cs_vars, byc): # the values_map collects all values for the given interval to retrieve # the min and max values of each interval values_map = [[] for i in range(int_no)] - digests = [] - if type(cs_vars).__name__ == "Cursor": cs_vars.rewind() - for v in cs_vars: - if "variant_state" not in v: - continue - - v_t_c = v["variant_state"].get("id", "__NA__") + v_t_c = v.get("variant_state", {}).get("id", "__NA__") if v_t_c not in v_t_defs.keys(): continue - dup_del = v_t_defs[v_t_c].get("DUPDEL") # skipping non-CNV vars if dup_del is None: continue - cov_lab = cov_labs[dup_del] - - if "reference_name" not in v: - v.update({"reference_name": v["location"]["chromosome"]}) - v_i_id = v.get("variant_internal_id", None) v_cs_id = v.get("callset_id", None) - if v_i_id in digests: if "local" in ENV: print(f'\n¡¡¡ {v_i_id} already counted for {v_cs_id}') + continue else: digests.append(v_i_id) for i, intv in enumerate(intervals): - if _has_overlap(intv, v): - ov_end = min(intv["end"], v["location"]["end"]) ov_start = max(intv["start"], v["location"]["start"]) ov = ov_end - ov_start maps[cov_lab][i] += ov - try: # print(type(v["info"]["cnv_value"])) if type(v["info"]["cnv_value"]) == int or type(v["info"]["cnv_value"]) == float: @@ -277,7 +271,7 @@ def interval_cnv_arrays(cs_vars, byc): if maps[cov_lab][i] > 0: cov = maps[cov_lab][i] lab = f'{cov_lab}coverage' - chro = str(intv["reference_name"]) + chro = str(v["location"].get("chromosome")) c_a = chro + intv["arm"] cnv_stats[lab] += cov chro_stats[chro][lab] += cov @@ -335,7 +329,6 @@ def interval_counts_from_callsets(analyses, byc): analyses with CNV statusmaps and return a list of standard genomic interval objects with added per-interval quantitative data. """ - min_f = byc["interval_definitions"]["interval_min_fraction"].get("value", 0.001) int_fs = deepcopy(byc["genomic_intervals"]) int_no = len(int_fs) @@ -346,32 +339,25 @@ def interval_counts_from_callsets(analyses, byc): cs_no = len(list(analyses)) f_factor = 0 - if cs_no > 0: f_factor = 100 / cs_no - pars = { "gain": {"cov_l": "dup", "val_l": "max"}, "loss": {"cov_l": "del", "val_l": "min"} } for t in pars.keys(): - covs = np.zeros((cs_no, int_no)) vals = np.zeros((cs_no, int_no)) - if type(analyses).__name__ == "Cursor": analyses.rewind() - for i, cs in enumerate(analyses): covs[i] = cs["cnv_statusmaps"][pars[t]["cov_l"]] vals[i] = cs["cnv_statusmaps"][pars[t]["val_l"]] - counts = np.count_nonzero(covs >= min_f, axis=0) frequencies = np.around(counts * f_factor, 3) medians = np.around(np.ma.median(np.ma.masked_where(covs < min_f, vals), axis=0).filled(0), 3) means = np.around(np.ma.mean(np.ma.masked_where(covs < min_f, vals), axis=0).filled(0), 3) - for i, interval in enumerate(int_fs): int_fs[i].update({ t + "_frequency": frequencies[i], diff --git a/services/lib/service_response_generation.py b/services/lib/service_response_generation.py index a5d6d274..eb0adca8 100644 --- a/services/lib/service_response_generation.py +++ b/services/lib/service_response_generation.py @@ -18,17 +18,15 @@ class ByconautServiceResponse: def __init__(self, byc: dict, response_schema="byconautServiceResponse"): self.byc = byc - self.beacon_defaults = byc.get("beacon_defaults", {}) - self.services_defaults = byc.get("services_defaults", {}) - self.entity_defaults = self.beacon_defaults.get("entity_defaults", {"info":{}}) + self.entity_defaults = BYC["beacon_defaults"].get("entity_defaults", {"info":{}}) self.service_config = self.byc.get("service_config", {}) self.response_schema = response_schema self.requested_granularity = BYC_PARS.get("requested_granularity", "record") # TBD for authentication? - self.returned_granularity = self.requested_granularity + self.returned_granularity = byc.get("returned_granularity", "boolean") self.beacon_schema = self.byc["response_entity"].get("beacon_schema", "___none___") - self.data_response = object_instance_from_schema_name(byc, response_schema, "") - self.error_response = object_instance_from_schema_name(byc, "beaconErrorResponse", "") + self.data_response = object_instance_from_schema_name(response_schema, "") + self.error_response = object_instance_from_schema_name("beaconErrorResponse", "") self.__meta_add_received_request_summary_parameters() self.__meta_add_parameters() @@ -192,11 +190,9 @@ def __init__(self, byc: dict): self.delivery_method = BYC_PARS.get("method", "___none___") self.output = BYC_PARS.get("output", "___none___") self.dataset_ids = byc.get("dataset_ids", []) - self.beacon_defaults = byc.get("beacon_defaults", {}) self.service_config = byc.get("service_config", {}) - self.entity_defaults = self.beacon_defaults.get("entity_defaults", {"info":{}}) + self.entity_defaults = BYC["beacon_defaults"].get("entity_defaults", {"info":{}}) self.filter_definitions = byc.get("filter_definitions", {}) - self.form_data = byc.get("form_data", {}) self.filters = byc.get("filters", []) self.response_entity_id = byc.get("response_entity_id", "filteringTerm") self.path_id_value = byc.get("request_entity_path_id_value", False) @@ -221,18 +217,17 @@ def __return_collations(self): f_coll = "collations" d_k = set_selected_delivery_keys(self.service_config.get("method_keys")) - c_id = self.form_data.get("id", "") + c_id = BYC_PARS.get("id", "") # TODO: This should be derived from some entity definitions # TODO: whole query generation in separate function ... query = {} if BYC["TEST_MODE"] is True: - t_m_c = self.form_data.get("test_mode_count", 5) + t_m_c = BYC_PARS.get("test_mode_count", 5) query = mongo_test_mode_query(self.dataset_ids[0], f_coll, t_m_c) elif len(c_id) > 0: query = { "id": c_id } else: - q_list = [] ft_fs = [] for f in self.filters: @@ -242,11 +237,9 @@ def __return_collations(self): f_re = re.compile(r'^' + '|'.join(ft_fs)) else: f_re = None - if f_re is not None: q_list.append({"id": { "$regex": f_re}}) - - q_types = self.form_data.get("collation_types", []) + q_types = BYC_PARS.get("collation_types", []) if len(q_types) > 0: q_list.append({"collation_type": {"$in": q_types }}) @@ -254,13 +247,14 @@ def __return_collations(self): query = q_list[0] elif len(q_list) > 1: query = {"$and": q_list} + + prdbug(f'Collation query: {query}') # TODO # if not query: # warning = 'No limit (filters, collationTypes, id) on collation listing -> abortin...' s_s = { } - for ds_id in self.dataset_ids: fields = {"_id": 0} f_s = mongo_result_list(ds_id, f_coll, query, fields) @@ -268,14 +262,11 @@ def __return_collations(self): if "codematches" in str(self.delivery_method): if int(f.get("code_matches", 0)) < 1: continue - i_d = f.get("id", "NA") if i_d not in s_s: s_s[ i_d ] = { } - if len(d_k) < 1: - d_k = list(f.keys()) - + d_k = list(f.keys()) for k in d_k: if k in self.service_config.get("integer_keys", []): s_s[ i_d ].update({k: s_s[ i_d ].get(k, 0) + f.get(k, 0)}) diff --git a/services/local/beacon_defaults.yaml b/services/local/beacon_defaults.yaml index e5c94826..55ef4d17 100644 --- a/services/local/beacon_defaults.yaml +++ b/services/local/beacon_defaults.yaml @@ -13,11 +13,12 @@ defaults: # the aliases here are for non-standard speling or additional entry types service_path_aliases: - analyses: analyses - filteringTerms: filteringTerms - phenopackets: phenopackets - variants: genomicVariations - genomicVariations: genomicVariations + filteringTerms: filtering_terms # just for speling variations + entryTypes: entry_types # just for speling variations + variants: genomicVariations # just for speling variations + genomicVariations: genomicVariations # just for speling variations + phenopackets: phenopackets # Beacon+ specific example + ################################################################################ # here you can map additional path values to the corresponding (additional) @@ -25,16 +26,16 @@ service_path_aliases: ################################################################################ path_entry_type_mappings: - phenopackets: phenopacket + phenopackets: phenopacket # Beacon+ specific example ################################################################################ # her you can add additional path ids to the data query aggregation pipeline # that usually mapps/reduces queries against biosamples, genomicVariations, -#individuals ... +# individuals ... ################################################################################ data_pipeline_path_ids: - - phenopackets + - phenopackets # Beacon+ specific example ################################################################################ # Beacon entry type defaults - please adjust esp. info and schema paths... @@ -42,36 +43,7 @@ data_pipeline_path_ids: # framework and might be disentangled further on ... ################################################################################ -# => snake_casing - -# standard examples - -# ################################################################################ -# filteringTerm: -# is_entry_type: False -# request_entity_path_id: filteringTerms -# response_entity_id: filteringTerm -# collection: collations -# response_schema: beaconFilteringTermsResponse -# beacon_schema: -# entity_type: filteringTerm -# schema: https://progenetix.org/services/schemas/filteringTermsSchema/ -# h->o_access_key: Null -# ################################################################################ -# biosample: -# is_entry_type: True -# request_entity_path_id: biosamples -# response_entity_id: biosample -# collection: biosamples -# response_schema: beaconResultsetsResponse -# beacon_schema: -# entity_type: biosample -# schema: https://progenetix.org/services/schemas/biosample/ -# h->o_access_key: biosamples._id -# ################################################################################ - entity_defaults: - info: is_entry_type: False collection: Null diff --git a/services/local/instance_overrides.yaml b/services/local/instance_overrides.yaml new file mode 100644 index 00000000..8c8fc7a6 --- /dev/null +++ b/services/local/instance_overrides.yaml @@ -0,0 +1,62 @@ +progenetix: + domains: + - progenetix.org + - www.progenetix.org + - progenetix.test + beacon_defaults: + defaults: + default_dataset_id: progenetix + test_domains: + - progenetix.test + +beaconplus: + domains: + - beaconplus.progenetix.org + - beaconplus.test + beacon_defaults: + defaults: + default_dataset_id: examplez + test_domains: + - beaconplus.test + +cancercelllines: + domains: + - cancercelllines.org + - www.cancercelllines.org + - cancercelllines.test + beacon_defaults: + defaults: + default_dataset_id: cellz + test_domains: + - cancercelllines.test + entity_defaults: + info: + content: + beacon_id: org.cancercelllines + name: Cancer Cell Line Genomics Beacon+ + id: org.cancercelllines.beacon + environment: prod + description: >- + The cancercelllines.org Beacon is a specific instance of the Progenetix + Beacon+ environment providing information about genommic variations in + cancer cell lines. + type: + group: org.ga4gh + artifact: beacon + version: v2.1.0-beaconplus + documentation_url: http://docs.cancercelllines.org + service_url: http://cancercelllines.org/beacon/ + welcome_url: https://cancercelllines.org/biosamples/ + alternative_url: https://cancercelllines.org + contact_url: mailto:contact@progenetix.org + created_at: 2023-07-01T00:00:00 + updated_at: 2024-02-24T13:00:00 + organization: + welcome_url: https://cancercelllines.org/ + contact_url: mailto:contact@progenetix.org + logoUrl: https://cancercelllines.org/img/cancercelllines-icon-400x300.png + info: + update_date_time: 2024-02-24T12:45:00 + create_date_time: 2023-07-01T00:00:00 + update_date_time: 2024-02-24T13:00:00 + diff --git a/services/local/local_paths.yaml b/services/local/local_paths.yaml index 29dc1724..a35deda8 100644 --- a/services/local/local_paths.yaml +++ b/services/local/local_paths.yaml @@ -19,8 +19,3 @@ server_callsets_dir_loc: - grch38 probefile_name: probes,cn.tsv - -test_domains: - - progenetix.test - - cancercelllines.test - - beaconplus.test diff --git a/services/local/services_defaults.yaml b/services/local/services_defaults.yaml index e60ce82f..6d50321f 100644 --- a/services/local/services_defaults.yaml +++ b/services/local/services_defaults.yaml @@ -1,3 +1,5 @@ +# Definitions here in fact are treated like `beacon_defaults` and merged into +# the global `beacon_defaults` dictionary defaults: {} ################################################################################ @@ -35,6 +37,7 @@ service_path_aliases: schemas: schemas uploader: uploader uploadplotter: uploadplotter + variantsbedfile: variantsbedfile vcf: vcfvariants vcfvariants: vcfvariants diff --git a/services/services.py b/services/services.py index 220b32cd..88489385 100755 --- a/services/services.py +++ b/services/services.py @@ -45,7 +45,7 @@ def services(): # for d_k, d_v in defaults.items(): # byc.update( { d_k: d_v } ) read_service_prefs(service, services_conf_path, byc) - defs = byc.get("beacon_defaults", {}) + defs = BYC["beacon_defaults"] s_a_s = defs.get("service_path_aliases", {}) r_w = defs.get("rewrites", {}) diff --git a/services/variantsbedfile.py b/services/variantsbedfile.py new file mode 100644 index 00000000..378d5d9c --- /dev/null +++ b/services/variantsbedfile.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import sys +from os import path, environ, pardir + +from bycon import * + +services_lib_path = path.join( path.dirname( path.abspath(__file__) ), "lib" ) +sys.path.append( services_lib_path ) +from export_file_generation import write_variants_bedfile + +""" +The plot service uses the standard bycon data retrieval pipeline with `biosample` +as entity type. Therefore, all standard Beacon query parameters work and also +the path is interpreted for an biosample `id` value if there is an entry at +`.../pgxsegvariants/{id}` + +* http://progenetix.org/services/pgxsegvariants/pgxbs-kftvjv8w + +""" + +################################################################################ +################################################################################ +################################################################################ + +def main(): + variantsbedfile() + +################################################################################ + +def variantsbedfile(): + initialize_bycon_service(byc, "g_variants") + run_beacon_init_stack(byc) + rss = ByconResultSets(byc).datasetsResults() + ds_id = list(rss.keys())[0] + ucsclink, bedfilelink = write_variants_bedfile(rss, ds_id, byc) + # TODO: Error + if "ucsc" in BYC_PARS.get("output", "bed"): + print_uri_rewrite_response(ucsclink, bedfilelink) + print_uri_rewrite_response(bedfilelink) + + +################################################################################ +################################################################################ +################################################################################ + +if __name__ == '__main__': + main() diff --git a/tmp/aggregator.yaml b/tmp/aggregator.yaml index 6162e6b3..66ef6927 100644 --- a/tmp/aggregator.yaml +++ b/tmp/aggregator.yaml @@ -3,7 +3,7 @@ description: >- federated Beacon queries through translating / sending / retrieving / converting Beacon queries in v2 format to the format of the respective Beacon instances. Please be aware that `bycon` uses `snake_cased` keys for its internal parameters; - _i.e._ a URL parameter `assemblyId` will be accessible as `byc["form_data"]["assembly_id"]` + _i.e._ a URL parameter `assemblyId` will be accessible as `BYC_PARS["assembly_id"]` to the internal methods. selected_beacons: