1.5.0

progenetix · Feb 20, 2024 · 4e52ff0 · 4e52ff0
1 parent 46c5fde
commit 4e52ff0
Show file tree

Hide file tree

Showing 60 changed files with 6,010 additions and 100,266 deletions.
diff --git a/bin/ISCNsegmenter.py b/bin/ISCNsegmenter.py
@@ -26,14 +26,12 @@
 ################################################################################
 
 def main():
-
 	iscn_segmenter()
 
 ################################################################################
 
 def iscn_segmenter():
-
-	initialize_bycon_service(byc)
+	initialize_bycon_service(byc, "iscn_segmenter")
 	run_beacon_init_stack(byc)
 	generate_genome_bins(byc)
 
@@ -59,7 +57,7 @@ def iscn_segmenter():
 	else:
 		output_file = path.splitext(output_file)[0]
 
-	if byc["test_mode"] is True:
+	if BYC["TEST_MODE"] is True:
 		output_file += "_test"
 
 	output_file += ".pgxseg"

diff --git a/bin/analysesStatusmapsRefresher.py b/bin/analysesStatusmapsRefresher.py
@@ -9,9 +9,8 @@
 
 services_lib_path = path.join( path.dirname( path.abspath(__file__) ), pardir, "services", "lib" )
 sys.path.append( services_lib_path )
-from bycon_bundler import ByconBundler
-from bycon_plot import *
 from interval_utils import generate_genome_bins, interval_cnv_arrays
+from collation_utils import set_collation_types
 
 """
 
@@ -28,39 +27,38 @@
 ################################################################################
 
 def main():
-
     callsets_refresher()
 
 ################################################################################
 
 def callsets_refresher():
-
-    initialize_bycon_service(byc)
+    initialize_bycon_service(byc, "callsets_refresher")
     run_beacon_init_stack(byc)
+    generate_genome_bins(byc)
 
     if len(byc["dataset_ids"]) > 1:
         print("Please give only one dataset using -d")
         exit()
 
     ds_id = byc["dataset_ids"][0]
-    print(f'=> Using data values from {ds_id}')
-
-    generate_genome_bins(byc)
+    set_collation_types(byc)
+    print(f'=> Using data values from {ds_id} for {byc.get("genomic_interval_count", 0)} intervals...')
 
-    data_client = MongoClient(host=byc["mongohost"])
+    form = byc.get("form_data", {})
+    data_client = MongoClient(host=DB_MONGOHOST)
     data_db = data_client[ ds_id ]
     cs_coll = data_db[ "analyses" ]
     v_coll = data_db[ "variants" ]
 
     record_queries = ByconQuery(byc).recordsQuery()
 
-    execute_bycon_queries( ds_id, record_queries, byc )
-
-    ds_results = byc["dataset_results"][ds_id]
+    res = execute_bycon_queries( ds_id, record_queries, byc )
+    ds_results = res.get(ds_id, {})
+    has_analyses = ds_results.get("analyses._id")
 
     no_cnv_type = 0
 
-    if not "analyses._id" in ds_results.keys():
+    if not has_analyses:
         cs_ids = []
         for cs in cs_coll.find( {} ):
             cs_ids.append(cs["_id"])
@@ -87,7 +85,7 @@ def callsets_refresher():
 
         bar.next()
 
-        if not "CNV" in cs.get("variant_class", "CNV"):
+        if "SNV" in cs.get("variant_class", "CNV"):
             no_cnv_type += 1
             continue
 
@@ -102,13 +100,13 @@ def callsets_refresher():
         cs_update_obj.update({"cnv_statusmaps": maps})
         cs_update_obj.update({"cnv_stats": cs_cnv_stats})
         cs_update_obj.update({"cnv_chro_stats": cs_chro_stats})
-        cs_update_obj.update({ "updated": datetime.now().isoformat() })
+        cs_update_obj.update({ "updated": datetime.datetime.now().isoformat() })
 
-        if not byc["test_mode"]:
+        if BYC.get("TEST_MODE", False) is True: 
+            prjsonnice(cs_chro_stats)
+        else:
             cs_coll.update_one( { "_id": _id }, { '$set': cs_update_obj }  )
             updated += 1
-        else:
-            prjsonnice(cs_chro_stats)
 
         ####################################################################
         ####################################################################
@@ -118,7 +116,7 @@ def callsets_refresher():
 
     print(f"{counter} analyses were processed")
     print(f"{no_cnv_type} analyses were not from CNV calling")
-    print(f'{updated} analyses were updated for\n    `cnv_statusmaps`\n    `cnv_stats`\n    `cnv_chro_stats`\nusing {byc["genomic_interval_count"]} bins ({byc["interval_definitions"].get("genome_binning", "")})')
+    print(f'{updated} analyses were updated for\n    `cnv_statusmaps`\n    `cnv_stats`\n    `cnv_chro_stats`\nusing {byc["genomic_interval_count"]} bins ({form.get("genome_binning", "")})')
 
 ################################################################################
 ################################################################################

diff --git a/bin/collationsCreator.py b/bin/collationsCreator.py
@@ -30,34 +30,30 @@ def main():
 ################################################################################
 
 def collations_creator():
-
-    initialize_bycon_service(byc)
-    select_dataset_ids(byc)
+    initialize_bycon_service(byc, "collations_creator")
+    run_beacon_init_stack(byc)
 
     if len(byc["dataset_ids"]) > 1:
         print("Please give only one dataset using -d")
         exit()
-
     ds_id = byc["dataset_ids"][0]
 
-    print( "Creating collations for " + ds_id)
+    print(f'Creating collations for {ds_id}')
 
     set_collation_types(byc)
 
     for coll_type, coll_defs in byc["filter_definitions"].items():
-
         collationed = coll_defs.get("collationed")
         if not collationed:
             continue
-
         pre = coll_defs["namespace_prefix"]
         pre_h_f = path.join( pkg_path, "rsrc", "classificationTrees", coll_type, "numbered_hierarchies.tsv" )
         collection = coll_defs["scope"]
         db_key = coll_defs["db_key"]
 
-        if "PMID" in coll_type:
+        if "pubmed" in coll_type:
             hier =  _make_dummy_publication_hierarchy(byc)
-        elif  path.exists( pre_h_f ):
+        elif path.exists( pre_h_f ):
             print( "Creating hierarchy for " + coll_type)
             hier =  get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc)
         else:
@@ -87,39 +83,27 @@ def collations_creator():
             onto_keys.update(child_ids)
 
         sel_hiers = [ ]
-
         no = len(hier.keys())
         matched = 0
-
-        if not byc["test_mode"]:
-            bar = Bar("Writing "+pre, max = no, suffix='%(percent)d%%'+" of "+str(no) )
-
+        if not BYC["TEST_MODE"]:
+            bar = Bar("Writing "+pre, max = no, suffix='%(percent)d%%'+" of "+str(no) )      
         for count, code in enumerate(hier.keys(), start=1):
-
-            if not byc["test_mode"]:
+            if not BYC["TEST_MODE"]:
                 bar.next()
-
             children = list( set( hier[ code ][ "child_terms" ] ) & onto_keys )
-
             hier[ code ].update(  { "child_terms": children } )
-
             if len( children ) < 1:
-                if byc["test_mode"]:
+                if BYC["TEST_MODE"]:
                     print(code+" w/o children")
                 continue
-
             code_no = data_coll.count_documents( { db_key: code } )
-
             if code_no < 1:
                 code_no = 0
-
             if len( children ) < 2:
                 child_no = code_no
             else:
                 child_no =  data_coll.count_documents( { db_key: { "$in": children } } )
-
             if child_no > 0:
-
                 # sub_id = re.sub(pre, coll_type, code)
                 sub_id = code
                 update_obj = hier[ code ].copy()
@@ -138,23 +122,19 @@ def collations_creator():
                     "updated": datetime.datetime.now().isoformat(),
                     "db_key": db_key
                 })
-
                 if "reference" in coll_defs:
                     url = coll_defs["reference"].get("root", "https://progenetix.org/services/ids/")
                     r = coll_defs["reference"].get("replace", ["___nothing___", ""])
                     ref = url+re.sub(r[0], r[1], code)
                     update_obj.update({"reference": ref })
-
                 matched += 1
-
-                if not byc["test_mode"]:
+                if not BYC["TEST_MODE"]:
                     sel_hiers.append( update_obj )
                 else:
                     print("{}:\t{} ({} deep) samples - {} / {} {}".format(sub_id, code_no, child_no, count, no, pre))
 
-
         # UPDATE   
-        if not byc["test_mode"]:
+        if not BYC["TEST_MODE"]:
             bar.finish()
             print("==> Updating database ...")
             if matched > 0:
@@ -169,18 +149,15 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
 
     coll_defs = byc["filter_definitions"][coll_type]
     hier = hierarchy_from_file(ds_id, coll_type, pre_h_f, byc)
-
     no = len(hier.keys())
 
     # now adding terms missing from the tree ###################################
-
     print("Looking for missing {} codes in {}.{} ...".format(coll_type, ds_id, coll_defs["scope"]))
     data_client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
     data_db = data_client[ ds_id ]
     data_coll = data_db[coll_defs["scope"]]
 
-    db_key = coll_defs.get("db_key", "")
-
+    db_key = coll_defs.get("db_key", "")    
     onto_ids = _get_ids_for_prefix( data_coll, coll_defs )
 
     added_no = 0
@@ -273,7 +250,7 @@ def get_prefix_hierarchy( ds_id, coll_type, pre_h_f, byc):
 
 def _make_dummy_publication_hierarchy(byc):
 
-    coll_type = "PMID"
+    coll_type = "pubmed"
     coll_defs = byc["filter_definitions"][coll_type]
     data_db = "progenetix"
     data_coll = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))[ data_db ][ "publications" ]
@@ -311,7 +288,7 @@ def _make_dummy_publication_hierarchy(byc):
 
 def _get_dummy_hierarchy(ds_id, coll_type, coll_defs, byc):
 
-    data_client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
+    data_client = MongoClient(host=DB_MONGOHOST)
     data_db = data_client[ ds_id ]
     data_coll = data_db[ coll_defs["scope"] ]
     data_pat = coll_defs["pattern"]
@@ -369,8 +346,10 @@ def _get_ids_for_prefix(data_coll, coll_defs):
     db_key = coll_defs["db_key"]
     pre_re = re.compile( coll_defs["pattern"] )
 
+    prdbug(f'_get_ids_for_prefix ... : "{db_key}"" - pattern {pre_re}')
     pre_ids = data_coll.distinct( db_key, { db_key: { "$regex": pre_re } } )
     pre_ids = list(filter(lambda d: pre_re.match(d), pre_ids))
+    prdbug(f'_get_ids_for_prefix ... : found {len(pre_ids)}')
 
     return pre_ids
 

diff --git a/bin/collationsPlotter.py b/bin/collationsPlotter.py
@@ -27,8 +27,7 @@ def main():
 ################################################################################
 
 def collations_plotter():
-
-    initialize_bycon_service(byc)
+    initialize_bycon_service(byc, "collations_plotter")
     run_beacon_init_stack(byc)
     generate_genome_bins(byc)
 

diff --git a/bin/config/iscn_segmenter.yaml b/bin/config/iscn_segmenter.yaml
@@ -1,6 +1,5 @@
 ---
 defaults:
-  genome_binning: 1Mb
   import_files_root:
     - imports
 

diff --git a/bin/databaseArchiver.py b/bin/databaseArchiver.py
@@ -24,8 +24,7 @@ def main():
 ################################################################################
 
 def database_archiver():
-
-    initialize_bycon_service(byc)
+    initialize_bycon_service(byc, "database_archiver")
     run_beacon_init_stack(byc)
 
     if len(byc["dataset_ids"]) != 1:

diff --git a/bin/examplezSampler.py b/bin/examplezSampler.py
@@ -6,7 +6,6 @@
 biosample_id_list=[]
 client = MongoClient(host=environ.get("BYCON_MONGO_HOST", "localhost"))
 
-
 progenetix_biosample_id_list=[]
 progenetix_individual_id_list=[]
 progenetix_variant_id_list=[]

diff --git a/bin/examplezUpdater.py b/bin/examplezUpdater.py
@@ -24,7 +24,6 @@ def main():
 ################################################################################
 
 def examplez_updater():
-
     # Note: This doesn't use the standard `bycon` dataset id argument input since
     # you may want to create a new database not in the configuration list ...
     parser = argparse.ArgumentParser()
@@ -43,7 +42,6 @@ def examplez_updater():
     }
 
     # collecting the actions
-
     print('Database to create/update:', e_ds_id)
     if e_ds_id in db_names:
         ddb_resp = input(f'Drop existing {e_ds_id} database first?\n(Y|n): ')