Merge pull request #227 from FlyBase/gp

geneproduct data fr testing
FlyBase · Sep 13, 2023 · 5ae75b3 · 5ae75b3
2 parents bc3f7db + 96b272b
commit 5ae75b3
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 3 deletions.
diff --git a/Load/geneproduct.py b/Load/geneproduct.py
@@ -0,0 +1,98 @@
+r"""
+:synopsis: Create genes and alleles etc needed for testing of geneproducts.
+
+:moduleauthor: Ian Longden <[email protected]>
+
+
+create 30 genes for testing geneproducts.
+
+first ten only linked to alleles
+fb_test=# select f.name, f.uniquename, cvt.name from feature f, cvterm cvt where f.type_id = cvt.cvterm_id and f.name like 'gpt5%';
+    name    | uniquename  |  name
+------------+-------------+--------
+ gpt5       | FBgn0000126 | gene
+ gpt5[Clk1] | FBal0000165 | allele
+ gpt5[Clk2] | FBal0000166 | allele
+
+second 10 also linked to mRNA
+fb_test=# select f.name, f.uniquename, cvt.name from feature f, cvterm cvt where f.type_id = cvt.cvterm_id and f.name like 'gpt15%';
+    name     | uniquename  |  name
+-------------+-------------+--------
+ gpt15       | FBgn0000136 | gene
+ gpt15-RA    | FBtr0000106 | mRNA
+ gpt15-RB    | FBtr0000107 | mRNA
+ gpt15[Clk1] | FBal0000185 | allele
+ gpt15[Clk2] | FBal0000186 | allele
+
+last 10 linked to mRNA and polypeptides
+fb_test=# select f.name, f.uniquename, cvt.name from feature f, cvterm cvt where f.type_id = cvt.cvterm_id and f.name like 'gpt25%';
+    name     | uniquename  |    name
+-------------+-------------+-------------
+ gpt25       | FBgn0000146 | gene
+ gpt25-RA    | FBtr0000126 | mRNA
+ gpt25-RB    | FBtr0000127 | mRNA
+ gpt25[Clk1] | FBal0000205 | allele
+ gpt25[Clk2] | FBal0000206 | allele
+ gpt25-PB    | FBpp0000027 | polypeptide
+ gpt25-PA    | FBpp0000026 | polypeptide
+"""
+from .gene_alleles import create_gene_alleles
+
+feat_sql = """ INSERT INTO feature (dbxref_id, organism_id, name, uniquename, residues, seqlen, type_id)
+               VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING feature_id"""
+fs_sql = """ INSERT INTO feature_synonym (synonym_id, feature_id,  pub_id, is_current) VALUES (%s, %s, %s, %s) """
+syn_sql = """ INSERT INTO synonym (name, type_id, synonym_sgml) VALUES (%s, %s, %s) RETURNING synonym_id """
+feat_rel_sql = """ INSERT INTO feature_relationship (subject_id, object_id,  type_id)
+                   VALUES (%s, %s, %s) RETURNING feature_relationship_id """
+
+
+def create_geneproducts(cursor, organism_id, feature_id, cvterm_id, dbxref_id, db_id, pub_id):
+
+    create_gene_alleles(
+        cursor, organism_id, feature_id, cvterm_id, db_id, pub_id,
+        num_genes=30,
+        num_alleles=2,
+        gene_prefix='gpt',
+        allele_prefix=None,
+        tool_prefix='Clk'
+    )
+
+    # first 10 genes have alleles only.
+    for gene_count in range(10, 30):
+        # 10 ->29 have transcripts (mRNA)
+        # gptx-Ry x=gene_count y='A', 'B';
+        gene_name = f"gpt{gene_count}"
+        for postfix in ['A', 'B']:
+            tr_name = f"{gene_name}-R{postfix}"
+            cursor.execute(feat_sql, (None, organism_id['Dmel'], tr_name,
+                                      'FBtr:temp_0', None, None, cvterm_id['mRNA']))
+            feature_id[tr_name] = mrna_id = cursor.fetchone()[0]
+
+            # add synonyms
+            cursor.execute(syn_sql, (tr_name, cvterm_id['symbol'], tr_name))
+            symbol_id = cursor.fetchone()[0]
+
+            # add feature_synonym
+            cursor.execute(fs_sql, (symbol_id, mrna_id, pub_id, True))
+
+            # add relationship to gene
+            cursor.execute(feat_rel_sql, (feature_id[tr_name], feature_id[gene_name], cvterm_id['partof']))
+
+        # 20 -> 29 have polypeptides too.
+        if gene_count < 20:
+            continue
+        for postfix in ['A', 'B']:
+            pp_name = f"{gene_name}-P{postfix}"
+            cursor.execute(feat_sql, (None, organism_id['Dmel'], pp_name,
+                                      'FBpp:temp_0', None, None, cvterm_id['polypeptide']))
+            feature_id[pp_name] = cursor.fetchone()[0]
+
+            # add synonyms
+            cursor.execute(syn_sql, (pp_name, cvterm_id['symbol'], pp_name))
+            symbol_id = cursor.fetchone()[0]
+
+            # add feature_synonym
+            cursor.execute(fs_sql, (symbol_id, mrna_id, pub_id, True))
+
+            # add relationship to tr
+            cursor.execute(feat_rel_sql, (feature_id[tr_name], feature_id[pp_name], cvterm_id['producedby']))
diff --git a/add-test_data.py b/add-test_data.py
@@ -22,6 +22,7 @@
 from Load.cell_line import add_cell_line_data
 from Load.aberration import add_aberration_data
 from Load.drivers import add_driver_data
+from Load.geneproduct import create_geneproducts
 
 conn = psycopg2.connect(database="fb_test")
 cursor = conn.cursor()
@@ -558,6 +559,9 @@ def load_pub_author_pubprop(parsed_yaml):
     fr_id = cursor.fetchone()[0]
     cursor.execute(frp_sql, (fr_id, pub_id))
 
+# gene product data
+create_geneproducts(cursor, organism_id, feature_id, cvterm_id, dbxref_id, db_id, pub_id)
+
 conn.commit()
 conn.close()
 print("SUCCESS")
diff --git a/data/cv_cvterm.yaml b/data/cv_cvterm.yaml
@@ -2,13 +2,13 @@
 
 # order dependent cv/cvterms. i.e. accession are specific and numbered.
 #######################################################################
-# Order is important only add to end of SO list. Tests relie on this!!
+# Order is important only add to end of SO list. Tests rely on this!!
 SO: ['chromosome_arm', 'chromosome', 'gene', 'mRNA', 'DNA', 'golden_path', 'ncRNA_gene',
      'regulatory_region', 'chromosome_structure_variation', 'chromosomal_inversion',
      'natural population', 'cloned_region', 'engineered_region', 'transgenic_transposable_element',
      'transposable_element_insertion_site', 'chromosome_band', 'allele', 'transposable_element',
      'natural_transposable_element', 'gene_group', 'polypeptide', 'chromosome_breakpoint', 'engineered_plasmid', 'sgRNA',
-     'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence']
+     'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence', 'RNA']
 molecular_function: ['mRNA binding']
 cellular_component: ['nucleolus', 'something' ,'extracellular space', 'endoplasmic reticulum']
 biological_process: ['activation of immune response', 'defense response to other organism', 'rRNA processing']

diff --git a/data/db_dbxref.yaml b/data/db_dbxref.yaml
@@ -1,7 +1,7 @@
 testdb: ['hh-1']
 testdb2: []
 EMBL-EBI Single Cell Expression Atlas Datasets:  []
-FBbt: []
+FBbt: ['dissociated larval fat cell', 'CP1 lineage neuron']
 FBcv: []
 FBdv: []
 GB: ['GB1', 'GB2']