From d87cb522c790f5bf82ac0bacf85300694ba99672 Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:29:30 +0100 Subject: [PATCH 1/7] version 0.1.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bca7425..c5bfe0e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="hgnc_queries", - version="0.0.3", + version="0.1.0", author="Yujin Kim", author_email="yujin.kim@hotmail.fr", description="Make HGNC queries for gene symbols", From 39f0a48fbd4b827d3cdeb3c378b8f5ed4f9cc1ea Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:29:56 +0100 Subject: [PATCH 2/7] new imports --- hgnc_queries/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hgnc_queries/__init__.py b/hgnc_queries/__init__.py index dd06ab3..b342dc1 100644 --- a/hgnc_queries/__init__.py +++ b/hgnc_queries/__init__.py @@ -1,6 +1,10 @@ +from .api import get_api_response, URL +from .conversion import ( + convert_ensembl2refseq, convert_refseq2ensembl +) from .queries import ( get_new_symbol, get_gene_starting_with, get_alias, get_main_symbol, get_prev_symbol, get_id, - get_symbol_from_id, get_hgnc_symbol -) -from .api import get_api_response + get_symbol_from_id, get_hgnc_symbol, + get_ensembl, get_refseq +) \ No newline at end of file From e38fa66c3c1bcb8a194a87569fb9ef5a4d16d61a Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:30:18 +0100 Subject: [PATCH 3/7] convert refseq <-> ensembl --- hgnc_queries/conversion.py | 78 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 hgnc_queries/conversion.py diff --git a/hgnc_queries/conversion.py b/hgnc_queries/conversion.py new file mode 100644 index 0000000..71e5077 --- /dev/null +++ b/hgnc_queries/conversion.py @@ -0,0 +1,78 @@ +from .api import get_api_response, URL + + +def convert_refseq2ensembl(refseq: str, verbose: bool = True): + """ Convert refseq number to ensembl gene id + + Args: + refseq (str): Refseq accession number + verbose (bool, optional): Print output. Defaults to True. + + Returns: + None + ensembl_id (str) + """ + + refseq = refseq.strip().upper() + + if not refseq.startswith("NM"): + print("Refseq given: {} doesn't start with \"NM\"".format(refseq)) + return + + ext = "fetch/refseq_accession/{}".format(refseq) + data = get_api_response("{}/{}".format(URL, ext)) + res = data["response"]["docs"] + + if not res: + if verbose: + print("Refseq \"{}\" not found".format(refseq)) + + return + else: + ensembl_id = res[0]["ensembl_gene_id"] + + if verbose: + print("Refseq \"{}\" -> Ensembl \"{}\"".format(refseq, ensembl_id)) + + return ensembl_id + + +def convert_ensembl2refseq(ensembl_id: str, verbose: bool = True): + """ Convert Ensembl id to refseq number + + Args: + ensembl_id (str): Ensembl id + verbose (bool, optional): Prints the output. Defaults to True. + + Returns: + None + refseq (str) + """ + + ensembl_id = ensembl_id.strip().upper() + + if not ensembl_id.startswith("ENSG"): + print("Ensembl_id given: {} doesn't start with \"NM\"".format( + ensembl_id + )) + return + + ext = "fetch/ensembl_gene_id/{}".format(ensembl_id) + data = get_api_response("{}/{}".format(URL, ext)) + res = data["response"]["docs"] + + if not res: + if verbose: + print("Ensembl_id \"{}\" not found".format(ensembl_id)) + + return + else: + refseq = res[0]["refseq_accession"] + + if verbose: + print("Ensembl_id \"{}\" -> Refseq \"{}\"".format( + ensembl_id, + refseq + )) + + return refseq From 15170b04e7a7eb425e2b8aeb1bea9f094f96301b Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:30:57 +0100 Subject: [PATCH 4/7] added strips, removed main, added refseq and ensembl queries --- hgnc_queries/queries.py | 149 +++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 77 deletions(-) diff --git a/hgnc_queries/queries.py b/hgnc_queries/queries.py index 4d3bfb9..1a5f392 100644 --- a/hgnc_queries/queries.py +++ b/hgnc_queries/queries.py @@ -1,9 +1,3 @@ -""" hgnc_queries.py - -Fetch data from HGNC api -""" - -import argparse from .api import get_api_response, URL @@ -19,7 +13,7 @@ def get_new_symbol(gene_symbol: str, verbose: bool = True): - None """ - gene_symbol = gene_symbol.upper() + gene_symbol = gene_symbol.strip().upper() ext = "search/prev_symbol/{}".format(gene_symbol) data = get_api_response("{}/{}".format(URL, ext)) @@ -61,7 +55,7 @@ def get_gene_starting_with(gene_symbol: str, verbose: bool = True): - None """ - gene_symbol = gene_symbol.upper() + gene_symbol = gene_symbol.strip().upper() ext = "search/symbol/{}*".format(gene_symbol) data = get_api_response("{}/{}".format(URL, ext)) @@ -97,7 +91,7 @@ def get_alias(gene_symbol: str, verbose: bool = True): - None """ - gene_symbol = gene_symbol.upper() + gene_symbol = gene_symbol.strip().upper() ext = "fetch/symbol/{}".format(gene_symbol) data = get_api_response("{}/{}".format(URL, ext)) @@ -146,7 +140,7 @@ def get_main_symbol(gene_symbol: str, verbose: bool = True): - None """ - gene_symbol = gene_symbol.upper() + gene_symbol = gene_symbol.strip().upper() ext = "search/alias_symbol/{}".format(gene_symbol) data = get_api_response("{}/{}".format(URL, ext)) @@ -184,7 +178,7 @@ def get_prev_symbol(gene_symbol: str, verbose: bool = True): - None """ - gene_symbol = gene_symbol.upper() + gene_symbol = gene_symbol.strip().upper() ext = "fetch/symbol/{}".format(gene_symbol) data = get_api_response("{}/{}".format(URL, ext)) @@ -225,7 +219,7 @@ def get_id(gene_symbol: str, verbose: bool = True): - None """ - gene_symbol = gene_symbol.upper() + gene_symbol = gene_symbol.strip().upper() ext = "fetch/symbol/{}".format(gene_symbol) data = get_api_response("{}/{}".format(URL, ext)) @@ -267,6 +261,8 @@ def get_symbol_from_id(gene_id: str, verbose: bool = True): return + gene_id = gene_id.strip() + ext = "search/hgnc_id/{}".format(gene_id) data = get_api_response("{}/{}".format(URL, ext)) res = data["response"]["docs"] @@ -298,6 +294,8 @@ def get_hgnc_symbol(gene_symbol: str): - None """ + gene_symbol = gene_symbol.strip() + new_symbol = get_new_symbol(gene_symbol, False) if new_symbol: @@ -311,68 +309,65 @@ def get_hgnc_symbol(gene_symbol: str): return -def main(): - pass - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Script to interface with the HGNC api" - ) - subparsers = parser.add_subparsers(help="Commands") - - new_symbol = subparsers.add_parser("new_symbol", help="Get the new symbol") - new_symbol.add_argument("gene_symbol", help="Gene symbol") - new_symbol.set_defaults(func=get_new_symbol) - - alias = subparsers.add_parser( - "alias", - help="Get the aliases of given symbol" - ) - alias.add_argument("gene_symbol", help="Gene symbol") - alias.set_defaults(func=get_alias) - - main_symbol = subparsers.add_parser( - "main_symbol", - help="Get the main symbol from alias" - ) - main_symbol.add_argument("gene_symbol", help="Gene symbol") - main_symbol.set_defaults(func=get_main_symbol) - - prev_symbol = subparsers.add_parser( - "prev_symbol", - help="Get the previous symbol" - ) - prev_symbol.add_argument("gene_symbol", help="Gene symbol") - prev_symbol.set_defaults(func=get_prev_symbol) - - gene_symbol = subparsers.add_parser( - "gene", - help="Get the gene symbols starting with" - ) - gene_symbol.add_argument("gene_symbol", help="Gene symbol") - gene_symbol.set_defaults(func=get_gene_starting_with) - - gene_id = subparsers.add_parser( - "id", - help="Get the ID from a gene symbol" - ) - gene_id.add_argument("gene_symbol", help="Gene symbol") - gene_id.set_defaults(func=get_id) - - id2symbol = subparsers.add_parser( - "id2symbol", - help="Get the gene symbol from the id" - ) - id2symbol.add_argument("gene_id", help="Gene ID") - id2symbol.set_defaults(func=get_symbol_from_id) - - args = parser.parse_args() - - if hasattr(args, "gene_symbol"): - gene_symbol = args.gene_symbol.upper() - args.func(gene_symbol) - - elif hasattr(args, "gene_id"): - gene_id = args.gene_id - args.func(gene_id) +def get_refseq(gene_symbol: str, verbose: bool = True): + """ Get refseq given a gene symbol + + Args: + gene_symbol (str): Gene symbol + verbose (bool, optional): Prints the output. Defaults to True. + + Returns: + None + refseq (str) + """ + + gene_symbol = gene_symbol.strip().upper() + + ext = "fetch/symbol/{}".format(gene_symbol) + data = get_api_response("{}/{}".format(URL, ext)) + res = data["response"]["docs"] + + if not res: + if verbose: + print("Gene \"{}\" not found".format(gene_symbol)) + + return + else: + refseq = res[0]["refseq_accession"] + + if verbose: + print("Refseq for \"{}\": {}".format(gene_symbol, refseq)) + + return refseq + + +def get_ensembl(gene_symbol: str, verbose: bool = True): + """ Get the ensembl id for given gene symbol + + Args: + gene_symbol (str): Gene symbol + verbose (bool, optional): Prints the output. Defaults to True. + + Returns: + None + ensembl_id (str) + """ + + gene_symbol = gene_symbol.strip().upper() + + ext = "fetch/symbol/{}".format(gene_symbol) + data = get_api_response("{}/{}".format(URL, ext)) + res = data["response"]["docs"] + + if not res: + if verbose: + print("Gene \"{}\" not found".format(gene_symbol)) + + return + else: + ensembl_id = res[0]["ensembl_gene_id"] + + if verbose: + print("Ensembl_id for \"{}\": {}".format(gene_symbol, ensembl_id)) + + return ensembl_id From 065f725e8f7709a4b3bcc3595c24330ed49b687f Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:31:14 +0100 Subject: [PATCH 5/7] added tests for new functions --- test/test_hgnc_queries.py | 125 +++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 23 deletions(-) diff --git a/test/test_hgnc_queries.py b/test/test_hgnc_queries.py index 568443b..7a37400 100644 --- a/test/test_hgnc_queries.py +++ b/test/test_hgnc_queries.py @@ -1,18 +1,25 @@ import pytest -import sys -from hgnc_queries import queries +from hgnc_queries.conversion import ( + convert_ensembl2refseq, convert_refseq2ensembl +) +from hgnc_queries.queries import ( + get_new_symbol, get_gene_starting_with, get_alias, + get_main_symbol, get_prev_symbol, get_id, + get_symbol_from_id, get_hgnc_symbol, + get_ensembl, get_refseq +) def test_get_new_symbol_no_res(): assert( - queries.get_new_symbol("BRCA1P", False) is None + get_new_symbol("BRCA1P", False) is None ) def test_get_new_symbol_one_res(): assert( - queries.get_new_symbol("RN5S49", False) == "RNA5SP49" + get_new_symbol("RN5S49", False) == "RNA5SP49" ) @@ -21,21 +28,21 @@ def test_get_new_symbol_one_res(): def test_get_gene_starting_with_no_res(): assert( - queries.get_gene_starting_with("AIFJAEIGJI", False) is None + get_gene_starting_with("AIFJAEIGJI", False) is None ) def test_get_gene_starting_with_single_res(): truth = ["BRCA1P1"] assert( - queries.get_gene_starting_with("BRCA1P", False) == truth + get_gene_starting_with("BRCA1P", False) == truth ) def test_get_gene_starting_with_multiple_res(): truth_list = ["BRCA1", "BRCA1P1", "BRCA2", "BRCA3"] assert( - queries.get_gene_starting_with("BRCA", False) == truth_list + get_gene_starting_with("BRCA", False) == truth_list ) @@ -44,21 +51,21 @@ def test_get_gene_starting_with_multiple_res(): def test_get_alias_no_alias(): assert( - queries.get_alias("CARD9", False) is None + get_alias("CARD9", False) is None ) def test_get_alias_one_alias(): truth = ["DA9"] assert( - queries.get_alias("FBN2", False) == truth + get_alias("FBN2", False) == truth ) def test_get_alias_multiple_aliases(): truth = ["KIAA1235", "ELD/OSA1", "p250R", "BAF250b", "DAN15", "6A3-5"] assert( - queries.get_alias("ARID1B", False) == truth + get_alias("ARID1B", False) == truth ) @@ -67,14 +74,14 @@ def test_get_alias_multiple_aliases(): def test_get_main_symbol_no_res(): assert( - queries.get_main_symbol("CARD9", False) is None + get_main_symbol("CARD9", False) is None ) def test_get_main_symbol_res(): truth = "BRAF" assert( - queries.get_main_symbol("BRAF1", False) == truth + get_main_symbol("BRAF1", False) == truth ) @@ -83,21 +90,21 @@ def test_get_main_symbol_res(): def test_get_prev_symbol_no_res(): assert( - queries.get_prev_symbol("No exist", False) is None + get_prev_symbol("No exist", False) is None ) def test_get_prev_symbol_one_res(): truth = ["CCA"] assert( - queries.get_prev_symbol("FBN2", False) == truth + get_prev_symbol("FBN2", False) == truth ) def test_get_prev_symbol_multiple_res(): truth = ["SEDC", "AOM"] assert( - queries.get_prev_symbol("COL2A1", False) == truth + get_prev_symbol("COL2A1", False) == truth ) @@ -106,14 +113,14 @@ def test_get_prev_symbol_multiple_res(): def test_get_id_no_res(): assert( - queries.get_id("No exist", False) is None + get_id("No exist", False) is None ) def test_get_id_one_res(): truth = "HGNC:1097" assert( - queries.get_id("BRAF", False) == truth + get_id("BRAF", False) == truth ) @@ -122,20 +129,20 @@ def test_get_id_one_res(): def test_get_symbol_from_id_not_ID(): assert( - queries.get_symbol_from_id("No exist", False) is None + get_symbol_from_id("No exist", False) is None ) def test_get_symbol_from_id_no_res(): assert( - queries.get_symbol_from_id("54974894564156", False) is None + get_symbol_from_id("54974894564156", False) is None ) def test_get_symbol_from_id_one_res(): truth = "BRAF" assert( - queries.get_symbol_from_id("1097", False) == truth + get_symbol_from_id("1097", False) == truth ) @@ -144,17 +151,89 @@ def test_get_symbol_from_id_one_res(): def test_get_hgnc_symbol_none(): assert( - queries.get_hgnc_symbol("BRCA") is None + get_hgnc_symbol("BRCA") is None ) def test_get_hgnc_symbol_use_new(): assert( - queries.get_hgnc_symbol("RN5S49") == "RNA5SP49" + get_hgnc_symbol("RN5S49") == "RNA5SP49" ) def test_get_hgnc_symbol_use_main(): assert( - queries.get_hgnc_symbol("BRAF1") == "BRAF" + get_hgnc_symbol("BRAF1") == "BRAF" + ) + + +########################################################################### + + +def test_convert_refseq2ensembl_wrong_id(): + assert( + convert_refseq2ensembl("Not_ENSG") is None + ) + + +def test_convert_refseq2ensembl_no_res(): + assert( + convert_refseq2ensembl("ENSG09184875872") is None + ) + + +def test_convert_refseq2ensembl_res(): + assert( + convert_refseq2ensembl("NM_007294", False) == "ENSG00000012048" + ) + + +########################################################################### + + +def test_convert_ensembl2refseq_wrong_id(): + assert( + convert_ensembl2refseq("Not_NM") is None + ) + + +def test_convert_ensembl2refseq_no_res(): + assert( + convert_ensembl2refseq("NM_9189483857") is None + ) + + +def test_convert_ensembl2refseq_res(): + assert( + convert_ensembl2refseq("ENSG00000012048") == ['NM_007294'] + ) + + +########################################################################### + + +def test_get_refseq_wrong_id(): + assert( + get_refseq("thingy") is None + ) + + +def test_get_refseq_res(): + assert( + get_refseq("BRCA1") == ['NM_007294'] + ) + + +########################################################################### + + +def test_get_ensembl_wrong_id(): + assert( + get_ensembl("thingy") is None + ) + + +def test_get_ensembl_res(): + assert( + get_ensembl("BRCA1") == "ENSG00000012048" ) From 18b6691a908917dd9c64a297232cda40e3ef3f9f Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:40:44 +0100 Subject: [PATCH 6/7] 0.1.0 update --- README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a34208c..2c162d9 100644 --- a/README.md +++ b/README.md @@ -32,13 +32,23 @@ BRAF >> hgnc_queries.get_id("BRAF", verbose = False) HGNC:1097 ->>hgnc_queries.get_symbol_from_id("1097", verbose = False) +>> hgnc_queries.get_symbol_from_id("1097", verbose = False) BRAF # new function more for in-script use # get the hgnc symbol if gene symbol not recognized by the process you're using ->>print(queries.get_hgnc_symbol("RN5S49")) +>> print(queries.get_hgnc_symbol("RN5S49")) RNA5SP49 ->>print(queries.get_hgnc_symbol("BRAF1")) +>> print(queries.get_hgnc_symbol("BRAF1")) BRAF + +>> hgnc_queries.get_refseq("BRCA1", False) +['NM_007294'] +>> hgnc_queries.get_ensembl("BRCA1", False) +"ENSG00000012048" + +>> hgnc_queries.convert_refseq2ensembl("NM_007294", False) +"ENSG00000012048" +>> hgnc_queries.convert_ensembl2refseq("ENSG00000012048", False) +['NM_007294'] ``` \ No newline at end of file From 37fd891683001e0176d682339a226eb6153718d3 Mon Sep 17 00:00:00 2001 From: Yu-jinKim Date: Thu, 2 Jul 2020 15:40:53 +0100 Subject: [PATCH 7/7] order imports --- hgnc_queries/api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hgnc_queries/api.py b/hgnc_queries/api.py index a61e6aa..8de0d79 100644 --- a/hgnc_queries/api.py +++ b/hgnc_queries/api.py @@ -1,6 +1,7 @@ -import requests -import sys import json +import sys + +import requests URL = "http://rest.genenames.org"