-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgettfidf.py
99 lines (73 loc) · 2.45 KB
/
gettfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import sys
import pandas as pd
import gzip
import json
from pymongo import MongoClient
from jsontodb import __get_database
import click
CONTEXT_SETTINGS = {
"help_option_names": ["-h", "--help"],
}
@click.command(no_args_is_help=True, context_settings=CONTEXT_SETTINGS)
@click.option(
"-f", "--foreground",
help="Foreground genes file.",
type=str,
required=True
)
@click.option(
"-b", "--background",
help="Background genes file.",
type=str,
required=True
)
@click.option(
"--input-type",
help="Input type.",
type=click.Choice(["entrezid", "symbol"], case_sensitive=False),
default="entrezid",
show_default=True
)
def main(**params):
# TODO: Connect to database (the next 3 lines are placeholders)
db = __get_database('pubmeddb')
pm_collection = db.pubmed_id
gene_collection = db.gene_id
# Identifiers
foreground = get_identifiers(params["foreground"], params["input_type"])
background = get_identifiers(params["background"], params["input_type"])
# Identifiers to PMIDS
fore_entrezids, fore_symbols, fore_pmids = identifier2pmids(foreground, gene_collection, params['input_type'])
back_entrezids, back_symbols, back_pmids = identifier2pmids(background, gene_collection, params['input_type'])
# TODO: From pmids, collect words from the pm_collection and then calculate tfidfs from foreground and background (maybe another function?)
def get_identifiers(input_file, input_type):
# Get identifiers
if input_file is not None:
identifiers = []
handle = input_file
for line in handle:
identifiers.append(line.strip("\n"))
handle.close()
if input_type == "entrezid":
identifiers = list(map(int, identifiers))
return(identifiers)
def identifier2pmids(idlist, collection, input_type):
entrezid_list = []
symbol_list = []
pmids_list = []
if input_type == 'entrezid':
for id in idlist:
d = collection.find({ "GeneID": id})[0]
entrezid_list.append(d['GeneID'])
symbol_list.append(d['Symbol'])
pmids_list.append(d['PubMed_ID'])
else:
for id in idlist:
d = collection.find({ "Symbol": id})[0]
symbol_list.append(d['Symbol'])
entrezid_list.append(d['GeneID'])
pmids_list.append(d['PubMed_ID'])
return(entrezid_list, symbol_list, pmids_list)
if __name__ == "__main__":
main()