Adding GO annotation column.

And several bug fixes.
ajmaurais · Apr 5, 2019 · a3ec7de · a3ec7de
1 parent bdfa50e
commit a3ec7de
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # locScraper
-Given a .tsv file with a column containing Uniprot protein IDs, scrape annotations for subcellular location from uniprot.org
+Given a .tsv file with a column containing Uniprot protein IDs, scrape annotations for subcellular location from Uniprot.org
 
 ## Installation
-The simplest way to install `locScraper` is to download one of the precompiled binaries under the [releases](https://github.com/ajmaurais/locScraper/releases) tab. Binaries are available for OSX and CentOS. 
+The simplest way to install `locScraper` is to download one of the precompiled binaries under the [releases](https://github.com/ajmaurais/locScraper) tab. Binaries are available for OSX and CentOS. 
 
 You can also clone this repository with the command.
 ```
@@ -17,11 +17,15 @@ lxml
 
 ## Usage
 ```
-usage: locScraper [-h] [-i IDCOL] [-l LOCCOL] [--nThread NTHREAD]
+usage: locScraper [-h] [-i IDCOL] [--columns {sl,go,all}] [--locCol LOCCOL]
+                  [--goCol GOCOL] [--allCol ALLCOL] [--nThread NTHREAD]
                   [-o OFNAME] [--inPlace]
                   input_file [input_file ...]
 
-Get subcellular location annotations for a list of uniprot protein IDs.
+Get subcellular location annotations for a list of Uniprot protein IDs. A
+column in input_file should contain Uniprot IDs. After locScraper runs,
+columns will be added for Unipriot location annotations, GO cellular component
+annotations.
 
 positional arguments:
   input_file            .tsv or .csv files to process.
@@ -32,10 +36,22 @@ optional arguments:
   -i IDCOL, --idCol IDCOL
                         Name of column containing Uniprot IDs.
 
-  -l LOCCOL, --locCol LOCCOL
-                        Name of new column to add with subcellular location.
+  --columns {sl,go,all}
+                        Which new columns should be added?
+                        sl : Uniprot annotation for subcellular location
+                        go : GO annotation for cellular component
+                        all : both sl and go
+                        Default is all.
 
-  --nThread NTHREAD     Number of threads to use to lookup uniprot
+  --locCol LOCCOL       Name of new column to add with subcellular location.
+
+  --goCol GOCOL         Name of new column to add with GO cellular component
+                        annotation.
+
+  --allCol ALLCOL       Name of new column to add with GO and Uniprot
+                        annotations combined.
+
+  --nThread NTHREAD     Number of threads to use to lookup Uniprot
                         annotations. Default is the number of logical cores on
                         your system.
 

diff --git a/protein_loc_scraper/dataframe/dataframe.py b/protein_loc_scraper/dataframe/dataframe.py
@@ -3,6 +3,11 @@
 import csv
 
 class DataFrame(object):
+    '''
+    Lightweight DataFrame class which recreates some of the
+    functionality as a Pandas.DataFrame without having to
+    import the entire Pandas library.
+    '''
 
     _ROW_NAME = '_row'
 
@@ -22,7 +27,8 @@ def __getitem__(self, item: str) -> List:
 
     def __setitem__(self, key: str, value: List):
         self.data[key] = value
-        self._columns.append(key)
+        if key not in self._columns:
+            self._columns.append(key)
 
     def _fromDict(self, data: Dict):
         for i, k, v in enumerate(data.items()):
@@ -32,7 +38,8 @@ def _fromDict(self, data: Dict):
                 if self.nrow != len(v):
                     raise ValueError('Columns must all be same length!')
             self.data[k] = v
-            self._columns.append(k)
+            if k not in self._columns:
+                self._columns.append(k)
         self.ncol = len(self._columns)
         self.data[DataFrame._ROW_NAME] = [x for x in range(self.nrow)]
 

diff --git a/protein_loc_scraper/locScraper.py b/protein_loc_scraper/locScraper.py
@@ -8,16 +8,29 @@
 import dataframe
 
 def getArgs():
-    parser = argparse.ArgumentParser(description='Get subcellular location annotations for a list of uniprot protein IDs.')
+    parser = argparse.ArgumentParser(prog = 'locScraper',
+                                     description='Get subcellular location annotations for a list of Uniprot protein IDs. '
+                                                 'A column in input_file should contain Uniprot IDs. After locScraper '
+                                                 'runs, columns will be added for Unipriot location annotations, '
+                                                 'GO celluar component annotations.')
 
     parser.add_argument('-i', '--idCol', default = 'ID', type = str,
                         help = 'Name of column containing Uniprot IDs.')
 
-    parser.add_argument('-l', '--locCol', default = 'subcellular_loc',
-                        help = 'Name of new column to add with subcellular location.')
+    parser.add_argument('--columns', choices= ['sl', 'go', 'all'], default =  'all',
+                        help = 'Which new columns should be added? Default is all.')
+
+    parser.add_argument('--locCol', default='subcellular_loc',
+                        help='Name of new column to add with subcellular location.')
+
+    parser.add_argument('--goCol', default='go_cellular_component',
+                        help='Name of new column to add with GO cellular component annotation.')
+
+    parser.add_argument('--allCol', default='all_locations',
+                        help='Name of new column to add with GO and Uniprot annotations combined.')
 
     parser.add_argument('--nThread', default = None, type = int,
-                        help = 'Number of threads to use to lookup uniprot annotations. '
+                        help = 'Number of threads to use to lookup Uniprot annotations. '
                                'Default is the number of logical cores on your system.')
 
     parser.add_argument('-o', '--ofname', type = str, default = None,
@@ -53,8 +66,8 @@ def main():
         sys.stdout.write('Working on {}...\n'.format(ifname))
         df = dataframe.read_tsv(ifname)
 
-        #get list of uniprot IDs
-        sys.stdout.write('Using \'{}\' as the uniprot ID column.\n'.format(args.idCol))
+        #get list of Uniprot IDs
+        sys.stdout.write('Using \'{}\' as the Uniprot ID column.\n'.format(args.idCol))
         try:
             ids = df[args.idCol]
         except KeyError as e:
@@ -63,11 +76,22 @@ def main():
 
         #get locations
         locations = scraper.getLocList(ids, nThread = args.nThread)
-        df[args.locCol] = locations
+
+        #transpose locations so columns can easily be added to df
+        locations = list(zip(*locations))
+
+        #add columns to df
+        if args.columns == 'all' or args.columns == 'sl':
+            df[args.locCol] = locations[0]
+        if args.columns == 'all' or args.columns == 'go':
+            df[args.goCol] = locations[1]
+        if args.columns == 'all':
+            df[args.allCol] = locations[2]
 
         #write results
         df.to_csv(ofnames[i], sep = '\t')
         sys.stdout.write('Results written to {}\n\n'.format(ofnames[i]))
 
+
 if __name__ == '__main__':
     main()
diff --git a/protein_loc_scraper/scraper/parallelization.py b/protein_loc_scraper/scraper/parallelization.py
@@ -30,7 +30,7 @@ def getLocList(uniProtIDs: List, nThread: int = None) -> List:
         _nThread = nThread
 
     #lookup locs using thread pool
-    sys.stdout.write('Searching for locations with {} threads...\n'.format(_nThread))
+    sys.stdout.write('Searching for locations with {} thread(s)...\n'.format(_nThread))
     with Pool(processes=_nThread) as pool:
         ret = list(tqdm(pool.imap(getLocs, uniProtIDs),
                              total = listLen,

diff --git a/protein_loc_scraper/scraper/scraper.py b/protein_loc_scraper/scraper/scraper.py
@@ -1,6 +1,6 @@
 
 import requests
-from typing import List
+from typing import List, Tuple
 from lxml import html
 
 SKIP_LOCS = ['other locations']
@@ -16,23 +16,33 @@ def removeDuplicates(values):
 
 
 def _concatLocs(locList: List, delim: str = ';') -> str:
-    ret = [x for x in locList if x not in SKIP_LOCS]
+    #process text and remove any string in SKIP_LOCS
+    ret = list(filter(lambda x: x not in SKIP_LOCS, [y.lower().strip() for y in locList]))
     ret = removeDuplicates(ret)
     if not ret:
         return 'no_annotated_location'
     return delim.join(ret)
 
 
-def getLocs(uniprotID: str) -> str:
+def getLocs(uniprotID: str) -> Tuple:
     url = 'http://www.uniprot.org/uniprot/' + uniprotID + '.html'
     response = requests.get(url)
     if response.status_code >= 400:
-        return "No_uniprot_records_found"
+        return 'no_uniprot_records_found', 'no_uniprot_records_found', 'no_uniprot_records_found'
 
     tree = html.fromstring(response.content)
+
+    #get sl uniprot anotation
     sl = tree.xpath('//*[@id="table-uniprot_annotation"]/div/ul/li/h6/text()')
     ssl = tree.xpath('//*[@id="table-uniprot_annotation"]/div/ul/li/ul/li/a/text()')
-    locs = _concatLocs([x.lower().strip() for x in sl + ssl])
+    locs = _concatLocs(sl + ssl)
+
+    #get go term for celluar component
+    go = tree.xpath('//*[@id="table-go_annotation"]/div/ul/li/h6/text()')
+    sgo = tree.xpath('//*[@id="table-go_annotation"]/div/ul/li/ul/li/a/text()')
+    gos = _concatLocs(go + sgo)
+
+    concat = _concatLocs(sl + ssl + go + sgo)
 
-    return locs
+    return locs, gos, concat