Skip to content

Commit

Permalink
Adding GO annotation column.
Browse files Browse the repository at this point in the history
And several bug fixes.
  • Loading branch information
mauraisa committed Apr 5, 2019
1 parent bdfa50e commit a3ec7de
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 23 deletions.
30 changes: 23 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# locScraper
Given a .tsv file with a column containing Uniprot protein IDs, scrape annotations for subcellular location from uniprot.org
Given a .tsv file with a column containing Uniprot protein IDs, scrape annotations for subcellular location from Uniprot.org

## Installation
The simplest way to install `locScraper` is to download one of the precompiled binaries under the [releases](https://github.com/ajmaurais/locScraper/releases) tab. Binaries are available for OSX and CentOS.
The simplest way to install `locScraper` is to download one of the precompiled binaries under the [releases](https://github.com/ajmaurais/locScraper) tab. Binaries are available for OSX and CentOS.

You can also clone this repository with the command.
```
Expand All @@ -17,11 +17,15 @@ lxml

## Usage
```
usage: locScraper [-h] [-i IDCOL] [-l LOCCOL] [--nThread NTHREAD]
usage: locScraper [-h] [-i IDCOL] [--columns {sl,go,all}] [--locCol LOCCOL]
[--goCol GOCOL] [--allCol ALLCOL] [--nThread NTHREAD]
[-o OFNAME] [--inPlace]
input_file [input_file ...]
Get subcellular location annotations for a list of uniprot protein IDs.
Get subcellular location annotations for a list of Uniprot protein IDs. A
column in input_file should contain Uniprot IDs. After locScraper runs,
columns will be added for Unipriot location annotations, GO cellular component
annotations.
positional arguments:
input_file .tsv or .csv files to process.
Expand All @@ -32,10 +36,22 @@ optional arguments:
-i IDCOL, --idCol IDCOL
Name of column containing Uniprot IDs.
-l LOCCOL, --locCol LOCCOL
Name of new column to add with subcellular location.
--columns {sl,go,all}
Which new columns should be added?
sl : Uniprot annotation for subcellular location
go : GO annotation for cellular component
all : both sl and go
Default is all.
--nThread NTHREAD Number of threads to use to lookup uniprot
--locCol LOCCOL Name of new column to add with subcellular location.
--goCol GOCOL Name of new column to add with GO cellular component
annotation.
--allCol ALLCOL Name of new column to add with GO and Uniprot
annotations combined.
--nThread NTHREAD Number of threads to use to lookup Uniprot
annotations. Default is the number of logical cores on
your system.
Expand Down
11 changes: 9 additions & 2 deletions protein_loc_scraper/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import csv

class DataFrame(object):
'''
Lightweight DataFrame class which recreates some of the
functionality as a Pandas.DataFrame without having to
import the entire Pandas library.
'''

_ROW_NAME = '_row'

Expand All @@ -22,7 +27,8 @@ def __getitem__(self, item: str) -> List:

def __setitem__(self, key: str, value: List):
self.data[key] = value
self._columns.append(key)
if key not in self._columns:
self._columns.append(key)

def _fromDict(self, data: Dict):
for i, k, v in enumerate(data.items()):
Expand All @@ -32,7 +38,8 @@ def _fromDict(self, data: Dict):
if self.nrow != len(v):
raise ValueError('Columns must all be same length!')
self.data[k] = v
self._columns.append(k)
if k not in self._columns:
self._columns.append(k)
self.ncol = len(self._columns)
self.data[DataFrame._ROW_NAME] = [x for x in range(self.nrow)]

Expand Down
38 changes: 31 additions & 7 deletions protein_loc_scraper/locScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,29 @@
import dataframe

def getArgs():
parser = argparse.ArgumentParser(description='Get subcellular location annotations for a list of uniprot protein IDs.')
parser = argparse.ArgumentParser(prog = 'locScraper',
description='Get subcellular location annotations for a list of Uniprot protein IDs. '
'A column in input_file should contain Uniprot IDs. After locScraper '
'runs, columns will be added for Unipriot location annotations, '
'GO celluar component annotations.')

parser.add_argument('-i', '--idCol', default = 'ID', type = str,
help = 'Name of column containing Uniprot IDs.')

parser.add_argument('-l', '--locCol', default = 'subcellular_loc',
help = 'Name of new column to add with subcellular location.')
parser.add_argument('--columns', choices= ['sl', 'go', 'all'], default = 'all',
help = 'Which new columns should be added? Default is all.')

parser.add_argument('--locCol', default='subcellular_loc',
help='Name of new column to add with subcellular location.')

parser.add_argument('--goCol', default='go_cellular_component',
help='Name of new column to add with GO cellular component annotation.')

parser.add_argument('--allCol', default='all_locations',
help='Name of new column to add with GO and Uniprot annotations combined.')

parser.add_argument('--nThread', default = None, type = int,
help = 'Number of threads to use to lookup uniprot annotations. '
help = 'Number of threads to use to lookup Uniprot annotations. '
'Default is the number of logical cores on your system.')

parser.add_argument('-o', '--ofname', type = str, default = None,
Expand Down Expand Up @@ -53,8 +66,8 @@ def main():
sys.stdout.write('Working on {}...\n'.format(ifname))
df = dataframe.read_tsv(ifname)

#get list of uniprot IDs
sys.stdout.write('Using \'{}\' as the uniprot ID column.\n'.format(args.idCol))
#get list of Uniprot IDs
sys.stdout.write('Using \'{}\' as the Uniprot ID column.\n'.format(args.idCol))
try:
ids = df[args.idCol]
except KeyError as e:
Expand All @@ -63,11 +76,22 @@ def main():

#get locations
locations = scraper.getLocList(ids, nThread = args.nThread)
df[args.locCol] = locations

#transpose locations so columns can easily be added to df
locations = list(zip(*locations))

#add columns to df
if args.columns == 'all' or args.columns == 'sl':
df[args.locCol] = locations[0]
if args.columns == 'all' or args.columns == 'go':
df[args.goCol] = locations[1]
if args.columns == 'all':
df[args.allCol] = locations[2]

#write results
df.to_csv(ofnames[i], sep = '\t')
sys.stdout.write('Results written to {}\n\n'.format(ofnames[i]))


if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion protein_loc_scraper/scraper/parallelization.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def getLocList(uniProtIDs: List, nThread: int = None) -> List:
_nThread = nThread

#lookup locs using thread pool
sys.stdout.write('Searching for locations with {} threads...\n'.format(_nThread))
sys.stdout.write('Searching for locations with {} thread(s)...\n'.format(_nThread))
with Pool(processes=_nThread) as pool:
ret = list(tqdm(pool.imap(getLocs, uniProtIDs),
total = listLen,
Expand Down
22 changes: 16 additions & 6 deletions protein_loc_scraper/scraper/scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

import requests
from typing import List
from typing import List, Tuple
from lxml import html

SKIP_LOCS = ['other locations']
Expand All @@ -16,23 +16,33 @@ def removeDuplicates(values):


def _concatLocs(locList: List, delim: str = ';') -> str:
ret = [x for x in locList if x not in SKIP_LOCS]
#process text and remove any string in SKIP_LOCS
ret = list(filter(lambda x: x not in SKIP_LOCS, [y.lower().strip() for y in locList]))
ret = removeDuplicates(ret)
if not ret:
return 'no_annotated_location'
return delim.join(ret)


def getLocs(uniprotID: str) -> str:
def getLocs(uniprotID: str) -> Tuple:
url = 'http://www.uniprot.org/uniprot/' + uniprotID + '.html'
response = requests.get(url)
if response.status_code >= 400:
return "No_uniprot_records_found"
return 'no_uniprot_records_found', 'no_uniprot_records_found', 'no_uniprot_records_found'

tree = html.fromstring(response.content)

#get sl uniprot anotation
sl = tree.xpath('//*[@id="table-uniprot_annotation"]/div/ul/li/h6/text()')
ssl = tree.xpath('//*[@id="table-uniprot_annotation"]/div/ul/li/ul/li/a/text()')
locs = _concatLocs([x.lower().strip() for x in sl + ssl])
locs = _concatLocs(sl + ssl)

#get go term for celluar component
go = tree.xpath('//*[@id="table-go_annotation"]/div/ul/li/h6/text()')
sgo = tree.xpath('//*[@id="table-go_annotation"]/div/ul/li/ul/li/a/text()')
gos = _concatLocs(go + sgo)

concat = _concatLocs(sl + ssl + go + sgo)

return locs
return locs, gos, concat

0 comments on commit a3ec7de

Please sign in to comment.