-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexample.py
90 lines (82 loc) · 3.6 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#! /usr/bin/env python
# Dom Bennett
# 22/01/2015
# TODO: add tutorial on advanced TaxDict usage
'''
Example script for using taxon_names_resolver
'''
# this is for forward compatibility with python 3
from __future__ import absolute_import
from __future__ import print_function
# SETUP LOGGING (OPTIONAL)
import logging
logger = logging.getLogger('')
logger.setLevel(logging.DEBUG)
console = logging.StreamHandler()
console.setFormatter(logging.Formatter('%(message)s'))
logger.addHandler(console)
# PACKAGES
from taxon_names_resolver import Resolver
from taxon_names_resolver import TaxDict
from taxon_names_resolver import taxTree
# EXAMPLE NAMES
terms = ['Homo sapiens', 'Gorilla gorilla', 'Pongo pongo', 'Macca mulatta',
'Mus musculus', 'Ailuropoda melanoleuca', 'Ailurus fulgens',
'Chlorotalpa tytonis', 'Arabidopsis thaliana', 'Bacillus subtilus']
# RESOLVE
# pass the terms, the datasource and the logger (optional)
resolver = Resolver(terms=terms, datasource="NCBI", logger=logger)
resolver.main() # resolve!
# CREATE TAXDICT
# extract the unique names for each term ('idents', query_name is best as it is
# guaranteed to be unique)
idents = resolver.retrieve('query_name')
# extract the lists of names for all known parental taxonomic groups for each
# term ('lineages', e.g. Homo, Primate, Mammalia)
lineages = resolver.retrieve('classification_path')
# for Taxonomic IDs instead of names, use:
# lineages = resolver.retrieve('classification_path_ids')
# extract the lists of corresponding rank names for 'lineages' ('ranks', e.g.
# species, genus etc.) for each entity
ranks = resolver.retrieve('classification_path_ranks')
# optional extra data slots are also possible, for example a list of 1s and 0s
# it could be anything, just as long as its in the same order
extra = [1, 1, 1, 0, 0, 1, 1, 0, 1, 0]
# create a taxonomy specifying the names and order of 'ranks'. N.B. this is the
# default and is based on NCBI's taxonomy.
taxonomy = ['subspecies', 'species', 'subgenus', 'genus', 'tribe', 'subfamily',
'family', 'superfamily', 'parvorder', 'infraorder', 'suborder',
'order', 'superorder', 'parvclass', 'infraclass', 'subclass',
'class', 'superclass', 'subphylum', 'phylum', 'kingdom',
'superkingdom']
# use 'idents', 'ranks', 'lineages' and 'taxonomy' (optional) to construct a
# TaxDict
taxdict = TaxDict(idents=idents, ranks=ranks, lineages=lineages,
taxonomy=taxonomy, extra=extra)
# EXPLORE TAXDICT
# a dictionary for each ident with: 'lineage', 'taxref', 'ident', 'cident' and
# 'rank' (+ 'extra')
# the lineage
taxdict['Homo sapiens']['lineage'] # N.B. not all lineages are named, ''
# the ident is the same format as lineage e.g. it could be an ID
taxdict['Homo sapiens']['ident']
# the 'cident' (Contextual Ident), the highest named taxonomic group unique to
# this ident among all other idents
taxdict['Arabidopsis thaliana']['cident'] # A. thaliana is the only plant
# the 'taxref', a holder of 'ident' and taxonomic posistion. Requires printing
# e.g. C. tytonis could only resolved to the genus level (22/01/2015)
print(taxdict['Chlorotalpa tytonis']['taxref'])
# check the taxonomy
print(taxdict.taxonomy)
# check the hierarchy, a dictionary of taxrefs ranked and grouped in the form:
# {'rank':[([taxref1, taxref2, ....],'lineage1'),
# ([taxref3, taxref4, ....],'lineage2'), ....]}
print(taxdict.hierarchy)
# we've also added an extra data slot
taxdict['Homo sapiens']['extra']
# CREATE TREE
# use the taxdict to create a Newick string
treestring = taxTree(taxdict)
# SAVE TREE
with open('example.tre', 'w') as file:
file.write(treestring)