Merge pull request #29 from knights-lab/release-v1.0.7

Release v1.0.7
knights-lab · Dec 27, 2019 · da27a0d · da27a0d
2 parents c5b1f54 + ae5773c
commit da27a0d
Show file tree

Hide file tree

Showing 13 changed files with 324 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -7,13 +7,13 @@ Shallow seq pipeline for optimal shotgun data usage
 
 ![alt-tag](docs/shogun_schematic.png)
 
-Schematic overview of the shallow-shotgun computational pipeline SHOGUN. For every step in the SHOGUN pipeline, the user must supply the pre-formatted SHOGUN database folder. To run every step shown here in a single command, the user can select the pipeline subcommand. Otherwise, the analysis modules can be run independently. 
+Schematic overview of the shallow-shotgun computational pipeline SHOGUN. For every step in the SHOGUN pipeline, the user must supply the pre-formatted SHOGUN database folder. To run every step shown here in a single command, the user can select the pipeline subcommand. Otherwise, the analysis modules can be run independently.
 
 a. *filter* - The input quality-controlled reads are aligned against the contaminate database using BURST to filter out all reads that hit human associated genome content.
 
 b. *align* - The input contaminate filtered reads are aligned against the reference database. The user has the option to select one or all of the three alignment tools BURST, Bowtie2, or UTree.
 
-c. *assign_taxonomy* - Given the data artifacts from a SHOGUN alignment tool, output a Biological Observation Matrix ![(BIOM)](http://biom-format.org/) format taxatable with the rows being rank-flexible taxonomies, the columns are samples, and the entries are counts for each given taxonomy per sample. The alignment tool BURST has two run modes, taxonomy and capitalist. If the capitalist mode is enabled, a rank-specific BIOM file is output instead. 
+c. *assign_taxonomy* - Given the data artifacts from a SHOGUN alignment tool, output a Biological Observation Matrix ![(BIOM)](http://biom-format.org/) format taxatable with the rows being rank-flexible taxonomies, the columns are samples, and the entries are counts for each given taxonomy per sample. The alignment tool BURST has two run modes, taxonomy and capitalist. If the capitalist mode is enabled, a rank-specific BIOM file is output instead.
 
 d. *coverage* - The output from BURST can be utilized to analyze the coverage of each taxonomy across all samples in your alignment file. This can useful for reducing the number of false positive taxonomies.
 
@@ -168,7 +168,7 @@ Options:
   -h, --help                      Show this message and exit.
 ```
 
-### normalize
+#### normalize
 
 ```
 Usage: shogun normalize [OPTIONS]
@@ -273,3 +273,9 @@ shogun pipeline -i input.fna -d /path/to/database/parent/folder/ -o output -m bu
 shogun pipeline -i input.fna -d /path/to/database/parent/folder/ -o output -m utree
 shogun pipeline -i input.fna -d /path/to/database/parent/folder/ -o output -m bowtie2
 ```
+
+For a prebuilt database, download the file locations [here](https://github.com/knights-lab/SHOGUN/tree/master/docs/shogun_db_links.txt) and run the command:
+
+```
+wget -i <path_to_folder>/shogun_db_links.txt
+```
diff --git a/docs/README.md b/docs/README.md
@@ -37,6 +37,12 @@ If you are on MSI the database for SHOGUN is located at:
 /home/knightsd/hillm096/globus/SHOGUN/rep82
 ```
 
+Else you can access the rep82 database on AWS by running :
+
+```
+wget <path_to_folder>/shogun_db_links.txt
+```
+
 If everything up to the testing is working, below is the location of an example shallow shotgun file.
 ```
 /home/grad00/hillm096/combined_seqs.fna

diff --git a/docs/Untitled.ipynb b/docs/Untitled.ipynb
@@ -0,0 +1,52 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"/home/benjamin/code/rep94/idmapping.RefSeq.ko.dat\", sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/sample_shogun_db.md b/docs/sample_shogun_db.md
@@ -1,13 +1,33 @@
-# Make database for BURST
+# Gene split database
+
+Download the newest idmapping.data.gz from UniProt ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/
+
+https://raw.githubusercontent.com/knights-lab/analysis_SHOGUN/master/scripts/shear_db.py
+
+```
+wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
+gunzip idmapping.data.gz
+```
+
+```
 mkdir burst
 burst15 --references genomes.small.fna --taxonomy genomes.small.tax --output burst/genomes.small.edx --npenalize --makedb --fingerprint --accelerator burst/genomes.small.acx
+```
 
 ```
-# Make database for Bowtie2
-mkdir bowtie2
-bowtie2-build -f genomes.small.fna bowtie2/genomes.small
+# How to make sheared_bayes.txt
+python shear_db.py Rep94.fasta 100 50 > Rep94.shear.fasta
+/project/flatiron2/sop/burst15 -t 16 -q Rep94.shear.fasta -a Rep94.acx -r Rep94.edx -o Rep94.shear.b6 -m ALLPATHS -fr -i 0.98
+sed 's/_/./1' Rep94.shear.b6 > Rep94.shear.fixed.b6
+/project/flatiron2/sop/embalmulate Rep94.shear.fixed.b6 Rep94.shear.otu.txt
+python shear_results_fix.py Rep94.shear.otu.txt Rep94.tax Rep94.shear
 ```
 
+
+https://raw.githubusercontent.com/knights-lab/analysis_SHOGUN/master/scripts/shear_db.py
+
+
+
 ```
 # Make database for UTree
 mkdir utree

diff --git a/docs/shear_results_fix.py b/docs/shear_results_fix.py
@@ -0,0 +1,68 @@
+# usage: python me.py \
+#  alignment.burst.otu.txt db.tax sheared_bayes.txt
+
+import os
+import sys
+import csv
+import pandas as pd
+import numpy as np
+from scipy.sparse import csr_matrix
+
+with open(sys.argv[1], 'r') as inf:
+    csv_inf = csv.reader(inf, delimiter="\t")
+    columns = next(csv_inf)
+    columns = dict(zip(columns[1:], range(len(columns))))
+    indptr = [0]
+    indices = np.array([], dtype=int)
+    data = np.array([], dtype=int)
+    names = []
+    for ix, row in enumerate(csv_inf):
+        if ix % 1000 == 0:
+            print(ix)
+        names.append(row[0])
+        np_row = np.array(row[1:], dtype=int)
+        temp_indx = [np_row > 0]
+        data = np.concatenate((data, np_row[temp_indx]))
+        indices = np.concatenate((indices, np.where(temp_indx)[1]))
+        indptr.append(indices.shape[0])
+
+csr = csr_matrix((data, indices, indptr), dtype=int).T
+
+with open(sys.argv[2]) as inf:
+    csv_inf = csv.reader(inf, delimiter='\t')
+    name2taxonomy = dict(csv_inf)
+
+cols_tax = [name2taxonomy[name] for name in names]
+rows_tax = [name2taxonomy[_.replace(".", "_", 1)] for _ in sorted(columns, key=columns.get)]
+
+def index_lca(str1, str2):
+    for i, (s1, s2) in enumerate(zip(str1.split(';'), str2.split(';'))):
+        if s1 != s2:
+            return i
+    return 8
+
+dat = np.zeros((len(rows_tax), 9), dtype=int)
+for i, row_name in enumerate(rows_tax):
+    row = csr.getrow(i)
+    for j, indx in enumerate(row.indices):
+        dat[i, index_lca(rows_tax[i], cols_tax[indx])] += row.data[j]
+
+print(str(dat[:, 0].sum()))
+
+df = pd.DataFrame(dat, index=rows_tax)
+df['sum'] = dat.sum(axis=1)
+df.drop(0, axis=1, inplace=True)
+df.to_csv(sys.argv[3], header=False, sep='\t')
+
+uniqueness_rate_per_level = np.zeros(8, dtype=float)
+for i in range(0, 8):
+    # Take the sum of those columns
+    num_hits =  df.iloc[:, i].sum()
+    # Total number of possible hits
+    total_hits = df['sum'].sum()
+    # Uniqueness Rate
+    uniqueness_rate_per_level[i] = num_hits/total_hits
+levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
+list(zip(levels, uniqueness_rate_per_level))
+print(uniqueness_rate_per_level.sum())
+
diff --git a/docs/shogun_db_links.txt b/docs/shogun_db_links.txt
@@ -0,0 +1,22 @@
+https://s3.us-east-2.amazonaws.com/shogun-db/metadata.yaml
+https://s3.us-east-2.amazonaws.com/shogun-db/rep82.fna
+https://s3.us-east-2.amazonaws.com/shogun-db/rep82.tax
+https://s3.us-east-2.amazonaws.com/shogun-db/sheared_bayes.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/utree/rep82.gg.ctr
+https://s3.us-east-2.amazonaws.com/shogun-db/utree/rep82.gg.log
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-enzyme-annotations.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-module-annotations.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-pathway-annotations.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-species2ko.80pct.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-species2ko.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-strain2ko.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/filter/humanD252.edx
+https://s3.us-east-2.amazonaws.com/shogun-db/filter/humanD252.acx
+https://s3.us-east-2.amazonaws.com/shogun-db/burst/rep82.edx
+https://s3.us-east-2.amazonaws.com/shogun-db/burst/rep82.acx
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.1.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.2.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.3.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.4.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.rev.1.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.rev.2.bt2l
diff --git a/docs/taxmap_gene.py b/docs/taxmap_gene.py
@@ -0,0 +1,31 @@
+# usage: taxmap gene.py
+
+import sys
+import csv
+import re
+
+RefSeq2Uniprot = dict()
+Uniprot2KO = dict()
+
+with open(sys.argv[1], 'r') as inf:
+    csv_inf = csv.reader(inf, delimiter="\t")
+    for i, row in enumerate(csv_inf):
+        if row[1] == "RefSeq":
+            RefSeq2Uniprot[row[2]] = row[0]
+        elif row[1] == "KO":
+            Uniprot2KO[row[0]] = row[2]
+
+with open(sys.argv[2], 'r') as inf:
+    for line in inf:
+        title_search = re.search('\[protein_id=(.*?)\]', line, re.IGNORECASE)
+        output_str = line[1:] + "\t"
+        if title_search:
+            title = title_search.group(1)
+
+            if title in RefSeq2Uniprot:
+                uniprot_id = RefSeq2Uniprot[title]
+                if uniprot_id in Uniprot2KO:
+                    ko_id = Uniprot2KO[uniprot_id]
+                    output_str += ko_id
+        print(output_str)
+
diff --git a/shogun/__main__.py b/shogun/__main__.py
@@ -9,7 +9,7 @@
 from datetime import date
 from multiprocessing import cpu_count
 import click
-from yaml import load
+import yaml
 import pandas as pd
 
 from shogun import __version__, logger
@@ -306,7 +306,7 @@ def _load_metadata(database):
         with open(metadata_file, 'r') as stream:
             logger.debug(
                 "Attempting to load the database metadata file at %s" % (os.path.abspath(metadata_file)))
-            data_files = load(stream)
+            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
         return data_files
     else:
         logger.critical("Unable to load database at %s" % os.path.abspath(metadata_file))

diff --git a/shogun/aligners/_aligner.py b/shogun/aligners/_aligner.py
@@ -6,7 +6,7 @@
 
 import os
 
-from yaml import load
+import yaml
 
 from shogun import logger
 
@@ -20,7 +20,7 @@ def __init__(self, database_dir, threads=1, post_align=True, shell=False, percen
         check, msg = self.check_database(database_dir)
 
         with open(os.path.join(database_dir, 'metadata.yaml')) as stream:
-            self.data_files = load(stream)
+            self.data_files = yaml.load(stream, Loader=yaml.SafeLoader)
 
         if not check:
             raise Exception("Database %s is not formatted correctly: %s" % (database_dir, msg))
@@ -37,7 +37,7 @@ def __init__(self, database_dir, threads=1, post_align=True, shell=False, percen
     @classmethod
     def check_database(cls, dir):
         with open(os.path.join(dir, 'metadata.yaml'), 'r') as stream:
-            data_files = load(stream)
+            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
 
         SUFFICES = {
             'burst': ['.edx'],

diff --git a/shogun/function/tests/test_function.py b/shogun/function/tests/test_function.py
@@ -9,7 +9,7 @@
 import unittest
 
 import pkg_resources
-from yaml import load
+import yaml
 
 from shogun.__main__ import _function
 from shogun.function import parse_function_db
@@ -26,7 +26,7 @@ def tearDown(self):
     def test_check_database(self):
         database = pkg_resources.resource_filename('shogun.tests', os.path.join('data'))
         with open(os.path.join(database, 'metadata.yaml'), 'r') as stream:
-            data_files = load(stream)
+            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
         results = parse_function_db(data_files, database)
         self.assertTrue(results is not None)
 

diff --git a/shogun/utils/__init__.py b/shogun/utils/__init__.py
@@ -4,9 +4,9 @@
 This software is released under the GNU Affero General Public License (AGPL) v3.0 License.
 """
 
-from .last_common_ancestor import build_lca_map
+from .last_common_ancestor import build_lca_map, least_common_ancestor
 from .normalize import normalize_by_median_depth
 from ._utils import run_command, hash_file, read_checksums, save_csr_matrix, load_csr_matrix, read_fasta, convert_to_relative_abundance
 
 __all__ = ['build_lca_map', 'run_command', 'hash_file', 'read_checksums', 'save_csr_matrix', 'load_csr_matrix',
-           'normalize_by_median_depth', 'read_fasta', 'convert_to_relative_abundance']
+           'normalize_by_median_depth', 'read_fasta', 'convert_to_relative_abundance', 'least_common_ancestor']
diff --git a/shogun/utils/last_common_ancestor.py b/shogun/utils/last_common_ancestor.py
@@ -31,9 +31,21 @@ def build_lca_map(gen: typing.Iterator, tree: Taxonomy) -> dict:
 
 
 def least_common_ancestor(taxa_set):
+    lca = []
     taxa_set = [_.split(';') for _ in taxa_set]
-    for i, level in enumerate(reversed(list(zip(*taxa_set)))):
-        if len(set(level)) == 1:
-            convert_i = lambda x: None if x == 0 else -x
-            return ';'.join([taxon_level[0] for taxon_level in list(zip(*taxa_set))[:convert_i(i)]])
-    return None
+    unclassified_flag = False
+    lca_classified = []
+    for level in zip(*taxa_set):
+        if len(set(level)) > 1:
+            if unclassified_flag:
+                lca = lca_classified
+            return ';'.join(lca) if lca else None
+        elif len(level[0]) == 3 and level[0].endswith("__"):
+            # hit an unclassified level
+            if not unclassified_flag:
+                lca_classified = lca.copy()
+                unclassified_flag = True
+        else:
+            # reset classified flag
+            unclassified_flag = False
+        lca.append(level[0])