From 9c4f8d18790f114591ef9aa5b4b4f42ada5cf36c Mon Sep 17 00:00:00 2001
From: bhillmann <bhillmann@outlook.com>
Date: Sat, 14 Sep 2019 12:37:50 -0500
Subject: [PATCH 1/9] updating to include more documentation.

---
 docs/Untitled.ipynb       | 52 ++++++++++++++++++++++++++++++
 docs/sample_shogun_db.md  | 28 +++++++++++++---
 docs/shear_results_fix.py | 68 +++++++++++++++++++++++++++++++++++++++
 docs/taxmap_gene.py       | 31 ++++++++++++++++++
 4 files changed, 175 insertions(+), 4 deletions(-)
 create mode 100644 docs/Untitled.ipynb
 create mode 100644 docs/shear_results_fix.py
 create mode 100644 docs/taxmap_gene.py

diff --git a/docs/Untitled.ipynb b/docs/Untitled.ipynb
new file mode 100644
index 0000000..9c00365
--- /dev/null
+++ b/docs/Untitled.ipynb
@@ -0,0 +1,52 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"/home/benjamin/code/rep94/idmapping.RefSeq.ko.dat\", sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/sample_shogun_db.md b/docs/sample_shogun_db.md
index 4752cb0..8b6923e 100644
--- a/docs/sample_shogun_db.md
+++ b/docs/sample_shogun_db.md
@@ -1,13 +1,33 @@
-# Make database for BURST
+# Gene split database
+
+Download the newest idmapping.data.gz from UniProt ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/
+
+https://raw.githubusercontent.com/knights-lab/analysis_SHOGUN/master/scripts/shear_db.py
+
+```
+wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
+gunzip idmapping.data.gz
+```
+
+```
 mkdir burst
 burst15 --references genomes.small.fna --taxonomy genomes.small.tax --output burst/genomes.small.edx --npenalize --makedb --fingerprint --accelerator burst/genomes.small.acx
+```
 
 ```
-# Make database for Bowtie2
-mkdir bowtie2
-bowtie2-build -f genomes.small.fna bowtie2/genomes.small
+# How to make sheared_bayes.txt
+python shear_db.py Rep94.fasta 100 50 > Rep94.shear.fasta
+/project/flatiron2/sop/burst15 -t 16 -q Rep94.shear.fasta -a Rep94.acx -r Rep94.edx -o Rep94.shear.b6 -m ALLPATHS -fr -i 0.98
+sed 's/_/./1' Rep94.shear.b6 > Rep94.shear.fixed.b6
+/project/flatiron2/sop/embalmulate Rep94.shear.fixed.b6 Rep94.shear.otu.txt
+python shear_results_fix.py Rep94.shear.otu.txt Rep94.tax Rep94.shear
 ```
 
+
+https://raw.githubusercontent.com/knights-lab/analysis_SHOGUN/master/scripts/shear_db.py
+
+
+
 ```
 # Make database for UTree
 mkdir utree
diff --git a/docs/shear_results_fix.py b/docs/shear_results_fix.py
new file mode 100644
index 0000000..7b8c8f8
--- /dev/null
+++ b/docs/shear_results_fix.py
@@ -0,0 +1,68 @@
+# usage: python me.py \
+#  alignment.burst.otu.txt db.tax sheared_bayes.txt
+
+import os
+import sys
+import csv
+import pandas as pd
+import numpy as np
+from scipy.sparse import csr_matrix
+
+with open(sys.argv[1], 'r') as inf:
+    csv_inf = csv.reader(inf, delimiter="\t")
+    columns = next(csv_inf)
+    columns = dict(zip(columns[1:], range(len(columns))))
+    indptr = [0]
+    indices = np.array([], dtype=int)
+    data = np.array([], dtype=int)
+    names = []
+    for ix, row in enumerate(csv_inf):
+        if ix % 1000 == 0:
+            print(ix)
+        names.append(row[0])
+        np_row = np.array(row[1:], dtype=int)
+        temp_indx = [np_row > 0]
+        data = np.concatenate((data, np_row[temp_indx]))
+        indices = np.concatenate((indices, np.where(temp_indx)[1]))
+        indptr.append(indices.shape[0])
+
+csr = csr_matrix((data, indices, indptr), dtype=int).T
+
+with open(sys.argv[2]) as inf:
+    csv_inf = csv.reader(inf, delimiter='\t')
+    name2taxonomy = dict(csv_inf)
+
+cols_tax = [name2taxonomy[name] for name in names]
+rows_tax = [name2taxonomy[_.replace(".", "_", 1)] for _ in sorted(columns, key=columns.get)]
+
+def index_lca(str1, str2):
+    for i, (s1, s2) in enumerate(zip(str1.split(';'), str2.split(';'))):
+        if s1 != s2:
+            return i
+    return 8
+
+dat = np.zeros((len(rows_tax), 9), dtype=int)
+for i, row_name in enumerate(rows_tax):
+    row = csr.getrow(i)
+    for j, indx in enumerate(row.indices):
+        dat[i, index_lca(rows_tax[i], cols_tax[indx])] += row.data[j]
+
+print(str(dat[:, 0].sum()))
+
+df = pd.DataFrame(dat, index=rows_tax)
+df['sum'] = dat.sum(axis=1)
+df.drop(0, axis=1, inplace=True)
+df.to_csv(sys.argv[3], header=False, sep='\t')
+
+uniqueness_rate_per_level = np.zeros(8, dtype=float)
+for i in range(0, 8):
+    # Take the sum of those columns
+    num_hits =  df.iloc[:, i].sum()
+    # Total number of possible hits
+    total_hits = df['sum'].sum()
+    # Uniqueness Rate
+    uniqueness_rate_per_level[i] = num_hits/total_hits
+levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
+list(zip(levels, uniqueness_rate_per_level))
+print(uniqueness_rate_per_level.sum())
+
diff --git a/docs/taxmap_gene.py b/docs/taxmap_gene.py
new file mode 100644
index 0000000..ccc475c
--- /dev/null
+++ b/docs/taxmap_gene.py
@@ -0,0 +1,31 @@
+# usage: taxmap gene.py
+
+import sys
+import csv
+import re
+
+RefSeq2Uniprot = dict()
+Uniprot2KO = dict()
+
+with open(sys.argv[1], 'r') as inf:
+    csv_inf = csv.reader(inf, delimiter="\t")
+    for i, row in enumerate(csv_inf):
+        if row[1] == "RefSeq":
+            RefSeq2Uniprot[row[2]] = row[0]
+        elif row[1] == "KO":
+            Uniprot2KO[row[0]] = row[2]
+
+with open(sys.argv[2], 'r') as inf:
+    for line in inf:
+        title_search = re.search('\[protein_id=(.*?)\]', line, re.IGNORECASE)
+        output_str = line[1:] + "\t"
+        if title_search:
+            title = title_search.group(1)
+
+            if title in RefSeq2Uniprot:
+                uniprot_id = RefSeq2Uniprot[title]
+                if uniprot_id in Uniprot2KO:
+                    ko_id = Uniprot2KO[uniprot_id]
+                    output_str += ko_id
+        print(output_str)
+

From 9fc92e1a1634f2da3d06b4266673949477a840cd Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Tue, 10 Dec 2019 09:58:12 -0800
Subject: [PATCH 2/9] modified LCA function

---
 shogun/utils/last_common_ancestor.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/shogun/utils/last_common_ancestor.py b/shogun/utils/last_common_ancestor.py
index c55e115..a2b5784 100644
--- a/shogun/utils/last_common_ancestor.py
+++ b/shogun/utils/last_common_ancestor.py
@@ -31,9 +31,9 @@ def build_lca_map(gen: typing.Iterator, tree: Taxonomy) -> dict:
 
 
 def least_common_ancestor(taxa_set):
+    lca = []
     taxa_set = [_.split(';') for _ in taxa_set]
-    for i, level in enumerate(reversed(list(zip(*taxa_set)))):
-        if len(set(level)) == 1:
-            convert_i = lambda x: None if x == 0 else -x
-            return ';'.join([taxon_level[0] for taxon_level in list(zip(*taxa_set))[:convert_i(i)]])
-    return None
+    for level in zip(*taxa_set):
+        if len(set(level)) > 1 or any(_.endswith('__') for _ in level):
+            return ';'.join(lca) if lca else None
+        lca.append(level[0])

From 9bb23a1d38d39542bd0bbd2ae122db62cbd9b6d2 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Tue, 10 Dec 2019 13:52:53 -0800
Subject: [PATCH 3/9] fixed YAML loader warning

---
 shogun/__main__.py                     | 4 ++--
 shogun/aligners/_aligner.py            | 6 +++---
 shogun/function/tests/test_function.py | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/shogun/__main__.py b/shogun/__main__.py
index 537a4be..4a2dfd8 100644
--- a/shogun/__main__.py
+++ b/shogun/__main__.py
@@ -9,7 +9,7 @@
 from datetime import date
 from multiprocessing import cpu_count
 import click
-from yaml import load
+import yaml
 import pandas as pd
 
 from shogun import __version__, logger
@@ -306,7 +306,7 @@ def _load_metadata(database):
         with open(metadata_file, 'r') as stream:
             logger.debug(
                 "Attempting to load the database metadata file at %s" % (os.path.abspath(metadata_file)))
-            data_files = load(stream)
+            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
         return data_files
     else:
         logger.critical("Unable to load database at %s" % os.path.abspath(metadata_file))
diff --git a/shogun/aligners/_aligner.py b/shogun/aligners/_aligner.py
index 2506bdf..423d1d9 100644
--- a/shogun/aligners/_aligner.py
+++ b/shogun/aligners/_aligner.py
@@ -6,7 +6,7 @@
 
 import os
 
-from yaml import load
+import yaml
 
 from shogun import logger
 
@@ -20,7 +20,7 @@ def __init__(self, database_dir, threads=1, post_align=True, shell=False, percen
         check, msg = self.check_database(database_dir)
 
         with open(os.path.join(database_dir, 'metadata.yaml')) as stream:
-            self.data_files = load(stream)
+            self.data_files = yaml.load(stream, Loader=yaml.SafeLoader)
 
         if not check:
             raise Exception("Database %s is not formatted correctly: %s" % (database_dir, msg))
@@ -37,7 +37,7 @@ def __init__(self, database_dir, threads=1, post_align=True, shell=False, percen
     @classmethod
     def check_database(cls, dir):
         with open(os.path.join(dir, 'metadata.yaml'), 'r') as stream:
-            data_files = load(stream)
+            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
 
         SUFFICES = {
             'burst': ['.edx'],
diff --git a/shogun/function/tests/test_function.py b/shogun/function/tests/test_function.py
index 3cc6792..be9d0a1 100644
--- a/shogun/function/tests/test_function.py
+++ b/shogun/function/tests/test_function.py
@@ -9,7 +9,7 @@
 import unittest
 
 import pkg_resources
-from yaml import load
+import yaml
 
 from shogun.__main__ import _function
 from shogun.function import parse_function_db
@@ -26,7 +26,7 @@ def tearDown(self):
     def test_check_database(self):
         database = pkg_resources.resource_filename('shogun.tests', os.path.join('data'))
         with open(os.path.join(database, 'metadata.yaml'), 'r') as stream:
-            data_files = load(stream)
+            data_files = yaml.load(stream, Loader=yaml.SafeLoader)
         results = parse_function_db(data_files, database)
         self.assertTrue(results is not None)
 

From 7d1e83afb8d4f408574508b150573eb85cae02de Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Tue, 10 Dec 2019 14:32:53 -0800
Subject: [PATCH 4/9] added unit tests for LCA function

---
 shogun/utils/__init__.py         |  4 +--
 shogun/utils/tests/test_utils.py | 47 +++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/shogun/utils/__init__.py b/shogun/utils/__init__.py
index 7d937a6..6578b40 100644
--- a/shogun/utils/__init__.py
+++ b/shogun/utils/__init__.py
@@ -4,9 +4,9 @@
 This software is released under the GNU Affero General Public License (AGPL) v3.0 License.
 """
 
-from .last_common_ancestor import build_lca_map
+from .last_common_ancestor import build_lca_map, least_common_ancestor
 from .normalize import normalize_by_median_depth
 from ._utils import run_command, hash_file, read_checksums, save_csr_matrix, load_csr_matrix, read_fasta, convert_to_relative_abundance
 
 __all__ = ['build_lca_map', 'run_command', 'hash_file', 'read_checksums', 'save_csr_matrix', 'load_csr_matrix',
-           'normalize_by_median_depth', 'read_fasta', 'convert_to_relative_abundance']
+           'normalize_by_median_depth', 'read_fasta', 'convert_to_relative_abundance', 'least_common_ancestor']
diff --git a/shogun/utils/tests/test_utils.py b/shogun/utils/tests/test_utils.py
index fd52fee..8ff92fd 100644
--- a/shogun/utils/tests/test_utils.py
+++ b/shogun/utils/tests/test_utils.py
@@ -8,7 +8,7 @@
 import pandas as pd
 import numpy as np
 
-from shogun.utils import convert_to_relative_abundance
+from shogun.utils import convert_to_relative_abundance, least_common_ancestor
 
 class TestRelativeAbundance(unittest.TestCase):
     def test_convert_relative_abundance(self):
@@ -17,3 +17,48 @@ def test_convert_relative_abundance(self):
         df_ra = convert_to_relative_abundance(df)
         pd.testing.assert_frame_equal(df_ra, df_expected)
 
+
+class TestLeastCommonAncestor(unittest.TestCase):
+    def test_least_common_ancestor(self):
+        # normal scenario
+        taxa = [
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium acetobutylicum',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium botulinum',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium aldrichii',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Caminicella;s__Caminicella sporogenes'
+        ]
+        obs = least_common_ancestor(taxa)
+        exp = 'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae'
+        self.assertEqual(obs, exp)
+
+        # scenario 1: lowest level is unclassified
+        taxa = [
+            'k__Bacteria;p__Deinococcus-Thermus;c__Deinococci;o__Thermales;f__Thermaceae;g__Thermus;s__Thermus thermophilus;t__',
+            'k__Bacteria;p__Deinococcus-Thermus;c__Deinococci;o__Deinococcales;f__Deinococcaceae;g__Deinococcus;s__Deinococcus radiodurans;t__',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium acetobutylicum;t__',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium botulinum;t__'
+        ]
+        obs = least_common_ancestor(taxa)
+        exp = 'k__Bacteria'
+        self.assertEqual(obs, exp)
+
+        # scenario 2: middle level is unclassified
+        taxa = [
+            'k__Viruses;p__ssDNA_viruses;c__Geminiviridae;o__;f__;g__Begomovirus;s__Sida_mosaic_Sinaloa_virus',
+            'k__Viruses;p__ssDNA_viruses;c__Geminiviridae;o__;f__;g__Mastrevirus;s__Panicum_streak_virus',
+            'k__Viruses;p__ssDNA_viruses;c__Geminiviridae;o__;f__;g__Begomovirus;s__Cotton_leaf_crumple_virus',
+            'k__Viruses;p__ssDNA_viruses;c__Nanoviridae;o__;f__;g__Babuvirus;s__Banana_bunchy_top_virus'
+        ]
+        obs = least_common_ancestor(taxa)
+        exp = 'k__Viruses;p__ssDNA_viruses'
+        self.assertEqual(obs, exp)
+
+        # scenario 3: top level is inconsistent
+        taxa = [
+            'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Vibrio;s__Vibrio_tasmaniensis;t__Vibrio_tasmaniensis_LGP32',
+            'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Vibrio;s__Vibrio_maritimus;t__',
+            'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Aliivibrio;s__Aliivibrio_wodanis;t__',
+            'k__BacteriaPlasmid;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Aliivibrio;s__Aliivibrio_wodanis;t__'
+        ]
+        obs = least_common_ancestor(taxa)
+        self.assertIsNone(obs)

From bc4a94d5e0657ff3c7ad384447c48808770c7c86 Mon Sep 17 00:00:00 2001
From: bhillmann <hillmannben@gmail.com>
Date: Mon, 16 Dec 2019 04:53:33 -0600
Subject: [PATCH 5/9] split out the tests

---
 shogun/utils/tests/test_utils.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/shogun/utils/tests/test_utils.py b/shogun/utils/tests/test_utils.py
index 8ff92fd..7f224e7 100644
--- a/shogun/utils/tests/test_utils.py
+++ b/shogun/utils/tests/test_utils.py
@@ -10,6 +10,7 @@
 
 from shogun.utils import convert_to_relative_abundance, least_common_ancestor
 
+
 class TestRelativeAbundance(unittest.TestCase):
     def test_convert_relative_abundance(self):
         df = pd.DataFrame([[1, 1, 0], [1, 0, 0], [1, 0, 0]])
@@ -19,7 +20,7 @@ def test_convert_relative_abundance(self):
 
 
 class TestLeastCommonAncestor(unittest.TestCase):
-    def test_least_common_ancestor(self):
+    def test_lca_family(self):
         # normal scenario
         taxa = [
             'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium acetobutylicum',
@@ -31,6 +32,7 @@ def test_least_common_ancestor(self):
         exp = 'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae'
         self.assertEqual(obs, exp)
 
+    def test_lca_kingdom(self):
         # scenario 1: lowest level is unclassified
         taxa = [
             'k__Bacteria;p__Deinococcus-Thermus;c__Deinococci;o__Thermales;f__Thermaceae;g__Thermus;s__Thermus thermophilus;t__',
@@ -42,6 +44,7 @@ def test_least_common_ancestor(self):
         exp = 'k__Bacteria'
         self.assertEqual(obs, exp)
 
+    def test_lca_phylum(self):
         # scenario 2: middle level is unclassified
         taxa = [
             'k__Viruses;p__ssDNA_viruses;c__Geminiviridae;o__;f__;g__Begomovirus;s__Sida_mosaic_Sinaloa_virus',
@@ -53,6 +56,7 @@ def test_least_common_ancestor(self):
         exp = 'k__Viruses;p__ssDNA_viruses'
         self.assertEqual(obs, exp)
 
+    def test_lca_none(self):
         # scenario 3: top level is inconsistent
         taxa = [
             'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Vibrio;s__Vibrio_tasmaniensis;t__Vibrio_tasmaniensis_LGP32',
@@ -62,3 +66,28 @@ def test_least_common_ancestor(self):
         ]
         obs = least_common_ancestor(taxa)
         self.assertIsNone(obs)
+
+    def test_blank_class(self):
+        # normal scenario
+        taxa = [
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium acetobutylicum',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium botulinum',
+            'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__Clostridium aldrichii',
+            'k__Bacteria;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Caminicella;s__Caminicella sporogenes'
+        ]
+        obs = least_common_ancestor(taxa)
+        exp = 'k__Bacteria;p__Firmicutes'
+        self.assertEqual(obs, exp)
+
+    def test_blanks_kingdom_class(self):
+        # normal scenario
+        taxa = [
+            'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Clostridium',
+            'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Clostridium',
+            'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Clostridium',
+            'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Caminicella'
+        ]
+        obs = least_common_ancestor(taxa)
+        exp = 'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae'
+        # self.assertEqual(obs, exp)
+        self.assertEquals(obs, None)

From 2c3906157daa4c71ccf0bbb6dbf68986a829adf9 Mon Sep 17 00:00:00 2001
From: bhillmann <bhillmann@outlook.com>
Date: Wed, 18 Dec 2019 13:35:55 -0600
Subject: [PATCH 6/9] addd more tests and added noted changes to the classified
 blanks

---
 shogun/utils/last_common_ancestor.py | 16 ++++++++++++++--
 shogun/utils/tests/test_utils.py     | 15 +++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/shogun/utils/last_common_ancestor.py b/shogun/utils/last_common_ancestor.py
index a2b5784..daaa989 100644
--- a/shogun/utils/last_common_ancestor.py
+++ b/shogun/utils/last_common_ancestor.py
@@ -33,7 +33,19 @@ def build_lca_map(gen: typing.Iterator, tree: Taxonomy) -> dict:
 def least_common_ancestor(taxa_set):
     lca = []
     taxa_set = [_.split(';') for _ in taxa_set]
-    for level in zip(*taxa_set):
-        if len(set(level)) > 1 or any(_.endswith('__') for _ in level):
+    unclassified_flag = False
+    lca_classified = []
+    for i, level in enumerate(zip(*taxa_set)):
+        if len(set(level)) > 1:
+            if unclassified_flag:
+                lca = lca_classified
             return ';'.join(lca) if lca else None
+        elif len(level[0]) == 3 and level[0].endswith("__"):
+            # hit an unclassified level
+            if not unclassified_flag:
+                lca_classified = lca.copy()
+                unclassified_flag = True
+        else:
+            # reset classified flag
+            unclassified_flag = False
         lca.append(level[0])
diff --git a/shogun/utils/tests/test_utils.py b/shogun/utils/tests/test_utils.py
index 7f224e7..c0595b0 100644
--- a/shogun/utils/tests/test_utils.py
+++ b/shogun/utils/tests/test_utils.py
@@ -89,5 +89,16 @@ def test_blanks_kingdom_class(self):
         ]
         obs = least_common_ancestor(taxa)
         exp = 'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae'
-        # self.assertEqual(obs, exp)
-        self.assertEquals(obs, None)
+        self.assertEqual(obs, exp)
+
+    def test_continued_blanks(self):
+        # normal scenario
+        taxa = [
+            'k__;p__Firmicutes;c__;o__;f__;g__Clostridium',
+            'k__;p__Firmicutes;c__;o__;f__;g__Clostridium',
+            'k__;p__Firmicutes;c__;o__;f__;g__Clostridium',
+            'k__;p__Firmicutes;c__;o__;f__;g__Caminicella'
+        ]
+        obs = least_common_ancestor(taxa)
+        exp = 'k__;p__Firmicutes'
+        self.assertEqual(obs, exp)

From 17552529aed4476bbf7356ef9c577c7e83d6698f Mon Sep 17 00:00:00 2001
From: bhillmann <bhillmann@outlook.com>
Date: Mon, 23 Dec 2019 14:51:08 -0600
Subject: [PATCH 7/9] removed the unnecessary enumerate in lca

---
 shogun/utils/last_common_ancestor.py | 2 +-
 shogun/utils/tests/test_utils.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/shogun/utils/last_common_ancestor.py b/shogun/utils/last_common_ancestor.py
index daaa989..84e31bb 100644
--- a/shogun/utils/last_common_ancestor.py
+++ b/shogun/utils/last_common_ancestor.py
@@ -35,7 +35,7 @@ def least_common_ancestor(taxa_set):
     taxa_set = [_.split(';') for _ in taxa_set]
     unclassified_flag = False
     lca_classified = []
-    for i, level in enumerate(zip(*taxa_set)):
+    for level in zip(*taxa_set):
         if len(set(level)) > 1:
             if unclassified_flag:
                 lca = lca_classified
diff --git a/shogun/utils/tests/test_utils.py b/shogun/utils/tests/test_utils.py
index c0595b0..5fa654a 100644
--- a/shogun/utils/tests/test_utils.py
+++ b/shogun/utils/tests/test_utils.py
@@ -80,7 +80,7 @@ def test_blank_class(self):
         self.assertEqual(obs, exp)
 
     def test_blanks_kingdom_class(self):
-        # normal scenario
+        # unclassified until family
         taxa = [
             'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Clostridium',
             'k__;p__Firmicutes;c__;o__Clostridiales;f__Clostridiaceae;g__Clostridium',
@@ -92,7 +92,7 @@ def test_blanks_kingdom_class(self):
         self.assertEqual(obs, exp)
 
     def test_continued_blanks(self):
-        # normal scenario
+        # unclassified until phylum
         taxa = [
             'k__;p__Firmicutes;c__;o__;f__;g__Clostridium',
             'k__;p__Firmicutes;c__;o__;f__;g__Clostridium',

From d0c8758e2206f3aca8cf0e031b48ca3ed6965189 Mon Sep 17 00:00:00 2001
From: bhillmann <bhillmann@outlook.com>
Date: Mon, 23 Dec 2019 14:56:28 -0600
Subject: [PATCH 8/9] added the shogun db links folder

---
 docs/README.md           |  6 ++++++
 docs/shogun_db_links.txt | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 docs/shogun_db_links.txt

diff --git a/docs/README.md b/docs/README.md
index 46bb842..5be9a03 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -37,6 +37,12 @@ If you are on MSI the database for SHOGUN is located at:
 /home/knightsd/hillm096/globus/SHOGUN/rep82
 ```
 
+Else you can access the rep82 database on AWS by running :
+
+```
+wget <path_to_folder>/shogun_db_links.txt
+```
+
 If everything up to the testing is working, below is the location of an example shallow shotgun file.
 ```
 /home/grad00/hillm096/combined_seqs.fna
diff --git a/docs/shogun_db_links.txt b/docs/shogun_db_links.txt
new file mode 100644
index 0000000..c768f25
--- /dev/null
+++ b/docs/shogun_db_links.txt
@@ -0,0 +1,22 @@
+https://s3.us-east-2.amazonaws.com/shogun-db/metadata.yaml
+https://s3.us-east-2.amazonaws.com/shogun-db/rep82.fna
+https://s3.us-east-2.amazonaws.com/shogun-db/rep82.tax
+https://s3.us-east-2.amazonaws.com/shogun-db/sheared_bayes.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/utree/rep82.gg.ctr
+https://s3.us-east-2.amazonaws.com/shogun-db/utree/rep82.gg.log
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-enzyme-annotations.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-module-annotations.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-pathway-annotations.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-species2ko.80pct.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-species2ko.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/function/ko-strain2ko.txt
+https://s3.us-east-2.amazonaws.com/shogun-db/filter/humanD252.edx
+https://s3.us-east-2.amazonaws.com/shogun-db/filter/humanD252.acx
+https://s3.us-east-2.amazonaws.com/shogun-db/burst/rep82.edx
+https://s3.us-east-2.amazonaws.com/shogun-db/burst/rep82.acx
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.1.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.2.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.3.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.4.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.rev.1.bt2l
+https://s3.us-east-2.amazonaws.com/shogun-db/bt2/rep82.rev.2.bt2l

From ae5773cec200df74c2b2039125f6c11e1fd9a2e4 Mon Sep 17 00:00:00 2001
From: bhillmann <bhillmann@outlook.com>
Date: Mon, 23 Dec 2019 15:06:42 -0600
Subject: [PATCH 9/9] Added some documentation

---
 README.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 728ddb4..cdf5551 100644
--- a/README.md
+++ b/README.md
@@ -7,13 +7,13 @@ Shallow seq pipeline for optimal shotgun data usage
 
 ![alt-tag](docs/shogun_schematic.png)
 
-Schematic overview of the shallow-shotgun computational pipeline SHOGUN. For every step in the SHOGUN pipeline, the user must supply the pre-formatted SHOGUN database folder. To run every step shown here in a single command, the user can select the pipeline subcommand. Otherwise, the analysis modules can be run independently. 
+Schematic overview of the shallow-shotgun computational pipeline SHOGUN. For every step in the SHOGUN pipeline, the user must supply the pre-formatted SHOGUN database folder. To run every step shown here in a single command, the user can select the pipeline subcommand. Otherwise, the analysis modules can be run independently.
 
 a. *filter* - The input quality-controlled reads are aligned against the contaminate database using BURST to filter out all reads that hit human associated genome content.
 
 b. *align* - The input contaminate filtered reads are aligned against the reference database. The user has the option to select one or all of the three alignment tools BURST, Bowtie2, or UTree.
 
-c. *assign_taxonomy* - Given the data artifacts from a SHOGUN alignment tool, output a Biological Observation Matrix ![(BIOM)](http://biom-format.org/) format taxatable with the rows being rank-flexible taxonomies, the columns are samples, and the entries are counts for each given taxonomy per sample. The alignment tool BURST has two run modes, taxonomy and capitalist. If the capitalist mode is enabled, a rank-specific BIOM file is output instead. 
+c. *assign_taxonomy* - Given the data artifacts from a SHOGUN alignment tool, output a Biological Observation Matrix ![(BIOM)](http://biom-format.org/) format taxatable with the rows being rank-flexible taxonomies, the columns are samples, and the entries are counts for each given taxonomy per sample. The alignment tool BURST has two run modes, taxonomy and capitalist. If the capitalist mode is enabled, a rank-specific BIOM file is output instead.
 
 d. *coverage* - The output from BURST can be utilized to analyze the coverage of each taxonomy across all samples in your alignment file. This can useful for reducing the number of false positive taxonomies.
 
@@ -168,7 +168,7 @@ Options:
   -h, --help                      Show this message and exit.
 ```
 
-### normalize
+#### normalize
 
 ```
 Usage: shogun normalize [OPTIONS]
@@ -273,3 +273,9 @@ shogun pipeline -i input.fna -d /path/to/database/parent/folder/ -o output -m bu
 shogun pipeline -i input.fna -d /path/to/database/parent/folder/ -o output -m utree
 shogun pipeline -i input.fna -d /path/to/database/parent/folder/ -o output -m bowtie2
 ```
+
+For a prebuilt database, download the file locations [here](https://github.com/knights-lab/SHOGUN/tree/master/docs/shogun_db_links.txt) and run the command:
+
+```
+wget -i <path_to_folder>/shogun_db_links.txt
+```