initial commit of file to linearize genomes by taxonomy map

knights-lab · Dec 10, 2019 · 81f4fec · 81f4fec
1 parent b3ed7d2
commit 81f4fec
Show file tree

Hide file tree

Showing 4 changed files with 266 additions and 82 deletions.
diff --git a/shogun/scripts/linearize_fasta_by_genome.py b/shogun/scripts/linearize_fasta_by_genome.py
@@ -0,0 +1,74 @@
+#! /usr/bin/env python
+# concatenates all DNA sequences from a single strain
+# into one long genome with NNNNNNNNNNNNNNNNN between
+# sequences
+#
+# uses taxonomy file to determine strain ID of each fasta
+# sequence
+#
+# requires fasta in order by strain
+#
+# usage (NNNNNNNNN is what to separate strains with):
+# *.py input.fna input.tax output.fna output.tax NNNNNNNNNN
+import os
+import sys
+
+infna = sys.argv[1]
+intax = sys.argv[2]
+outfna = sys.argv[3]
+outtax = sys.argv[4]
+delim = sys.argv[5]
+
+# load tax map
+taxmap = {} # header:strain
+taxa = set() # set of all strains
+print('Reading/writing taxmap...')
+count = 0
+with open(intax,'r') as f:
+    with open(outtax,'w') as g:
+        for line in f:
+            count += 1
+            if count % 100000 == 0:
+                sys.stdout.write(str(count) + ' ')
+                sys.stdout.flush()
+            words = line.strip().split('\t')
+            header = words[0]
+            tax = words[1]
+            # write new taxmap line only if first instance of this strain
+            if not tax in taxa:
+                g.write(tax + '\t' + tax + '\n')
+                taxa.add(tax)
+            taxmap[header] = tax
+sys.stdout.write('\n')
+
+# read/write streaming fasta
+# track strains found to see if a
+# strain has reads that already showed up
+# earlier in file (and fail in that case)
+currentstrain = ''
+currentdna = ''
+count = 0
+print('Reading/writing FASTA...')
+with open(infna,'r') as f:
+    with open(outfna,'w') as g:
+        for line in f:            
+            if line.startswith('>'):
+                count += 1
+                if count % 100000 == 0:
+                    sys.stdout.write(str(count) + ' ')
+                    sys.stdout.flush()
+                header = line.strip().split()[0][1:]
+                strain = taxmap[header]
+                if strain != currentstrain:
+                    if currentstrain != '':
+                        g.write('>' + currentstrain + '\n')
+                        g.write(currentdna + '\n')
+                        currentdna = ''
+                    currentstrain = strain
+            else:
+                if currentdna != '':
+                    currentdna += delim
+                currentdna += line.strip()
+        g.write('>' + currentstrain + '\n' + currentdna + '\n') # don't forget to write the last sequence
+sys.stdout.write('\n')
+
diff --git a/shogun/scripts/make_db.py b/shogun/scripts/make_db.py
@@ -22,6 +22,7 @@
     dbname = sys.argv[3]
     dbpath = os.path.join(outdir,dbname + '.fna') # full path
     taxpath = os.path.join(outdir,dbname + '.tax') # full path
+    taxpath = os.path.join(outdir,dbname + '-ko.tax') # full path
 
     # download the raw CDS from genomes
     make_refseq_fasta_and_taxonomy(assemblypath,dbpath,taxpath)