Skip to content

Commit

Permalink
initial commit of file to linearize genomes by taxonomy map
Browse files Browse the repository at this point in the history
  • Loading branch information
danknights committed Dec 10, 2019
1 parent b3ed7d2 commit 81f4fec
Show file tree
Hide file tree
Showing 4 changed files with 266 additions and 82 deletions.
74 changes: 74 additions & 0 deletions shogun/scripts/linearize_fasta_by_genome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#! /usr/bin/env python
# concatenates all DNA sequences from a single strain
# into one long genome with NNNNNNNNNNNNNNNNN between
# sequences
#
# uses taxonomy file to determine strain ID of each fasta
# sequence
#
# requires fasta in order by strain
#
# usage (NNNNNNNNN is what to separate strains with):
# *.py input.fna input.tax output.fna output.tax NNNNNNNNNN
import os
import sys

infna = sys.argv[1]
intax = sys.argv[2]
outfna = sys.argv[3]
outtax = sys.argv[4]
delim = sys.argv[5]

# load tax map
taxmap = {} # header:strain
taxa = set() # set of all strains
print('Reading/writing taxmap...')
count = 0
with open(intax,'r') as f:
with open(outtax,'w') as g:
for line in f:
count += 1
if count % 100000 == 0:
sys.stdout.write(str(count) + ' ')
sys.stdout.flush()
words = line.strip().split('\t')
header = words[0]
tax = words[1]
# write new taxmap line only if first instance of this strain
if not tax in taxa:
g.write(tax + '\t' + tax + '\n')
taxa.add(tax)
taxmap[header] = tax
sys.stdout.write('\n')

# read/write streaming fasta
# track strains found to see if a
# strain has reads that already showed up
# earlier in file (and fail in that case)
currentstrain = ''
currentdna = ''
count = 0
print('Reading/writing FASTA...')
with open(infna,'r') as f:
with open(outfna,'w') as g:
for line in f:
if line.startswith('>'):
count += 1
if count % 100000 == 0:
sys.stdout.write(str(count) + ' ')
sys.stdout.flush()
header = line.strip().split()[0][1:]
strain = taxmap[header]
if strain != currentstrain:
if currentstrain != '':
g.write('>' + currentstrain + '\n')
g.write(currentdna + '\n')
currentdna = ''
currentstrain = strain
else:
if currentdna != '':
currentdna += delim
currentdna += line.strip()
g.write('>' + currentstrain + '\n' + currentdna + '\n') # don't forget to write the last sequence
sys.stdout.write('\n')

1 change: 1 addition & 0 deletions shogun/scripts/make_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
dbname = sys.argv[3]
dbpath = os.path.join(outdir,dbname + '.fna') # full path
taxpath = os.path.join(outdir,dbname + '.tax') # full path
taxpath = os.path.join(outdir,dbname + '-ko.tax') # full path

# download the raw CDS from genomes
make_refseq_fasta_and_taxonomy(assemblypath,dbpath,taxpath)
Expand Down
Loading

0 comments on commit 81f4fec

Please sign in to comment.