Skip to content

Commit

Permalink
o-dist.R was very useful, but I am tired of fixing it for changing AP…
Browse files Browse the repository at this point in the history
…Is of underlying R libraries. R is great for ad hoc, use-and-delete type of scripting. but writing and maintaining long-term code for R is a major pain.
  • Loading branch information
meren committed Nov 11, 2014
1 parent f15f5c5 commit 8ab1393
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 220 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.4
1.5
219 changes: 0 additions & 219 deletions bin/o-dist.R

This file was deleted.

75 changes: 75 additions & 0 deletions bin/o-sequence-distances
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2010 - 2012, A. Murat Eren
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.

import sys
import Oligotyping.lib.fastalib as u
from Oligotyping.utils.aligner import nw_align
from Oligotyping.utils.utils import Progress

progress = Progress()

def main(input_file, output_file, align = False):
sequences = {}
similarities = {}
fasta = u.SequenceSource(input_file)

while fasta.next():
sequences[fasta.id] = fasta.seq
similarities[fasta.id] = {}

keys = sequences.keys()

progress.new('Processing sequences')
for i in range(0, len(keys)):
progress.update('%d of %d' % (i + 1, len(keys)))
for j in range(i, len(keys)):
if i == j:
continue

if align:
seq1, seq2 = nw_align(sequences[keys[i]].replace('-', ''), sequences[keys[j]].replace('-', ''))
else:
seq1, seq2 = sequences[keys[i]], sequences[keys[j]]

percent_identity = 100 - len([True for x in range(0, len(seq1)) if seq1[x] != seq2[x]]) * 100.0 / len(seq1)

similarities[keys[i]][keys[j]] = similarities[keys[j]][keys[i]] =percent_identity

progress.end()

output = open(output_file, 'w')
output.write('\t'.join([''] + keys) + '\n')
for key1 in keys:
line = [key1]
for key2 in keys:
line.append('%.2f' % (100 if key1 == key2 else similarities[key1][key2]))
output.write('\t'.join(line) + '\n')
output.close()


if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser(description='Generates a ditance matrix for all sequences in a given FASTA file')
parser.add_argument('input_file', metavar = 'FASTA',
help = 'FASTA file that contains -representative?- sequences')
parser.add_argument('-o', '--output_file', metavar = 'FILE', default = None,
help = 'Output file to store results')
parser.add_argument('-A', '--align', action="store_true", default = False,
help = 'If sequences require pairwise alignment')

args = parser.parse_args()

if not args.output_file:
args.output_file = args.input_file + '-DIST.txt'

sys.exit(main(args.input_file, args.output_file, args.align))
Loading

0 comments on commit 8ab1393

Please sign in to comment.