-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patho-gen-stackbar-with-sets-from-ENVIRONMENT
executable file
·66 lines (51 loc) · 2.8 KB
/
o-gen-stackbar-with-sets-from-ENVIRONMENT
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2010 - 2012, A. Murat Eren
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.
# takes an environment file and a cosine similarity threshold as a parameter,
# generates an environment file with sets of units defined by the similarity
# they possess with respect to the frequency distribution patterns among
# samples.
import sys
from Oligotyping.utils.utils import get_samples_dict_from_environment_file
from Oligotyping.utils.utils import get_oligos_sorted_by_abundance
from Oligotyping.utils.utils import get_units_across_samples_dicts
from Oligotyping.utils.utils import get_unit_counts_and_percents
from Oligotyping.utils.cosine_similarity import get_oligotype_sets
from Oligotyping.utils.cosine_similarity import get_oligotype_sets_greedy
from Oligotyping.visualization.oligotype_distribution_stack_bar import oligotype_distribution_stack_bar
from Oligotyping.utils.utils import generate_ENVIRONMENT_file
input_file_path = sys.argv[1]
cosine_similarity_value = float(sys.argv[2])
sets_output_file_name = input_file_path + '-cos-%s-SETS' % cosine_similarity_value
environ_output_file_name = input_file_path + '-cos-%s-SETS-ENVIRON' % cosine_similarity_value
samples_dict = get_samples_dict_from_environment_file(input_file_path)
oligos = get_oligos_sorted_by_abundance(samples_dict)
unit_counts, unit_percents = get_unit_counts_and_percents(oligos, samples_dict)
samples = list(samples_dict.keys())
across_samples_sum_normalized, across_samples_max_normalized = get_units_across_samples_dicts(oligos, list(samples_dict.keys()), unit_percents)
oligotype_sets = get_oligotype_sets_greedy(oligos,
across_samples_sum_normalized,
cosine_similarity_value,
sets_output_file_name)
print('%d sets from %d units' % (len(oligotype_sets), len(oligos)))
samples_dict_with_agglomerated_oligos = {}
for sample in samples:
samples_dict_with_agglomerated_oligos[sample] = {}
for set_id in oligotype_sets:
oligotype_set = oligotype_sets[set_id]
for sample in samples:
samples_dict_with_agglomerated_oligos[sample][set_id] = 0
for oligo in samples_dict[sample]:
if oligo in oligotype_set:
samples_dict_with_agglomerated_oligos[sample][set_id] += samples_dict[sample][oligo]
oligotype_distribution_stack_bar(samples_dict_with_agglomerated_oligos, None)
generate_ENVIRONMENT_file(samples,
samples_dict_with_agglomerated_oligos,
environ_output_file_name)