-
Notifications
You must be signed in to change notification settings - Fork 6
/
generate_wgs_pyclone_input.py
253 lines (201 loc) · 9.99 KB
/
generate_wgs_pyclone_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from argparse import ArgumentParser
import os
import pandas as pd
import sys
from collections import defaultdict
import numpy as np
"""
Usage:
generate_wgs_pyclone_input.py
SNVs_OxoGFFPE/GCT001-TP-NT-SM-DPBZL-SM-DPBZK.ffpeBias.maf.annotated
Indels/GCT001-TP-NT-SM-DPBZL-SM-DPBZK.indel.wgs.maf.annotated
--facets_output/titan_output CNAs/GCT001-TP-NT-SM-DPBZL-SM-DPBZK.titan.optimalClust_ploidy2.segs.txt
output_folder
Generate the tab-delimited file used as input to the build_mutations_file script for PyClone.
"""
DEFAULT_MAJOR_COPY_NUMBER = 1
DEFAULT_MINOR_COPY_NUMBER = 1
DEFAULT_CELL_COPY_NUMBER = 2
class TitanCols:
CHR = "Chromosome"
START = "Start_Position(bp)"
END = "End_Position(bp)"
MAJOR_CN = "MajorCN"
MINOR_CN = "MinorCN"
class FacetsCols:
CHR = "chrom"
START = "start"
END = "end"
TOTAL_CN = "tcn.em"
MINOR_CN = "lcn.em"
class MafCols:
START = "Start_position"
END = "End_position"
CHR = "Chromosome"
CLASSIFICATION = "Variant_Classification"
TYPE = "Variant_Type"
GENE = "Hugo_Symbol"
REF = "Reference_Allele"
ALT = "Tumor_Seq_Allele2"
REF_COUNT = "t_ref_count"
ALT_COUNT = "t_alt_count"
@classmethod
def all(cls):
return [cls.GENE, cls.CHR, cls.START, cls.END, cls.CLASSIFICATION, cls.TYPE, cls.GENE, cls.REF, cls.ALT,
cls.REF_COUNT, cls.ALT_COUNT]
def mutation_id(row):
"""Generate a unique mutation ID"""
return 'chr{}:{}({})'.format(row[MafCols.CHR], row[MafCols.START], row[MafCols.GENE].values[0])
def center_position(row):
"""Find the point halfway between the start and end of the mutation -- relevant for SNVs and indels"""
start_int = int(row[MafCols.START])
end_int = int(row[MafCols.END])
length = end_int - start_int
return end_int - (length / 2)
def combine_snv_and_indel_mafs(snv_maf_df, indel_maf_df):
"""Combine data from snv and indel maf dataframes into one dataframe"""
subset_snvs = snv_maf_df[MafCols.all()]
subset_indels = indel_maf_df[MafCols.all()]
combined_snvs_and_indels_df = pd.concat([subset_snvs, subset_indels])
sys.stdout.write("Adding mutation ID...\n")
combined_snvs_and_indels_df['mutation_id'] = combined_snvs_and_indels_df.apply(mutation_id, axis=1)
sys.stdout.write("Added mutation ID...\n")
sys.stdout.write("Adding section center...\n")
combined_snvs_and_indels_df['Center_Position'] = combined_snvs_and_indels_df.apply(center_position, axis=1)
sys.stdout.write("Added section center...\n")
return combined_snvs_and_indels_df
def total_copy_number_based_on_sex_and_chrom(sex, chrom):
"""Based on whether looking at a sex chromosome or not, determine the normal cell copy number"""
# Figure out normal copy number based on sex if this a sex chromosome
if chrom in ['X', 'Y']:
if sex == 'female':
assert(chrom == 'X')
return int(2)
elif sex == 'male':
return int(1)
else:
return int(DEFAULT_CELL_COPY_NUMBER)
def build_chrom_map(titan_df, sex, input_type='titan'):
"""Given a dataframe of Titan data, build a dictionary where the chromosomes are the keys and the values
are the segments"""
sys.stdout.write("Building chromosome map for CNA data...\n")
chr_map = defaultdict(lambda: defaultdict(dict))
for index, row in titan_df.iterrows():
if input_type == 'titan':
chrom = str(row[TitanCols.CHR])
major_cn = row[TitanCols.MAJOR_CN]
minor_cn = row[TitanCols.MINOR_CN]
start = row[TitanCols.START]
end = row[TitanCols.END]
elif input_type == 'facets':
try:
chrom = str(row[FacetsCols.CHR])
minor_cn = int(row[FacetsCols.MINOR_CN])
total_cn = int(row[FacetsCols.TOTAL_CN])
major_cn = total_cn - minor_cn
start = row[FacetsCols.START]
end = row[FacetsCols.END]
except:
pass
if not (chrom == 'Y' and sex == 'female'):
chr_map[chrom][start] = {"end": end,
"major_cn": major_cn,
"minor_cn": minor_cn,
"normal_cn": total_copy_number_based_on_sex_and_chrom(sex, chrom)
}
return chr_map
def add_cn_info_to_indel_snv_maf(indel_snv_maf, chrom_map, sex):
"""Add the titan allelic copy number information to the combined snv indel maf"""
all_chrs = chrom_map.keys()
def major_and_minor_cns(combined_maf_row):
# For all rows where the chromosome isn't even in the search tree, set the minor and major alleles to the
# default, non-aneuploidy values
chrom = combined_maf_row[MafCols.CHR]
position = combined_maf_row["Center_Position"]
if chrom not in all_chrs:
return int(DEFAULT_MAJOR_COPY_NUMBER), int(DEFAULT_MINOR_COPY_NUMBER), total_copy_number_based_on_sex_and_chrom(
sex, chrom)
else:
starts = list(chrom_map[chrom].keys())
start_index = np.searchsorted(starts, position)
relevant_start = starts[start_index - 1]
seg = chrom_map[chrom][relevant_start]
if seg.get("end") >= position:
return int(seg.get("major_cn")), int(seg.get("minor_cn")), int(seg.get("normal_cn"))
else:
return int(DEFAULT_MAJOR_COPY_NUMBER), int(DEFAULT_MINOR_COPY_NUMBER), total_copy_number_based_on_sex_and_chrom(
sex, chrom)
cn_info = indel_snv_maf.apply(major_and_minor_cns, axis=1)
# Get a series with a major_cn and minor_cn column of out this Series of tuples
cn_info_cols = cn_info.apply(pd.Series, index=["major_cn", "minor_cn", "normal_cn"])
cn_info_added_to_snv_and_maf_df = pd.concat([indel_snv_maf, cn_info_cols], axis=1)
return cn_info_added_to_snv_and_maf_df
def get_sex(snv_maf):
"""Based on whether both a Y chromosome is present, make a sex determination"""
chromosomes = snv_maf['Chromosome'].unique()
y_in_sample = 'Y' in chromosomes
if y_in_sample:
sys.stdout.write('Y chromosome found in sample SNV file. Determining male sex. \n')
return 'male'
else:
sys.stdout.write('Y chromosome not found in sample SNV. Determining female sex. Will ignore any Y chromosome '
'information from other files. \n')
return 'female'
def main():
parser = ArgumentParser(description='Generate .tsv input for PyClone from SNV maf,'
' GATK indel file, and TITAN output')
parser.add_argument('snv_maf', metavar='snv_maf', type=str)
parser.add_argument('indel_maf', metavar='indel_maf', type=str)
parser.add_argument('--titan_output', metavar='titan_output', type=str)
parser.add_argument('--facets_output', metavar='facets_output', type=str)
parser.add_argument('output_dir', metavar='output_dir', type=str)
parser.add_argument('--handle', metavar='handle', type=str)
args = parser.parse_args()
snv_maf = args.snv_maf
indel_maf = args.indel_maf
if args.titan_output:
cn_output = args.titan_output
input_type = 'titan'
elif args.facets_output:
cn_output = args.facets_output
input_type = 'facets'
else:
sys.exit('Either a titan or facets output file must be provided with allelic copy number information')
handle = args.handle
if not handle:
# If not analysis handle is provided, simply name the file after the titan output file prefix
handle = os.path.split(cn_output)[-1].split('.')[0]
output_dir = args.output_dir
snv_maf_df = pd.read_csv(snv_maf, delimiter='\t', comment='#', header='infer')
sex = get_sex(snv_maf_df)
indel_maf_df = pd.read_csv(indel_maf, delimiter='\t', comment='#', skiprows=3, encoding="ISO-8859-1",
header='infer')
if sex == 'female':
# if there are no SNVs on the Y chromosome then let's ignore the data on Y chromosome for indels as well
indel_maf_df = indel_maf_df[indel_maf_df.Chromosome != 'Y']
sys.stdout.write("Combining SNV and indel maf file data...\n")
indel_snv_maf = combine_snv_and_indel_mafs(snv_maf_df, indel_maf_df)
sys.stdout.write("Loading CNA data...\n")
cn_df = pd.read_csv(cn_output, delimiter='\t', comment='#', header='infer')
# Build a data structure that allows for easier searching across the CNA data
chrom_map = build_chrom_map(cn_df, sex, input_type)
sys.stdout.write("Merging copy number, SNV, and indel information...\n")
final_df = add_cn_info_to_indel_snv_maf(indel_snv_maf, chrom_map, sex)
final_output_filepath = os.path.join(output_dir, '{}_pyclone_ready.tsv'.format(handle))
sys.stdout.write("Writing output to {}...\n".format(final_output_filepath))
sys.stdout.write("Filtering homozygous deletion sites out of output (major cn == 0)...\n")
sys.stdout.write("Length before filtering: {}\n".format(len(final_df)))
final_df = final_df[final_df['major_cn'] > 0]
sys.stdout.write("Length after filtering: {}\n".format(len(final_df)))
final_df[MafCols.REF_COUNT] = final_df[MafCols.REF_COUNT].fillna(0)
final_df[MafCols.REF_COUNT] = final_df[MafCols.REF_COUNT].astype(int)
final_df[MafCols.ALT_COUNT] = final_df[MafCols.ALT_COUNT].fillna(0)
final_df[MafCols.ALT_COUNT] = final_df[MafCols.ALT_COUNT].astype(int)
final_df.to_csv(final_output_filepath, sep='\t',
columns=['mutation_id', MafCols.REF_COUNT, MafCols.ALT_COUNT, 'normal_cn', 'major_cn', 'minor_cn',
MafCols.CHR, MafCols.START, MafCols.END, MafCols.CLASSIFICATION, MafCols.TYPE],
header=['mutation_id', 'ref_counts', 'var_counts', 'normal_cn', 'major_cn', 'minor_cn',
MafCols.CHR, MafCols.START, MafCols.END, MafCols.CLASSIFICATION, MafCols.TYPE], index=False)
sys.stdout.write("Done!\n")
if __name__ == '__main__':
main()