-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_pombase_tables.py
87 lines (67 loc) · 4.19 KB
/
make_pombase_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas
data = pandas.read_csv('results/genome_changes_summary_comments.tsv', sep='\t', na_filter=False)
# Remove some known errors
data = data.loc[~data.systematic_id.str.contains('controlled_curation'), :].copy()
data['comment_addition'] = data.comment_addition.apply(lambda x: '. '.join([i for i in x.split('|') if i !='']))
data['comment_removal'] = data.comment_removal.apply(lambda x: '. '.join([i for i in x.split('|') if i !='']))
data['reference_addition'] = data.reference_addition.apply(lambda x: ','.join([i for i in x.split('|') if i !='']))
data['reference_removal'] = data.reference_removal.apply(lambda x: ','.join([i for i in x.split('|') if i !='']))
added_genes = data.loc[data.category.isin(['added', 'added_and_changed']), ['systematic_id', 'primary_name', 'earliest_change', 'comment_addition', 'reference_addition', 'feature_type']]
added_genes.rename(inplace=True,columns={
'systematic_id': 'Systematic ID',
'primary_name': 'Primary name',
'earliest_change': 'Date added',
'comment_addition': 'Comment',
'reference_addition': 'Reference',
})
added_genes.sort_values(by='Date added', ascending=False, inplace=True)
added_genes.loc[added_genes.feature_type == 'CDS', :].drop(columns='feature_type').to_csv('results/pombase_tables/new-gene-data-protein-coding.tsv', sep='\t', index=False)
added_genes.loc[added_genes.feature_type != 'CDS', :].drop(columns='feature_type').to_csv('results/pombase_tables/new-gene-data-RNA.tsv', sep='\t', index=False)
removed_genes = data.loc[data.category.str.contains('merged')|data.category.str.contains('removed'), ['systematic_id', 'primary_name', 'latest_change', 'latest_coords', 'comment_removal', 'reference_removal', 'merged_into', 'feature_type']]
merged_genes = removed_genes['merged_into'] != ''
def formatting_function(r):
if r.merged_into in r.comment_removal:
return r.comment_removal
elif r.comment_removal:
return f'{r.comment_removal}. Merged into {r.merged_into}'
else:
return f'Merged into {r.merged_into}'
removed_genes.loc[merged_genes, 'comment_removal'] = removed_genes[merged_genes].apply(formatting_function, axis=1)
removed_genes.drop(columns='merged_into', inplace=True)
removed_genes.rename(inplace=True,columns={
'systematic_id': 'Systematic ID',
'primary_name': 'Primary name',
'latest_change': 'Date removed',
'latest_coords': 'Coordinates when removed',
'comment_removal': 'Comment',
'reference_removal': 'Reference',
}
)
removed_genes.sort_values(by='Date removed', ascending=False, inplace=True)
removed_genes.loc[removed_genes.feature_type == 'CDS', :].drop(columns='feature_type').to_csv('results/pombase_tables/removed-gene-data-protein-coding.tsv', sep='\t', index=False)
removed_genes.loc[removed_genes.feature_type != 'CDS', :].drop(columns='feature_type').to_csv('results/pombase_tables/removed-gene-data-RNA.tsv', sep='\t', index=False)
# Changes table
data = pandas.read_csv('results/only_modified_coordinates_comments_no_type_change.tsv', sep='\t', na_filter=False)
def formatting_function(r):
if r['db_xref']:
if r['pombase_reference']:
return f'{r["pombase_reference"]},{r["db_xref"]}'
return r['db_xref']
return r['pombase_reference']
data['reference'] = data.apply(formatting_function,axis=1)
data['location'] = data.apply(lambda r: r['chromosome'] + ':' + r['value'],axis=1)
data.rename(columns={
'reference': 'Reference',
'location': 'Coordinates',
'systematic_id': 'Systematic ID',
'primary_name' : 'Primary name',
'added_or_removed': 'Before / After change',
'pombase_comments': 'Comment',
'date': 'Date'
}, inplace=True)
removed = data['Before / After change'] == 'removed'
data.loc[removed, 'Before / After change'] = 'before'
data.loc[~removed, 'Before / After change'] = 'after'
output_columns = ['Date', 'Systematic ID', 'Primary name', 'Before / After change', 'Coordinates', 'Comment', 'Reference']
data.loc[data.feature_type == 'CDS', output_columns].to_csv('results/pombase_tables/gene-coordinate-change-data-protein-coding.tsv', sep='\t', index=False)
data.loc[data.feature_type != 'CDS', output_columns].to_csv('results/pombase_tables/gene-coordinate-change-data-RNA.tsv', sep='\t', index=False)