-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathprocess_mutations.py
More file actions
executable file
·180 lines (155 loc) · 8.51 KB
/
process_mutations.py
File metadata and controls
executable file
·180 lines (155 loc) · 8.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python
# Load required modules
import sys, os, argparse, json, numpy as np
from collections import defaultdict
# Argument parser
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--mutation_file_groups', type=str, required=True,
action='append', nargs='*')
parser.add_argument('-ct', '--cancer_types', type=str, required=True, nargs='*')
parser.add_argument('-pw', '--patient_whitelist', type=str, required=False)
parser.add_argument('-hf', '--hypermutators_file', type=str, required=False, default=None)
parser.add_argument('-ivc', '--ignored_variant_classes', type=str, required=False, nargs='*',
default=["Silent", "Intron", "3'UTR", "5'UTR", "IGR", "lincRNA", "RNA"])
parser.add_argument('-ivt', '--ignored_variant_types', type=str, required=False, nargs='*',
default=['Germline'])
parser.add_argument('-ivs', '--ignored_validation_statuses', type=str, required=False, nargs='*',
default=['Wildtype', 'Invalid'])
parser.add_argument('-o', '--output_file', type=str, required=True)
parser.add_argument('-v', '--verbose', type=int, default=1, required=False, choices=range(5))
return parser
def process_maf( maf_file, patientWhitelist, geneToCases, patientToMutations, vc, vt, vs, ivc, ivt, ivs, verbose ):
if verbose > 1: print '\tLoading MAF:', maf_file
genes, patients = set(), set()
with open(maf_file, 'r') as IN:
seenHeader = False
for i, l in enumerate(IN):
arr = l.rstrip('\n').split('\t')
# Parse the header if we haven't seen it yet
if not seenHeader and arr[0].lower() == 'hugo_symbol':
arr = map(str.lower, arr)
seenHeader = True
gene_index = 0
patient_index = arr.index('tumor_sample_barcode')
var_class_index = arr.index('variant_classification') if 'variant_classification' in arr else None
var_type_index = arr.index('variant_type') if 'variant_type' in arr else None
val_status_index = arr.index('validation_status') if 'validation_status' in arr else None
# Process the mutation
elif seenHeader:
# Record the patients and genes, even if we ignore their mutations
patient, gene = '-'.join(arr[patient_index].split('-')[:3]), arr[gene_index]
if not patientWhitelist[patient]: continue
patients.add(patient)
genes.add(gene)
# Ignore certain mutations. If we aren't given certain information, we
# will include the mutation by default
if var_class_index:
var_class = arr[var_class_index].lower()
vc_check = not (var_class in ivc)
else:
var_class = ''
vc_check = True
if var_type_index:
var_type = arr[var_type_index].lower()
vt_check = not (var_type in ivt)
else:
var_type = ''
vt_check = True
if val_status_index:
val_status = arr[val_status_index].lower()
vs_check = not (val_status in ivs)
else:
val_status = ''
vs_check = True
# Record the mutation
if vc_check and vt_check and vs_check:
vt.add( var_type )
vs.add( val_status )
vc.add( var_class )
geneToCases[gene].add( patient )
patientToMutations[patient].add( gene )
return genes, patients
def process_events_file( events_file, patientWhitelist, geneToCases, patientToMutations, verbose ):
if verbose > 1: print '\tProcessing events file:', events_file
# Parse the events file
events, patients = set(), set()
with open(events_file, 'r') as IN:
arrs = [ l.rstrip('\n').split('\t') for l in IN if not l.startswith('#') ]
for arr in arrs:
# Skip patients that aren't whitelisted
patient, mutations = arr[0], set(arr[1:])
if not patientWhitelist[patient]: continue
# Record the events and mutations
patients.add(patient)
events |= mutations
patientToMutations[patient] |= mutations
for event in mutations:
geneToCases[event].add( patient )
return events, patients
def run( args ):
# Do some additional argument checking
assert( len(args.mutation_file_groups) == len(args.cancer_types) )
ivc = set(map(str.lower, args.ignored_variant_classes))
ivt = set(map(str.lower, args.ignored_variant_types))
ivs = set(map(str.lower, args.ignored_validation_statuses))
# Load the patient whitelist (if supplied)
if args.patient_whitelist:
if args.verbose > 0: print '* Loading patient whitelist...'
patientWhitelist = defaultdict( lambda : False )
with open(args.patient_whitelist, 'r') as IN:
patientWhitelist.update( (l.rstrip('\n').split()[0], True) for l in IN if not l.startswith('#') )
else:
if args.verbose > 0: print '* No patient whitelist provided, including all patients...'
patientWhitelist = defaultdict( lambda : True )
# Load the mutations from each MAF
if args.verbose > 0: print '* Loading and combining {} datasets...'.format(len(args.cancer_types))
geneToCases, patientToMutations = defaultdict( set ), defaultdict( set )
genes, patients = set(), set()
vc, vt, vs = set(), set(), set() # variant classes/types and validation statuses
patientToType = dict()
cancerToFiles = defaultdict(list)
for cancer_type, mutation_file_group in zip(args.cancer_types, args.mutation_file_groups):
for mutation_file in mutation_file_group:
cancerToFiles[cancer_type].append(os.path.abspath(mutation_file))
if mutation_file.lower().endswith('.maf'):
per_type_genes, per_type_patients = process_maf( mutation_file, patientWhitelist, geneToCases, patientToMutations, vc, vt, vs, ivc, ivt, ivs, args.verbose)
else:
per_type_genes, per_type_patients = process_events_file( mutation_file, patientWhitelist, geneToCases, patientToMutations, args.verbose )
patientToType.update( (p, cancer_type) for p in per_type_patients )
patients |= per_type_patients
genes |= per_type_genes
patients = sorted(patients)
genes = sorted(genes)
num_patients = len(patients)
num_genes = len(genes)
# Load the hypermutators/
if args.hypermutators_file:
with open( args.hypermutators_file, 'r') as IN:
hypermutators = set( l.rstrip('\n') for l in IN if not l.startswith('#') )
else:
hypermutators = set()
# Summarize the data
if args.verbose > 0:
print '* Summary of mutation data...'
print '\tGenes: {}'.format(num_genes)
print '\tPatients: {} ({} hypermutators)'.format(num_patients, len(hypermutators))
print '\tUsed variant classes:', ', '.join(sorted(vc))
print '\tUsed variant types:', ', '.join(sorted(vt))
print '\tUsed validation statuses:', ', '.join(sorted(vs))
# Output to file
with open(args.output_file, 'w') as OUT:
params = dict(cancerToFiles=cancerToFiles,
cancer_types=args.cancer_types,
ignored_variant_classes=args.ignored_variant_classes,
ignored_variant_types=args.ignored_variant_types,
ignored_validation_statuses=args.ignored_validation_statuses,
patient_whitelist_file=os.path.abspath(args.patient_whitelist) if args.patient_whitelist else None,
hypermutators_file=os.path.abspath(args.hypermutators_file) if args.hypermutators_file else None)
output = dict(params=params, patients=patients, genes=genes, hypermutators=list(hypermutators),
geneToCases=dict( (g, list(cases)) for g, cases in geneToCases.items()),
patientToType=patientToType,
patientToMutations=dict( (p, list(muts)) for p, muts in patientToMutations.items()),
num_genes=num_genes, num_patients=num_patients)
json.dump( output, OUT )
if __name__ == '__main__': run( get_parser().parse_args( sys.argv[1:]) )