-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathsubsample_alignment_beta.py
More file actions
executable file
·172 lines (141 loc) · 6.91 KB
/
subsample_alignment_beta.py
File metadata and controls
executable file
·172 lines (141 loc) · 6.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
"""subsample an alignment using beta distributions (with hard-coded parameters) to model sampling probabilities"""
if __name__ == '__main__':
import argparse, copy, operator, os, random, re, sys
import numpy as np
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-a", "--alignment-file", required=True, type=open, help="The name of the alignment in relaxed phylip format, to be subsampled.")
parser.add_argument("-p", "--partitions-file", required=True, type=open, help="The name of the partitions file, in raxml format, to be used.")
parser.add_argument("-r", "--random-seed", type=int, help="A random seed to use.")
parser.add_argument("-l", "--label", help="A label to be used for output files.")
parser.add_argument("-o", "--order", help="A list of the partitions in decreasing order of rate, used to ensure that faster evolving ones are subsampled less.")
args = parser.parse_args()
if 'random_seed' in args:
random.seed(args.random_seed)
else:
random.seed()
output_label = args.output_label if 'output_label' in args else args.alignment_file.name.rsplit('.',1)[0]
order = args.order.split(',') if 'order' in args else None
# map the partitions to a dict
part_start_map = {}
parts = {}
for line in args.partitions_file:
toks = [t.strip() for t in re.split(r'[,=]+',line)]
ptype = toks[0]
name = toks[1]
bounds = [b.strip() for b in toks[2].split("-")]
start = int(bounds[0])
end = int(bounds[1])
# parts[start] = { 'name': name, 'type': ptype, 'start': start, 'end': end, 'taxa_sampled': 0, 'data': {}, }
parts[name] = { 'name': name, 'type': ptype, 'start': start, 'end': end, 'taxa_sampled': 0, 'data': {}, }
part_start_map[start] = name
part_starts = sorted(part_start_map.keys())
args.partitions_file.close()
# read in the alignment, recording the taxon names in a separate dict
taxa = {}
ntax = 0
ncols = 0
on_first_line = True
for line in args.alignment_file:
if len(line.strip()) < 3:
continue
toks = [l.strip() for l in line.split()]
if len(toks) > 1:
if on_first_line == True:
on_first_line = False
ntax = toks[0]
ncols = toks[1]
print ntax, ncols
if parts is None:
parts = {'all': { 'name': 'all', 'type': 'DNA', 'start': 1, 'end': int(ncols), 'taxa_sampled': 0, 'data': {}, }}
part_starts = {1: 'all'}
else:
taxname = toks[0]
seq = toks[1]
taxa[taxname] = {'parts_sampled': 0}
for start in part_starts:
n = part_start_map[start]
end = parts[n]['end']
parts[n]['data'][taxname] = seq[start-1:end]
else:
raise IndexError("too many items on line '" + line + "' in alignment")
args.alignment_file.close()
# assign sampling probs for loci from beta distribution a=3, b=5.
# chance of drawing a sampling prob 0.1 < S < 0.71 is 95%
# p = np.random.beta(3,5,len(parts))
# for i, s in enumerate(parts.keys()):
# parts[s]['p'] = p[i]
# for 10% of loci (minimum 1), assign sampling probs from beta dist a=4, b=3.
# chance of drawing a sampling prob 0.22 < S < 0.88 is 95%
# lucky_parts = random.sample(parts.keys(), max(len(parts)/10,1))
# p = np.random.beta(4,3,len(lucky_parts))
# for i, s in enumerate(lucky_parts):
# parts[s]['note'] = 'p increased from ' + str(parts[s]['p']) + ' to ' + str(p[i])
# parts[s]['p'] = p[i]
n_lucky_parts = max(len(parts)/10,1)
probs = list(np.random.beta(4,3,n_lucky_parts))
for p in np.random.beta(3,5,len(parts) - n_lucky_parts):
probs.append(p)
probs.sort(reverse=True)
for i, n in enumerate(order if order is not None else parts):
print i, n
parts[n]['p'] = probs[i]
# assign sampling probs for taxa from beta distribution a=3, b=5.
# chance of drawing a sampling prob 0.1 < S < 0.71 is 95%
p = np.random.beta(3,5,len(taxa))
for i, t in enumerate(taxa.keys()):
taxa[t]['p'] = p[i]
# for 5% of taxa (minimum 1), assign sampling probs from beta dist a=8, b=1.
# chance of drawing a sampling prob MORE than 0.68 is 95%
lucky_taxa = random.sample(taxa.keys(), max(len(taxa)/20,1))
p = np.random.beta(8,1,len(lucky_taxa))
for i, t in enumerate(lucky_taxa):
taxa[t]['note'] = 'p increased from ' + str(taxa[t]['p']) + ' to ' + str(p[i])
taxa[t]['p'] = p[i]
# precalculate proposal values for all cells in the sampling matrix
p = np.random.uniform(0, 1, len(taxa) * len(parts))
sample_bitmap = {}
i = 0
k = 0 # count of sampled sites
for t in taxa:
if t not in sample_bitmap:
sample_bitmap[t] = {}
for n in parts:
if n not in sample_bitmap[t]:
sample_bitmap[t][n] = {}
if parts[n]['p'] * taxa[t]['p'] > p[i]:
sample_bitmap[t][n] = True
parts[n]['taxa_sampled'] += 1
taxa[t]['parts_sampled'] += 1
k += 1
else:
sample_bitmap[t][n] = False
i += 1
# open a random site, to be sure we have at least one sampled partition for all taxa
r = parts.keys()[random.randrange(len(parts))]
if sample_bitmap[t][r] != True:
taxa[t]['parts_sampled'] += 1
sample_bitmap[t][r] = True
# create iterators over the dicts to return data in sorted order
taxa_sorted = sorted(taxa.items(), cmp=lambda p, q: cmp(p[1]['parts_sampled'], q[1]['parts_sampled']), reverse=True)
parts_sorted = sorted(parts.items(), cmp=lambda p, q: cmp(p[1]['taxa_sampled'], q[1]['taxa_sampled']), reverse=True)
with open(output_label + '.sampling_matrix.txt', 'w') as sampling_matrix:
for t in taxa_sorted:
for p in parts_sorted:
if sample_bitmap[t[0]][p[0]]:
sampling_matrix.write('1')
else:
sampling_matrix.write('0')
sampling_matrix.write('\n')
with open(output_label + '.subsampled.phy', 'w') as subsampled_aln:
subsampled_aln.write(ntax + ' ' + ncols + "\n")
for t in taxa_sorted:
subsampled_aln.write(t[0] + ' ')
for p in parts_sorted:
if sample_bitmap[t[0]][p[0]]:
subsampled_aln.write(parts[p[0]]['data'][t[0]])
else:
subsampled_aln.write('-' * len(parts[p[0]]['data'][t[0]]))
subsampled_aln.write("\n")
print('files have been written to: ' + output_label + '.sampling_matrix.txt, ' + output_label + '.subsampled.phy\n' \
'sampling proportion is ' + str(float(k) / (len(parts) * len(taxa))))