-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathCheckCoverage.py
More file actions
executable file
·103 lines (88 loc) · 2.93 KB
/
CheckCoverage.py
File metadata and controls
executable file
·103 lines (88 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
from __future__ import division
from collections import defaultdict
import pysam
import re
import sys
from glob import glob
from os import path
from scipy import stats
from numpy import array, log, exp
import progressbar as pbar
if len(sys.argv) < 3:
print """Usage: python CheckCoverage.py <GTF-File> BAMfile [BAMfile ...]"""
sys.exit(1)
gtf_fname = sys.argv[1] # 'Reference/dmel-all-r5.42.gtf'
analysis_dir = 'analysis'
starts = set()
curr_len = -1
coverage = 0
cutoff = 0
def analyze_bamfile(bam_fname):
print bam_fname,
bam_file = pysam.Samfile(bam_fname, 'rb')
coverages = defaultdict(lambda: [0, 0, set()])
parent = ''
curr_len = 0
coverage = 0
starts = set()
f = open(gtf_fname)
for line in f:
pass
pb = pbar.ProgressBar(widgets=[bam_fname, pbar.Bar(), pbar.ETA()],
maxval=f.tell()).start()
f.seek(0)
for line in f:
pb.update(f.tell())
if line.startswith('#'):
continue
if line.startswith('>'):
break
data = line.split()
chrom = data[0]
kind = data[2]
start = int(data[3]) - 1
stop = int(data[4])
fbtr_finder = re.compile('FBtr[0-9]*')
# parent = fbtr_finder.findall(line)[0]
if kind == 'exon':
fbtrs = fbtr_finder.findall(line)
if not fbtrs:
continue
fbtr = fbtrs[0]
coverages[fbtr][1] += (stop - start)
starts = set()
coverage = 0
for read in bam_file.fetch(chrom, start, stop):
coverage += 1
starts.add(read.pos)
coverages[fbtr][0] += coverage
coverages[fbtr][2].update(starts)
pb.finish()
curr_lens, rpks, uniques = zip(*coverages.itervalues())
dir, fname = path.split(bam_fname)
return dir, rpks, uniques, curr_lens
if __name__ == "__main__":
import multiprocessing as mp
POOL = mp.Pool(20)
res = POOL.map(analyze_bamfile,
[f for f in sys.argv[2:] if f.endswith('.bam')])
all_dirs, all_rpks, all_pct_uniques, all_lens = zip(*res)
import cPickle as pickle
out_fh = open('checkcoverage.pkl', 'w')
pickle.dump({'dirs': all_dirs, 'rpks': all_rpks,
'pct_uniques': all_pct_uniques, 'lens': all_lens},
out_fh)
for fname, rpks, uniques, curr_lens in res:
print fname
try:
xs = array(rpks)
ys = array([len(u)/(curr_len + 1)
for u, curr_len in zip(uniques, curr_lens)])
cutoff = max(xs[ys < .1])
reg = stats.linregress(log(xs[(xs < cutoff)*(xs > 0)*(ys > 0)]),
log(ys[(xs < cutoff)*(xs > 0)*(ys > 0)]))
print "exp(%f) * x ** %f" % (reg[1], reg[0])
print "Duplicate badness score: ", exp(-reg[1]-.38)
except Exception as exc:
print exc