-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquantiles.py
More file actions
executable file
·111 lines (96 loc) · 3.95 KB
/
quantiles.py
File metadata and controls
executable file
·111 lines (96 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import pandas as pd
import numpy as np
class Quantile():
"""
Quantile information
"""
def __init__(self, freqs):
self.freqs = freqs
def quantiles(self, quantiles=2):
"""
Find the quantiles of a list of frequencies
"""
self.freqs['inst_frac'] = (self.freqs.instances.cumsum() /
self.freqs.instances.sum())
cuts = np.linspace(0.0, 1.0, num=quantiles+1)
self.freqs['quantile'] = pd.cut(self.freqs.inst_frac,
cuts,
include_lowest=True,
labels=False)
return self.freqs['quantile'].value_counts(ascending=True).tolist()
def head(self, quantiles=2):
"""
List the queries comprising the top quantile
"""
quantiles = self.quantiles(quantiles)
return self.freqs['query'].head(quantiles[0]).tolist()
def tail(self, quantiles=2):
"""
List the queries comprising the bottom quantile
"""
quantiles = self.quantiles(quantiles)
return self.freqs['query'].tolist()[sum(quantiles[:-1]):]
def middle(self, quantiles=3):
"""
List the queries comprising the quantiles between
the top and bottom quantiles
"""
if(quantiles < 3):
raise ValueError("quantiles must be greater than 2")
quantiles = self.quantiles(quantiles)
return self.freqs['query'].tolist()[quantiles[0]:sum(quantiles[:-1])]
def main():
import sys
import argparse
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='command',
help='available commands')
quantile = subparsers.add_parser('quantiles')
quantile.add_argument('filename', nargs='?',
help='file to read, if empty, stdin is used')
quantile.add_argument('-q', '--quantiles',
type=int, default=2,
help='file to read, if empty, stdin is used')
head = subparsers.add_parser('head')
head.add_argument('filename', nargs='?',
help='file to read, if empty, stdin is used')
head.add_argument('-q', '--quantiles',
type=int, default=2,
help='file to read, if empty, stdin is used')
tail = subparsers.add_parser('tail')
tail.add_argument('filename', nargs='?',
help='file to read, if empty, stdin is used')
tail.add_argument('-q', '--quantiles',
type=int, default=2,
help='file to read, if empty, stdin is used')
middle = subparsers.add_parser('middle')
middle.add_argument('filename', nargs='?',
help='file to read, if empty, stdin is used')
middle.add_argument('-q', '--quantiles',
type=int, default=3,
help='file to read, if empty, stdin is used')
args = parser.parse_args()
freq_file = args.filename or sys.stdin
freqs = pd.read_csv(freq_file, usecols=['query', 'instances'])
quantile = Quantile(freqs)
if args.command == 'quantiles':
counter = 1
for q in quantile.quantiles(args.quantiles):
sys.stdout.write(str(counter) + '\t' + str(q) + '\n')
counter += 1
elif args.command == 'head':
for q in quantile.head(args.quantiles):
sys.stdout.write(q + '\n')
elif args.command == 'tail':
for q in quantile.tail(args.quantiles):
sys.stdout.write(q + '\n')
elif args.command == 'middle':
for q in quantile.middle(args.quantiles):
sys.stdout.write(q + '\n')
if __name__ == "__main__":
main()