From e84e18f86a8d439b3d21ff67a2d81438ac1e8cfc Mon Sep 17 00:00:00 2001 From: im-rodrigo Date: Tue, 21 Jul 2015 23:28:28 -0400 Subject: [PATCH 1/6] added new error for cases when attempting to find mean of non-numerical values; the stats fns are now executed against 'elemends' and no 'values' --- statscounter/statscounter.py | 37 ++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/statscounter/statscounter.py b/statscounter/statscounter.py index 2d4501c..c1e22d1 100644 --- a/statscounter/statscounter.py +++ b/statscounter/statscounter.py @@ -25,56 +25,73 @@ import statscounter.stats as stats +class WrongVariableTypeError(ValueError): + """You cannot find the 'expected value' (mean) of a distribution + of categorical (nominal) random variables (for example, a + distribution of words is equivalent to a categorical variable). + It makes no sense to find the average word. + """ + pass + + class StatsCounter(Counter): def mean(self): + """ AKA Expectation + """ + try: + return stats.mean(self.elements()) + except (TypeError): + raise WrongVariableTypeError("Distribution is not a numerical type.") + + def expectation(self): """ """ - return stats.mean(self.values()) + return self.mean() def median(self, ): """ """ - return stats.median(self.values()) + return stats.median(self.elements()) def median_low(self): """ """ - return stats.median_low(self.values()) + return stats.median_low(self.elements()) def median_high(self): """ """ - return stats.median_high(self.values()) + return stats.median_high(self.elements()) def median_grouped(self): """ """ - return stats.median_grouped(self.values()) + return stats.median_grouped(self.elements()) def mode(self): """ """ - return stats.mode(self.values()) + return stats.mode(self.elements()) def variance(self): """ """ - return stats.variance(self.values()) + return stats.variance(self.elements()) def pvariance(self): """ """ - return stats.pvariance(self.values()) + return stats.pvariance(self.elements()) def stdev(self, ): """ """ - return stats.stdev(self.values()) + return stats.stdev(self.elements()) def pstdev(self): """ """ - return stats.pstdev(self.values()) + return stats.pstdev(self.elements()) def best_pair(self): return self.most_common(1)[0] From 7ad446d14de6712b9a55b9282272887d256bf215 Mon Sep 17 00:00:00 2001 From: im-rodrigo Date: Tue, 21 Jul 2015 23:28:46 -0400 Subject: [PATCH 2/6] upped vs. number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 995ef2f..cb023d8 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def readme(): setup( name='statscounter', - version='0.0.010', + version='0.0.011', url='https://github.com/datalib/statscounter', license='MIT', description="Python's missing statistical Swiss Army knife", From 30bbd905114f98f144deacd0865eea845517c8c0 Mon Sep 17 00:00:00 2001 From: im-rodrigo Date: Mon, 10 Aug 2015 20:26:58 -0700 Subject: [PATCH 3/6] adding errors for median fn's for when dealing with non-numeric types --- statscounter/statscounter.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/statscounter/statscounter.py b/statscounter/statscounter.py index c1e22d1..5e0055d 100644 --- a/statscounter/statscounter.py +++ b/statscounter/statscounter.py @@ -51,23 +51,37 @@ def expectation(self): def median(self, ): """ """ - return stats.median(self.elements()) - + try: + return stats.median(self.elements()) + except (TypeError): + raise WrongVariableTypeError("Distribution is not a numerical type.") + def median_low(self): """ """ - return stats.median_low(self.elements()) + try: + return stats.median_low(self.elements()) + except (TypeError): + raise WrongVariableTypeError("Distribution is not a numerical type.") + def median_high(self): """ """ - return stats.median_high(self.elements()) + try: + return stats.median_high(self.elements()) + except (TypeError): + raise WrongVariableTypeError("Distribution is not a numerical type.") + def median_grouped(self): """ """ - return stats.median_grouped(self.elements()) - + try: + return stats.median_grouped(self.elements()) + except (TypeError): + raise WrongVariableTypeError("Distribution is not a numerical type.") + def mode(self): """ """ From 5dd81b2b75e4b898d1920764b2ea4f160ee51366 Mon Sep 17 00:00:00 2001 From: im-rodrigo Date: Mon, 24 Aug 2015 20:39:56 -0700 Subject: [PATCH 4/6] adding MultipleMostCommonValuesError; max, argmax and best_pair should only return when a single value is most common --- statscounter/statscounter.py | 10 +++++++++- tests/test_statscounter.py | 30 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/statscounter/statscounter.py b/statscounter/statscounter.py index 5e0055d..a07a71b 100644 --- a/statscounter/statscounter.py +++ b/statscounter/statscounter.py @@ -33,6 +33,10 @@ class WrongVariableTypeError(ValueError): """ pass +class MultipleMostCommonValuesError(ValueError): + """""" + pass + class StatsCounter(Counter): def mean(self): @@ -108,7 +112,11 @@ def pstdev(self): return stats.pstdev(self.elements()) def best_pair(self): - return self.most_common(1)[0] + best_two_pairs = self.most_common(2)[0] + try: + self.mode() + except (stats.StatisticsError): + raise MultipleMostCommonValuesError("Two or more values appear more than once.") def argmax(self): """ diff --git a/tests/test_statscounter.py b/tests/test_statscounter.py index fa551cd..af81557 100644 --- a/tests/test_statscounter.py +++ b/tests/test_statscounter.py @@ -1,27 +1,27 @@ from __future__ import division from pytest import raises from statscounter import StatsCounter, stats - +from statscounter.statscounter import MultipleMostCommonValuesError class TestStatsCounter: - counter_ints = StatsCounter({str(s):s for s in range(1000)}) + counter_ints = StatsCounter([1,1,2,3,4,4]) def test_mean_int(self): m = self.counter_ints.mean() - d = 499500/1000 + d = 15/6 assert m == d def test_median_low(self): m = self.counter_ints.median_low() - assert m == 499 + assert m == 2 def test_median_high(self, ): m = self.counter_ints.median_high() - assert m == 500 + assert m == 3 def test_median_grouped(self, ): m = self.counter_ints.median_grouped() - assert m == 499.5 + assert m == 2.5 def test_mode(self): with raises(stats.StatisticsError): @@ -29,28 +29,28 @@ def test_mode(self): def test_variance(self): m = self.counter_ints.variance() - assert m == 83416.66666666667 + assert m == 1.9 def test_stdev(self, ): m = self.counter_ints.stdev() - assert m == 288.8194360957494 + assert m == 1.378404875209022 def test_pvariance(self): m = self.counter_ints.pvariance() - assert m == 83333.25 + assert m == 1.5833333333333333 def test_pstdev(self, ): m = self.counter_ints.pstdev() - assert m == 288.6749902572095 + assert m == 1.2583057392117916 def test_argmax(self): - m = self.counter_ints.argmax() - assert m == '999' + with raises(MultipleMostCommonValuesError): + m = self.counter_ints.argmax() def test_max(self): - m = self.counter_ints.max() - assert m == 999 - + with raises(MultipleMostCommonValuesError): + m = self.counter_ints.max() + def test_normalize(self): pdist = StatsCounter({1: 1, 2: 2, 3: 1}).normalize() assert pdist == { From e4b1d8c5e7d55e73d035f901357c121fd2db5164 Mon Sep 17 00:00:00 2001 From: im-rodrigo Date: Mon, 24 Aug 2015 21:15:09 -0700 Subject: [PATCH 5/6] formatted test functions to match conventions regarding exception testing --- statscounter/statscounter.py | 7 ++++--- tests/test_statscounter.py | 35 ++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/statscounter/statscounter.py b/statscounter/statscounter.py index a07a71b..6154a62 100644 --- a/statscounter/statscounter.py +++ b/statscounter/statscounter.py @@ -26,7 +26,7 @@ class WrongVariableTypeError(ValueError): - """You cannot find the 'expected value' (mean) of a distribution + """You cannot find the 'expectation' (mean) of a distribution of categorical (nominal) random variables (for example, a distribution of words is equivalent to a categorical variable). It makes no sense to find the average word. @@ -112,12 +112,13 @@ def pstdev(self): return stats.pstdev(self.elements()) def best_pair(self): - best_two_pairs = self.most_common(2)[0] try: self.mode() except (stats.StatisticsError): raise MultipleMostCommonValuesError("Two or more values appear more than once.") - + else: + return self.most_common(1)[0] + def argmax(self): """ """ diff --git a/tests/test_statscounter.py b/tests/test_statscounter.py index af81557..bddd908 100644 --- a/tests/test_statscounter.py +++ b/tests/test_statscounter.py @@ -4,52 +4,61 @@ from statscounter.statscounter import MultipleMostCommonValuesError class TestStatsCounter: - counter_ints = StatsCounter([1,1,2,3,4,4]) + counter_ints = StatsCounter([1,1,2,3,4]) + counter_ints_with_two_modes = StatsCounter([1,1,2,3,4,4]) def test_mean_int(self): - m = self.counter_ints.mean() + m = self.counter_ints_with_two_modes.mean() d = 15/6 assert m == d def test_median_low(self): - m = self.counter_ints.median_low() + m = self.counter_ints_with_two_modes.median_low() assert m == 2 def test_median_high(self, ): - m = self.counter_ints.median_high() + m = self.counter_ints_with_two_modes.median_high() assert m == 3 def test_median_grouped(self, ): - m = self.counter_ints.median_grouped() + m = self.counter_ints_with_two_modes.median_grouped() assert m == 2.5 def test_mode(self): with raises(stats.StatisticsError): - self.counter_ints.mode() + self.counter_ints_with_two_modes.mode() def test_variance(self): - m = self.counter_ints.variance() + m = self.counter_ints_with_two_modes.variance() assert m == 1.9 def test_stdev(self, ): - m = self.counter_ints.stdev() + m = self.counter_ints_with_two_modes.stdev() assert m == 1.378404875209022 def test_pvariance(self): - m = self.counter_ints.pvariance() + m = self.counter_ints_with_two_modes.pvariance() assert m == 1.5833333333333333 def test_pstdev(self, ): - m = self.counter_ints.pstdev() + m = self.counter_ints_with_two_modes.pstdev() assert m == 1.2583057392117916 - + def test_argmax(self): + m = self.counter_ints.argmax() + assert m == 1 + + def test_argmax_throws_exception(self): with raises(MultipleMostCommonValuesError): - m = self.counter_ints.argmax() + m = self.counter_ints_with_two_modes.argmax() def test_max(self): + m = self.counter_ints.max() + assert m == 2 + + def test_max_throws_exception(self): with raises(MultipleMostCommonValuesError): - m = self.counter_ints.max() + m = self.counter_ints_with_two_modes.max() def test_normalize(self): pdist = StatsCounter({1: 1, 2: 2, 3: 1}).normalize() From 7b9bc61ea57e15fb8d770bc3b6e7f63f0122babc Mon Sep 17 00:00:00 2001 From: im-rodrigo Date: Tue, 25 Aug 2015 00:07:59 -0700 Subject: [PATCH 6/6] added key_types_distribution, which creates a prob. dist. of types of the elements of the referencing class; used for checking if dist is 'discrete random variable' --- statscounter/statscounter.py | 48 ++++++++++++++++++--------------- tests/test_statscounter.py | 52 +++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 29 deletions(-) diff --git a/statscounter/statscounter.py b/statscounter/statscounter.py index 6154a62..3a2e16c 100644 --- a/statscounter/statscounter.py +++ b/statscounter/statscounter.py @@ -25,13 +25,8 @@ import statscounter.stats as stats -class WrongVariableTypeError(ValueError): - """You cannot find the 'expectation' (mean) of a distribution - of categorical (nominal) random variables (for example, a - distribution of words is equivalent to a categorical variable). - It makes no sense to find the average word. - """ - pass +NUMBER_TYPES = set(['float', 'int', 'Decimal', 'Fraction']) + class MultipleMostCommonValuesError(ValueError): """""" @@ -39,13 +34,19 @@ class MultipleMostCommonValuesError(ValueError): class StatsCounter(Counter): + + def key_types_distribution(self): + """Return a p. distribution of the elements' types""" + return StatsCounter([type(element).__name__ + for element in self.elements()]).normalize() + def mean(self): """ AKA Expectation """ try: return stats.mean(self.elements()) except (TypeError): - raise WrongVariableTypeError("Distribution is not a numerical type.") + raise TypeError("Distribution is not a numerical type.") def expectation(self): """ @@ -55,36 +56,39 @@ def expectation(self): def median(self, ): """ """ - try: + key_type = self.key_types_distribution().most_common(1)[0] + print(key_type) + if key_type[0] not in NUMBER_TYPES or key_type[1] != 1.0: + raise TypeError("Distribution is not a numerical type.") + else: return stats.median(self.elements()) - except (TypeError): - raise WrongVariableTypeError("Distribution is not a numerical type.") def median_low(self): """ """ - try: + key_type = self.key_types_distribution().most_common(1)[0] + if key_type[0] not in NUMBER_TYPES or key_type[1] != 1.0: + raise TypeError("Distribution is not a numerical type.") + else: return stats.median_low(self.elements()) - except (TypeError): - raise WrongVariableTypeError("Distribution is not a numerical type.") - def median_high(self): """ """ - try: + key_type = self.key_types_distribution().most_common(1)[0] + if key_type[0] not in NUMBER_TYPES or key_type[1] != 1.0: + raise TypeError("Distribution is not a numerical type.") + else: return stats.median_high(self.elements()) - except (TypeError): - raise WrongVariableTypeError("Distribution is not a numerical type.") - def median_grouped(self): """ """ - try: + key_type = self.key_types_distribution().most_common(1)[0] + if key_type[0] not in NUMBER_TYPES or key_type[1] != 1.0: + raise TypeError("Distribution is not a numerical type.") + else: return stats.median_grouped(self.elements()) - except (TypeError): - raise WrongVariableTypeError("Distribution is not a numerical type.") def mode(self): """ diff --git a/tests/test_statscounter.py b/tests/test_statscounter.py index bddd908..dd028e7 100644 --- a/tests/test_statscounter.py +++ b/tests/test_statscounter.py @@ -6,25 +6,63 @@ class TestStatsCounter: counter_ints = StatsCounter([1,1,2,3,4]) counter_ints_with_two_modes = StatsCounter([1,1,2,3,4,4]) - - def test_mean_int(self): + counter_chars = StatsCounter('aabccd') + + def test_key_types_distribution(self): + ci = self.counter_ints.key_types_distribution() + ci2 = self.counter_ints_with_two_modes.key_types_distribution() + cc = self.counter_chars.key_types_distribution() + + assert ci == StatsCounter(['int']) + assert ci2 == StatsCounter(['int']) + assert cc == StatsCounter(['str']) + + def test_mean(self): m = self.counter_ints_with_two_modes.mean() d = 15/6 assert m == d - + + def test_mean_throws_exception(self): + with raises(TypeError): + self.counter_chars.mean() + + def test_median(self): + m = self.counter_ints_with_two_modes.median() + assert m == 2.5 + + def test_median_throws_exception(self): + with raises(TypeError): + self.counter_chars.median() + def test_median_low(self): m = self.counter_ints_with_two_modes.median_low() - assert m == 2 - + assert m == 2 + + def test_median_low_throws_exception(self): + with raises(TypeError): + self.counter_chars.median_low() + def test_median_high(self, ): m = self.counter_ints_with_two_modes.median_high() assert m == 3 + + def test_median_high_throws_exception(self): + with raises(TypeError): + self.counter_chars.median_high() def test_median_grouped(self, ): m = self.counter_ints_with_two_modes.median_grouped() assert m == 2.5 - - def test_mode(self): + + def test_median_grouped_throws_exception(self): + with raises(TypeError): + self.counter_chars.median_grouped() + + def test_mode(self, ): + m = self.counter_ints.mode() + assert m == 1 + + def test_mode_throws_exception(self): with raises(stats.StatisticsError): self.counter_ints_with_two_modes.mode()