ยปCore Development>Code coverage>Lib/statistics.py

# Python code coverage for Lib/statistics.py

#countcontent
1n/a"""
2n/aBasic statistics module.
3n/a
4n/aThis module provides functions for calculating statistics of data, including
5n/aaverages, variance, and standard deviation.
6n/a
7n/aCalculating averages
8n/a--------------------
9n/a
10n/a================== =============================================
11n/aFunction Description
12n/a================== =============================================
13n/amean Arithmetic mean (average) of data.
14n/aharmonic_mean Harmonic mean of data.
15n/amedian Median (middle value) of data.
16n/amedian_low Low median of data.
17n/amedian_high High median of data.
18n/amedian_grouped Median, or 50th percentile, of grouped data.
19n/amode Mode (most common value) of data.
20n/a================== =============================================
21n/a
22n/aCalculate the arithmetic mean ("the average") of data:
23n/a
24n/a>>> mean([-1.0, 2.5, 3.25, 5.75])
25n/a2.625
26n/a
27n/a
28n/aCalculate the standard median of discrete data:
29n/a
30n/a>>> median([2, 3, 4, 5])
31n/a3.5
32n/a
33n/a
34n/aCalculate the median, or 50th percentile, of data grouped into class intervals
35n/acentred on the data values provided. E.g. if your data points are rounded to
36n/athe nearest whole number:
37n/a
38n/a>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
39n/a2.8333333333...
40n/a
41n/aThis should be interpreted in this way: you have two data points in the class
42n/ainterval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
43n/athe class interval 3.5-4.5. The median of these data points is 2.8333...
44n/a
45n/a
47n/a---------------------------------
48n/a
49n/a================== =============================================
50n/aFunction Description
51n/a================== =============================================
52n/apvariance Population variance of data.
53n/avariance Sample variance of data.
54n/apstdev Population standard deviation of data.
55n/astdev Sample standard deviation of data.
56n/a================== =============================================
57n/a
58n/aCalculate the standard deviation of sample data:
59n/a
60n/a>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
61n/a4.38961843444...
62n/a
63n/aIf you have previously calculated the mean, you can pass it as the optional
64n/asecond argument to the four "spread" functions to avoid recalculating it:
65n/a
66n/a>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
67n/a>>> mu = mean(data)
68n/a>>> pvariance(data, mu)
69n/a2.5
70n/a
71n/a
72n/aExceptions
73n/a----------
74n/a
75n/aA single exception is defined: StatisticsError is a subclass of ValueError.
76n/a
77n/a"""
78n/a
79n/a__all__ = [ 'StatisticsError',
80n/a 'pstdev', 'pvariance', 'stdev', 'variance',
81n/a 'median', 'median_low', 'median_high', 'median_grouped',
82n/a 'mean', 'mode', 'harmonic_mean',
83n/a ]
84n/a
85n/aimport collections
86n/aimport decimal
87n/aimport math
88n/aimport numbers
89n/a
90n/afrom fractions import Fraction
91n/afrom decimal import Decimal
92n/afrom itertools import groupby, chain
93n/afrom bisect import bisect_left, bisect_right
94n/a
95n/a
96n/a
97n/a# === Exceptions ===
98n/a
99n/aclass StatisticsError(ValueError):
100n/a pass
101n/a
102n/a
103n/a# === Private utilities ===
104n/a
106n/a """_sum(data [, start]) -> (type, sum, count)
107n/a
108n/a Return a high-precision sum of the given numeric data as a fraction,
109n/a together with the type to be converted to and the count of items.
110n/a
111n/a If optional argument ``start`` is given, it is added to the total.
112n/a If ``data`` is empty, ``start`` (defaulting to 0) is returned.
113n/a
114n/a
115n/a Examples
116n/a --------
117n/a
118n/a >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
119n/a (<class 'float'>, Fraction(11, 1), 5)
120n/a
121n/a Some sources of round-off error will be avoided:
122n/a
123n/a # Built-in sum returns zero.
124n/a >>> _sum([1e50, 1, -1e50] * 1000)
125n/a (<class 'float'>, Fraction(1000, 1), 3000)
126n/a
127n/a Fractions and Decimals are also supported:
128n/a
129n/a >>> from fractions import Fraction as F
130n/a >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
131n/a (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
132n/a
133n/a >>> from decimal import Decimal as D
134n/a >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
135n/a >>> _sum(data)
136n/a (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
137n/a
138n/a Mixed types are currently treated as an error, except that int is
139n/a allowed.
140n/a """
141n/a count = 0
142n/a n, d = _exact_ratio(start)
143n/a partials = {d: n}
144n/a partials_get = partials.get
145n/a T = _coerce(int, type(start))
146n/a for typ, values in groupby(data, type):
147n/a T = _coerce(T, typ) # or raise TypeError
148n/a for n,d in map(_exact_ratio, values):
149n/a count += 1
150n/a partials[d] = partials_get(d, 0) + n
151n/a if None in partials:
152n/a # The sum will be a NAN or INF. We can ignore all the finite
153n/a # partials, and just look at this special one.
154n/a total = partials[None]
155n/a assert not _isfinite(total)
156n/a else:
157n/a # Sum all the partial sums using builtin sum.
158n/a # FIXME is this faster if we sum them in order of the denominator?
159n/a total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
160n/a return (T, total, count)
161n/a
162n/a
164n/a try:
165n/a return x.is_finite() # Likely a Decimal.
166n/a except AttributeError:
167n/a return math.isfinite(x) # Coerces to float first.
168n/a
169n/a
171n/a """Coerce types T and S to a common type, or raise TypeError.
172n/a
173n/a Coercion rules are currently an implementation detail. See the CoerceTest
174n/a test class in test_statistics for details.
175n/a """
176n/a # See http://bugs.python.org/issue24068.
177n/a assert T is not bool, "initial type T is bool"
178n/a # If the types are the same, no need to coerce anything. Put this
179n/a # first, so that the usual case (no coercion needed) happens as soon
180n/a # as possible.
181n/a if T is S: return T
182n/a # Mixed int & other coerce to the other type.
183n/a if S is int or S is bool: return T
184n/a if T is int: return S
185n/a # If one is a (strict) subclass of the other, coerce to the subclass.
186n/a if issubclass(S, T): return S
187n/a if issubclass(T, S): return T
188n/a # Ints coerce to the other type.
189n/a if issubclass(T, int): return S
190n/a if issubclass(S, int): return T
191n/a # Mixed fraction & float coerces to float (or float subclass).
192n/a if issubclass(T, Fraction) and issubclass(S, float):
193n/a return S
194n/a if issubclass(T, float) and issubclass(S, Fraction):
195n/a return T
196n/a # Any other combination is disallowed.
197n/a msg = "don't know how to coerce %s and %s"
198n/a raise TypeError(msg % (T.__name__, S.__name__))
199n/a
200n/a
202n/a """Return Real number x to exact (numerator, denominator) pair.
203n/a
204n/a >>> _exact_ratio(0.25)
205n/a (1, 4)
206n/a
207n/a x is expected to be an int, Fraction, Decimal or float.
208n/a """
209n/a try:
210n/a # Optimise the common case of floats. We expect that the most often
211n/a # used numeric type will be builtin floats, so try to make this as
212n/a # fast as possible.
213n/a if type(x) is float or type(x) is Decimal:
214n/a return x.as_integer_ratio()
215n/a try:
216n/a # x may be an int, Fraction, or Integral ABC.
217n/a return (x.numerator, x.denominator)
218n/a except AttributeError:
219n/a try:
220n/a # x may be a float or Decimal subclass.
221n/a return x.as_integer_ratio()
222n/a except AttributeError:
223n/a # Just give up?
224n/a pass
225n/a except (OverflowError, ValueError):
226n/a # float NAN or INF.
227n/a assert not _isfinite(x)
228n/a return (x, None)
229n/a msg = "can't convert type '{}' to numerator/denominator"
230n/a raise TypeError(msg.format(type(x).__name__))
231n/a
232n/a
234n/a """Convert value to given numeric type T."""
235n/a if type(value) is T:
236n/a # This covers the cases where T is Fraction, or where value is
237n/a # a NAN or INF (Decimal or float).
238n/a return value
239n/a if issubclass(T, int) and value.denominator != 1:
240n/a T = float
241n/a try:
242n/a # FIXME: what do we do if this overflows?
243n/a return T(value)
244n/a except TypeError:
245n/a if issubclass(T, Decimal):
246n/a return T(value.numerator)/T(value.denominator)
247n/a else:
248n/a raise
249n/a
250n/a
252n/a # Generate a table of sorted (value, frequency) pairs.
253n/a table = collections.Counter(iter(data)).most_common()
254n/a if not table:
255n/a return table
256n/a # Extract the values with the highest frequency.
257n/a maxfreq = table[0][1]
258n/a for i in range(1, len(table)):
259n/a if table[i][1] != maxfreq:
260n/a table = table[:i]
261n/a break
262n/a return table
263n/a
264n/a
266n/a 'Locate the leftmost value exactly equal to x'
267n/a i = bisect_left(a, x)
268n/a if i != len(a) and a[i] == x:
269n/a return i
270n/a raise ValueError
271n/a
272n/a
274n/a 'Locate the rightmost value exactly equal to x'
275n/a i = bisect_right(a, x, lo=l)
276n/a if i != (len(a)+1) and a[i-1] == x:
277n/a return i-1
278n/a raise ValueError
279n/a
280n/a
282n/a """Iterate over values, failing if any are less than zero."""
283n/a for x in values:
284n/a if x < 0:
285n/a raise StatisticsError(errmsg)
286n/a yield x
287n/a
288n/a
289n/a# === Measures of central tendency (averages) ===
290n/a
292n/a """Return the sample arithmetic mean of data.
293n/a
294n/a >>> mean([1, 2, 3, 4, 4])
295n/a 2.8
296n/a
297n/a >>> from fractions import Fraction as F
298n/a >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
299n/a Fraction(13, 21)
300n/a
301n/a >>> from decimal import Decimal as D
302n/a >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
303n/a Decimal('0.5625')
304n/a
305n/a If ``data`` is empty, StatisticsError will be raised.
306n/a """
307n/a if iter(data) is data:
308n/a data = list(data)
309n/a n = len(data)
310n/a if n < 1:
311n/a raise StatisticsError('mean requires at least one data point')
312n/a T, total, count = _sum(data)
313n/a assert count == n
314n/a return _convert(total/n, T)
315n/a
316n/a
318n/a """Return the harmonic mean of data.
319n/a
320n/a The harmonic mean, sometimes called the subcontrary mean, is the
321n/a reciprocal of the arithmetic mean of the reciprocals of the data,
322n/a and is often appropriate when averaging quantities which are rates
323n/a or ratios, for example speeds. Example:
324n/a
325n/a Suppose an investor purchases an equal value of shares in each of
326n/a three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
327n/a What is the average P/E ratio for the investor's portfolio?
328n/a
329n/a >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
330n/a 3.6
331n/a
332n/a Using the arithmetic mean would give an average of about 5.167, which
333n/a is too high.
334n/a
335n/a If ``data`` is empty, or any element is less than zero,
336n/a ``harmonic_mean`` will raise ``StatisticsError``.
337n/a """
338n/a # For a justification for using harmonic mean for P/E ratios, see
339n/a # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
340n/a # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
341n/a if iter(data) is data:
342n/a data = list(data)
343n/a errmsg = 'harmonic mean does not support negative values'
344n/a n = len(data)
345n/a if n < 1:
346n/a raise StatisticsError('harmonic_mean requires at least one data point')
347n/a elif n == 1:
348n/a x = data[0]
349n/a if isinstance(x, (numbers.Real, Decimal)):
350n/a if x < 0:
351n/a raise StatisticsError(errmsg)
352n/a return x
353n/a else:
354n/a raise TypeError('unsupported type')
355n/a try:
356n/a T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
357n/a except ZeroDivisionError:
358n/a return 0
359n/a assert count == n
360n/a return _convert(n/total, T)
361n/a
362n/a
363n/a# FIXME: investigate ways to calculate medians without sorting? Quickselect?
365n/a """Return the median (middle value) of numeric data.
366n/a
367n/a When the number of data points is odd, return the middle data point.
368n/a When the number of data points is even, the median is interpolated by
369n/a taking the average of the two middle values:
370n/a
371n/a >>> median([1, 3, 5])
372n/a 3
373n/a >>> median([1, 3, 5, 7])
374n/a 4.0
375n/a
376n/a """
377n/a data = sorted(data)
378n/a n = len(data)
379n/a if n == 0:
380n/a raise StatisticsError("no median for empty data")
381n/a if n%2 == 1:
382n/a return data[n//2]
383n/a else:
384n/a i = n//2
385n/a return (data[i - 1] + data[i])/2
386n/a
387n/a
389n/a """Return the low median of numeric data.
390n/a
391n/a When the number of data points is odd, the middle value is returned.
392n/a When it is even, the smaller of the two middle values is returned.
393n/a
394n/a >>> median_low([1, 3, 5])
395n/a 3
396n/a >>> median_low([1, 3, 5, 7])
397n/a 3
398n/a
399n/a """
400n/a data = sorted(data)
401n/a n = len(data)
402n/a if n == 0:
403n/a raise StatisticsError("no median for empty data")
404n/a if n%2 == 1:
405n/a return data[n//2]
406n/a else:
407n/a return data[n//2 - 1]
408n/a
409n/a
411n/a """Return the high median of data.
412n/a
413n/a When the number of data points is odd, the middle value is returned.
414n/a When it is even, the larger of the two middle values is returned.
415n/a
416n/a >>> median_high([1, 3, 5])
417n/a 3
418n/a >>> median_high([1, 3, 5, 7])
419n/a 5
420n/a
421n/a """
422n/a data = sorted(data)
423n/a n = len(data)
424n/a if n == 0:
425n/a raise StatisticsError("no median for empty data")
426n/a return data[n//2]
427n/a
428n/a
430n/a """Return the 50th percentile (median) of grouped continuous data.
431n/a
432n/a >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
433n/a 3.7
434n/a >>> median_grouped([52, 52, 53, 54])
435n/a 52.5
436n/a
437n/a This calculates the median as the 50th percentile, and should be
438n/a used when your data is continuous and grouped. In the above example,
439n/a the values 1, 2, 3, etc. actually represent the midpoint of classes
440n/a 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
441n/a class 3.5-4.5, and interpolation is used to estimate it.
442n/a
443n/a Optional argument ``interval`` represents the class interval, and
444n/a defaults to 1. Changing the class interval naturally will change the
445n/a interpolated 50th percentile value:
446n/a
447n/a >>> median_grouped([1, 3, 3, 5, 7], interval=1)
448n/a 3.25
449n/a >>> median_grouped([1, 3, 3, 5, 7], interval=2)
450n/a 3.5
451n/a
452n/a This function does not check whether the data points are at least
453n/a ``interval`` apart.
454n/a """
455n/a data = sorted(data)
456n/a n = len(data)
457n/a if n == 0:
458n/a raise StatisticsError("no median for empty data")
459n/a elif n == 1:
460n/a return data[0]
461n/a # Find the value at the midpoint. Remember this corresponds to the
462n/a # centre of the class interval.
463n/a x = data[n//2]
464n/a for obj in (x, interval):
465n/a if isinstance(obj, (str, bytes)):
466n/a raise TypeError('expected number but got %r' % obj)
467n/a try:
468n/a L = x - interval/2 # The lower limit of the median interval.
469n/a except TypeError:
470n/a # Mixed type. For now we just coerce to float.
471n/a L = float(x) - float(interval)/2
472n/a
473n/a # Uses bisection search to search for x in data with log(n) time complexity
474n/a # Find the position of leftmost occurrence of x in data
475n/a l1 = _find_lteq(data, x)
476n/a # Find the position of rightmost occurrence of x in data[l1...len(data)]
477n/a # Assuming always l1 <= l2
478n/a l2 = _find_rteq(data, l1, x)
479n/a cf = l1
480n/a f = l2 - l1 + 1
481n/a return L + interval*(n/2 - cf)/f
482n/a
483n/a
485n/a """Return the most common data point from discrete or nominal data.
486n/a
487n/a ``mode`` assumes discrete data, and returns a single value. This is the
488n/a standard treatment of the mode as commonly taught in schools:
489n/a
490n/a >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
491n/a 3
492n/a
493n/a This also works with nominal (non-numeric) data:
494n/a
495n/a >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
496n/a 'red'
497n/a
498n/a If there is not exactly one most common value, ``mode`` will raise
499n/a StatisticsError.
500n/a """
501n/a # Generate a table of sorted (value, frequency) pairs.
502n/a table = _counts(data)
503n/a if len(table) == 1:
504n/a return table[0][0]
505n/a elif table:
506n/a raise StatisticsError(
507n/a 'no unique mode; found %d equally common values' % len(table)
508n/a )
509n/a else:
510n/a raise StatisticsError('no mode for empty data')
511n/a
512n/a
513n/a# === Measures of spread ===
514n/a
515n/a# See http://mathworld.wolfram.com/Variance.html
516n/a# http://mathworld.wolfram.com/SampleVariance.html
517n/a# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
518n/a#
519n/a# Under no circumstances use the so-called "computational formula for
520n/a# variance", as that is only suitable for hand calculations with a small
521n/a# amount of low-precision data. It has terrible numeric properties.
522n/a#
523n/a# See a comparison of three computational methods here:
524n/a# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
525n/a
527n/a """Return sum of square deviations of sequence data.
528n/a
529n/a If ``c`` is None, the mean is calculated in one pass, and the deviations
530n/a from the mean are calculated in a second pass. Otherwise, deviations are
531n/a calculated from ``c`` as given. Use the second case with care, as it can
533n/a """
534n/a if c is None:
535n/a c = mean(data)
536n/a T, total, count = _sum((x-c)**2 for x in data)
537n/a # The following sum should mathematically equal zero, but due to rounding
538n/a # error may not.
539n/a U, total2, count2 = _sum((x-c) for x in data)
540n/a assert T == U and count == count2
541n/a total -= total2**2/len(data)
542n/a assert not total < 0, 'negative sum of square deviations: %f' % total
543n/a return (T, total)
544n/a
545n/a
547n/a """Return the sample variance of data.
548n/a
549n/a data should be an iterable of Real-valued numbers, with at least two
550n/a values. The optional argument xbar, if given, should be the mean of
551n/a the data. If it is missing or None, the mean is automatically calculated.
552n/a
553n/a Use this function when your data is a sample from a population. To
554n/a calculate the variance from the entire population, see ``pvariance``.
555n/a
556n/a Examples:
557n/a
558n/a >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
559n/a >>> variance(data)
560n/a 1.3720238095238095
561n/a
562n/a If you have already calculated the mean of your data, you can pass it as
563n/a the optional second argument ``xbar`` to avoid recalculating it:
564n/a
565n/a >>> m = mean(data)
566n/a >>> variance(data, m)
567n/a 1.3720238095238095
568n/a
569n/a This function does not check that ``xbar`` is actually the mean of
570n/a ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
571n/a impossible results.
572n/a
573n/a Decimals and Fractions are supported:
574n/a
575n/a >>> from decimal import Decimal as D
576n/a >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
577n/a Decimal('31.01875')
578n/a
579n/a >>> from fractions import Fraction as F
580n/a >>> variance([F(1, 6), F(1, 2), F(5, 3)])
581n/a Fraction(67, 108)
582n/a
583n/a """
584n/a if iter(data) is data:
585n/a data = list(data)
586n/a n = len(data)
587n/a if n < 2:
588n/a raise StatisticsError('variance requires at least two data points')
589n/a T, ss = _ss(data, xbar)
590n/a return _convert(ss/(n-1), T)
591n/a
592n/a
594n/a """Return the population variance of ``data``.
595n/a
596n/a data should be an iterable of Real-valued numbers, with at least one
597n/a value. The optional argument mu, if given, should be the mean of
598n/a the data. If it is missing or None, the mean is automatically calculated.
599n/a
600n/a Use this function to calculate the variance from the entire population.
601n/a To estimate the variance from a sample, the ``variance`` function is
602n/a usually a better choice.
603n/a
604n/a Examples:
605n/a
606n/a >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
607n/a >>> pvariance(data)
608n/a 1.25
609n/a
610n/a If you have already calculated the mean of the data, you can pass it as
611n/a the optional second argument to avoid recalculating it:
612n/a
613n/a >>> mu = mean(data)
614n/a >>> pvariance(data, mu)
615n/a 1.25
616n/a
617n/a This function does not check that ``mu`` is actually the mean of ``data``.
618n/a Giving arbitrary values for ``mu`` may lead to invalid or impossible
619n/a results.
620n/a
621n/a Decimals and Fractions are supported:
622n/a
623n/a >>> from decimal import Decimal as D
624n/a >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
625n/a Decimal('24.815')
626n/a
627n/a >>> from fractions import Fraction as F
628n/a >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
629n/a Fraction(13, 72)
630n/a
631n/a """
632n/a if iter(data) is data:
633n/a data = list(data)
634n/a n = len(data)
635n/a if n < 1:
636n/a raise StatisticsError('pvariance requires at least one data point')
637n/a T, ss = _ss(data, mu)
638n/a return _convert(ss/n, T)
639n/a
640n/a
642n/a """Return the square root of the sample variance.
643n/a
644n/a See ``variance`` for arguments and other details.
645n/a
646n/a >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
647n/a 1.0810874155219827
648n/a
649n/a """
650n/a var = variance(data, xbar)
651n/a try:
652n/a return var.sqrt()
653n/a except AttributeError:
654n/a return math.sqrt(var)
655n/a
656n/a