1 #![allow(missing_docs)]
2 #![allow(deprecated)] // Float
9 fn local_sort(v: &mut [f64]) {
10 v.sort_by(|x: &f64, y: &f64| x.total_cmp(y));
13 /// Trait that provides simple descriptive statistics on a univariate set of numeric samples.
15 /// Sum of the samples.
17 /// Note: this method sacrifices performance at the altar of accuracy
18 /// Depends on IEEE-754 arithmetic guarantees. See proof of correctness at:
19 /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric
20 /// Predicates"][paper]
22 /// [paper]: https://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps
25 /// Minimum value of the samples.
28 /// Maximum value of the samples.
31 /// Arithmetic mean (average) of the samples: sum divided by sample-count.
33 /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean>
34 fn mean(&self) -> f64;
36 /// Median of the samples: value separating the lower half of the samples from the higher half.
37 /// Equal to `self.percentile(50.0)`.
39 /// See: <https://en.wikipedia.org/wiki/Median>
40 fn median(&self) -> f64;
42 /// Variance of the samples: bias-corrected mean of the squares of the differences of each
43 /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the
44 /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n`
45 /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather
48 /// See: <https://en.wikipedia.org/wiki/Variance>
51 /// Standard deviation: the square root of the sample variance.
53 /// Note: this is not a robust statistic for non-normal distributions. Prefer the
54 /// `median_abs_dev` for unknown distributions.
56 /// See: <https://en.wikipedia.org/wiki/Standard_deviation>
57 fn std_dev(&self) -> f64;
59 /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`.
61 /// Note: this is not a robust statistic for non-normal distributions. Prefer the
62 /// `median_abs_dev_pct` for unknown distributions.
63 fn std_dev_pct(&self) -> f64;
65 /// Scaled median of the absolute deviations of each sample from the sample median. This is a
66 /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to
67 /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled
68 /// by the constant `1.4826` to allow its use as a consistent estimator for the standard
71 /// See: <https://en.wikipedia.org/wiki/Median_absolute_deviation>
72 fn median_abs_dev(&self) -> f64;
74 /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`.
75 fn median_abs_dev_pct(&self) -> f64;
77 /// Percentile: the value below which `pct` percent of the values in `self` fall. For example,
78 /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self`
81 /// Calculated by linear interpolation between closest ranks.
83 /// See: <https://en.wikipedia.org/wiki/Percentile>
84 fn percentile(&self, pct: f64) -> f64;
86 /// Quartiles of the sample: three values that divide the sample into four equal groups, each
87 /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This
88 /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but
89 /// is otherwise equivalent.
91 /// See also: <https://en.wikipedia.org/wiki/Quartile>
92 fn quartiles(&self) -> (f64, f64, f64);
94 /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th
95 /// percentile (3rd quartile). See `quartiles`.
97 /// See also: <https://en.wikipedia.org/wiki/Interquartile_range>
101 /// Extracted collection of all the summary statistics of a sample set.
102 #[derive(Debug, Clone, PartialEq, Copy)]
103 #[allow(missing_docs)]
112 pub std_dev_pct: f64,
113 pub median_abs_dev: f64,
114 pub median_abs_dev_pct: f64,
115 pub quartiles: (f64, f64, f64),
120 /// Construct a new summary of a sample set.
121 pub fn new(samples: &[f64]) -> Summary {
126 mean: samples.mean(),
127 median: samples.median(),
129 std_dev: samples.std_dev(),
130 std_dev_pct: samples.std_dev_pct(),
131 median_abs_dev: samples.median_abs_dev(),
132 median_abs_dev_pct: samples.median_abs_dev_pct(),
133 quartiles: samples.quartiles(),
139 impl Stats for [f64] {
140 // FIXME #11059 handle NaN, inf and overflow
141 fn sum(&self) -> f64 {
142 let mut partials = vec![];
147 // This inner loop applies `hi`/`lo` summation to each
148 // partial so that the list of partial sums remains exact.
149 for i in 0..partials.len() {
150 let mut y: f64 = partials[i];
151 if x.abs() < y.abs() {
152 mem::swap(&mut x, &mut y);
154 // Rounded `x+y` is stored in `hi` with round-off stored in
155 // `lo`. Together `hi+lo` are exactly equal to `x+y`.
157 let lo = y - (hi - x);
164 if j >= partials.len() {
168 partials.truncate(j + 1);
172 partials.iter().fold(zero, |p, q| p + *q)
175 fn min(&self) -> f64 {
176 assert!(!self.is_empty());
177 self.iter().fold(self[0], |p, q| p.min(*q))
180 fn max(&self) -> f64 {
181 assert!(!self.is_empty());
182 self.iter().fold(self[0], |p, q| p.max(*q))
185 fn mean(&self) -> f64 {
186 assert!(!self.is_empty());
187 self.sum() / (self.len() as f64)
190 fn median(&self) -> f64 {
191 self.percentile(50_f64)
194 fn var(&self) -> f64 {
198 let mean = self.mean();
199 let mut v: f64 = 0.0;
204 // N.B., this is _supposed to be_ len-1, not len. If you
205 // change it back to len, you will be calculating a
206 // population variance, not a sample variance.
207 let denom = (self.len() - 1) as f64;
212 fn std_dev(&self) -> f64 {
216 fn std_dev_pct(&self) -> f64 {
217 let hundred = 100_f64;
218 (self.std_dev() / self.mean()) * hundred
221 fn median_abs_dev(&self) -> f64 {
222 let med = self.median();
223 let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect();
224 // This constant is derived by smarter statistics brains than me, but it is
225 // consistent with how R and other packages treat the MAD.
227 abs_devs.median() * number
230 fn median_abs_dev_pct(&self) -> f64 {
231 let hundred = 100_f64;
232 (self.median_abs_dev() / self.median()) * hundred
235 fn percentile(&self, pct: f64) -> f64 {
236 let mut tmp = self.to_vec();
237 local_sort(&mut tmp);
238 percentile_of_sorted(&tmp, pct)
241 fn quartiles(&self) -> (f64, f64, f64) {
242 let mut tmp = self.to_vec();
243 local_sort(&mut tmp);
245 let a = percentile_of_sorted(&tmp, first);
247 let b = percentile_of_sorted(&tmp, second);
249 let c = percentile_of_sorted(&tmp, third);
253 fn iqr(&self) -> f64 {
254 let (a, _, c) = self.quartiles();
259 // Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using
260 // linear interpolation. If samples are not sorted, return nonsensical value.
261 fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 {
262 assert!(!sorted_samples.is_empty());
263 if sorted_samples.len() == 1 {
264 return sorted_samples[0];
267 assert!(zero <= pct);
268 let hundred = 100_f64;
269 assert!(pct <= hundred);
271 return sorted_samples[sorted_samples.len() - 1];
273 let length = (sorted_samples.len() - 1) as f64;
274 let rank = (pct / hundred) * length;
275 let lrank = rank.floor();
276 let d = rank - lrank;
277 let n = lrank as usize;
278 let lo = sorted_samples[n];
279 let hi = sorted_samples[n + 1];
283 /// Winsorize a set of samples, replacing values above the `100-pct` percentile
284 /// and below the `pct` percentile with those percentiles themselves. This is a
285 /// way of minimizing the effect of outliers, at the cost of biasing the sample.
286 /// It differs from trimming in that it does not change the number of samples,
287 /// just changes the values of those that are outliers.
289 /// See: <https://en.wikipedia.org/wiki/Winsorising>
290 pub fn winsorize(samples: &mut [f64], pct: f64) {
291 let mut tmp = samples.to_vec();
292 local_sort(&mut tmp);
293 let lo = percentile_of_sorted(&tmp, pct);
294 let hundred = 100_f64;
295 let hi = percentile_of_sorted(&tmp, hundred - pct);
296 for samp in samples {
299 } else if *samp < lo {