stats/
unsorted.rs

1use std::default::Default;
2use std::iter::{FromIterator, IntoIterator};
3use num_traits::ToPrimitive;
4
5use {Commute, Partial};
6
7/// Compute the exact median on a stream of data.
8///
9/// (This has time complexity `O(nlogn)` and space complexity `O(n)`.)
10pub fn median<I>(it: I) -> Option<f64>
11        where I: Iterator, <I as Iterator>::Item: PartialOrd + ToPrimitive {
12    it.collect::<Unsorted<_>>().median()
13}
14
15/// Compute the exact mode on a stream of data.
16///
17/// (This has time complexity `O(nlogn)` and space complexity `O(n)`.)
18///
19/// If the data does not have a mode, then `None` is returned.
20pub fn mode<T, I>(it: I) -> Option<T>
21       where T: PartialOrd + Clone, I: Iterator<Item=T> {
22    it.collect::<Unsorted<T>>().mode()
23}
24
25/// Compute the modes on a stream of data.
26/// 
27/// If there is a single mode, then only that value is returned in the `Vec`
28/// however, if there multiple values tied for occuring the most amount of times
29/// those values are returned.
30/// 
31/// ## Example
32/// ```
33/// use stats;
34/// 
35/// let vals = vec![1, 1, 2, 2, 3];
36/// 
37/// assert_eq!(stats::modes(vals.into_iter()), vec![1, 2]);
38/// ```
39/// This has time complexity `O(n)`
40///
41/// If the data does not have a mode, then an empty `Vec` is returned.
42pub fn modes<T, I>(it: I) -> Vec<T>
43       where T: PartialOrd + Clone, I: Iterator<Item=T> {
44    it.collect::<Unsorted<T>>().modes()
45}
46
47fn median_on_sorted<T>(data: &[T]) -> Option<f64>
48        where T: PartialOrd + ToPrimitive {
49    Some(match data.len() {
50        0 => return None,
51        1 => data[0].to_f64().unwrap(),
52        len if len % 2 == 0 => {
53            let v1 = data[(len / 2) - 1].to_f64().unwrap();
54            let v2 = data[len / 2].to_f64().unwrap();
55            (v1 + v2) / 2.0
56        }
57        len => {
58            data[len / 2].to_f64().unwrap()
59        }
60    })
61}
62
63fn mode_on_sorted<T, I>(it: I) -> Option<T>
64        where T: PartialOrd, I: Iterator<Item=T> {
65    // This approach to computing the mode works very nicely when the
66    // number of samples is large and is close to its cardinality.
67    // In other cases, a hashmap would be much better.
68    // But really, how can we know this when given an arbitrary stream?
69    // Might just switch to a hashmap to track frequencies. That would also
70    // be generally useful for discovering the cardinality of a sample.
71    let (mut mode, mut next) = (None, None);
72    let (mut mode_count, mut next_count) = (0usize, 0usize);
73    for x in it {
74        if mode.as_ref().map(|y| y == &x).unwrap_or(false) {
75            mode_count += 1;
76        } else if next.as_ref().map(|y| y == &x).unwrap_or(false) {
77            next_count += 1;
78        } else {
79            next = Some(x);
80            next_count = 0;
81        }
82
83        if next_count > mode_count {
84            mode = next;
85            mode_count = next_count;
86            next = None;
87            next_count = 0;
88        } else if next_count == mode_count {
89            mode = None;
90            mode_count = 0usize;
91        }
92    }
93    mode
94}
95
96fn modes_on_sorted<T, I>(it: I) -> Vec<T>
97        where T: PartialOrd, I: Iterator<Item=T> {
98
99    let mut highest_mode = 1_u32;
100    let mut modes: Vec<u32> = vec![];
101    let mut values = vec![];
102    let mut count = 0;
103    for x in it {
104        if values.len() == 0 {
105            values.push(x);
106            modes.push(1);
107            continue
108        }
109        if x == values[count] {
110            modes[count] += 1;
111            if highest_mode < modes[count] {
112                highest_mode = modes[count];
113            }
114        } else {
115            values.push(x);
116            modes.push(1);
117            count += 1;
118        }
119    }
120    modes.into_iter()
121        .zip(values)
122        .filter(|(cnt, _val)| *cnt == highest_mode && highest_mode > 1)
123        .map(|(_, val)| val)
124        .collect()
125}
126
127/// A commutative data structure for lazily sorted sequences of data.
128///
129/// The sort does not occur until statistics need to be computed.
130///
131/// Note that this works on types that do not define a total ordering like
132/// `f32` and `f64`. When an ordering is not defined, an arbitrary order
133/// is returned.
134#[derive(Clone)]
135pub struct Unsorted<T> {
136    data: Vec<Partial<T>>,
137    sorted: bool,
138}
139
140impl<T: PartialOrd> Unsorted<T> {
141    /// Create initial empty state.
142    pub fn new() -> Unsorted<T> {
143        Default::default()
144    }
145
146    /// Add a new element to the set.
147    pub fn add(&mut self, v: T) {
148        self.dirtied();
149        self.data.push(Partial(v))
150    }
151
152    /// Return the number of data points.
153    pub fn len(&self) -> usize {
154        self.data.len()
155    }
156
157    fn sort(&mut self) {
158        if !self.sorted {
159            self.data.sort();
160        }
161    }
162
163    fn dirtied(&mut self) {
164        self.sorted = false;
165    }
166}
167
168impl<T: PartialOrd + Eq + Clone> Unsorted<T> {
169    pub fn cardinality(&mut self) -> usize {
170        self.sort();
171        let mut set = self.data.clone();
172        set.dedup();
173        set.len()
174    }
175}
176
177impl<T: PartialOrd + Clone> Unsorted<T> {
178    /// Returns the mode of the data.
179    pub fn mode(&mut self) -> Option<T> {
180        self.sort();
181        mode_on_sorted(self.data.iter()).map(|p| p.0.clone())
182    }
183
184    /// Returns the modes of the data.
185    pub fn modes(&mut self) -> Vec<T> {
186        self.sort();
187        modes_on_sorted(self.data.iter())
188            .into_iter()
189            .map(|p| p.0.clone())
190            .collect()
191    }
192}
193
194impl<T: PartialOrd + ToPrimitive> Unsorted<T> {
195    /// Returns the median of the data.
196    pub fn median(&mut self) -> Option<f64> {
197        self.sort();
198        median_on_sorted(&*self.data)
199    }
200}
201
202impl<T: PartialOrd> Commute for Unsorted<T> {
203    fn merge(&mut self, v: Unsorted<T>) {
204        self.dirtied();
205        self.data.extend(v.data.into_iter());
206    }
207}
208
209impl<T: PartialOrd> Default for Unsorted<T> {
210    fn default() -> Unsorted<T> {
211        Unsorted {
212            data: Vec::with_capacity(1000),
213            sorted: true,
214        }
215    }
216}
217
218impl<T: PartialOrd> FromIterator<T> for Unsorted<T> {
219    fn from_iter<I: IntoIterator<Item=T>>(it: I) -> Unsorted<T> {
220        let mut v = Unsorted::new();
221        v.extend(it);
222        v
223    }
224}
225
226impl<T: PartialOrd> Extend<T> for Unsorted<T> {
227    fn extend<I: IntoIterator<Item=T>>(&mut self, it: I) {
228        self.dirtied();
229        self.data.extend(it.into_iter().map(Partial))
230    }
231}
232
233#[cfg(test)]
234mod test {
235    use super::{median, mode, modes};
236
237    #[test]
238    fn median_stream() {
239        assert_eq!(median(vec![3usize, 5, 7, 9].into_iter()), Some(6.0));
240        assert_eq!(median(vec![3usize, 5, 7].into_iter()), Some(5.0));
241    }
242
243    #[test]
244    fn mode_stream() {
245        assert_eq!(mode(vec![3usize, 5, 7, 9].into_iter()), None);
246        assert_eq!(mode(vec![3usize, 3, 3, 3].into_iter()), Some(3));
247        assert_eq!(mode(vec![3usize, 3, 3, 4].into_iter()), Some(3));
248        assert_eq!(mode(vec![4usize, 3, 3, 3].into_iter()), Some(3));
249        assert_eq!(mode(vec![1usize, 1, 2, 3, 3].into_iter()), None);
250    }
251
252    #[test]
253    fn median_floats() {
254        assert_eq!(median(vec![3.0f64, 5.0, 7.0, 9.0].into_iter()), Some(6.0));
255        assert_eq!(median(vec![3.0f64, 5.0, 7.0].into_iter()), Some(5.0));
256        assert_eq!(median(vec![1.0f64, 2.5, 3.0].into_iter()), Some(2.5));
257    }
258
259    #[test]
260    fn mode_floats() {
261        assert_eq!(mode(vec![3.0f64, 5.0, 7.0, 9.0].into_iter()), None);
262        assert_eq!(mode(vec![3.0f64, 3.0, 3.0, 3.0].into_iter()), Some(3.0));
263        assert_eq!(mode(vec![3.0f64, 3.0, 3.0, 4.0].into_iter()), Some(3.0));
264        assert_eq!(mode(vec![4.0f64, 3.0, 3.0, 3.0].into_iter()), Some(3.0));
265        assert_eq!(mode(vec![1.0f64, 1.0, 2.0, 3.0, 3.0].into_iter()), None);
266    }
267
268    #[test]
269    fn modes_stream() {
270        assert_eq!(modes(vec![3usize, 5, 7, 9].into_iter()), vec![]);
271        assert_eq!(modes(vec![3usize, 3, 3, 3].into_iter()), vec![3]);
272        assert_eq!(modes(vec![3usize, 3, 4, 4].into_iter()), vec![3, 4]);
273        assert_eq!(modes(vec![4usize, 3, 3, 3].into_iter()), vec![3]);
274        assert_eq!(modes(vec![1usize, 1, 2, 2].into_iter()), vec![1, 2]);
275        let vec: Vec<u32> = vec![];
276        assert_eq!(modes(vec.into_iter()), vec![]);
277    }
278
279    #[test]
280    fn modes_floats() {
281        assert_eq!(modes(vec![3_f64, 5.0, 7.0, 9.0].into_iter()), vec![]);
282        assert_eq!(modes(vec![3_f64, 3.0, 3.0, 3.0].into_iter()), vec![3.0]);
283        assert_eq!(modes(vec![3_f64, 3.0, 4.0, 4.0].into_iter()), vec![3.0, 4.0]);
284        assert_eq!(modes(vec![1_f64, 1.0, 2.0, 3.0, 3.0].into_iter()), vec![1.0, 3.0]);
285    }
286}