stats/
frequency.rs

1use std::collections::hash_map::{HashMap, Entry};
2use std::fmt;
3use std::hash::Hash;
4use std::iter::{FromIterator, IntoIterator};
5use std::default::Default;
6
7use Commute;
8
9/// A commutative data structure for exact frequency counts.
10#[derive(Clone)]
11pub struct Frequencies<T> {
12    data: HashMap<T, u64>,
13}
14
15impl<T: fmt::Debug + Eq + Hash> fmt::Debug for Frequencies<T> {
16    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
17        write!(f, "{:?}", self.data)
18    }
19}
20
21impl<T: Eq + Hash> Frequencies<T> {
22    /// Create a new frequency table with no samples.
23    pub fn new() -> Frequencies<T> {
24        Default::default()
25    }
26
27    /// Add a sample to the frequency table.
28    pub fn add(&mut self, v: T) {
29        match self.data.entry(v) {
30            Entry::Vacant(count) => { count.insert(1); },
31            Entry::Occupied(mut count) => { *count.get_mut() += 1; },
32        }
33    }
34
35    /// Return the number of occurrences of `v` in the data.
36    pub fn count(&self, v: &T) -> u64 {
37        self.data.get(v).map(|&v| v).unwrap_or(0)
38    }
39
40    /// Return the cardinality (number of unique elements) in the data.
41    pub fn cardinality(&self) -> u64 {
42        self.len() as u64
43    }
44
45    /// Returns the mode if one exists.
46    pub fn mode(&self) -> Option<&T> {
47        let counts = self.most_frequent();
48        if counts.is_empty() {
49            None
50        } else if counts.len() >= 2 && counts[0].1 == counts[1].1 {
51            None
52        } else {
53            Some(counts[0].0)
54        }
55    }
56
57    /// Return a `Vec` of elements and their corresponding counts in
58    /// descending order.
59    pub fn most_frequent(&self) -> Vec<(&T, u64)> {
60        let mut counts: Vec<_> = self.data.iter()
61                                          .map(|(k, &v)| (k, v))
62                                          .collect();
63        counts.sort_by(|&(_, c1), &(_, c2)| c2.cmp(&c1));
64        counts
65    }
66
67    /// Return a `Vec` of elements and their corresponding counts in
68    /// ascending order.
69    pub fn least_frequent(&self) -> Vec<(&T, u64)> {
70        let mut counts: Vec<_> = self.data.iter()
71                                          .map(|(k, &v)| (k, v))
72                                          .collect();
73        counts.sort_by(|&(_, c1), &(_, c2)| c1.cmp(&c2));
74        counts
75    }
76
77    /// Returns the cardinality of the data.
78    pub fn len(&self) -> usize {
79        self.data.len()
80    }
81}
82
83impl<T: Eq + Hash> Commute for Frequencies<T> {
84    fn merge(&mut self, v: Frequencies<T>) {
85        for (k, v2) in v.data.into_iter() {
86            match self.data.entry(k) {
87                Entry::Vacant(v1) => { v1.insert(v2); }
88                Entry::Occupied(mut v1) => { *v1.get_mut() += v2; }
89            }
90        }
91    }
92}
93
94impl<T: Eq + Hash> Default for Frequencies<T> {
95    fn default() -> Frequencies<T> {
96        Frequencies { data: HashMap::with_capacity(100000) }
97    }
98}
99
100impl<T: Eq + Hash> FromIterator<T> for Frequencies<T> {
101    fn from_iter<I: IntoIterator<Item=T>>(it: I) -> Frequencies<T> {
102        let mut v = Frequencies::new();
103        v.extend(it);
104        v
105    }
106}
107
108impl<T: Eq + Hash> Extend<T> for Frequencies<T> {
109    fn extend<I: IntoIterator<Item=T>>(&mut self, it: I) {
110        for sample in it {
111            self.add(sample);
112        }
113    }
114}
115
116#[cfg(test)]
117mod test {
118    use super::Frequencies;
119
120    #[test]
121    fn ranked() {
122        let mut counts = Frequencies::new();
123        counts.extend(vec![1usize, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4].into_iter());
124        assert_eq!(counts.most_frequent()[0], (&2, 5));
125        assert_eq!(counts.least_frequent()[0], (&3, 1));
126    }
127}