opendp/metrics/
mod.rs

1//! Various implementations of Metrics (and associated Distance).
2//!
3//! A Metric is used to measure the distance between data.
4//! Metrics are paired with a **domain** on which the metric can measure distance.
5//! The distance is expressed in terms of an **associated type**.
6//!
7//! # Example
8//!
9//! [`SymmetricDistance`] can be paired with a domain: `VectorDomain(AtomDomain(T))`.
10//! In this context, the `SymmetricDistance` is used to measure the distance between any two vectors of elements of type `T`.
11//! The `SymmetricDistance` has an associated distance type of [`u32`].
12//! This means that the symmetric distance between vectors is expressed in terms of a [`u32`].
13
14#[cfg(feature = "ffi")]
15pub(crate) mod ffi;
16
17use std::hash::Hash;
18use std::marker::PhantomData;
19
20use crate::{
21    core::{Domain, Metric, MetricSpace},
22    domains::{type_name, AtomDomain, BitVectorDomain, MapDomain, VectorDomain},
23    error::Fallible,
24    traits::{CheckAtom, InfAdd},
25};
26#[cfg(feature = "contrib")]
27use crate::{traits::Hashable, transformations::DataFrameDomain};
28use std::fmt::{Debug, Formatter};
29
30/// The type that represents the distance between datasets.
31/// It is used as the associated [`Metric`]::Distance type for e.g. [`SymmetricDistance`], [`InsertDeleteDistance`], etc.
32pub type IntDistance = u32;
33
34/// The smallest number of additions or removals to make two datasets equivalent.
35///
36/// This metric is not sensitive to data ordering.
37/// Because this metric counts additions and removals,
38/// it is an unbounded metric (for unbounded DP).
39///
40/// # Proof Definition
41///
42/// ### `d`-closeness
43/// For any two vectors $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
44/// we say that $u, v$ are $d$-close under the symmetric distance metric
45/// (abbreviated as $d_{Sym}$) whenever
46///
47/// ```math
48/// d_{Sym}(u, v) = |u \Delta v| \leq d
49/// ```
50/// # Note
51/// The distance type is hard-coded as [`IntDistance`],
52/// so this metric is not generic over the distance type like many other metrics.
53///
54/// # Compatible Domains
55///
56/// * `VectorDomain<D>` for any valid `D`
57///
58/// When this metric is paired with a `VectorDomain`, we instead consider the multisets corresponding to $u, v \in \texttt{D}$.
59#[derive(Clone)]
60pub struct SymmetricDistance;
61
62impl Default for SymmetricDistance {
63    fn default() -> Self {
64        SymmetricDistance
65    }
66}
67
68impl PartialEq for SymmetricDistance {
69    fn eq(&self, _other: &Self) -> bool {
70        true
71    }
72}
73impl Debug for SymmetricDistance {
74    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
75        write!(f, "SymmetricDistance()")
76    }
77}
78impl Metric for SymmetricDistance {
79    type Distance = IntDistance;
80}
81
82// Symmetric distance is defined in terms of unescaped line-breaks for CSV string datasets
83impl MetricSpace for (AtomDomain<String>, SymmetricDistance) {
84    fn check_space(&self) -> Fallible<()> {
85        Ok(())
86    }
87}
88
89impl<D: Domain> MetricSpace for (VectorDomain<D>, SymmetricDistance) {
90    fn check_space(&self) -> Fallible<()> {
91        Ok(())
92    }
93}
94
95#[cfg(feature = "contrib")]
96impl<K: Hashable> MetricSpace for (DataFrameDomain<K>, SymmetricDistance) {
97    fn check_space(&self) -> Fallible<()> {
98        Ok(())
99    }
100}
101
102/// The smallest number of insertions or deletions to make two datasets equivalent.
103///
104/// An *insertion* to a dataset is an addition of an element at a specific index,
105/// and a *deletion* is the removal of an element at a specific index.
106///
107/// Therefore, this metric is sensitive to data ordering.
108/// Because this metric counts insertions and deletions,
109/// it is an unbounded metric (for unbounded DP).
110///
111/// # Proof Definition
112///
113/// ### `d`-closeness
114/// For any two vectors $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
115/// we say that $u, v$ are $d$-close under the insert-delete distance metric
116/// (abbreviated as $d_{ID}$) whenever
117///
118/// ```math
119/// d_{ID}(u, v) \leq d
120/// ```
121///
122/// # Note
123/// The distance type is hard-coded as [`IntDistance`],
124/// so this metric is not generic over the distance type like many other metrics.
125///
126/// # Compatible Domains
127///
128/// * `VectorDomain<D>` for any valid `D`
129#[derive(Clone)]
130pub struct InsertDeleteDistance;
131
132impl Default for InsertDeleteDistance {
133    fn default() -> Self {
134        InsertDeleteDistance
135    }
136}
137
138impl PartialEq for InsertDeleteDistance {
139    fn eq(&self, _other: &Self) -> bool {
140        true
141    }
142}
143impl Debug for InsertDeleteDistance {
144    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
145        write!(f, "InsertDeleteDistance()")
146    }
147}
148impl Metric for InsertDeleteDistance {
149    type Distance = IntDistance;
150}
151
152impl<D: Domain> MetricSpace for (VectorDomain<D>, InsertDeleteDistance) {
153    fn check_space(&self) -> Fallible<()> {
154        Ok(())
155    }
156}
157
158#[cfg(feature = "contrib")]
159impl<K: Hashable> MetricSpace for (DataFrameDomain<K>, InsertDeleteDistance) {
160    fn check_space(&self) -> Fallible<()> {
161        Ok(())
162    }
163}
164
165/// The smallest number of changes to make two equal-length datasets equivalent.
166///
167/// This metric is not sensitive to data ordering.
168/// Since this metric counts the number of changed rows,
169/// it is a bounded metric (for bounded DP).
170///
171/// Since this metric is bounded, the dataset size must be fixed.
172/// Thus we only consider neighboring datasets with the same fixed size: [`crate::domains::VectorDomain::size`].
173///
174/// # Proof Definition
175///
176/// ### `d`-closeness
177/// For any two datasets $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
178/// we say that $u, v$ are $d$-close under the change-one distance metric (abbreviated as $d_{CO}$) whenever
179///
180/// ```math
181/// d_{CO}(u, v) = d_{Sym}(u, v) / 2 \leq d
182/// ```
183/// $d_{Sym}$ is in reference to the [`SymmetricDistance`].
184///
185/// # Note
186/// Since the dataset size is fixed,
187/// there are always just as many additions as there are removals to reach an adjacent dataset.
188/// Consider an edit as one addition and one removal,
189/// therefore the symmetric distance is always even.
190///
191/// The distance type is hard-coded as [`IntDistance`],
192/// so this metric is not generic over the distance type like many other metrics.
193///
194/// WLOG, most OpenDP interfaces need only consider unbounded metrics.
195/// Use [`crate::transformations::make_metric_unbounded`] and [`crate::transformations::make_metric_bounded`]
196/// to convert to/from the symmetric distance.
197///
198/// # Compatible Domains
199///
200/// * `VectorDomain<D>` for any valid `D`, when `VectorDomain::size.is_some()`.
201#[derive(Clone)]
202pub struct ChangeOneDistance;
203
204impl Default for ChangeOneDistance {
205    fn default() -> Self {
206        ChangeOneDistance
207    }
208}
209
210impl PartialEq for ChangeOneDistance {
211    fn eq(&self, _other: &Self) -> bool {
212        true
213    }
214}
215impl Debug for ChangeOneDistance {
216    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
217        write!(f, "ChangeOneDistance()")
218    }
219}
220impl Metric for ChangeOneDistance {
221    type Distance = IntDistance;
222}
223
224impl<D: Domain> MetricSpace for (VectorDomain<D>, ChangeOneDistance) {
225    fn check_space(&self) -> Fallible<()> {
226        self.0.size.map(|_| ()).ok_or_else(|| {
227            err!(
228                MetricSpace,
229                "change-one distance requires a known dataset size"
230            )
231        })
232    }
233}
234
235/// The number of elements that differ between two equal-length datasets.
236///
237/// This metric is sensitive to data ordering.
238/// Since this metric counts the number of changed rows,
239/// it is a bounded metric (for bounded DP).
240///
241/// Since this metric is bounded, the dataset size must be fixed.
242/// Thus we only consider neighboring datasets with the same fixed size: [`crate::domains::VectorDomain::size`].
243///
244/// # Proof Definition
245///
246/// ### `d`-closeness
247/// For any two datasets $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
248/// we say that $u, v$ are $d$-close under the Hamming distance metric (abbreviated as $d_{Ham}$) whenever
249///
250/// ```math
251/// d_{Ham}(u, v) = \#\{i: u_i \neq v_i\} \leq d
252/// ```
253///
254/// # Note
255///
256/// The distance type is hard-coded as [`IntDistance`],
257/// so this metric is not generic over the distance type like many other metrics.
258///
259/// WLOG, most OpenDP interfaces need only consider unbounded metrics.
260/// Use [`crate::transformations::make_metric_unbounded`] and [`crate::transformations::make_metric_bounded`]
261/// to convert to/from the symmetric distance.
262///
263/// # Compatible Domains
264///
265/// * `VectorDomain<D>` for any valid `D`, when `VectorDomain::size.is_some()`.
266#[derive(Clone)]
267pub struct HammingDistance;
268
269impl Default for HammingDistance {
270    fn default() -> Self {
271        HammingDistance
272    }
273}
274
275impl PartialEq for HammingDistance {
276    fn eq(&self, _other: &Self) -> bool {
277        true
278    }
279}
280impl Debug for HammingDistance {
281    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
282        write!(f, "HammingDistance()")
283    }
284}
285impl Metric for HammingDistance {
286    type Distance = IntDistance;
287}
288impl<D: Domain> MetricSpace for (VectorDomain<D>, HammingDistance) {
289    fn check_space(&self) -> Fallible<()> {
290        self.0.size.map(|_| ()).ok_or_else(|| {
291            err!(
292                MetricSpace,
293                "Hamming distance requires a known dataset size"
294            )
295        })
296    }
297}
298
299/// The $L_p$ distance between two vector-valued aggregates.
300///
301/// # Proof Definition
302///
303/// ### $d$-closeness
304/// For any two vectors $u, v \in \texttt{D}$ and $d$ of generic type $\texttt{Q}$,
305/// we say that $u, v$ are $d$-close under the the $L_p$ distance metric (abbreviated as $d_{LP}$) whenever
306///
307/// ```math
308/// d_{LP}(u, v) = \|u_i - v_i\|_p \leq d
309/// ```
310///
311/// If $u$ and $v$ are different lengths, then
312/// ```math
313/// d_{LP}(u, v) = \infty
314/// ```
315///
316/// # Compatible Domains
317///
318/// * `VectorDomain<D>` for any valid `D`
319/// * `MapDomain<D>` for any valid `D`
320pub struct LpDistance<const P: usize, Q>(PhantomData<fn() -> Q>);
321impl<const P: usize, Q> Default for LpDistance<P, Q> {
322    fn default() -> Self {
323        LpDistance(PhantomData)
324    }
325}
326
327impl<const P: usize, Q> Clone for LpDistance<P, Q> {
328    fn clone(&self) -> Self {
329        Self::default()
330    }
331}
332impl<const P: usize, Q> PartialEq for LpDistance<P, Q> {
333    fn eq(&self, _other: &Self) -> bool {
334        true
335    }
336}
337impl<const P: usize, Q> Debug for LpDistance<P, Q> {
338    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
339        write!(f, "L{}Distance({})", P, type_name!(Q))
340    }
341}
342impl<const P: usize, Q> Metric for LpDistance<P, Q> {
343    type Distance = Q;
344}
345
346impl<T: CheckAtom, const P: usize, Q> MetricSpace
347    for (VectorDomain<AtomDomain<T>>, LpDistance<P, Q>)
348{
349    fn check_space(&self) -> Fallible<()> {
350        if self.0.element_domain.nullable() {
351            fallible!(MetricSpace, "LpDistance requires non-nullable elements")
352        } else {
353            Ok(())
354        }
355    }
356}
357impl<K: CheckAtom, V: CheckAtom, const P: usize, Q> MetricSpace
358    for (MapDomain<AtomDomain<K>, AtomDomain<V>>, LpDistance<P, Q>)
359where
360    K: Eq + Hash,
361{
362    fn check_space(&self) -> Fallible<()> {
363        if self.0.value_domain.nullable() {
364            return fallible!(MetricSpace, "LpDistance requires non-nullable elements");
365        } else {
366            Ok(())
367        }
368    }
369}
370
371/// The $L_1$ distance between two vector-valued aggregates.
372///
373/// Refer to [`LpDistance`] for details.
374pub type L1Distance<Q> = LpDistance<1, Q>;
375
376/// The $L_2$ distance between two vector-valued aggregates.
377///
378/// Refer to [`LpDistance`] for details.
379pub type L2Distance<Q> = LpDistance<2, Q>;
380
381/// The absolute distance between two scalar-valued aggregates.
382///
383/// # Proof Definition
384///
385/// ### `d`-closeness
386/// For any two scalars $u, v \in \texttt{D}$ and $d$ of generic type $\texttt{Q}$,
387/// we say that $u, v$ are $d$-close under the the the absolute distance metric (abbreviated as $d_{Abs}$) whenever
388///
389/// ```math
390/// d_{Abs}(u, v) = |u - v| \leq d
391/// ```
392///
393/// # Compatible Domains
394///
395/// * `AtomDomain<T>` for any valid `T`
396pub struct AbsoluteDistance<Q>(PhantomData<fn() -> Q>);
397impl<Q> Default for AbsoluteDistance<Q> {
398    fn default() -> Self {
399        AbsoluteDistance(PhantomData)
400    }
401}
402
403impl<Q> Clone for AbsoluteDistance<Q> {
404    fn clone(&self) -> Self {
405        Self::default()
406    }
407}
408impl<Q> PartialEq for AbsoluteDistance<Q> {
409    fn eq(&self, _other: &Self) -> bool {
410        true
411    }
412}
413impl<Q> Debug for AbsoluteDistance<Q> {
414    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
415        write!(f, "AbsoluteDistance({})", type_name!(Q))
416    }
417}
418impl<Q> Metric for AbsoluteDistance<Q> {
419    type Distance = Q;
420}
421impl<T: CheckAtom, Q> MetricSpace for (AtomDomain<T>, AbsoluteDistance<Q>) {
422    fn check_space(&self) -> Fallible<()> {
423        if self.0.nullable() {
424            fallible!(
425                MetricSpace,
426                "AbsoluteDistance requires non-nullable elements"
427            )
428        } else {
429            Ok(())
430        }
431    }
432}
433
434/// The $L^0$, $L\infty$ norms of the per-partition distances between data sets.
435///
436/// The $L^0$ norm counts the number of partitions that have changed.
437/// The $L\infty$ norm is the greatest change in any one partition.
438///
439/// # Proof Definition
440///
441/// ### $d$-closeness
442/// For any two partitionings $x, x' \in \texttt{D}$ and $d$ of type `(u32, M::Distance)`,
443/// we say that $x, x'$ are $d = (l0, li)$-close under the the partition distance metric whenever
444///
445/// ```math
446/// d(x, x') = (|d_M(x, x')|_0, |d_M(x, x')|_\infty) \leq (l0, li) = d
447/// ```
448///
449/// Both numbers in the 2-tuple must be less than their respective values to be $d$-close.
450///
451#[derive(Clone, PartialEq)]
452pub struct Parallel<M: Metric>(pub M);
453impl<M: Metric> Default for Parallel<M> {
454    fn default() -> Self {
455        Parallel(M::default())
456    }
457}
458impl<M: Metric> Debug for Parallel<M> {
459    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
460        write!(f, "Parallel({:?})", self.0)
461    }
462}
463
464impl<M: Metric> Metric for Parallel<M> {
465    //               L^0          L^\infty
466    type Distance = (IntDistance, M::Distance);
467}
468
469/// The $L^0$, $L^1$, $L\infty$ norms of the per-partition distances between data sets.
470///
471/// The $L^0$ norm counts the number of partitions that have changed.
472/// The $L^1$ norm is the total change.
473/// The $L\infty$ norm is the greatest change in any one partition.
474///
475/// # Proof Definition
476///
477/// ### $d$-closeness
478/// For any two partitionings $u, v \in \texttt{D}$ and $d$ of type `(usize, M::Distance, M::Distance)`,
479/// we say that $u, v$ are $d$-close under the the partition distance metric whenever
480///
481/// ```math
482/// d(x, x') = |d_M(x, x')|_0, |d_M(x, x')|_1, |d_M(x, x')|_\infty \leq d
483/// ```
484///
485/// All three numbers in the triple must be less than their respective values in $d$ to be $d$-close.
486///
487#[derive(Clone, PartialEq)]
488pub struct PartitionDistance<M: Metric>(pub M);
489impl<M: Metric> Default for PartitionDistance<M> {
490    fn default() -> Self {
491        PartitionDistance(M::default())
492    }
493}
494
495impl<M: Metric> Debug for PartitionDistance<M> {
496    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
497        write!(f, "PartitionDistance({:?})", self.0)
498    }
499}
500
501impl<M: Metric> Metric for PartitionDistance<M> {
502    //               L^0          L^1          L^\infty
503    type Distance = (IntDistance, M::Distance, M::Distance);
504}
505
506impl<T: CheckAtom> MetricSpace
507    for (
508        VectorDomain<AtomDomain<T>>,
509        PartitionDistance<AbsoluteDistance<T>>,
510    )
511{
512    fn check_space(&self) -> Fallible<()> {
513        if self.0.element_domain.nullable() {
514            fallible!(
515                MetricSpace,
516                "PartitionDistance requires non-nullable elements"
517            )
518        } else {
519            Ok(())
520        }
521    }
522}
523
524/// Indicates if two elements are equal to each other.
525///
526/// This is used in the context of randomized response,
527/// to capture the distance between adjacent inputs (they are either equivalent or not).
528///
529/// # Proof Definition
530///
531/// ### `d`-closeness
532/// For any two datasets $u, v \in$ `AtomDomain<T>` and any $d$ of type [`IntDistance`],
533/// we say that $u, v$ are $d$-close under the discrete metric (abbreviated as $d_{Eq}$) whenever
534///
535/// ```math
536/// d_{Eq}(u, v) = \mathbb{1}[u = v] \leq d
537/// ```
538///
539/// # Notes
540/// Clearly, `d` is bounded above by 1.
541/// 1 is the expected argument on measurements that use this distance.
542///
543/// # Compatible Domains
544/// * `AtomDomain<T>` for any valid `T`.
545#[derive(Clone, Default, PartialEq)]
546pub struct DiscreteDistance;
547
548impl Debug for DiscreteDistance {
549    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
550        write!(f, "DiscreteDistance()")
551    }
552}
553impl Metric for DiscreteDistance {
554    type Distance = IntDistance;
555}
556
557impl<T: CheckAtom> MetricSpace for (AtomDomain<T>, DiscreteDistance) {
558    fn check_space(&self) -> Fallible<()> {
559        Ok(())
560    }
561}
562
563impl MetricSpace for (BitVectorDomain, DiscreteDistance) {
564    fn check_space(&self) -> Fallible<()> {
565        Ok(())
566    }
567}
568
569/// Distance between score vectors with monotonicity indicator.
570///
571/// # Proof Definition
572///
573/// ### `d`-closeness
574/// For any two datasets $u, v \in$ `VectorDomain<AtomDomain<T>>` and any $d$ of type `T`,
575/// we say that $u, v$ are $d$-close under the l-infinity metric (abbreviated as $d_{\infty}$) whenever
576///
577/// ```math
578/// d_{\infty}(u, v) = max_{i} |u_i - v_i|
579/// ```
580///
581/// If `monotonic` is `true`, then the distance is infinity if any of the differences have opposing signs.
582pub struct LInfDistance<Q> {
583    pub monotonic: bool,
584    _marker: PhantomData<fn() -> Q>,
585}
586
587impl<Q> LInfDistance<Q> {
588    pub fn new(monotonic: bool) -> Self {
589        LInfDistance {
590            monotonic,
591            _marker: PhantomData,
592        }
593    }
594}
595
596impl<Q: InfAdd> LInfDistance<Q> {
597    /// Translate a distance bound `d_in` wrt the $L_\infty$ metric to a distance bound wrt the range metric.
598    ///
599    /// ```math
600    /// d_{\text{Range}}(u, v) = max_{ij} |(u_i - v_i) - (u_j - v_j)|
601    /// ```
602    pub fn range_distance(&self, d_in: Q) -> Fallible<Q> {
603        if self.monotonic {
604            Ok(d_in)
605        } else {
606            d_in.inf_add(&d_in)
607        }
608    }
609}
610
611impl<Q> Default for LInfDistance<Q> {
612    fn default() -> Self {
613        LInfDistance {
614            monotonic: false,
615            _marker: PhantomData,
616        }
617    }
618}
619
620impl<Q> Clone for LInfDistance<Q> {
621    fn clone(&self) -> Self {
622        LInfDistance {
623            monotonic: self.monotonic,
624            _marker: PhantomData,
625        }
626    }
627}
628impl<Q> PartialEq for LInfDistance<Q> {
629    fn eq(&self, other: &Self) -> bool {
630        self.monotonic == other.monotonic
631    }
632}
633impl<Q> Debug for LInfDistance<Q> {
634    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
635        let monotonic = self.monotonic.then_some("monotonic, ").unwrap_or_default();
636        write!(f, "LInfDistance({monotonic}T={})", type_name!(Q))
637    }
638}
639
640impl<Q> Metric for LInfDistance<Q> {
641    type Distance = Q;
642}
643
644impl<T: CheckAtom> MetricSpace for (VectorDomain<AtomDomain<T>>, LInfDistance<T>) {
645    fn check_space(&self) -> Fallible<()> {
646        if self.0.element_domain.nullable() {
647            fallible!(MetricSpace, "LInfDistance requires non-nullable elements")
648        } else {
649            Ok(())
650        }
651    }
652}