opendp/metrics/mod.rs
1//! Various implementations of Metrics (and associated Distance).
2//!
3//! A Metric is used to measure the distance between data.
4//! Metrics are paired with a **domain** on which the metric can measure distance.
5//! The distance is expressed in terms of an **associated type**.
6//!
7//! # Example
8//!
9//! [`SymmetricDistance`] can be paired with a domain: `VectorDomain(AtomDomain(T))`.
10//! In this context, the `SymmetricDistance` is used to measure the distance between any two vectors of elements of type `T`.
11//! The `SymmetricDistance` has an associated distance type of [`u32`].
12//! This means that the symmetric distance between vectors is expressed in terms of a [`u32`].
13
14#[cfg(feature = "ffi")]
15pub(crate) mod ffi;
16
17use std::hash::Hash;
18use std::marker::PhantomData;
19
20use crate::{
21 core::{Domain, Metric, MetricSpace},
22 domains::{type_name, AtomDomain, BitVectorDomain, MapDomain, VectorDomain},
23 error::Fallible,
24 traits::{CheckAtom, InfAdd},
25};
26#[cfg(feature = "contrib")]
27use crate::{traits::Hashable, transformations::DataFrameDomain};
28use std::fmt::{Debug, Formatter};
29
30/// The type that represents the distance between datasets.
31/// It is used as the associated [`Metric`]::Distance type for e.g. [`SymmetricDistance`], [`InsertDeleteDistance`], etc.
32pub type IntDistance = u32;
33
34/// The smallest number of additions or removals to make two datasets equivalent.
35///
36/// This metric is not sensitive to data ordering.
37/// Because this metric counts additions and removals,
38/// it is an unbounded metric (for unbounded DP).
39///
40/// # Proof Definition
41///
42/// ### `d`-closeness
43/// For any two vectors $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
44/// we say that $u, v$ are $d$-close under the symmetric distance metric
45/// (abbreviated as $d_{Sym}$) whenever
46///
47/// ```math
48/// d_{Sym}(u, v) = |u \Delta v| \leq d
49/// ```
50/// # Note
51/// The distance type is hard-coded as [`IntDistance`],
52/// so this metric is not generic over the distance type like many other metrics.
53///
54/// # Compatible Domains
55///
56/// * `VectorDomain<D>` for any valid `D`
57///
58/// When this metric is paired with a `VectorDomain`, we instead consider the multisets corresponding to $u, v \in \texttt{D}$.
59#[derive(Clone)]
60pub struct SymmetricDistance;
61
62impl Default for SymmetricDistance {
63 fn default() -> Self {
64 SymmetricDistance
65 }
66}
67
68impl PartialEq for SymmetricDistance {
69 fn eq(&self, _other: &Self) -> bool {
70 true
71 }
72}
73impl Debug for SymmetricDistance {
74 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
75 write!(f, "SymmetricDistance()")
76 }
77}
78impl Metric for SymmetricDistance {
79 type Distance = IntDistance;
80}
81
82// Symmetric distance is defined in terms of unescaped line-breaks for CSV string datasets
83impl MetricSpace for (AtomDomain<String>, SymmetricDistance) {
84 fn check_space(&self) -> Fallible<()> {
85 Ok(())
86 }
87}
88
89impl<D: Domain> MetricSpace for (VectorDomain<D>, SymmetricDistance) {
90 fn check_space(&self) -> Fallible<()> {
91 Ok(())
92 }
93}
94
95#[cfg(feature = "contrib")]
96impl<K: Hashable> MetricSpace for (DataFrameDomain<K>, SymmetricDistance) {
97 fn check_space(&self) -> Fallible<()> {
98 Ok(())
99 }
100}
101
102/// The smallest number of insertions or deletions to make two datasets equivalent.
103///
104/// An *insertion* to a dataset is an addition of an element at a specific index,
105/// and a *deletion* is the removal of an element at a specific index.
106///
107/// Therefore, this metric is sensitive to data ordering.
108/// Because this metric counts insertions and deletions,
109/// it is an unbounded metric (for unbounded DP).
110///
111/// # Proof Definition
112///
113/// ### `d`-closeness
114/// For any two vectors $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
115/// we say that $u, v$ are $d$-close under the insert-delete distance metric
116/// (abbreviated as $d_{ID}$) whenever
117///
118/// ```math
119/// d_{ID}(u, v) \leq d
120/// ```
121///
122/// # Note
123/// The distance type is hard-coded as [`IntDistance`],
124/// so this metric is not generic over the distance type like many other metrics.
125///
126/// # Compatible Domains
127///
128/// * `VectorDomain<D>` for any valid `D`
129#[derive(Clone)]
130pub struct InsertDeleteDistance;
131
132impl Default for InsertDeleteDistance {
133 fn default() -> Self {
134 InsertDeleteDistance
135 }
136}
137
138impl PartialEq for InsertDeleteDistance {
139 fn eq(&self, _other: &Self) -> bool {
140 true
141 }
142}
143impl Debug for InsertDeleteDistance {
144 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
145 write!(f, "InsertDeleteDistance()")
146 }
147}
148impl Metric for InsertDeleteDistance {
149 type Distance = IntDistance;
150}
151
152impl<D: Domain> MetricSpace for (VectorDomain<D>, InsertDeleteDistance) {
153 fn check_space(&self) -> Fallible<()> {
154 Ok(())
155 }
156}
157
158#[cfg(feature = "contrib")]
159impl<K: Hashable> MetricSpace for (DataFrameDomain<K>, InsertDeleteDistance) {
160 fn check_space(&self) -> Fallible<()> {
161 Ok(())
162 }
163}
164
165/// The smallest number of changes to make two equal-length datasets equivalent.
166///
167/// This metric is not sensitive to data ordering.
168/// Since this metric counts the number of changed rows,
169/// it is a bounded metric (for bounded DP).
170///
171/// Since this metric is bounded, the dataset size must be fixed.
172/// Thus we only consider neighboring datasets with the same fixed size: [`crate::domains::VectorDomain::size`].
173///
174/// # Proof Definition
175///
176/// ### `d`-closeness
177/// For any two datasets $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
178/// we say that $u, v$ are $d$-close under the change-one distance metric (abbreviated as $d_{CO}$) whenever
179///
180/// ```math
181/// d_{CO}(u, v) = d_{Sym}(u, v) / 2 \leq d
182/// ```
183/// $d_{Sym}$ is in reference to the [`SymmetricDistance`].
184///
185/// # Note
186/// Since the dataset size is fixed,
187/// there are always just as many additions as there are removals to reach an adjacent dataset.
188/// Consider an edit as one addition and one removal,
189/// therefore the symmetric distance is always even.
190///
191/// The distance type is hard-coded as [`IntDistance`],
192/// so this metric is not generic over the distance type like many other metrics.
193///
194/// WLOG, most OpenDP interfaces need only consider unbounded metrics.
195/// Use [`crate::transformations::make_metric_unbounded`] and [`crate::transformations::make_metric_bounded`]
196/// to convert to/from the symmetric distance.
197///
198/// # Compatible Domains
199///
200/// * `VectorDomain<D>` for any valid `D`, when `VectorDomain::size.is_some()`.
201#[derive(Clone)]
202pub struct ChangeOneDistance;
203
204impl Default for ChangeOneDistance {
205 fn default() -> Self {
206 ChangeOneDistance
207 }
208}
209
210impl PartialEq for ChangeOneDistance {
211 fn eq(&self, _other: &Self) -> bool {
212 true
213 }
214}
215impl Debug for ChangeOneDistance {
216 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
217 write!(f, "ChangeOneDistance()")
218 }
219}
220impl Metric for ChangeOneDistance {
221 type Distance = IntDistance;
222}
223
224impl<D: Domain> MetricSpace for (VectorDomain<D>, ChangeOneDistance) {
225 fn check_space(&self) -> Fallible<()> {
226 self.0.size.map(|_| ()).ok_or_else(|| {
227 err!(
228 MetricSpace,
229 "change-one distance requires a known dataset size"
230 )
231 })
232 }
233}
234
235/// The number of elements that differ between two equal-length datasets.
236///
237/// This metric is sensitive to data ordering.
238/// Since this metric counts the number of changed rows,
239/// it is a bounded metric (for bounded DP).
240///
241/// Since this metric is bounded, the dataset size must be fixed.
242/// Thus we only consider neighboring datasets with the same fixed size: [`crate::domains::VectorDomain::size`].
243///
244/// # Proof Definition
245///
246/// ### `d`-closeness
247/// For any two datasets $u, v \in \texttt{D}$ and any $d$ of type [`IntDistance`],
248/// we say that $u, v$ are $d$-close under the Hamming distance metric (abbreviated as $d_{Ham}$) whenever
249///
250/// ```math
251/// d_{Ham}(u, v) = \#\{i: u_i \neq v_i\} \leq d
252/// ```
253///
254/// # Note
255///
256/// The distance type is hard-coded as [`IntDistance`],
257/// so this metric is not generic over the distance type like many other metrics.
258///
259/// WLOG, most OpenDP interfaces need only consider unbounded metrics.
260/// Use [`crate::transformations::make_metric_unbounded`] and [`crate::transformations::make_metric_bounded`]
261/// to convert to/from the symmetric distance.
262///
263/// # Compatible Domains
264///
265/// * `VectorDomain<D>` for any valid `D`, when `VectorDomain::size.is_some()`.
266#[derive(Clone)]
267pub struct HammingDistance;
268
269impl Default for HammingDistance {
270 fn default() -> Self {
271 HammingDistance
272 }
273}
274
275impl PartialEq for HammingDistance {
276 fn eq(&self, _other: &Self) -> bool {
277 true
278 }
279}
280impl Debug for HammingDistance {
281 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
282 write!(f, "HammingDistance()")
283 }
284}
285impl Metric for HammingDistance {
286 type Distance = IntDistance;
287}
288impl<D: Domain> MetricSpace for (VectorDomain<D>, HammingDistance) {
289 fn check_space(&self) -> Fallible<()> {
290 self.0.size.map(|_| ()).ok_or_else(|| {
291 err!(
292 MetricSpace,
293 "Hamming distance requires a known dataset size"
294 )
295 })
296 }
297}
298
299/// The $L_p$ distance between two vector-valued aggregates.
300///
301/// # Proof Definition
302///
303/// ### $d$-closeness
304/// For any two vectors $u, v \in \texttt{D}$ and $d$ of generic type $\texttt{Q}$,
305/// we say that $u, v$ are $d$-close under the the $L_p$ distance metric (abbreviated as $d_{LP}$) whenever
306///
307/// ```math
308/// d_{LP}(u, v) = \|u_i - v_i\|_p \leq d
309/// ```
310///
311/// If $u$ and $v$ are different lengths, then
312/// ```math
313/// d_{LP}(u, v) = \infty
314/// ```
315///
316/// # Compatible Domains
317///
318/// * `VectorDomain<D>` for any valid `D`
319/// * `MapDomain<D>` for any valid `D`
320pub struct LpDistance<const P: usize, Q>(PhantomData<fn() -> Q>);
321impl<const P: usize, Q> Default for LpDistance<P, Q> {
322 fn default() -> Self {
323 LpDistance(PhantomData)
324 }
325}
326
327impl<const P: usize, Q> Clone for LpDistance<P, Q> {
328 fn clone(&self) -> Self {
329 Self::default()
330 }
331}
332impl<const P: usize, Q> PartialEq for LpDistance<P, Q> {
333 fn eq(&self, _other: &Self) -> bool {
334 true
335 }
336}
337impl<const P: usize, Q> Debug for LpDistance<P, Q> {
338 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
339 write!(f, "L{}Distance({})", P, type_name!(Q))
340 }
341}
342impl<const P: usize, Q> Metric for LpDistance<P, Q> {
343 type Distance = Q;
344}
345
346impl<T: CheckAtom, const P: usize, Q> MetricSpace
347 for (VectorDomain<AtomDomain<T>>, LpDistance<P, Q>)
348{
349 fn check_space(&self) -> Fallible<()> {
350 if self.0.element_domain.nullable() {
351 fallible!(MetricSpace, "LpDistance requires non-nullable elements")
352 } else {
353 Ok(())
354 }
355 }
356}
357impl<K: CheckAtom, V: CheckAtom, const P: usize, Q> MetricSpace
358 for (MapDomain<AtomDomain<K>, AtomDomain<V>>, LpDistance<P, Q>)
359where
360 K: Eq + Hash,
361{
362 fn check_space(&self) -> Fallible<()> {
363 if self.0.value_domain.nullable() {
364 return fallible!(MetricSpace, "LpDistance requires non-nullable elements");
365 } else {
366 Ok(())
367 }
368 }
369}
370
371/// The $L_1$ distance between two vector-valued aggregates.
372///
373/// Refer to [`LpDistance`] for details.
374pub type L1Distance<Q> = LpDistance<1, Q>;
375
376/// The $L_2$ distance between two vector-valued aggregates.
377///
378/// Refer to [`LpDistance`] for details.
379pub type L2Distance<Q> = LpDistance<2, Q>;
380
381/// The absolute distance between two scalar-valued aggregates.
382///
383/// # Proof Definition
384///
385/// ### `d`-closeness
386/// For any two scalars $u, v \in \texttt{D}$ and $d$ of generic type $\texttt{Q}$,
387/// we say that $u, v$ are $d$-close under the the the absolute distance metric (abbreviated as $d_{Abs}$) whenever
388///
389/// ```math
390/// d_{Abs}(u, v) = |u - v| \leq d
391/// ```
392///
393/// # Compatible Domains
394///
395/// * `AtomDomain<T>` for any valid `T`
396pub struct AbsoluteDistance<Q>(PhantomData<fn() -> Q>);
397impl<Q> Default for AbsoluteDistance<Q> {
398 fn default() -> Self {
399 AbsoluteDistance(PhantomData)
400 }
401}
402
403impl<Q> Clone for AbsoluteDistance<Q> {
404 fn clone(&self) -> Self {
405 Self::default()
406 }
407}
408impl<Q> PartialEq for AbsoluteDistance<Q> {
409 fn eq(&self, _other: &Self) -> bool {
410 true
411 }
412}
413impl<Q> Debug for AbsoluteDistance<Q> {
414 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
415 write!(f, "AbsoluteDistance({})", type_name!(Q))
416 }
417}
418impl<Q> Metric for AbsoluteDistance<Q> {
419 type Distance = Q;
420}
421impl<T: CheckAtom, Q> MetricSpace for (AtomDomain<T>, AbsoluteDistance<Q>) {
422 fn check_space(&self) -> Fallible<()> {
423 if self.0.nullable() {
424 fallible!(
425 MetricSpace,
426 "AbsoluteDistance requires non-nullable elements"
427 )
428 } else {
429 Ok(())
430 }
431 }
432}
433
434/// The $L^0$, $L\infty$ norms of the per-partition distances between data sets.
435///
436/// The $L^0$ norm counts the number of partitions that have changed.
437/// The $L\infty$ norm is the greatest change in any one partition.
438///
439/// # Proof Definition
440///
441/// ### $d$-closeness
442/// For any two partitionings $x, x' \in \texttt{D}$ and $d$ of type `(u32, M::Distance)`,
443/// we say that $x, x'$ are $d = (l0, li)$-close under the the partition distance metric whenever
444///
445/// ```math
446/// d(x, x') = (|d_M(x, x')|_0, |d_M(x, x')|_\infty) \leq (l0, li) = d
447/// ```
448///
449/// Both numbers in the 2-tuple must be less than their respective values to be $d$-close.
450///
451#[derive(Clone, PartialEq)]
452pub struct Parallel<M: Metric>(pub M);
453impl<M: Metric> Default for Parallel<M> {
454 fn default() -> Self {
455 Parallel(M::default())
456 }
457}
458impl<M: Metric> Debug for Parallel<M> {
459 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
460 write!(f, "Parallel({:?})", self.0)
461 }
462}
463
464impl<M: Metric> Metric for Parallel<M> {
465 // L^0 L^\infty
466 type Distance = (IntDistance, M::Distance);
467}
468
469/// The $L^0$, $L^1$, $L\infty$ norms of the per-partition distances between data sets.
470///
471/// The $L^0$ norm counts the number of partitions that have changed.
472/// The $L^1$ norm is the total change.
473/// The $L\infty$ norm is the greatest change in any one partition.
474///
475/// # Proof Definition
476///
477/// ### $d$-closeness
478/// For any two partitionings $u, v \in \texttt{D}$ and $d$ of type `(usize, M::Distance, M::Distance)`,
479/// we say that $u, v$ are $d$-close under the the partition distance metric whenever
480///
481/// ```math
482/// d(x, x') = |d_M(x, x')|_0, |d_M(x, x')|_1, |d_M(x, x')|_\infty \leq d
483/// ```
484///
485/// All three numbers in the triple must be less than their respective values in $d$ to be $d$-close.
486///
487#[derive(Clone, PartialEq)]
488pub struct PartitionDistance<M: Metric>(pub M);
489impl<M: Metric> Default for PartitionDistance<M> {
490 fn default() -> Self {
491 PartitionDistance(M::default())
492 }
493}
494
495impl<M: Metric> Debug for PartitionDistance<M> {
496 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
497 write!(f, "PartitionDistance({:?})", self.0)
498 }
499}
500
501impl<M: Metric> Metric for PartitionDistance<M> {
502 // L^0 L^1 L^\infty
503 type Distance = (IntDistance, M::Distance, M::Distance);
504}
505
506impl<T: CheckAtom> MetricSpace
507 for (
508 VectorDomain<AtomDomain<T>>,
509 PartitionDistance<AbsoluteDistance<T>>,
510 )
511{
512 fn check_space(&self) -> Fallible<()> {
513 if self.0.element_domain.nullable() {
514 fallible!(
515 MetricSpace,
516 "PartitionDistance requires non-nullable elements"
517 )
518 } else {
519 Ok(())
520 }
521 }
522}
523
524/// Indicates if two elements are equal to each other.
525///
526/// This is used in the context of randomized response,
527/// to capture the distance between adjacent inputs (they are either equivalent or not).
528///
529/// # Proof Definition
530///
531/// ### `d`-closeness
532/// For any two datasets $u, v \in$ `AtomDomain<T>` and any $d$ of type [`IntDistance`],
533/// we say that $u, v$ are $d$-close under the discrete metric (abbreviated as $d_{Eq}$) whenever
534///
535/// ```math
536/// d_{Eq}(u, v) = \mathbb{1}[u = v] \leq d
537/// ```
538///
539/// # Notes
540/// Clearly, `d` is bounded above by 1.
541/// 1 is the expected argument on measurements that use this distance.
542///
543/// # Compatible Domains
544/// * `AtomDomain<T>` for any valid `T`.
545#[derive(Clone, Default, PartialEq)]
546pub struct DiscreteDistance;
547
548impl Debug for DiscreteDistance {
549 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
550 write!(f, "DiscreteDistance()")
551 }
552}
553impl Metric for DiscreteDistance {
554 type Distance = IntDistance;
555}
556
557impl<T: CheckAtom> MetricSpace for (AtomDomain<T>, DiscreteDistance) {
558 fn check_space(&self) -> Fallible<()> {
559 Ok(())
560 }
561}
562
563impl MetricSpace for (BitVectorDomain, DiscreteDistance) {
564 fn check_space(&self) -> Fallible<()> {
565 Ok(())
566 }
567}
568
569/// Distance between score vectors with monotonicity indicator.
570///
571/// # Proof Definition
572///
573/// ### `d`-closeness
574/// For any two datasets $u, v \in$ `VectorDomain<AtomDomain<T>>` and any $d$ of type `T`,
575/// we say that $u, v$ are $d$-close under the l-infinity metric (abbreviated as $d_{\infty}$) whenever
576///
577/// ```math
578/// d_{\infty}(u, v) = max_{i} |u_i - v_i|
579/// ```
580///
581/// If `monotonic` is `true`, then the distance is infinity if any of the differences have opposing signs.
582pub struct LInfDistance<Q> {
583 pub monotonic: bool,
584 _marker: PhantomData<fn() -> Q>,
585}
586
587impl<Q> LInfDistance<Q> {
588 pub fn new(monotonic: bool) -> Self {
589 LInfDistance {
590 monotonic,
591 _marker: PhantomData,
592 }
593 }
594}
595
596impl<Q: InfAdd> LInfDistance<Q> {
597 /// Translate a distance bound `d_in` wrt the $L_\infty$ metric to a distance bound wrt the range metric.
598 ///
599 /// ```math
600 /// d_{\text{Range}}(u, v) = max_{ij} |(u_i - v_i) - (u_j - v_j)|
601 /// ```
602 pub fn range_distance(&self, d_in: Q) -> Fallible<Q> {
603 if self.monotonic {
604 Ok(d_in)
605 } else {
606 d_in.inf_add(&d_in)
607 }
608 }
609}
610
611impl<Q> Default for LInfDistance<Q> {
612 fn default() -> Self {
613 LInfDistance {
614 monotonic: false,
615 _marker: PhantomData,
616 }
617 }
618}
619
620impl<Q> Clone for LInfDistance<Q> {
621 fn clone(&self) -> Self {
622 LInfDistance {
623 monotonic: self.monotonic,
624 _marker: PhantomData,
625 }
626 }
627}
628impl<Q> PartialEq for LInfDistance<Q> {
629 fn eq(&self, other: &Self) -> bool {
630 self.monotonic == other.monotonic
631 }
632}
633impl<Q> Debug for LInfDistance<Q> {
634 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
635 let monotonic = self.monotonic.then_some("monotonic, ").unwrap_or_default();
636 write!(f, "LInfDistance({monotonic}T={})", type_name!(Q))
637 }
638}
639
640impl<Q> Metric for LInfDistance<Q> {
641 type Distance = Q;
642}
643
644impl<T: CheckAtom> MetricSpace for (VectorDomain<AtomDomain<T>>, LInfDistance<T>) {
645 fn check_space(&self) -> Fallible<()> {
646 if self.0.element_domain.nullable() {
647 fallible!(MetricSpace, "LInfDistance requires non-nullable elements")
648 } else {
649 Ok(())
650 }
651 }
652}