datafusion_physical_expr/utils/
guarantee.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19//! constant.
20
21use crate::utils::split_disjunction;
22use crate::{split_conjunction, PhysicalExpr};
23use datafusion_common::{Column, HashMap, ScalarValue};
24use datafusion_expr::Operator;
25use std::collections::HashSet;
26use std::fmt::{self, Display, Formatter};
27use std::sync::Arc;
28
29/// Represents a guarantee that must be true for a boolean expression to
30/// evaluate to `true`.
31///
32/// The guarantee takes the form of a column and a set of literal (constant)
33/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34/// satisfy* the guarantee(s).
35///
36/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37/// column must either:
38///
39/// 1. be ONLY one of that set
40/// 2. NOT be ANY of that set
41///
42/// # Uses `LiteralGuarantee`s
43///
44/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45/// files (e.g. row groups in parquet files) by proving expressions can not
46/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48/// partition where we know that `a` never has the value of `1`.
49///
50/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53/// that does **not** guarantee that the predicate will actually evaluate to
54/// `true`: it may still evaluate to `true`, `false` or `null`.
55///
56/// # Creating `LiteralGuarantee`s
57///
58/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59/// filter predicate.
60///
61/// # Details
62/// A guarantee can be one of two forms:
63///
64/// 1. The column must be one the values for the predicate to be `true`. If the
65///    column takes on any other value, the predicate can not evaluate to `true`.
66///    For example,
67///    `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68///
69/// 2. The column must NOT be one of the values for the predicate to be `true`.
70///    If the column can ONLY take one of these values, the predicate can not
71///    evaluate to `true`. For example,
72///    `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73#[derive(Debug, Clone, PartialEq)]
74pub struct LiteralGuarantee {
75    pub column: Column,
76    pub guarantee: Guarantee,
77    pub literals: HashSet<ScalarValue>,
78}
79
80/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Guarantee {
83    /// Guarantee that the expression is `true` if `column` is one of the values. If
84    /// `column` is not one of the values, the expression can not be `true`.
85    In,
86    /// Guarantee that the expression is `true` if `column` is not ANY of the
87    /// values. If `column` only takes one of these values, the expression can
88    /// not be `true`.
89    NotIn,
90}
91
92impl LiteralGuarantee {
93    /// Create a new instance of the guarantee if the provided operator is
94    /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95    /// create these structures from an predicate (boolean expression).
96    fn new<'a>(
97        column_name: impl Into<String>,
98        guarantee: Guarantee,
99        literals: impl IntoIterator<Item = &'a ScalarValue>,
100    ) -> Self {
101        let literals: HashSet<_> = literals.into_iter().cloned().collect();
102
103        Self {
104            column: Column::from_name(column_name),
105            guarantee,
106            literals,
107        }
108    }
109
110    /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
111    /// to evaluate to `true`.
112    ///
113    /// If more than one `LiteralGuarantee` is returned, they must **all** hold
114    /// for the expression to possibly be `true`. If any is not satisfied, the
115    /// expression is guaranteed to be `null` or `false`.
116    ///
117    /// # Notes:
118    /// 1. `expr` must be a boolean expression or inlist expression.
119    /// 2. `expr` is not simplified prior to analysis.
120    pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
121        // split conjunction: <expr> AND <expr> AND ...
122        split_conjunction(expr)
123            .into_iter()
124            // for an `AND` conjunction to be true, all terms individually must be true
125            .fold(GuaranteeBuilder::new(), |builder, expr| {
126                if let Some(cel) = ColOpLit::try_new(expr) {
127                    builder.aggregate_conjunct(cel)
128                } else if let Some(inlist) = expr
129                    .as_any()
130                    .downcast_ref::<crate::expressions::InListExpr>()
131                {
132                    // Only support single-column inlist currently, multi-column inlist is not supported
133                    let col = inlist
134                        .expr()
135                        .as_any()
136                        .downcast_ref::<crate::expressions::Column>();
137                    let Some(col) = col else {
138                        return builder;
139                    };
140
141                    let literals = inlist
142                        .list()
143                        .iter()
144                        .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
145                        .collect::<Option<Vec<_>>>();
146                    let Some(literals) = literals else {
147                        return builder;
148                    };
149
150                    let guarantee = if inlist.negated() {
151                        Guarantee::NotIn
152                    } else {
153                        Guarantee::In
154                    };
155
156                    builder.aggregate_multi_conjunct(
157                        col,
158                        guarantee,
159                        literals.iter().map(|e| e.value()),
160                    )
161                } else {
162                    // split disjunction: <expr> OR <expr> OR ...
163                    let disjunctions = split_disjunction(expr);
164
165                    // We are trying to add a guarantee that a column must be
166                    // in/not in a particular set of values for the expression
167                    // to evaluate to true.
168                    //
169                    // A disjunction is true, if at least one of the terms is be
170                    // true.
171                    //
172                    // Thus, we can infer a guarantee if all terms are of the
173                    // form `(col <op> literal) OR (col <op> literal) OR ...`.
174                    //
175                    // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
176                    // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
177                    //
178                    // However, for something like  `a = 1 OR a = 2 OR a < 0` we
179                    // **can't** guarantee that the predicate is only true if a
180                    // is in (`1`, `2`), as it could also be true if `a` were less
181                    // than zero.
182                    let terms = disjunctions
183                        .iter()
184                        .filter_map(|expr| ColOpLit::try_new(expr))
185                        .collect::<Vec<_>>();
186
187                    if terms.is_empty() {
188                        return builder;
189                    }
190
191                    // if not all terms are of the form (col <op> literal),
192                    // can't infer any guarantees
193                    if terms.len() != disjunctions.len() {
194                        return builder;
195                    }
196
197                    // if all terms are 'col <op> literal' with the same column
198                    // and operation we can infer any guarantees
199                    //
200                    // For those like (a != foo AND (a != bar OR a != baz)).
201                    // We can't combine the (a != bar OR a != baz) part, but
202                    // it also doesn't invalidate our knowledge that a !=
203                    // foo is required for the expression to be true.
204                    // So we can only create a multi value guarantee for `=`
205                    // (or a single value). (e.g. ignore `a != foo OR a != bar`)
206                    let first_term = &terms[0];
207                    if terms.iter().all(|term| {
208                        term.col.name() == first_term.col.name()
209                            && term.guarantee == Guarantee::In
210                    }) {
211                        builder.aggregate_multi_conjunct(
212                            first_term.col,
213                            Guarantee::In,
214                            terms.iter().map(|term| term.lit.value()),
215                        )
216                    } else {
217                        // can't infer anything
218                        builder
219                    }
220                }
221            })
222            .build()
223    }
224}
225
226impl Display for LiteralGuarantee {
227    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
228        let mut sorted_literals: Vec<_> =
229            self.literals.iter().map(|lit| lit.to_string()).collect();
230        sorted_literals.sort();
231        match self.guarantee {
232            Guarantee::In => write!(
233                f,
234                "{} in ({})",
235                self.column.name,
236                sorted_literals.join(", ")
237            ),
238            Guarantee::NotIn => write!(
239                f,
240                "{} not in ({})",
241                self.column.name,
242                sorted_literals.join(", ")
243            ),
244        }
245    }
246}
247
248/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
249/// preserving insert order
250#[derive(Debug, Default)]
251struct GuaranteeBuilder<'a> {
252    /// List of guarantees that have been created so far
253    /// if we have determined a subsequent conjunct invalidates a guarantee
254    /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
255    guarantees: Vec<Option<LiteralGuarantee>>,
256
257    /// Key is the (column name, guarantee type)
258    /// Value is the index into `guarantees`
259    map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
260}
261
262impl<'a> GuaranteeBuilder<'a> {
263    fn new() -> Self {
264        Default::default()
265    }
266
267    /// Aggregate a new single `AND col <op> literal` term to this builder
268    /// combining with existing guarantees if possible.
269    ///
270    /// # Examples
271    /// * `AND (a = 1)`: `a` is guaranteed to be 1
272    /// * `AND (a != 1)`: a is guaranteed to not be 1
273    fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
274        self.aggregate_multi_conjunct(
275            col_op_lit.col,
276            col_op_lit.guarantee,
277            [col_op_lit.lit.value()],
278        )
279    }
280
281    /// Aggregates a new single column, multi literal term to this builder
282    /// combining with previously known guarantees if possible.
283    ///
284    /// # Examples
285    /// For the following examples, we can guarantee the expression is `true` if:
286    /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
287    /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
288    /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
289    /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
290    fn aggregate_multi_conjunct(
291        mut self,
292        col: &'a crate::expressions::Column,
293        guarantee: Guarantee,
294        new_values: impl IntoIterator<Item = &'a ScalarValue>,
295    ) -> Self {
296        let key = (col, guarantee);
297        if let Some(index) = self.map.get(&key) {
298            // already have a guarantee for this column
299            let entry = &mut self.guarantees[*index];
300
301            let Some(existing) = entry else {
302                // determined the previous guarantee for this column has been
303                // invalidated, nothing to do
304                return self;
305            };
306
307            // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
308            // AND a = bar` doesn't make logical sense so we don't optimize this
309            // case
310            match existing.guarantee {
311                // knew that the column could not be a set of values
312                //
313                // For example, if we previously had `a != 5` and now we see
314                // another `AND a != 6` we know that a must not be either 5 or 6
315                // for the expression to be true
316                Guarantee::NotIn => {
317                    let new_values: HashSet<_> = new_values.into_iter().collect();
318                    existing.literals.extend(new_values.into_iter().cloned());
319                }
320                Guarantee::In => {
321                    let intersection = new_values
322                        .into_iter()
323                        .filter(|new_value| existing.literals.contains(*new_value))
324                        .collect::<Vec<_>>();
325                    // for an In guarantee, if the intersection is not empty,  we can extend the guarantee
326                    // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
327                    // otherwise, we invalidate the guarantee
328                    // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
329                    if !intersection.is_empty() {
330                        existing.literals = intersection.into_iter().cloned().collect();
331                    } else {
332                        // at least one was not, so invalidate the guarantee
333                        *entry = None;
334                    }
335                }
336            }
337        } else {
338            // This is a new guarantee
339            let new_values: HashSet<_> = new_values.into_iter().collect();
340
341            let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
342            // add it to the list of guarantees
343            self.guarantees.push(Some(guarantee));
344            self.map.insert(key, self.guarantees.len() - 1);
345        }
346
347        self
348    }
349
350    /// Return all guarantees that have been created so far
351    fn build(self) -> Vec<LiteralGuarantee> {
352        // filter out any guarantees that have been invalidated
353        self.guarantees.into_iter().flatten().collect()
354    }
355}
356
357/// Represents a single `col [not]in literal` expression
358struct ColOpLit<'a> {
359    col: &'a crate::expressions::Column,
360    guarantee: Guarantee,
361    lit: &'a crate::expressions::Literal,
362}
363
364impl<'a> ColOpLit<'a> {
365    /// Returns Some(ColEqLit) if the expression is either:
366    /// 1. `col <op> literal`
367    /// 2. `literal <op> col`
368    /// 3. operator is `=` or `!=`
369    ///
370    /// Returns None otherwise
371    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
372        let binary_expr = expr
373            .as_any()
374            .downcast_ref::<crate::expressions::BinaryExpr>()?;
375
376        let (left, op, right) = (
377            binary_expr.left().as_any(),
378            binary_expr.op(),
379            binary_expr.right().as_any(),
380        );
381        let guarantee = match op {
382            Operator::Eq => Guarantee::In,
383            Operator::NotEq => Guarantee::NotIn,
384            _ => return None,
385        };
386        // col <op> literal
387        if let (Some(col), Some(lit)) = (
388            left.downcast_ref::<crate::expressions::Column>(),
389            right.downcast_ref::<crate::expressions::Literal>(),
390        ) {
391            Some(Self {
392                col,
393                guarantee,
394                lit,
395            })
396        }
397        // literal <op> col
398        else if let (Some(lit), Some(col)) = (
399            left.downcast_ref::<crate::expressions::Literal>(),
400            right.downcast_ref::<crate::expressions::Column>(),
401        ) {
402            Some(Self {
403                col,
404                guarantee,
405                lit,
406            })
407        } else {
408            None
409        }
410    }
411}
412
413#[cfg(test)]
414mod test {
415    use std::sync::LazyLock;
416
417    use super::*;
418    use crate::planner::logical2physical;
419
420    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
421    use datafusion_expr::expr_fn::*;
422    use datafusion_expr::{lit, Expr};
423
424    use itertools::Itertools;
425
426    #[test]
427    fn test_literal() {
428        // a single literal offers no guarantee
429        test_analyze(lit(true), vec![])
430    }
431
432    #[test]
433    fn test_single() {
434        // a = "foo"
435        test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
436        // "foo" = a
437        test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
438        // a != "foo"
439        test_analyze(
440            col("a").not_eq(lit("foo")),
441            vec![not_in_guarantee("a", ["foo"])],
442        );
443        // "foo" != a
444        test_analyze(
445            lit("foo").not_eq(col("a")),
446            vec![not_in_guarantee("a", ["foo"])],
447        );
448    }
449
450    #[test]
451    fn test_conjunction_single_column() {
452        // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
453        test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
454        // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
455        test_analyze(
456            col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
457            vec![
458                // can only be true of b is 1 and b is not 2 (even though it is redundant)
459                in_guarantee("b", [1]),
460                not_in_guarantee("b", [2]),
461            ],
462        );
463        // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
464        test_analyze(
465            col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
466            vec![
467                // can only be true of b is not 1 and b is 2 (even though it is redundant)
468                not_in_guarantee("b", [1]),
469                in_guarantee("b", [2]),
470            ],
471        );
472        // b != 1 AND b != 2
473        test_analyze(
474            col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
475            vec![not_in_guarantee("b", [1, 2])],
476        );
477        // b != 1 AND b != 2 and b != 3
478        test_analyze(
479            col("b")
480                .not_eq(lit(1))
481                .and(col("b").not_eq(lit(2)))
482                .and(col("b").not_eq(lit(3))),
483            vec![not_in_guarantee("b", [1, 2, 3])],
484        );
485        // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
486        test_analyze(
487            col("b")
488                .not_eq(lit(1))
489                .and(col("b").eq(lit(2)))
490                .and(col("b").not_eq(lit(3))),
491            vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
492        );
493        // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
494        test_analyze(
495            col("b")
496                .not_eq(lit(1))
497                .and(col("b").not_eq(lit(2)))
498                .and(col("b").eq(lit(3))),
499            vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
500        );
501        // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
502        test_analyze(
503            col("b")
504                .not_eq(lit(1))
505                .and(col("b").not_eq(lit(2)))
506                .and(col("b").gt(lit(3))),
507            vec![not_in_guarantee("b", [1, 2])],
508        );
509    }
510
511    #[test]
512    fn test_conjunction_multi_column() {
513        // a = "foo" AND b = 1
514        test_analyze(
515            col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
516            vec![
517                // should find both column guarantees
518                in_guarantee("a", ["foo"]),
519                in_guarantee("b", [1]),
520            ],
521        );
522        // a != "foo" AND b != 1
523        test_analyze(
524            col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
525            // should find both column guarantees
526            vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
527        );
528        // a = "foo" AND a = "bar"
529        test_analyze(
530            col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
531            // this predicate is impossible ( can't be both foo and bar),
532            vec![],
533        );
534        // a = "foo" AND b != "bar"
535        test_analyze(
536            col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
537            vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
538        );
539        // a != "foo" AND a != "bar"
540        test_analyze(
541            col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
542            // know it isn't "foo" or "bar"
543            vec![not_in_guarantee("a", ["foo", "bar"])],
544        );
545        // a != "foo" AND a != "bar" and a != "baz"
546        test_analyze(
547            col("a")
548                .not_eq(lit("foo"))
549                .and(col("a").not_eq(lit("bar")))
550                .and(col("a").not_eq(lit("baz"))),
551            // know it isn't "foo" or "bar" or "baz"
552            vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
553        );
554        // a = "foo" AND a = "foo"
555        let expr = col("a").eq(lit("foo"));
556        test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
557        // b > 5 AND b = 10 (should get an b = 10 guarantee)
558        test_analyze(
559            col("b").gt(lit(5)).and(col("b").eq(lit(10))),
560            vec![in_guarantee("b", [10])],
561        );
562        // b > 10 AND b = 10 (this is impossible)
563        test_analyze(
564            col("b").gt(lit(10)).and(col("b").eq(lit(10))),
565            vec![
566                //  if b isn't 10, it can not be true (though the expression actually can never be true)
567                in_guarantee("b", [10]),
568            ],
569        );
570        // a != "foo" and (a != "bar" OR a != "baz")
571        test_analyze(
572            col("a")
573                .not_eq(lit("foo"))
574                .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
575            // a is not foo (we can't represent other knowledge about a)
576            vec![not_in_guarantee("a", ["foo"])],
577        );
578    }
579
580    #[test]
581    fn test_conjunction_and_disjunction_single_column() {
582        // b != 1 AND (b > 2)
583        test_analyze(
584            col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
585            vec![
586                // for the expression to be true, b can not be one
587                not_in_guarantee("b", [1]),
588            ],
589        );
590
591        // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
592        test_analyze(
593            col("b")
594                .eq(lit(1))
595                .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
596            vec![
597                // in theory, b must be 1 and one of 2,3 for this expression to be true
598                // which is a logical contradiction
599            ],
600        );
601    }
602
603    #[test]
604    fn test_disjunction_single_column() {
605        // b = 1 OR b = 2
606        test_analyze(
607            col("b").eq(lit(1)).or(col("b").eq(lit(2))),
608            vec![in_guarantee("b", [1, 2])],
609        );
610        // b != 1 OR b = 2
611        test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
612        // b = 1 OR b != 2
613        test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
614        // b != 1 OR b != 2
615        test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
616        // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
617        test_analyze(
618            col("b")
619                .not_eq(lit(1))
620                .or(col("b").not_eq(lit(2)))
621                .or(lit("b").eq(lit(3))),
622            vec![],
623        );
624        // b = 1 OR b = 2 OR b = 3
625        test_analyze(
626            col("b")
627                .eq(lit(1))
628                .or(col("b").eq(lit(2)))
629                .or(col("b").eq(lit(3))),
630            vec![in_guarantee("b", [1, 2, 3])],
631        );
632        // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
633        test_analyze(
634            col("b")
635                .eq(lit(1))
636                .or(col("b").eq(lit(2)))
637                .or(lit("b").eq(lit(3))),
638            vec![],
639        );
640    }
641
642    #[test]
643    fn test_disjunction_multi_column() {
644        // a = "foo" OR b = 1
645        test_analyze(
646            col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
647            // no can't have a single column guarantee (if a = "foo" then b != 1) etc
648            vec![],
649        );
650        // a != "foo" OR b != 1
651        test_analyze(
652            col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
653            // No single column guarantee
654            vec![],
655        );
656        // a = "foo" OR a = "bar"
657        test_analyze(
658            col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
659            vec![in_guarantee("a", ["foo", "bar"])],
660        );
661        // a = "foo" OR a = "foo"
662        test_analyze(
663            col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
664            vec![in_guarantee("a", ["foo"])],
665        );
666        // a != "foo" OR a != "bar"
667        test_analyze(
668            col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
669            // can't represent knowledge about a in this case
670            vec![],
671        );
672        // a = "foo" OR a = "bar" OR a = "baz"
673        test_analyze(
674            col("a")
675                .eq(lit("foo"))
676                .or(col("a").eq(lit("bar")))
677                .or(col("a").eq(lit("baz"))),
678            vec![in_guarantee("a", ["foo", "bar", "baz"])],
679        );
680        // (a = "foo" OR a = "bar") AND (a = "baz)"
681        test_analyze(
682            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
683                .and(col("a").eq(lit("baz"))),
684            // this could potentially be represented as 2 constraints with a more
685            // sophisticated analysis
686            vec![],
687        );
688        // (a = "foo" OR a = "bar") AND (b = 1)
689        test_analyze(
690            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
691                .and(col("b").eq(lit(1))),
692            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
693        );
694        // (a = "foo" OR a = "bar") OR (b = 1)
695        test_analyze(
696            col("a")
697                .eq(lit("foo"))
698                .or(col("a").eq(lit("bar")))
699                .or(col("b").eq(lit(1))),
700            // can't represent knowledge about a or b in this case
701            vec![],
702        );
703    }
704
705    #[test]
706    fn test_single_inlist() {
707        // b IN (1, 2, 3)
708        test_analyze(
709            col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
710            vec![in_guarantee("b", [1, 2, 3])],
711        );
712        // b NOT IN (1, 2, 3)
713        test_analyze(
714            col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
715            vec![not_in_guarantee("b", [1, 2, 3])],
716        );
717        // b IN (1,2,3,4...24)
718        test_analyze(
719            col("b").in_list((1..25).map(lit).collect_vec(), false),
720            vec![in_guarantee("b", 1..25)],
721        );
722    }
723
724    #[test]
725    fn test_inlist_conjunction() {
726        // b IN (1, 2, 3) AND b IN (2, 3, 4)
727        test_analyze(
728            col("b")
729                .in_list(vec![lit(1), lit(2), lit(3)], false)
730                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
731            vec![in_guarantee("b", [2, 3])],
732        );
733        // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
734        test_analyze(
735            col("b")
736                .in_list(vec![lit(1), lit(2), lit(3)], true)
737                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
738            vec![
739                not_in_guarantee("b", [1, 2, 3]),
740                in_guarantee("b", [2, 3, 4]),
741            ],
742        );
743        // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
744        test_analyze(
745            col("b")
746                .in_list(vec![lit(1), lit(2), lit(3)], true)
747                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
748            vec![not_in_guarantee("b", [1, 2, 3, 4])],
749        );
750        // b IN (1, 2, 3) AND b = 4
751        test_analyze(
752            col("b")
753                .in_list(vec![lit(1), lit(2), lit(3)], false)
754                .and(col("b").eq(lit(4))),
755            vec![],
756        );
757        // b IN (1, 2, 3) AND b = 2
758        test_analyze(
759            col("b")
760                .in_list(vec![lit(1), lit(2), lit(3)], false)
761                .and(col("b").eq(lit(2))),
762            vec![in_guarantee("b", [2])],
763        );
764        // b IN (1, 2, 3) AND b != 2
765        test_analyze(
766            col("b")
767                .in_list(vec![lit(1), lit(2), lit(3)], false)
768                .and(col("b").not_eq(lit(2))),
769            vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
770        );
771        // b NOT IN (1, 2, 3) AND b != 4
772        test_analyze(
773            col("b")
774                .in_list(vec![lit(1), lit(2), lit(3)], true)
775                .and(col("b").not_eq(lit(4))),
776            vec![not_in_guarantee("b", [1, 2, 3, 4])],
777        );
778        // b NOT IN (1, 2, 3) AND b != 2
779        test_analyze(
780            col("b")
781                .in_list(vec![lit(1), lit(2), lit(3)], true)
782                .and(col("b").not_eq(lit(2))),
783            vec![not_in_guarantee("b", [1, 2, 3])],
784        );
785    }
786
787    #[test]
788    fn test_inlist_with_disjunction() {
789        // b IN (1, 2, 3) AND (b = 3 OR b = 4)
790        test_analyze(
791            col("b")
792                .in_list(vec![lit(1), lit(2), lit(3)], false)
793                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
794            vec![in_guarantee("b", [3])],
795        );
796        // b IN (1, 2, 3) AND (b = 4 OR b = 5)
797        test_analyze(
798            col("b")
799                .in_list(vec![lit(1), lit(2), lit(3)], false)
800                .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
801            vec![],
802        );
803        // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
804        test_analyze(
805            col("b")
806                .in_list(vec![lit(1), lit(2), lit(3)], true)
807                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
808            vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
809        );
810        // b IN (1, 2, 3) OR b = 2
811        // TODO this should be in_guarantee("b", [1, 2, 3]) but currently we don't support to analyze this kind of disjunction. Only `ColOpLit OR ColOpLit` is supported.
812        test_analyze(
813            col("b")
814                .in_list(vec![lit(1), lit(2), lit(3)], false)
815                .or(col("b").eq(lit(2))),
816            vec![],
817        );
818        // b IN (1, 2, 3) OR b != 3
819        test_analyze(
820            col("b")
821                .in_list(vec![lit(1), lit(2), lit(3)], false)
822                .or(col("b").not_eq(lit(3))),
823            vec![],
824        );
825    }
826
827    /// Tests that analyzing expr results in the expected guarantees
828    fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
829        println!("Begin analyze of {expr}");
830        let schema = schema();
831        let physical_expr = logical2physical(&expr, &schema);
832
833        let actual = LiteralGuarantee::analyze(&physical_expr);
834        assert_eq!(
835            expected, actual,
836            "expr: {expr}\
837               \n\nexpected: {expected:#?}\
838               \n\nactual: {actual:#?}\
839               \n\nexpr: {expr:#?}\
840               \n\nphysical_expr: {physical_expr:#?}"
841        );
842    }
843
844    /// Guarantee that the expression is true if the column is one of the specified values
845    fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
846    where
847        I: IntoIterator<Item = S>,
848        S: Into<ScalarValue> + 'a,
849    {
850        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
851        LiteralGuarantee::new(column, Guarantee::In, literals.iter())
852    }
853
854    /// Guarantee that the expression is true if the column is NOT any of the specified values
855    fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
856    where
857        I: IntoIterator<Item = S>,
858        S: Into<ScalarValue> + 'a,
859    {
860        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
861        LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
862    }
863
864    // Schema for testing
865    fn schema() -> SchemaRef {
866        static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
867            Arc::new(Schema::new(vec![
868                Field::new("a", DataType::Utf8, false),
869                Field::new("b", DataType::Int32, false),
870            ]))
871        });
872        Arc::clone(&SCHEMA)
873    }
874}