datafusion_physical_expr/utils/guarantee.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19//! constant.
20
21use crate::utils::split_disjunction;
22use crate::{split_conjunction, PhysicalExpr};
23use datafusion_common::{Column, HashMap, ScalarValue};
24use datafusion_expr::Operator;
25use std::collections::HashSet;
26use std::fmt::{self, Display, Formatter};
27use std::sync::Arc;
28
29/// Represents a guarantee that must be true for a boolean expression to
30/// evaluate to `true`.
31///
32/// The guarantee takes the form of a column and a set of literal (constant)
33/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34/// satisfy* the guarantee(s).
35///
36/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37/// column must either:
38///
39/// 1. be ONLY one of that set
40/// 2. NOT be ANY of that set
41///
42/// # Uses `LiteralGuarantee`s
43///
44/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45/// files (e.g. row groups in parquet files) by proving expressions can not
46/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48/// partition where we know that `a` never has the value of `1`.
49///
50/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53/// that does **not** guarantee that the predicate will actually evaluate to
54/// `true`: it may still evaluate to `true`, `false` or `null`.
55///
56/// # Creating `LiteralGuarantee`s
57///
58/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59/// filter predicate.
60///
61/// # Details
62/// A guarantee can be one of two forms:
63///
64/// 1. The column must be one the values for the predicate to be `true`. If the
65/// column takes on any other value, the predicate can not evaluate to `true`.
66/// For example,
67/// `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68///
69/// 2. The column must NOT be one of the values for the predicate to be `true`.
70/// If the column can ONLY take one of these values, the predicate can not
71/// evaluate to `true`. For example,
72/// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73#[derive(Debug, Clone, PartialEq)]
74pub struct LiteralGuarantee {
75 pub column: Column,
76 pub guarantee: Guarantee,
77 pub literals: HashSet<ScalarValue>,
78}
79
80/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Guarantee {
83 /// Guarantee that the expression is `true` if `column` is one of the values. If
84 /// `column` is not one of the values, the expression can not be `true`.
85 In,
86 /// Guarantee that the expression is `true` if `column` is not ANY of the
87 /// values. If `column` only takes one of these values, the expression can
88 /// not be `true`.
89 NotIn,
90}
91
92impl LiteralGuarantee {
93 /// Create a new instance of the guarantee if the provided operator is
94 /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95 /// create these structures from an predicate (boolean expression).
96 fn new<'a>(
97 column_name: impl Into<String>,
98 guarantee: Guarantee,
99 literals: impl IntoIterator<Item = &'a ScalarValue>,
100 ) -> Self {
101 let literals: HashSet<_> = literals.into_iter().cloned().collect();
102
103 Self {
104 column: Column::from_name(column_name),
105 guarantee,
106 literals,
107 }
108 }
109
110 /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
111 /// to evaluate to `true`.
112 ///
113 /// If more than one `LiteralGuarantee` is returned, they must **all** hold
114 /// for the expression to possibly be `true`. If any is not satisfied, the
115 /// expression is guaranteed to be `null` or `false`.
116 ///
117 /// # Notes:
118 /// 1. `expr` must be a boolean expression or inlist expression.
119 /// 2. `expr` is not simplified prior to analysis.
120 pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
121 // split conjunction: <expr> AND <expr> AND ...
122 split_conjunction(expr)
123 .into_iter()
124 // for an `AND` conjunction to be true, all terms individually must be true
125 .fold(GuaranteeBuilder::new(), |builder, expr| {
126 if let Some(cel) = ColOpLit::try_new(expr) {
127 builder.aggregate_conjunct(cel)
128 } else if let Some(inlist) = expr
129 .as_any()
130 .downcast_ref::<crate::expressions::InListExpr>()
131 {
132 // Only support single-column inlist currently, multi-column inlist is not supported
133 let col = inlist
134 .expr()
135 .as_any()
136 .downcast_ref::<crate::expressions::Column>();
137 let Some(col) = col else {
138 return builder;
139 };
140
141 let literals = inlist
142 .list()
143 .iter()
144 .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
145 .collect::<Option<Vec<_>>>();
146 let Some(literals) = literals else {
147 return builder;
148 };
149
150 let guarantee = if inlist.negated() {
151 Guarantee::NotIn
152 } else {
153 Guarantee::In
154 };
155
156 builder.aggregate_multi_conjunct(
157 col,
158 guarantee,
159 literals.iter().map(|e| e.value()),
160 )
161 } else {
162 // split disjunction: <expr> OR <expr> OR ...
163 let disjunctions = split_disjunction(expr);
164
165 // We are trying to add a guarantee that a column must be
166 // in/not in a particular set of values for the expression
167 // to evaluate to true.
168 //
169 // A disjunction is true, if at least one of the terms is be
170 // true.
171 //
172 // Thus, we can infer a guarantee if all terms are of the
173 // form `(col <op> literal) OR (col <op> literal) OR ...`.
174 //
175 // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
176 // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
177 //
178 // However, for something like `a = 1 OR a = 2 OR a < 0` we
179 // **can't** guarantee that the predicate is only true if a
180 // is in (`1`, `2`), as it could also be true if `a` were less
181 // than zero.
182 let terms = disjunctions
183 .iter()
184 .filter_map(|expr| ColOpLit::try_new(expr))
185 .collect::<Vec<_>>();
186
187 if terms.is_empty() {
188 return builder;
189 }
190
191 // if not all terms are of the form (col <op> literal),
192 // can't infer any guarantees
193 if terms.len() != disjunctions.len() {
194 return builder;
195 }
196
197 // if all terms are 'col <op> literal' with the same column
198 // and operation we can infer any guarantees
199 //
200 // For those like (a != foo AND (a != bar OR a != baz)).
201 // We can't combine the (a != bar OR a != baz) part, but
202 // it also doesn't invalidate our knowledge that a !=
203 // foo is required for the expression to be true.
204 // So we can only create a multi value guarantee for `=`
205 // (or a single value). (e.g. ignore `a != foo OR a != bar`)
206 let first_term = &terms[0];
207 if terms.iter().all(|term| {
208 term.col.name() == first_term.col.name()
209 && term.guarantee == Guarantee::In
210 }) {
211 builder.aggregate_multi_conjunct(
212 first_term.col,
213 Guarantee::In,
214 terms.iter().map(|term| term.lit.value()),
215 )
216 } else {
217 // can't infer anything
218 builder
219 }
220 }
221 })
222 .build()
223 }
224}
225
226impl Display for LiteralGuarantee {
227 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
228 let mut sorted_literals: Vec<_> =
229 self.literals.iter().map(|lit| lit.to_string()).collect();
230 sorted_literals.sort();
231 match self.guarantee {
232 Guarantee::In => write!(
233 f,
234 "{} in ({})",
235 self.column.name,
236 sorted_literals.join(", ")
237 ),
238 Guarantee::NotIn => write!(
239 f,
240 "{} not in ({})",
241 self.column.name,
242 sorted_literals.join(", ")
243 ),
244 }
245 }
246}
247
248/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
249/// preserving insert order
250#[derive(Debug, Default)]
251struct GuaranteeBuilder<'a> {
252 /// List of guarantees that have been created so far
253 /// if we have determined a subsequent conjunct invalidates a guarantee
254 /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
255 guarantees: Vec<Option<LiteralGuarantee>>,
256
257 /// Key is the (column name, guarantee type)
258 /// Value is the index into `guarantees`
259 map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
260}
261
262impl<'a> GuaranteeBuilder<'a> {
263 fn new() -> Self {
264 Default::default()
265 }
266
267 /// Aggregate a new single `AND col <op> literal` term to this builder
268 /// combining with existing guarantees if possible.
269 ///
270 /// # Examples
271 /// * `AND (a = 1)`: `a` is guaranteed to be 1
272 /// * `AND (a != 1)`: a is guaranteed to not be 1
273 fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
274 self.aggregate_multi_conjunct(
275 col_op_lit.col,
276 col_op_lit.guarantee,
277 [col_op_lit.lit.value()],
278 )
279 }
280
281 /// Aggregates a new single column, multi literal term to this builder
282 /// combining with previously known guarantees if possible.
283 ///
284 /// # Examples
285 /// For the following examples, we can guarantee the expression is `true` if:
286 /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
287 /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
288 /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
289 /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
290 fn aggregate_multi_conjunct(
291 mut self,
292 col: &'a crate::expressions::Column,
293 guarantee: Guarantee,
294 new_values: impl IntoIterator<Item = &'a ScalarValue>,
295 ) -> Self {
296 let key = (col, guarantee);
297 if let Some(index) = self.map.get(&key) {
298 // already have a guarantee for this column
299 let entry = &mut self.guarantees[*index];
300
301 let Some(existing) = entry else {
302 // determined the previous guarantee for this column has been
303 // invalidated, nothing to do
304 return self;
305 };
306
307 // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
308 // AND a = bar` doesn't make logical sense so we don't optimize this
309 // case
310 match existing.guarantee {
311 // knew that the column could not be a set of values
312 //
313 // For example, if we previously had `a != 5` and now we see
314 // another `AND a != 6` we know that a must not be either 5 or 6
315 // for the expression to be true
316 Guarantee::NotIn => {
317 let new_values: HashSet<_> = new_values.into_iter().collect();
318 existing.literals.extend(new_values.into_iter().cloned());
319 }
320 Guarantee::In => {
321 let intersection = new_values
322 .into_iter()
323 .filter(|new_value| existing.literals.contains(*new_value))
324 .collect::<Vec<_>>();
325 // for an In guarantee, if the intersection is not empty, we can extend the guarantee
326 // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
327 // otherwise, we invalidate the guarantee
328 // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
329 if !intersection.is_empty() {
330 existing.literals = intersection.into_iter().cloned().collect();
331 } else {
332 // at least one was not, so invalidate the guarantee
333 *entry = None;
334 }
335 }
336 }
337 } else {
338 // This is a new guarantee
339 let new_values: HashSet<_> = new_values.into_iter().collect();
340
341 let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
342 // add it to the list of guarantees
343 self.guarantees.push(Some(guarantee));
344 self.map.insert(key, self.guarantees.len() - 1);
345 }
346
347 self
348 }
349
350 /// Return all guarantees that have been created so far
351 fn build(self) -> Vec<LiteralGuarantee> {
352 // filter out any guarantees that have been invalidated
353 self.guarantees.into_iter().flatten().collect()
354 }
355}
356
357/// Represents a single `col [not]in literal` expression
358struct ColOpLit<'a> {
359 col: &'a crate::expressions::Column,
360 guarantee: Guarantee,
361 lit: &'a crate::expressions::Literal,
362}
363
364impl<'a> ColOpLit<'a> {
365 /// Returns Some(ColEqLit) if the expression is either:
366 /// 1. `col <op> literal`
367 /// 2. `literal <op> col`
368 /// 3. operator is `=` or `!=`
369 ///
370 /// Returns None otherwise
371 fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
372 let binary_expr = expr
373 .as_any()
374 .downcast_ref::<crate::expressions::BinaryExpr>()?;
375
376 let (left, op, right) = (
377 binary_expr.left().as_any(),
378 binary_expr.op(),
379 binary_expr.right().as_any(),
380 );
381 let guarantee = match op {
382 Operator::Eq => Guarantee::In,
383 Operator::NotEq => Guarantee::NotIn,
384 _ => return None,
385 };
386 // col <op> literal
387 if let (Some(col), Some(lit)) = (
388 left.downcast_ref::<crate::expressions::Column>(),
389 right.downcast_ref::<crate::expressions::Literal>(),
390 ) {
391 Some(Self {
392 col,
393 guarantee,
394 lit,
395 })
396 }
397 // literal <op> col
398 else if let (Some(lit), Some(col)) = (
399 left.downcast_ref::<crate::expressions::Literal>(),
400 right.downcast_ref::<crate::expressions::Column>(),
401 ) {
402 Some(Self {
403 col,
404 guarantee,
405 lit,
406 })
407 } else {
408 None
409 }
410 }
411}
412
413#[cfg(test)]
414mod test {
415 use std::sync::LazyLock;
416
417 use super::*;
418 use crate::planner::logical2physical;
419
420 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
421 use datafusion_expr::expr_fn::*;
422 use datafusion_expr::{lit, Expr};
423
424 use itertools::Itertools;
425
426 #[test]
427 fn test_literal() {
428 // a single literal offers no guarantee
429 test_analyze(lit(true), vec![])
430 }
431
432 #[test]
433 fn test_single() {
434 // a = "foo"
435 test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
436 // "foo" = a
437 test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
438 // a != "foo"
439 test_analyze(
440 col("a").not_eq(lit("foo")),
441 vec![not_in_guarantee("a", ["foo"])],
442 );
443 // "foo" != a
444 test_analyze(
445 lit("foo").not_eq(col("a")),
446 vec![not_in_guarantee("a", ["foo"])],
447 );
448 }
449
450 #[test]
451 fn test_conjunction_single_column() {
452 // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
453 test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
454 // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
455 test_analyze(
456 col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
457 vec![
458 // can only be true of b is 1 and b is not 2 (even though it is redundant)
459 in_guarantee("b", [1]),
460 not_in_guarantee("b", [2]),
461 ],
462 );
463 // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
464 test_analyze(
465 col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
466 vec![
467 // can only be true of b is not 1 and b is 2 (even though it is redundant)
468 not_in_guarantee("b", [1]),
469 in_guarantee("b", [2]),
470 ],
471 );
472 // b != 1 AND b != 2
473 test_analyze(
474 col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
475 vec![not_in_guarantee("b", [1, 2])],
476 );
477 // b != 1 AND b != 2 and b != 3
478 test_analyze(
479 col("b")
480 .not_eq(lit(1))
481 .and(col("b").not_eq(lit(2)))
482 .and(col("b").not_eq(lit(3))),
483 vec![not_in_guarantee("b", [1, 2, 3])],
484 );
485 // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
486 test_analyze(
487 col("b")
488 .not_eq(lit(1))
489 .and(col("b").eq(lit(2)))
490 .and(col("b").not_eq(lit(3))),
491 vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
492 );
493 // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
494 test_analyze(
495 col("b")
496 .not_eq(lit(1))
497 .and(col("b").not_eq(lit(2)))
498 .and(col("b").eq(lit(3))),
499 vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
500 );
501 // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
502 test_analyze(
503 col("b")
504 .not_eq(lit(1))
505 .and(col("b").not_eq(lit(2)))
506 .and(col("b").gt(lit(3))),
507 vec![not_in_guarantee("b", [1, 2])],
508 );
509 }
510
511 #[test]
512 fn test_conjunction_multi_column() {
513 // a = "foo" AND b = 1
514 test_analyze(
515 col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
516 vec![
517 // should find both column guarantees
518 in_guarantee("a", ["foo"]),
519 in_guarantee("b", [1]),
520 ],
521 );
522 // a != "foo" AND b != 1
523 test_analyze(
524 col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
525 // should find both column guarantees
526 vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
527 );
528 // a = "foo" AND a = "bar"
529 test_analyze(
530 col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
531 // this predicate is impossible ( can't be both foo and bar),
532 vec![],
533 );
534 // a = "foo" AND b != "bar"
535 test_analyze(
536 col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
537 vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
538 );
539 // a != "foo" AND a != "bar"
540 test_analyze(
541 col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
542 // know it isn't "foo" or "bar"
543 vec![not_in_guarantee("a", ["foo", "bar"])],
544 );
545 // a != "foo" AND a != "bar" and a != "baz"
546 test_analyze(
547 col("a")
548 .not_eq(lit("foo"))
549 .and(col("a").not_eq(lit("bar")))
550 .and(col("a").not_eq(lit("baz"))),
551 // know it isn't "foo" or "bar" or "baz"
552 vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
553 );
554 // a = "foo" AND a = "foo"
555 let expr = col("a").eq(lit("foo"));
556 test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
557 // b > 5 AND b = 10 (should get an b = 10 guarantee)
558 test_analyze(
559 col("b").gt(lit(5)).and(col("b").eq(lit(10))),
560 vec![in_guarantee("b", [10])],
561 );
562 // b > 10 AND b = 10 (this is impossible)
563 test_analyze(
564 col("b").gt(lit(10)).and(col("b").eq(lit(10))),
565 vec![
566 // if b isn't 10, it can not be true (though the expression actually can never be true)
567 in_guarantee("b", [10]),
568 ],
569 );
570 // a != "foo" and (a != "bar" OR a != "baz")
571 test_analyze(
572 col("a")
573 .not_eq(lit("foo"))
574 .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
575 // a is not foo (we can't represent other knowledge about a)
576 vec![not_in_guarantee("a", ["foo"])],
577 );
578 }
579
580 #[test]
581 fn test_conjunction_and_disjunction_single_column() {
582 // b != 1 AND (b > 2)
583 test_analyze(
584 col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
585 vec![
586 // for the expression to be true, b can not be one
587 not_in_guarantee("b", [1]),
588 ],
589 );
590
591 // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
592 test_analyze(
593 col("b")
594 .eq(lit(1))
595 .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
596 vec![
597 // in theory, b must be 1 and one of 2,3 for this expression to be true
598 // which is a logical contradiction
599 ],
600 );
601 }
602
603 #[test]
604 fn test_disjunction_single_column() {
605 // b = 1 OR b = 2
606 test_analyze(
607 col("b").eq(lit(1)).or(col("b").eq(lit(2))),
608 vec![in_guarantee("b", [1, 2])],
609 );
610 // b != 1 OR b = 2
611 test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
612 // b = 1 OR b != 2
613 test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
614 // b != 1 OR b != 2
615 test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
616 // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
617 test_analyze(
618 col("b")
619 .not_eq(lit(1))
620 .or(col("b").not_eq(lit(2)))
621 .or(lit("b").eq(lit(3))),
622 vec![],
623 );
624 // b = 1 OR b = 2 OR b = 3
625 test_analyze(
626 col("b")
627 .eq(lit(1))
628 .or(col("b").eq(lit(2)))
629 .or(col("b").eq(lit(3))),
630 vec![in_guarantee("b", [1, 2, 3])],
631 );
632 // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
633 test_analyze(
634 col("b")
635 .eq(lit(1))
636 .or(col("b").eq(lit(2)))
637 .or(lit("b").eq(lit(3))),
638 vec![],
639 );
640 }
641
642 #[test]
643 fn test_disjunction_multi_column() {
644 // a = "foo" OR b = 1
645 test_analyze(
646 col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
647 // no can't have a single column guarantee (if a = "foo" then b != 1) etc
648 vec![],
649 );
650 // a != "foo" OR b != 1
651 test_analyze(
652 col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
653 // No single column guarantee
654 vec![],
655 );
656 // a = "foo" OR a = "bar"
657 test_analyze(
658 col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
659 vec![in_guarantee("a", ["foo", "bar"])],
660 );
661 // a = "foo" OR a = "foo"
662 test_analyze(
663 col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
664 vec![in_guarantee("a", ["foo"])],
665 );
666 // a != "foo" OR a != "bar"
667 test_analyze(
668 col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
669 // can't represent knowledge about a in this case
670 vec![],
671 );
672 // a = "foo" OR a = "bar" OR a = "baz"
673 test_analyze(
674 col("a")
675 .eq(lit("foo"))
676 .or(col("a").eq(lit("bar")))
677 .or(col("a").eq(lit("baz"))),
678 vec![in_guarantee("a", ["foo", "bar", "baz"])],
679 );
680 // (a = "foo" OR a = "bar") AND (a = "baz)"
681 test_analyze(
682 (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
683 .and(col("a").eq(lit("baz"))),
684 // this could potentially be represented as 2 constraints with a more
685 // sophisticated analysis
686 vec![],
687 );
688 // (a = "foo" OR a = "bar") AND (b = 1)
689 test_analyze(
690 (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
691 .and(col("b").eq(lit(1))),
692 vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
693 );
694 // (a = "foo" OR a = "bar") OR (b = 1)
695 test_analyze(
696 col("a")
697 .eq(lit("foo"))
698 .or(col("a").eq(lit("bar")))
699 .or(col("b").eq(lit(1))),
700 // can't represent knowledge about a or b in this case
701 vec![],
702 );
703 }
704
705 #[test]
706 fn test_single_inlist() {
707 // b IN (1, 2, 3)
708 test_analyze(
709 col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
710 vec![in_guarantee("b", [1, 2, 3])],
711 );
712 // b NOT IN (1, 2, 3)
713 test_analyze(
714 col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
715 vec![not_in_guarantee("b", [1, 2, 3])],
716 );
717 // b IN (1,2,3,4...24)
718 test_analyze(
719 col("b").in_list((1..25).map(lit).collect_vec(), false),
720 vec![in_guarantee("b", 1..25)],
721 );
722 }
723
724 #[test]
725 fn test_inlist_conjunction() {
726 // b IN (1, 2, 3) AND b IN (2, 3, 4)
727 test_analyze(
728 col("b")
729 .in_list(vec![lit(1), lit(2), lit(3)], false)
730 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
731 vec![in_guarantee("b", [2, 3])],
732 );
733 // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
734 test_analyze(
735 col("b")
736 .in_list(vec![lit(1), lit(2), lit(3)], true)
737 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
738 vec![
739 not_in_guarantee("b", [1, 2, 3]),
740 in_guarantee("b", [2, 3, 4]),
741 ],
742 );
743 // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
744 test_analyze(
745 col("b")
746 .in_list(vec![lit(1), lit(2), lit(3)], true)
747 .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
748 vec![not_in_guarantee("b", [1, 2, 3, 4])],
749 );
750 // b IN (1, 2, 3) AND b = 4
751 test_analyze(
752 col("b")
753 .in_list(vec![lit(1), lit(2), lit(3)], false)
754 .and(col("b").eq(lit(4))),
755 vec![],
756 );
757 // b IN (1, 2, 3) AND b = 2
758 test_analyze(
759 col("b")
760 .in_list(vec![lit(1), lit(2), lit(3)], false)
761 .and(col("b").eq(lit(2))),
762 vec![in_guarantee("b", [2])],
763 );
764 // b IN (1, 2, 3) AND b != 2
765 test_analyze(
766 col("b")
767 .in_list(vec![lit(1), lit(2), lit(3)], false)
768 .and(col("b").not_eq(lit(2))),
769 vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
770 );
771 // b NOT IN (1, 2, 3) AND b != 4
772 test_analyze(
773 col("b")
774 .in_list(vec![lit(1), lit(2), lit(3)], true)
775 .and(col("b").not_eq(lit(4))),
776 vec![not_in_guarantee("b", [1, 2, 3, 4])],
777 );
778 // b NOT IN (1, 2, 3) AND b != 2
779 test_analyze(
780 col("b")
781 .in_list(vec![lit(1), lit(2), lit(3)], true)
782 .and(col("b").not_eq(lit(2))),
783 vec![not_in_guarantee("b", [1, 2, 3])],
784 );
785 }
786
787 #[test]
788 fn test_inlist_with_disjunction() {
789 // b IN (1, 2, 3) AND (b = 3 OR b = 4)
790 test_analyze(
791 col("b")
792 .in_list(vec![lit(1), lit(2), lit(3)], false)
793 .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
794 vec![in_guarantee("b", [3])],
795 );
796 // b IN (1, 2, 3) AND (b = 4 OR b = 5)
797 test_analyze(
798 col("b")
799 .in_list(vec![lit(1), lit(2), lit(3)], false)
800 .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
801 vec![],
802 );
803 // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
804 test_analyze(
805 col("b")
806 .in_list(vec![lit(1), lit(2), lit(3)], true)
807 .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
808 vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
809 );
810 // b IN (1, 2, 3) OR b = 2
811 // TODO this should be in_guarantee("b", [1, 2, 3]) but currently we don't support to analyze this kind of disjunction. Only `ColOpLit OR ColOpLit` is supported.
812 test_analyze(
813 col("b")
814 .in_list(vec![lit(1), lit(2), lit(3)], false)
815 .or(col("b").eq(lit(2))),
816 vec![],
817 );
818 // b IN (1, 2, 3) OR b != 3
819 test_analyze(
820 col("b")
821 .in_list(vec![lit(1), lit(2), lit(3)], false)
822 .or(col("b").not_eq(lit(3))),
823 vec![],
824 );
825 }
826
827 /// Tests that analyzing expr results in the expected guarantees
828 fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
829 println!("Begin analyze of {expr}");
830 let schema = schema();
831 let physical_expr = logical2physical(&expr, &schema);
832
833 let actual = LiteralGuarantee::analyze(&physical_expr);
834 assert_eq!(
835 expected, actual,
836 "expr: {expr}\
837 \n\nexpected: {expected:#?}\
838 \n\nactual: {actual:#?}\
839 \n\nexpr: {expr:#?}\
840 \n\nphysical_expr: {physical_expr:#?}"
841 );
842 }
843
844 /// Guarantee that the expression is true if the column is one of the specified values
845 fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
846 where
847 I: IntoIterator<Item = S>,
848 S: Into<ScalarValue> + 'a,
849 {
850 let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
851 LiteralGuarantee::new(column, Guarantee::In, literals.iter())
852 }
853
854 /// Guarantee that the expression is true if the column is NOT any of the specified values
855 fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
856 where
857 I: IntoIterator<Item = S>,
858 S: Into<ScalarValue> + 'a,
859 {
860 let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
861 LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
862 }
863
864 // Schema for testing
865 fn schema() -> SchemaRef {
866 static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
867 Arc::new(Schema::new(vec![
868 Field::new("a", DataType::Utf8, false),
869 Field::new("b", DataType::Int32, false),
870 ]))
871 });
872 Arc::clone(&SCHEMA)
873 }
874}