polars_plan/dsl/
string.rs

1use super::*;
2/// Specialized expressions for [`Series`] of [`DataType::String`].
3pub struct StringNameSpace(pub(crate) Expr);
4
5impl StringNameSpace {
6    /// Check if a string value contains a literal substring.
7    #[cfg(feature = "regex")]
8    pub fn contains_literal(self, pat: Expr) -> Expr {
9        self.0.map_many_private(
10            FunctionExpr::StringExpr(StringFunction::Contains {
11                literal: true,
12                strict: false,
13            }),
14            &[pat],
15            false,
16            Some(Default::default()),
17        )
18    }
19
20    /// Check if this column of strings contains a Regex. If `strict` is `true`, then it is an error if any `pat` is
21    /// an invalid regex, whereas if `strict` is `false`, an invalid regex will simply evaluate to `false`.
22    #[cfg(feature = "regex")]
23    pub fn contains(self, pat: Expr, strict: bool) -> Expr {
24        self.0.map_many_private(
25            FunctionExpr::StringExpr(StringFunction::Contains {
26                literal: false,
27                strict,
28            }),
29            &[pat],
30            false,
31            Some(Default::default()),
32        )
33    }
34
35    /// Uses aho-corasick to find many patterns.
36    ///
37    /// # Arguments
38    /// - `patterns`: an expression that evaluates to an String column
39    /// - `ascii_case_insensitive`: Enable ASCII-aware case insensitive matching.
40    ///   When this option is enabled, searching will be performed without respect to case for
41    ///   ASCII letters (a-z and A-Z) only.
42    #[cfg(feature = "find_many")]
43    pub fn contains_any(self, patterns: Expr, ascii_case_insensitive: bool) -> Expr {
44        self.0.map_many_private(
45            FunctionExpr::StringExpr(StringFunction::ContainsAny {
46                ascii_case_insensitive,
47            }),
48            &[patterns],
49            false,
50            None,
51        )
52    }
53
54    /// Uses aho-corasick to replace many patterns.
55    /// # Arguments
56    /// - `patterns`: an expression that evaluates to a String column
57    /// - `replace_with`: an expression that evaluates to a String column
58    /// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
59    ///   When this option is enabled, searching will be performed without respect to case for
60    ///   ASCII letters (a-z and A-Z) only.
61    #[cfg(feature = "find_many")]
62    pub fn replace_many(
63        self,
64        patterns: Expr,
65        replace_with: Expr,
66        ascii_case_insensitive: bool,
67    ) -> Expr {
68        self.0.map_many_private(
69            FunctionExpr::StringExpr(StringFunction::ReplaceMany {
70                ascii_case_insensitive,
71            }),
72            &[patterns, replace_with],
73            false,
74            None,
75        )
76    }
77
78    /// Uses aho-corasick to replace many patterns.
79    /// # Arguments
80    /// - `patterns`: an expression that evaluates to a String column
81    /// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
82    ///   When this option is enabled, searching will be performed without respect to case for
83    ///   ASCII letters (a-z and A-Z) only.
84    /// - `overlapping`: Whether matches may overlap.
85    #[cfg(feature = "find_many")]
86    pub fn extract_many(
87        self,
88        patterns: Expr,
89        ascii_case_insensitive: bool,
90        overlapping: bool,
91    ) -> Expr {
92        self.0.map_many_private(
93            FunctionExpr::StringExpr(StringFunction::ExtractMany {
94                ascii_case_insensitive,
95                overlapping,
96            }),
97            &[patterns],
98            false,
99            None,
100        )
101    }
102
103    /// Uses aho-corasick to find many patterns.
104    /// # Arguments
105    /// - `patterns`: an expression that evaluates to a String column
106    /// - `ascii_case_insensitive`: Enable ASCII-aware case-insensitive matching.
107    ///   When this option is enabled, searching will be performed without respect to case for
108    ///   ASCII letters (a-z and A-Z) only.
109    /// - `overlapping`: Whether matches may overlap.
110    #[cfg(feature = "find_many")]
111    pub fn find_many(
112        self,
113        patterns: Expr,
114        ascii_case_insensitive: bool,
115        overlapping: bool,
116    ) -> Expr {
117        self.0.map_many_private(
118            FunctionExpr::StringExpr(StringFunction::FindMany {
119                ascii_case_insensitive,
120                overlapping,
121            }),
122            &[patterns],
123            false,
124            None,
125        )
126    }
127
128    /// Check if a string value ends with the `sub` string.
129    pub fn ends_with(self, sub: Expr) -> Expr {
130        self.0.map_many_private(
131            FunctionExpr::StringExpr(StringFunction::EndsWith),
132            &[sub],
133            false,
134            Some(Default::default()),
135        )
136    }
137
138    /// Check if a string value starts with the `sub` string.
139    pub fn starts_with(self, sub: Expr) -> Expr {
140        self.0.map_many_private(
141            FunctionExpr::StringExpr(StringFunction::StartsWith),
142            &[sub],
143            false,
144            Some(Default::default()),
145        )
146    }
147
148    #[cfg(feature = "string_encoding")]
149    pub fn hex_encode(self) -> Expr {
150        self.0
151            .map_private(FunctionExpr::StringExpr(StringFunction::HexEncode))
152    }
153
154    #[cfg(feature = "binary_encoding")]
155    pub fn hex_decode(self, strict: bool) -> Expr {
156        self.0
157            .map_private(FunctionExpr::StringExpr(StringFunction::HexDecode(strict)))
158    }
159
160    #[cfg(feature = "string_encoding")]
161    pub fn base64_encode(self) -> Expr {
162        self.0
163            .map_private(FunctionExpr::StringExpr(StringFunction::Base64Encode))
164    }
165
166    #[cfg(feature = "binary_encoding")]
167    pub fn base64_decode(self, strict: bool) -> Expr {
168        self.0
169            .map_private(FunctionExpr::StringExpr(StringFunction::Base64Decode(
170                strict,
171            )))
172    }
173
174    /// Extract a regex pattern from the a string value. If `group_index` is out of bounds, null is returned.
175    pub fn extract(self, pat: Expr, group_index: usize) -> Expr {
176        self.0.map_many_private(
177            StringFunction::Extract(group_index).into(),
178            &[pat],
179            false,
180            Some(Default::default()),
181        )
182    }
183
184    #[cfg(feature = "extract_groups")]
185    // Extract all captures groups from a regex pattern as a struct
186    pub fn extract_groups(self, pat: &str) -> PolarsResult<Expr> {
187        // regex will be compiled twice, because it doesn't support serde
188        // and we need to compile it here to determine the output datatype
189
190        use polars_utils::format_pl_smallstr;
191        let reg = regex::Regex::new(pat)?;
192        let names = reg
193            .capture_names()
194            .enumerate()
195            .skip(1)
196            .map(|(idx, opt_name)| {
197                opt_name
198                    .map(PlSmallStr::from_str)
199                    .unwrap_or_else(|| format_pl_smallstr!("{idx}"))
200            })
201            .collect::<Vec<_>>();
202
203        let dtype = DataType::Struct(
204            names
205                .iter()
206                .map(|name| Field::new(name.clone(), DataType::String))
207                .collect(),
208        );
209
210        Ok(self.0.map_private(
211            StringFunction::ExtractGroups {
212                dtype,
213                pat: pat.into(),
214            }
215            .into(),
216        ))
217    }
218
219    /// Pad the start of the string until it reaches the given length.
220    ///
221    /// Padding is done using the specified `fill_char`.
222    /// Strings with length equal to or greater than the given length are
223    /// returned as-is.
224    #[cfg(feature = "string_pad")]
225    pub fn pad_start(self, length: usize, fill_char: char) -> Expr {
226        self.0
227            .map_private(StringFunction::PadStart { length, fill_char }.into())
228    }
229
230    /// Pad the end of the string until it reaches the given length.
231    ///
232    /// Padding is done using the specified `fill_char`.
233    /// Strings with length equal to or greater than the given length are
234    /// returned as-is.
235    #[cfg(feature = "string_pad")]
236    pub fn pad_end(self, length: usize, fill_char: char) -> Expr {
237        self.0
238            .map_private(StringFunction::PadEnd { length, fill_char }.into())
239    }
240
241    /// Pad the start of the string with zeros until it reaches the given length.
242    ///
243    /// A sign prefix (`-`) is handled by inserting the padding after the sign
244    /// character rather than before.
245    /// Strings with length equal to or greater than the given length are
246    /// returned as-is.
247    #[cfg(feature = "string_pad")]
248    pub fn zfill(self, length: Expr) -> Expr {
249        self.0
250            .map_many_private(StringFunction::ZFill.into(), &[length], false, None)
251    }
252
253    /// Find the index of a literal substring within another string value.
254    #[cfg(feature = "regex")]
255    pub fn find_literal(self, pat: Expr) -> Expr {
256        self.0.map_many_private(
257            FunctionExpr::StringExpr(StringFunction::Find {
258                literal: true,
259                strict: false,
260            }),
261            &[pat],
262            false,
263            Some(Default::default()),
264        )
265    }
266
267    /// Find the index of a substring defined by a regular expressions within another string value.
268    #[cfg(feature = "regex")]
269    pub fn find(self, pat: Expr, strict: bool) -> Expr {
270        self.0.map_many_private(
271            FunctionExpr::StringExpr(StringFunction::Find {
272                literal: false,
273                strict,
274            }),
275            &[pat],
276            false,
277            Some(Default::default()),
278        )
279    }
280
281    /// Extract each successive non-overlapping match in an individual string as an array
282    pub fn extract_all(self, pat: Expr) -> Expr {
283        self.0
284            .map_many_private(StringFunction::ExtractAll.into(), &[pat], false, None)
285    }
286
287    /// Count all successive non-overlapping regex matches.
288    pub fn count_matches(self, pat: Expr, literal: bool) -> Expr {
289        self.0.map_many_private(
290            StringFunction::CountMatches(literal).into(),
291            &[pat],
292            false,
293            None,
294        )
295    }
296
297    /// Convert a String column into a Date/Datetime/Time column.
298    #[cfg(feature = "temporal")]
299    pub fn strptime(self, dtype: DataType, options: StrptimeOptions, ambiguous: Expr) -> Expr {
300        self.0.map_many_private(
301            StringFunction::Strptime(dtype, options).into(),
302            &[ambiguous],
303            false,
304            None,
305        )
306    }
307
308    /// Convert a String column into a Date column.
309    #[cfg(feature = "dtype-date")]
310    pub fn to_date(self, options: StrptimeOptions) -> Expr {
311        self.strptime(DataType::Date, options, lit("raise"))
312    }
313
314    /// Convert a String column into a Datetime column.
315    #[cfg(feature = "dtype-datetime")]
316    pub fn to_datetime(
317        self,
318        time_unit: Option<TimeUnit>,
319        time_zone: Option<TimeZone>,
320        options: StrptimeOptions,
321        ambiguous: Expr,
322    ) -> Expr {
323        // If time_unit is None, try to infer it from the format or set a default
324        let time_unit = match (&options.format, time_unit) {
325            (_, Some(time_unit)) => time_unit,
326            (Some(format), None) => {
327                if format.contains("%.9f") || format.contains("%9f") {
328                    TimeUnit::Nanoseconds
329                } else if format.contains("%.3f") || format.contains("%3f") {
330                    TimeUnit::Milliseconds
331                } else {
332                    TimeUnit::Microseconds
333                }
334            },
335            (None, None) => TimeUnit::Microseconds,
336        };
337
338        self.strptime(DataType::Datetime(time_unit, time_zone), options, ambiguous)
339    }
340
341    /// Convert a String column into a Time column.
342    #[cfg(feature = "dtype-time")]
343    pub fn to_time(self, options: StrptimeOptions) -> Expr {
344        self.strptime(DataType::Time, options, lit("raise"))
345    }
346
347    /// Convert a String column into a Decimal column.
348    #[cfg(feature = "dtype-decimal")]
349    pub fn to_decimal(self, infer_length: usize) -> Expr {
350        self.0
351            .map_private(StringFunction::ToDecimal(infer_length).into())
352    }
353
354    /// Concat the values into a string array.
355    /// # Arguments
356    ///
357    /// * `delimiter` - A string that will act as delimiter between values.
358    #[cfg(feature = "concat_str")]
359    pub fn join(self, delimiter: &str, ignore_nulls: bool) -> Expr {
360        self.0
361            .apply_private(
362                StringFunction::ConcatVertical {
363                    delimiter: delimiter.into(),
364                    ignore_nulls,
365                }
366                .into(),
367            )
368            .with_function_options(|mut options| {
369                options.flags |= FunctionFlags::RETURNS_SCALAR;
370                options.collect_groups = ApplyOptions::GroupWise;
371                options
372            })
373    }
374
375    /// Split the string by a substring. The resulting dtype is `List<String>`.
376    pub fn split(self, by: Expr) -> Expr {
377        self.0
378            .map_many_private(StringFunction::Split(false).into(), &[by], false, None)
379    }
380
381    /// Split the string by a substring and keep the substring. The resulting dtype is `List<String>`.
382    pub fn split_inclusive(self, by: Expr) -> Expr {
383        self.0
384            .map_many_private(StringFunction::Split(true).into(), &[by], false, None)
385    }
386
387    #[cfg(feature = "dtype-struct")]
388    /// Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
389    pub fn split_exact(self, by: Expr, n: usize) -> Expr {
390        self.0.map_many_private(
391            StringFunction::SplitExact {
392                n,
393                inclusive: false,
394            }
395            .into(),
396            &[by],
397            false,
398            None,
399        )
400    }
401
402    #[cfg(feature = "dtype-struct")]
403    /// Split exactly `n` times by a given substring and keep the substring.
404    /// The resulting dtype is [`DataType::Struct`].
405    pub fn split_exact_inclusive(self, by: Expr, n: usize) -> Expr {
406        self.0.map_many_private(
407            StringFunction::SplitExact { n, inclusive: true }.into(),
408            &[by],
409            false,
410            None,
411        )
412    }
413
414    #[cfg(feature = "dtype-struct")]
415    /// Split by a given substring, returning exactly `n` items. If there are more possible splits,
416    /// keeps the remainder of the string intact. The resulting dtype is [`DataType::Struct`].
417    pub fn splitn(self, by: Expr, n: usize) -> Expr {
418        self.0
419            .map_many_private(StringFunction::SplitN(n).into(), &[by], false, None)
420    }
421
422    #[cfg(feature = "regex")]
423    /// Replace values that match a regex `pat` with a `value`.
424    pub fn replace(self, pat: Expr, value: Expr, literal: bool) -> Expr {
425        self.0.map_many_private(
426            FunctionExpr::StringExpr(StringFunction::Replace { n: 1, literal }),
427            &[pat, value],
428            false,
429            Some(Default::default()),
430        )
431    }
432
433    #[cfg(feature = "regex")]
434    /// Replace values that match a regex `pat` with a `value`.
435    pub fn replace_n(self, pat: Expr, value: Expr, literal: bool, n: i64) -> Expr {
436        self.0.map_many_private(
437            FunctionExpr::StringExpr(StringFunction::Replace { n, literal }),
438            &[pat, value],
439            false,
440            Some(Default::default()),
441        )
442    }
443
444    #[cfg(feature = "regex")]
445    /// Replace all values that match a regex `pat` with a `value`.
446    pub fn replace_all(self, pat: Expr, value: Expr, literal: bool) -> Expr {
447        self.0.map_many_private(
448            FunctionExpr::StringExpr(StringFunction::Replace { n: -1, literal }),
449            &[pat, value],
450            false,
451            Some(Default::default()),
452        )
453    }
454
455    #[cfg(feature = "string_normalize")]
456    /// Normalize each string
457    pub fn normalize(self, form: UnicodeForm) -> Expr {
458        self.0.map_many_private(
459            FunctionExpr::StringExpr(StringFunction::Normalize { form }),
460            &[],
461            false,
462            None,
463        )
464    }
465
466    #[cfg(feature = "string_reverse")]
467    /// Reverse each string
468    pub fn reverse(self) -> Expr {
469        self.0.map_many_private(
470            FunctionExpr::StringExpr(StringFunction::Reverse),
471            &[],
472            false,
473            None,
474        )
475    }
476
477    /// Remove leading and trailing characters, or whitespace if matches is None.
478    pub fn strip_chars(self, matches: Expr) -> Expr {
479        self.0.map_many_private(
480            FunctionExpr::StringExpr(StringFunction::StripChars),
481            &[matches],
482            false,
483            None,
484        )
485    }
486
487    /// Remove leading characters, or whitespace if matches is None.
488    pub fn strip_chars_start(self, matches: Expr) -> Expr {
489        self.0.map_many_private(
490            FunctionExpr::StringExpr(StringFunction::StripCharsStart),
491            &[matches],
492            false,
493            None,
494        )
495    }
496
497    /// Remove trailing characters, or whitespace if matches is None.
498    pub fn strip_chars_end(self, matches: Expr) -> Expr {
499        self.0.map_many_private(
500            FunctionExpr::StringExpr(StringFunction::StripCharsEnd),
501            &[matches],
502            false,
503            None,
504        )
505    }
506
507    /// Remove prefix.
508    pub fn strip_prefix(self, prefix: Expr) -> Expr {
509        self.0.map_many_private(
510            FunctionExpr::StringExpr(StringFunction::StripPrefix),
511            &[prefix],
512            false,
513            None,
514        )
515    }
516
517    /// Remove suffix.
518    pub fn strip_suffix(self, suffix: Expr) -> Expr {
519        self.0.map_many_private(
520            FunctionExpr::StringExpr(StringFunction::StripSuffix),
521            &[suffix],
522            false,
523            None,
524        )
525    }
526
527    /// Convert all characters to lowercase.
528    pub fn to_lowercase(self) -> Expr {
529        self.0
530            .map_private(FunctionExpr::StringExpr(StringFunction::Lowercase))
531    }
532
533    /// Convert all characters to uppercase.
534    pub fn to_uppercase(self) -> Expr {
535        self.0
536            .map_private(FunctionExpr::StringExpr(StringFunction::Uppercase))
537    }
538
539    /// Convert all characters to titlecase.
540    #[cfg(feature = "nightly")]
541    pub fn to_titlecase(self) -> Expr {
542        self.0
543            .map_private(FunctionExpr::StringExpr(StringFunction::Titlecase))
544    }
545
546    #[cfg(feature = "string_to_integer")]
547    /// Parse string in base radix into decimal.
548    pub fn to_integer(self, base: Expr, strict: bool) -> Expr {
549        self.0.map_many_private(
550            FunctionExpr::StringExpr(StringFunction::ToInteger(strict)),
551            &[base],
552            false,
553            None,
554        )
555    }
556
557    /// Return the length of each string as the number of bytes.
558    ///
559    /// When working with non-ASCII text, the length in bytes is not the same
560    /// as the length in characters. You may want to use
561    /// [`len_chars`] instead. Note that `len_bytes` is much more
562    /// performant (_O(1)_) than [`len_chars`] (_O(n)_).
563    ///
564    /// [`len_chars`]: StringNameSpace::len_chars
565    pub fn len_bytes(self) -> Expr {
566        self.0
567            .map_private(FunctionExpr::StringExpr(StringFunction::LenBytes))
568    }
569
570    /// Return the length of each string as the number of characters.
571    ///
572    /// When working with ASCII text, use [`len_bytes`] instead to achieve
573    /// equivalent output with much better performance:
574    /// [`len_bytes`] runs in _O(1)_, while `len_chars` runs in _O(n)_.
575    ///
576    /// [`len_bytes`]: StringNameSpace::len_bytes
577    pub fn len_chars(self) -> Expr {
578        self.0
579            .map_private(FunctionExpr::StringExpr(StringFunction::LenChars))
580    }
581
582    /// Slice the string values.
583    pub fn slice(self, offset: Expr, length: Expr) -> Expr {
584        self.0.map_many_private(
585            FunctionExpr::StringExpr(StringFunction::Slice),
586            &[offset, length],
587            false,
588            None,
589        )
590    }
591
592    /// Take the first `n` characters of the string values.
593    pub fn head(self, n: Expr) -> Expr {
594        self.0.map_many_private(
595            FunctionExpr::StringExpr(StringFunction::Head),
596            &[n],
597            false,
598            None,
599        )
600    }
601
602    /// Take the last `n` characters of the string values.
603    pub fn tail(self, n: Expr) -> Expr {
604        self.0.map_many_private(
605            FunctionExpr::StringExpr(StringFunction::Tail),
606            &[n],
607            false,
608            None,
609        )
610    }
611
612    #[cfg(feature = "extract_jsonpath")]
613    pub fn json_decode(self, dtype: Option<DataType>, infer_schema_len: Option<usize>) -> Expr {
614        // Apply, because dtype should be inferred only once and be consistent over chunks/morsels.
615        self.0
616            .apply_private(FunctionExpr::StringExpr(StringFunction::JsonDecode {
617                dtype,
618                infer_schema_len,
619            }))
620    }
621
622    #[cfg(feature = "extract_jsonpath")]
623    pub fn json_path_match(self, pat: Expr) -> Expr {
624        self.0.map_many_private(
625            FunctionExpr::StringExpr(StringFunction::JsonPathMatch),
626            &[pat],
627            false,
628            None,
629        )
630    }
631
632    #[cfg(feature = "regex")]
633    pub fn escape_regex(self) -> Expr {
634        self.0.map_many_private(
635            FunctionExpr::StringExpr(StringFunction::EscapeRegex),
636            &[],
637            false,
638            None,
639        )
640    }
641}