datafusion_common/types/
native.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::{
19    LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
20    TypeSignature,
21};
22use crate::error::{Result, _internal_err};
23use arrow::compute::can_cast_types;
24use arrow::datatypes::{
25    DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
26};
27use std::{fmt::Display, sync::Arc};
28
29/// Representation of a type that DataFusion can handle natively. It is a subset
30/// of the physical variants in Arrow's native [`DataType`].
31#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
32pub enum NativeType {
33    /// Null type
34    Null,
35    /// A boolean type representing the values `true` and `false`.
36    Boolean,
37    /// A signed 8-bit integer.
38    Int8,
39    /// A signed 16-bit integer.
40    Int16,
41    /// A signed 32-bit integer.
42    Int32,
43    /// A signed 64-bit integer.
44    Int64,
45    /// An unsigned 8-bit integer.
46    UInt8,
47    /// An unsigned 16-bit integer.
48    UInt16,
49    /// An unsigned 32-bit integer.
50    UInt32,
51    /// An unsigned 64-bit integer.
52    UInt64,
53    /// A 16-bit floating point number.
54    Float16,
55    /// A 32-bit floating point number.
56    Float32,
57    /// A 64-bit floating point number.
58    Float64,
59    /// A timestamp with an optional timezone.
60    ///
61    /// Time is measured as a Unix epoch, counting the seconds from
62    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
63    /// as a signed 64-bit integer.
64    ///
65    /// The time zone is a string indicating the name of a time zone, one of:
66    ///
67    /// * As used in the Olson time zone database (the "tz database" or
68    ///   "tzdata"), such as "America/New_York"
69    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
70    ///
71    /// Timestamps with a non-empty timezone
72    /// ------------------------------------
73    ///
74    /// If a Timestamp column has a non-empty timezone value, its epoch is
75    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
76    /// (the Unix epoch), regardless of the Timestamp's own timezone.
77    ///
78    /// Therefore, timestamp values with a non-empty timezone correspond to
79    /// physical points in time together with some additional information about
80    /// how the data was obtained and/or how to display it (the timezone).
81    ///
82    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
83    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
84    ///   application may prefer to display it as "January 1st 1970, 01h00" in
85    ///   the Europe/Paris timezone (which is the same physical point in time).
86    ///
87    /// One consequence is that timestamp values with a non-empty timezone
88    /// can be compared and ordered directly, since they all share the same
89    /// well-known point of reference (the Unix epoch).
90    ///
91    /// Timestamps with an unset / empty timezone
92    /// -----------------------------------------
93    ///
94    /// If a Timestamp column has no timezone value, its epoch is
95    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
96    ///
97    /// Therefore, timestamp values without a timezone cannot be meaningfully
98    /// interpreted as physical points in time, but only as calendar / clock
99    /// indications ("wall clock time") in an unspecified timezone.
100    ///
101    ///   For example, the timestamp value 0 with an empty timezone string
102    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
103    ///   is not enough information to interpret it as a well-defined physical
104    ///   point in time.
105    ///
106    /// One consequence is that timestamp values without a timezone cannot
107    /// be reliably compared or ordered, since they may have different points of
108    /// reference.  In particular, it is *not* possible to interpret an unset
109    /// or empty timezone as the same as "UTC".
110    ///
111    /// Conversion between timezones
112    /// ----------------------------
113    ///
114    /// If a Timestamp column has a non-empty timezone, changing the timezone
115    /// to a different non-empty value is a metadata-only operation:
116    /// the timestamp values need not change as their point of reference remains
117    /// the same (the Unix epoch).
118    ///
119    /// However, if a Timestamp column has no timezone value, changing it to a
120    /// non-empty value requires to think about the desired semantics.
121    /// One possibility is to assume that the original timestamp values are
122    /// relative to the epoch of the timezone being set; timestamp values should
123    /// then adjusted to the Unix epoch (for example, changing the timezone from
124    /// empty to "Europe/Paris" would require converting the timestamp values
125    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
126    /// nevertheless correct).
127    ///
128    /// ```
129    /// # use arrow::datatypes::{DataType, TimeUnit};
130    /// DataType::Timestamp(TimeUnit::Second, None);
131    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
132    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
133    /// ```
134    Timestamp(TimeUnit, Option<Arc<str>>),
135    /// A signed date representing the elapsed time since UNIX epoch (1970-01-01)
136    /// in days.
137    Date,
138    /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`.
139    Time(TimeUnit),
140    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
141    Duration(TimeUnit),
142    /// A "calendar" interval which models types that don't necessarily
143    /// have a precise duration without the context of a base timestamp (e.g.
144    /// days can differ in length during day light savings time transitions).
145    Interval(IntervalUnit),
146    /// Opaque binary data of variable length.
147    Binary,
148    /// Opaque binary data of fixed size.
149    /// Enum parameter specifies the number of bytes per value.
150    FixedSizeBinary(i32),
151    /// A variable-length string in Unicode with UTF-8 encoding.
152    String,
153    /// A list of some logical data type with variable length.
154    List(LogicalFieldRef),
155    /// A list of some logical data type with fixed length.
156    FixedSizeList(LogicalFieldRef, i32),
157    /// A nested type that contains a number of sub-fields.
158    Struct(LogicalFields),
159    /// A nested type that can represent slots of differing types.
160    Union(LogicalUnionFields),
161    /// Decimal value with precision and scale
162    ///
163    /// * precision is the total number of digits
164    /// * scale is the number of digits past the decimal
165    ///
166    /// For example the number 123.45 has precision 5 and scale 2.
167    ///
168    /// In certain situations, scale could be negative number. For
169    /// negative scale, it is the number of padding 0 to the right
170    /// of the digits.
171    ///
172    /// For example the number 12300 could be treated as a decimal
173    /// has precision 3 and scale -2.
174    Decimal(u8, i8),
175    /// A Map is a type that an association between a key and a value.
176    ///
177    /// The key and value types are not constrained, but keys should be
178    /// hashable and unique.
179    ///
180    /// In a field with Map type, key type and the second the value type. The names of the
181    /// child fields may be respectively "entries", "key", and "value", but this is
182    /// not enforced.
183    Map(LogicalFieldRef),
184}
185
186impl Display for NativeType {
187    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
188        write!(f, "NativeType::{self:?}")
189    }
190}
191
192impl LogicalType for NativeType {
193    fn native(&self) -> &NativeType {
194        self
195    }
196
197    fn signature(&self) -> TypeSignature<'_> {
198        TypeSignature::Native(self)
199    }
200
201    /// Returns the default casted type for the given arrow type
202    ///
203    /// For types like String or Date, multiple arrow types mapped to the same logical type
204    /// If the given arrow type is one of them, we return the same type
205    /// Otherwise, we define the default casted type for the given arrow type
206    fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
207        use DataType::*;
208
209        fn default_field_cast(to: &LogicalField, from: &Field) -> Result<FieldRef> {
210            Ok(Arc::new(Field::new(
211                to.name.clone(),
212                to.logical_type.default_cast_for(from.data_type())?,
213                to.nullable,
214            )))
215        }
216
217        Ok(match (self, origin) {
218            (Self::Null, _) => Null,
219            (Self::Boolean, _) => Boolean,
220            (Self::Int8, _) => Int8,
221            (Self::Int16, _) => Int16,
222            (Self::Int32, _) => Int32,
223            (Self::Int64, _) => Int64,
224            (Self::UInt8, _) => UInt8,
225            (Self::UInt16, _) => UInt16,
226            (Self::UInt32, _) => UInt32,
227            (Self::UInt64, _) => UInt64,
228            (Self::Float16, _) => Float16,
229            (Self::Float32, _) => Float32,
230            (Self::Float64, _) => Float64,
231            (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s),
232            (Self::Decimal(p, s), _) => Decimal256(*p, *s),
233            (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
234            // If given type is Date, return the same type
235            (Self::Date, origin) if matches!(origin, Date32 | Date64) => {
236                origin.to_owned()
237            }
238            (Self::Date, _) => Date32,
239            (Self::Time(tu), _) => match tu {
240                TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
241                TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
242            },
243            (Self::Duration(tu), _) => Duration(*tu),
244            (Self::Interval(iu), _) => Interval(*iu),
245            (Self::Binary, LargeUtf8) => LargeBinary,
246            (Self::Binary, Utf8View) => BinaryView,
247            (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
248                BinaryView
249            }
250            (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => {
251                LargeBinary
252            }
253            (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary,
254            (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
255            (Self::String, LargeBinary) => LargeUtf8,
256            (Self::String, BinaryView) => Utf8View,
257            // We don't cast to another kind of string type if the origin one is already a string type
258            (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(),
259            (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View,
260            (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => {
261                LargeUtf8
262            }
263            (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8,
264            (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => {
265                List(default_field_cast(to_field, from_field)?)
266            }
267            (Self::List(to_field), LargeList(from_field)) => {
268                LargeList(default_field_cast(to_field, from_field)?)
269            }
270            (Self::List(to_field), ListView(from_field)) => {
271                ListView(default_field_cast(to_field, from_field)?)
272            }
273            (Self::List(to_field), LargeListView(from_field)) => {
274                LargeListView(default_field_cast(to_field, from_field)?)
275            }
276            // List array where each element is a len 1 list of the origin type
277            (Self::List(field), _) => List(Arc::new(Field::new(
278                field.name.clone(),
279                field.logical_type.default_cast_for(origin)?,
280                field.nullable,
281            ))),
282            (
283                Self::FixedSizeList(to_field, to_size),
284                FixedSizeList(from_field, from_size),
285            ) if from_size == to_size => {
286                FixedSizeList(default_field_cast(to_field, from_field)?, *to_size)
287            }
288            (
289                Self::FixedSizeList(to_field, size),
290                List(from_field)
291                | LargeList(from_field)
292                | ListView(from_field)
293                | LargeListView(from_field),
294            ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size),
295            // FixedSizeList array where each element is a len 1 list of the origin type
296            (Self::FixedSizeList(field, size), _) => FixedSizeList(
297                Arc::new(Field::new(
298                    field.name.clone(),
299                    field.logical_type.default_cast_for(origin)?,
300                    field.nullable,
301                )),
302                *size,
303            ),
304            // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
305            (Self::Struct(to_fields), Struct(from_fields))
306                if from_fields.len() == to_fields.len() =>
307            {
308                Struct(
309                    from_fields
310                        .iter()
311                        .zip(to_fields.iter())
312                        .map(|(from, to)| default_field_cast(to, from))
313                        .collect::<Result<Fields>>()?,
314                )
315            }
316            (Self::Struct(to_fields), Null) => Struct(
317                to_fields
318                    .iter()
319                    .map(|field| {
320                        Ok(Arc::new(Field::new(
321                            field.name.clone(),
322                            field.logical_type.default_cast_for(&Null)?,
323                            field.nullable,
324                        )))
325                    })
326                    .collect::<Result<Fields>>()?,
327            ),
328            (Self::Map(to_field), Map(from_field, sorted)) => {
329                Map(default_field_cast(to_field, from_field)?, *sorted)
330            }
331            (Self::Map(field), Null) => Map(
332                Arc::new(Field::new(
333                    field.name.clone(),
334                    field.logical_type.default_cast_for(&Null)?,
335                    field.nullable,
336                )),
337                false,
338            ),
339            (Self::Union(to_fields), Union(from_fields, mode))
340                if from_fields.len() == to_fields.len() =>
341            {
342                Union(
343                    from_fields
344                        .iter()
345                        .zip(to_fields.iter())
346                        .map(|((_, from), (i, to))| {
347                            Ok((*i, default_field_cast(to, from)?))
348                        })
349                        .collect::<Result<UnionFields>>()?,
350                    *mode,
351                )
352            }
353            _ => {
354                return _internal_err!(
355                "Unavailable default cast for native type {:?} from physical type {:?}",
356                self,
357                origin
358            )
359            }
360        })
361    }
362}
363
364// The following From<DataType>, From<Field>, ... implementations are temporary
365// mapping solutions to provide backwards compatibility while transitioning from
366// the purely physical system to a logical / physical system.
367
368impl From<&DataType> for NativeType {
369    fn from(value: &DataType) -> Self {
370        value.clone().into()
371    }
372}
373
374impl From<DataType> for NativeType {
375    fn from(value: DataType) -> Self {
376        use NativeType::*;
377        match value {
378            DataType::Null => Null,
379            DataType::Boolean => Boolean,
380            DataType::Int8 => Int8,
381            DataType::Int16 => Int16,
382            DataType::Int32 => Int32,
383            DataType::Int64 => Int64,
384            DataType::UInt8 => UInt8,
385            DataType::UInt16 => UInt16,
386            DataType::UInt32 => UInt32,
387            DataType::UInt64 => UInt64,
388            DataType::Float16 => Float16,
389            DataType::Float32 => Float32,
390            DataType::Float64 => Float64,
391            DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
392            DataType::Date32 | DataType::Date64 => Date,
393            DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
394            DataType::Duration(tu) => Duration(tu),
395            DataType::Interval(iu) => Interval(iu),
396            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary,
397            DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
398            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String,
399            DataType::List(field)
400            | DataType::ListView(field)
401            | DataType::LargeList(field)
402            | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())),
403            DataType::FixedSizeList(field, size) => {
404                FixedSizeList(Arc::new(field.as_ref().into()), size)
405            }
406            DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
407            DataType::Union(union_fields, _) => {
408                Union(LogicalUnionFields::from(&union_fields))
409            }
410            DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
411            DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
412            DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
413            DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
414        }
415    }
416}
417
418impl NativeType {
419    #[inline]
420    pub fn is_numeric(&self) -> bool {
421        use NativeType::*;
422        matches!(
423            self,
424            UInt8
425                | UInt16
426                | UInt32
427                | UInt64
428                | Int8
429                | Int16
430                | Int32
431                | Int64
432                | Float16
433                | Float32
434                | Float64
435                | Decimal(_, _)
436        )
437    }
438
439    #[inline]
440    pub fn is_integer(&self) -> bool {
441        use NativeType::*;
442        matches!(
443            self,
444            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
445        )
446    }
447
448    #[inline]
449    pub fn is_timestamp(&self) -> bool {
450        matches!(self, NativeType::Timestamp(_, _))
451    }
452
453    #[inline]
454    pub fn is_date(&self) -> bool {
455        matches!(self, NativeType::Date)
456    }
457
458    #[inline]
459    pub fn is_time(&self) -> bool {
460        matches!(self, NativeType::Time(_))
461    }
462
463    #[inline]
464    pub fn is_interval(&self) -> bool {
465        matches!(self, NativeType::Interval(_))
466    }
467
468    #[inline]
469    pub fn is_duration(&self) -> bool {
470        matches!(self, NativeType::Duration(_))
471    }
472}