datafusion_common/types/native.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use super::{
19 LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
20 TypeSignature,
21};
22use crate::error::{Result, _internal_err};
23use arrow::compute::can_cast_types;
24use arrow::datatypes::{
25 DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
26};
27use std::{fmt::Display, sync::Arc};
28
29/// Representation of a type that DataFusion can handle natively. It is a subset
30/// of the physical variants in Arrow's native [`DataType`].
31#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
32pub enum NativeType {
33 /// Null type
34 Null,
35 /// A boolean type representing the values `true` and `false`.
36 Boolean,
37 /// A signed 8-bit integer.
38 Int8,
39 /// A signed 16-bit integer.
40 Int16,
41 /// A signed 32-bit integer.
42 Int32,
43 /// A signed 64-bit integer.
44 Int64,
45 /// An unsigned 8-bit integer.
46 UInt8,
47 /// An unsigned 16-bit integer.
48 UInt16,
49 /// An unsigned 32-bit integer.
50 UInt32,
51 /// An unsigned 64-bit integer.
52 UInt64,
53 /// A 16-bit floating point number.
54 Float16,
55 /// A 32-bit floating point number.
56 Float32,
57 /// A 64-bit floating point number.
58 Float64,
59 /// A timestamp with an optional timezone.
60 ///
61 /// Time is measured as a Unix epoch, counting the seconds from
62 /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
63 /// as a signed 64-bit integer.
64 ///
65 /// The time zone is a string indicating the name of a time zone, one of:
66 ///
67 /// * As used in the Olson time zone database (the "tz database" or
68 /// "tzdata"), such as "America/New_York"
69 /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
70 ///
71 /// Timestamps with a non-empty timezone
72 /// ------------------------------------
73 ///
74 /// If a Timestamp column has a non-empty timezone value, its epoch is
75 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
76 /// (the Unix epoch), regardless of the Timestamp's own timezone.
77 ///
78 /// Therefore, timestamp values with a non-empty timezone correspond to
79 /// physical points in time together with some additional information about
80 /// how the data was obtained and/or how to display it (the timezone).
81 ///
82 /// For example, the timestamp value 0 with the timezone string "Europe/Paris"
83 /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
84 /// application may prefer to display it as "January 1st 1970, 01h00" in
85 /// the Europe/Paris timezone (which is the same physical point in time).
86 ///
87 /// One consequence is that timestamp values with a non-empty timezone
88 /// can be compared and ordered directly, since they all share the same
89 /// well-known point of reference (the Unix epoch).
90 ///
91 /// Timestamps with an unset / empty timezone
92 /// -----------------------------------------
93 ///
94 /// If a Timestamp column has no timezone value, its epoch is
95 /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
96 ///
97 /// Therefore, timestamp values without a timezone cannot be meaningfully
98 /// interpreted as physical points in time, but only as calendar / clock
99 /// indications ("wall clock time") in an unspecified timezone.
100 ///
101 /// For example, the timestamp value 0 with an empty timezone string
102 /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
103 /// is not enough information to interpret it as a well-defined physical
104 /// point in time.
105 ///
106 /// One consequence is that timestamp values without a timezone cannot
107 /// be reliably compared or ordered, since they may have different points of
108 /// reference. In particular, it is *not* possible to interpret an unset
109 /// or empty timezone as the same as "UTC".
110 ///
111 /// Conversion between timezones
112 /// ----------------------------
113 ///
114 /// If a Timestamp column has a non-empty timezone, changing the timezone
115 /// to a different non-empty value is a metadata-only operation:
116 /// the timestamp values need not change as their point of reference remains
117 /// the same (the Unix epoch).
118 ///
119 /// However, if a Timestamp column has no timezone value, changing it to a
120 /// non-empty value requires to think about the desired semantics.
121 /// One possibility is to assume that the original timestamp values are
122 /// relative to the epoch of the timezone being set; timestamp values should
123 /// then adjusted to the Unix epoch (for example, changing the timezone from
124 /// empty to "Europe/Paris" would require converting the timestamp values
125 /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
126 /// nevertheless correct).
127 ///
128 /// ```
129 /// # use arrow::datatypes::{DataType, TimeUnit};
130 /// DataType::Timestamp(TimeUnit::Second, None);
131 /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
132 /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
133 /// ```
134 Timestamp(TimeUnit, Option<Arc<str>>),
135 /// A signed date representing the elapsed time since UNIX epoch (1970-01-01)
136 /// in days.
137 Date,
138 /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`.
139 Time(TimeUnit),
140 /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
141 Duration(TimeUnit),
142 /// A "calendar" interval which models types that don't necessarily
143 /// have a precise duration without the context of a base timestamp (e.g.
144 /// days can differ in length during day light savings time transitions).
145 Interval(IntervalUnit),
146 /// Opaque binary data of variable length.
147 Binary,
148 /// Opaque binary data of fixed size.
149 /// Enum parameter specifies the number of bytes per value.
150 FixedSizeBinary(i32),
151 /// A variable-length string in Unicode with UTF-8 encoding.
152 String,
153 /// A list of some logical data type with variable length.
154 List(LogicalFieldRef),
155 /// A list of some logical data type with fixed length.
156 FixedSizeList(LogicalFieldRef, i32),
157 /// A nested type that contains a number of sub-fields.
158 Struct(LogicalFields),
159 /// A nested type that can represent slots of differing types.
160 Union(LogicalUnionFields),
161 /// Decimal value with precision and scale
162 ///
163 /// * precision is the total number of digits
164 /// * scale is the number of digits past the decimal
165 ///
166 /// For example the number 123.45 has precision 5 and scale 2.
167 ///
168 /// In certain situations, scale could be negative number. For
169 /// negative scale, it is the number of padding 0 to the right
170 /// of the digits.
171 ///
172 /// For example the number 12300 could be treated as a decimal
173 /// has precision 3 and scale -2.
174 Decimal(u8, i8),
175 /// A Map is a type that an association between a key and a value.
176 ///
177 /// The key and value types are not constrained, but keys should be
178 /// hashable and unique.
179 ///
180 /// In a field with Map type, key type and the second the value type. The names of the
181 /// child fields may be respectively "entries", "key", and "value", but this is
182 /// not enforced.
183 Map(LogicalFieldRef),
184}
185
186impl Display for NativeType {
187 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
188 write!(f, "NativeType::{self:?}")
189 }
190}
191
192impl LogicalType for NativeType {
193 fn native(&self) -> &NativeType {
194 self
195 }
196
197 fn signature(&self) -> TypeSignature<'_> {
198 TypeSignature::Native(self)
199 }
200
201 /// Returns the default casted type for the given arrow type
202 ///
203 /// For types like String or Date, multiple arrow types mapped to the same logical type
204 /// If the given arrow type is one of them, we return the same type
205 /// Otherwise, we define the default casted type for the given arrow type
206 fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
207 use DataType::*;
208
209 fn default_field_cast(to: &LogicalField, from: &Field) -> Result<FieldRef> {
210 Ok(Arc::new(Field::new(
211 to.name.clone(),
212 to.logical_type.default_cast_for(from.data_type())?,
213 to.nullable,
214 )))
215 }
216
217 Ok(match (self, origin) {
218 (Self::Null, _) => Null,
219 (Self::Boolean, _) => Boolean,
220 (Self::Int8, _) => Int8,
221 (Self::Int16, _) => Int16,
222 (Self::Int32, _) => Int32,
223 (Self::Int64, _) => Int64,
224 (Self::UInt8, _) => UInt8,
225 (Self::UInt16, _) => UInt16,
226 (Self::UInt32, _) => UInt32,
227 (Self::UInt64, _) => UInt64,
228 (Self::Float16, _) => Float16,
229 (Self::Float32, _) => Float32,
230 (Self::Float64, _) => Float64,
231 (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s),
232 (Self::Decimal(p, s), _) => Decimal256(*p, *s),
233 (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
234 // If given type is Date, return the same type
235 (Self::Date, origin) if matches!(origin, Date32 | Date64) => {
236 origin.to_owned()
237 }
238 (Self::Date, _) => Date32,
239 (Self::Time(tu), _) => match tu {
240 TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
241 TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
242 },
243 (Self::Duration(tu), _) => Duration(*tu),
244 (Self::Interval(iu), _) => Interval(*iu),
245 (Self::Binary, LargeUtf8) => LargeBinary,
246 (Self::Binary, Utf8View) => BinaryView,
247 (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
248 BinaryView
249 }
250 (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => {
251 LargeBinary
252 }
253 (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary,
254 (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
255 (Self::String, LargeBinary) => LargeUtf8,
256 (Self::String, BinaryView) => Utf8View,
257 // We don't cast to another kind of string type if the origin one is already a string type
258 (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(),
259 (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View,
260 (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => {
261 LargeUtf8
262 }
263 (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8,
264 (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => {
265 List(default_field_cast(to_field, from_field)?)
266 }
267 (Self::List(to_field), LargeList(from_field)) => {
268 LargeList(default_field_cast(to_field, from_field)?)
269 }
270 (Self::List(to_field), ListView(from_field)) => {
271 ListView(default_field_cast(to_field, from_field)?)
272 }
273 (Self::List(to_field), LargeListView(from_field)) => {
274 LargeListView(default_field_cast(to_field, from_field)?)
275 }
276 // List array where each element is a len 1 list of the origin type
277 (Self::List(field), _) => List(Arc::new(Field::new(
278 field.name.clone(),
279 field.logical_type.default_cast_for(origin)?,
280 field.nullable,
281 ))),
282 (
283 Self::FixedSizeList(to_field, to_size),
284 FixedSizeList(from_field, from_size),
285 ) if from_size == to_size => {
286 FixedSizeList(default_field_cast(to_field, from_field)?, *to_size)
287 }
288 (
289 Self::FixedSizeList(to_field, size),
290 List(from_field)
291 | LargeList(from_field)
292 | ListView(from_field)
293 | LargeListView(from_field),
294 ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size),
295 // FixedSizeList array where each element is a len 1 list of the origin type
296 (Self::FixedSizeList(field, size), _) => FixedSizeList(
297 Arc::new(Field::new(
298 field.name.clone(),
299 field.logical_type.default_cast_for(origin)?,
300 field.nullable,
301 )),
302 *size,
303 ),
304 // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
305 (Self::Struct(to_fields), Struct(from_fields))
306 if from_fields.len() == to_fields.len() =>
307 {
308 Struct(
309 from_fields
310 .iter()
311 .zip(to_fields.iter())
312 .map(|(from, to)| default_field_cast(to, from))
313 .collect::<Result<Fields>>()?,
314 )
315 }
316 (Self::Struct(to_fields), Null) => Struct(
317 to_fields
318 .iter()
319 .map(|field| {
320 Ok(Arc::new(Field::new(
321 field.name.clone(),
322 field.logical_type.default_cast_for(&Null)?,
323 field.nullable,
324 )))
325 })
326 .collect::<Result<Fields>>()?,
327 ),
328 (Self::Map(to_field), Map(from_field, sorted)) => {
329 Map(default_field_cast(to_field, from_field)?, *sorted)
330 }
331 (Self::Map(field), Null) => Map(
332 Arc::new(Field::new(
333 field.name.clone(),
334 field.logical_type.default_cast_for(&Null)?,
335 field.nullable,
336 )),
337 false,
338 ),
339 (Self::Union(to_fields), Union(from_fields, mode))
340 if from_fields.len() == to_fields.len() =>
341 {
342 Union(
343 from_fields
344 .iter()
345 .zip(to_fields.iter())
346 .map(|((_, from), (i, to))| {
347 Ok((*i, default_field_cast(to, from)?))
348 })
349 .collect::<Result<UnionFields>>()?,
350 *mode,
351 )
352 }
353 _ => {
354 return _internal_err!(
355 "Unavailable default cast for native type {:?} from physical type {:?}",
356 self,
357 origin
358 )
359 }
360 })
361 }
362}
363
364// The following From<DataType>, From<Field>, ... implementations are temporary
365// mapping solutions to provide backwards compatibility while transitioning from
366// the purely physical system to a logical / physical system.
367
368impl From<&DataType> for NativeType {
369 fn from(value: &DataType) -> Self {
370 value.clone().into()
371 }
372}
373
374impl From<DataType> for NativeType {
375 fn from(value: DataType) -> Self {
376 use NativeType::*;
377 match value {
378 DataType::Null => Null,
379 DataType::Boolean => Boolean,
380 DataType::Int8 => Int8,
381 DataType::Int16 => Int16,
382 DataType::Int32 => Int32,
383 DataType::Int64 => Int64,
384 DataType::UInt8 => UInt8,
385 DataType::UInt16 => UInt16,
386 DataType::UInt32 => UInt32,
387 DataType::UInt64 => UInt64,
388 DataType::Float16 => Float16,
389 DataType::Float32 => Float32,
390 DataType::Float64 => Float64,
391 DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
392 DataType::Date32 | DataType::Date64 => Date,
393 DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
394 DataType::Duration(tu) => Duration(tu),
395 DataType::Interval(iu) => Interval(iu),
396 DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary,
397 DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
398 DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String,
399 DataType::List(field)
400 | DataType::ListView(field)
401 | DataType::LargeList(field)
402 | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())),
403 DataType::FixedSizeList(field, size) => {
404 FixedSizeList(Arc::new(field.as_ref().into()), size)
405 }
406 DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
407 DataType::Union(union_fields, _) => {
408 Union(LogicalUnionFields::from(&union_fields))
409 }
410 DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
411 DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
412 DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
413 DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
414 }
415 }
416}
417
418impl NativeType {
419 #[inline]
420 pub fn is_numeric(&self) -> bool {
421 use NativeType::*;
422 matches!(
423 self,
424 UInt8
425 | UInt16
426 | UInt32
427 | UInt64
428 | Int8
429 | Int16
430 | Int32
431 | Int64
432 | Float16
433 | Float32
434 | Float64
435 | Decimal(_, _)
436 )
437 }
438
439 #[inline]
440 pub fn is_integer(&self) -> bool {
441 use NativeType::*;
442 matches!(
443 self,
444 UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
445 )
446 }
447
448 #[inline]
449 pub fn is_timestamp(&self) -> bool {
450 matches!(self, NativeType::Timestamp(_, _))
451 }
452
453 #[inline]
454 pub fn is_date(&self) -> bool {
455 matches!(self, NativeType::Date)
456 }
457
458 #[inline]
459 pub fn is_time(&self) -> bool {
460 matches!(self, NativeType::Time(_))
461 }
462
463 #[inline]
464 pub fn is_interval(&self) -> bool {
465 matches!(self, NativeType::Interval(_))
466 }
467
468 #[inline]
469 pub fn is_duration(&self) -> bool {
470 matches!(self, NativeType::Duration(_))
471 }
472}