arrow_schema/
schema.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::collections::HashMap;
19use std::fmt;
20use std::hash::Hash;
21use std::sync::Arc;
22
23use crate::error::ArrowError;
24use crate::field::Field;
25use crate::{FieldRef, Fields};
26
27/// A builder to facilitate building a [`Schema`] from iteratively from [`FieldRef`]
28#[derive(Debug, Default)]
29pub struct SchemaBuilder {
30    fields: Vec<FieldRef>,
31    metadata: HashMap<String, String>,
32}
33
34impl SchemaBuilder {
35    /// Creates a new empty [`SchemaBuilder`]
36    pub fn new() -> Self {
37        Self::default()
38    }
39
40    /// Creates a new empty [`SchemaBuilder`] with space for `capacity` fields
41    pub fn with_capacity(capacity: usize) -> Self {
42        Self {
43            fields: Vec::with_capacity(capacity),
44            metadata: Default::default(),
45        }
46    }
47
48    /// Appends a [`FieldRef`] to this [`SchemaBuilder`] without checking for collision
49    pub fn push(&mut self, field: impl Into<FieldRef>) {
50        self.fields.push(field.into())
51    }
52
53    /// Removes and returns the [`FieldRef`] as index `idx`
54    ///
55    /// # Panics
56    ///
57    /// Panics if index out of bounds
58    pub fn remove(&mut self, idx: usize) -> FieldRef {
59        self.fields.remove(idx)
60    }
61
62    /// Returns an immutable reference to the [`FieldRef`] at index `idx`
63    ///
64    /// # Panics
65    ///
66    /// Panics if index out of bounds
67    pub fn field(&mut self, idx: usize) -> &FieldRef {
68        &mut self.fields[idx]
69    }
70
71    /// Returns a mutable reference to the [`FieldRef`] at index `idx`
72    ///
73    /// # Panics
74    ///
75    /// Panics if index out of bounds
76    pub fn field_mut(&mut self, idx: usize) -> &mut FieldRef {
77        &mut self.fields[idx]
78    }
79
80    /// Returns an immutable reference to the Map of custom metadata key-value pairs.
81    pub fn metadata(&mut self) -> &HashMap<String, String> {
82        &self.metadata
83    }
84
85    /// Returns a mutable reference to the Map of custom metadata key-value pairs.
86    pub fn metadata_mut(&mut self) -> &mut HashMap<String, String> {
87        &mut self.metadata
88    }
89
90    /// Reverse the fileds
91    pub fn reverse(&mut self) {
92        self.fields.reverse();
93    }
94
95    /// Appends a [`FieldRef`] to this [`SchemaBuilder`] checking for collision
96    ///
97    /// If an existing field exists with the same name, calls [`Field::try_merge`]
98    pub fn try_merge(&mut self, field: &FieldRef) -> Result<(), ArrowError> {
99        // This could potentially be sped up with a HashMap or similar
100        let existing = self.fields.iter_mut().find(|f| f.name() == field.name());
101        match existing {
102            Some(e) if Arc::ptr_eq(e, field) => {} // Nothing to do
103            Some(e) => match Arc::get_mut(e) {
104                Some(e) => e.try_merge(field.as_ref())?,
105                None => {
106                    let mut t = e.as_ref().clone();
107                    t.try_merge(field)?;
108                    *e = Arc::new(t)
109                }
110            },
111            None => self.fields.push(field.clone()),
112        }
113        Ok(())
114    }
115
116    /// Consume this [`SchemaBuilder`] yielding the final [`Schema`]
117    pub fn finish(self) -> Schema {
118        Schema {
119            fields: self.fields.into(),
120            metadata: self.metadata,
121        }
122    }
123}
124
125impl From<&Fields> for SchemaBuilder {
126    fn from(value: &Fields) -> Self {
127        Self {
128            fields: value.to_vec(),
129            metadata: Default::default(),
130        }
131    }
132}
133
134impl From<Fields> for SchemaBuilder {
135    fn from(value: Fields) -> Self {
136        Self {
137            fields: value.to_vec(),
138            metadata: Default::default(),
139        }
140    }
141}
142
143impl From<&Schema> for SchemaBuilder {
144    fn from(value: &Schema) -> Self {
145        Self::from(value.clone())
146    }
147}
148
149impl From<Schema> for SchemaBuilder {
150    fn from(value: Schema) -> Self {
151        Self {
152            fields: value.fields.to_vec(),
153            metadata: value.metadata,
154        }
155    }
156}
157
158impl Extend<FieldRef> for SchemaBuilder {
159    fn extend<T: IntoIterator<Item = FieldRef>>(&mut self, iter: T) {
160        let iter = iter.into_iter();
161        self.fields.reserve(iter.size_hint().0);
162        for f in iter {
163            self.push(f)
164        }
165    }
166}
167
168impl Extend<Field> for SchemaBuilder {
169    fn extend<T: IntoIterator<Item = Field>>(&mut self, iter: T) {
170        let iter = iter.into_iter();
171        self.fields.reserve(iter.size_hint().0);
172        for f in iter {
173            self.push(f)
174        }
175    }
176}
177
178/// A reference-counted reference to a [`Schema`].
179pub type SchemaRef = Arc<Schema>;
180
181/// Describes the meta-data of an ordered sequence of relative types.
182///
183/// Note that this information is only part of the meta-data and not part of the physical
184/// memory layout.
185#[derive(Debug, Clone, PartialEq, Eq)]
186#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
187pub struct Schema {
188    /// A sequence of fields that describe the schema.
189    pub fields: Fields,
190    /// A map of key-value pairs containing additional meta data.
191    pub metadata: HashMap<String, String>,
192}
193
194impl Schema {
195    /// Creates an empty `Schema`
196    pub fn empty() -> Self {
197        Self {
198            fields: Default::default(),
199            metadata: HashMap::new(),
200        }
201    }
202
203    /// Creates a new [`Schema`] from a sequence of [`Field`] values.
204    ///
205    /// # Example
206    ///
207    /// ```
208    /// # use arrow_schema::*;
209    /// let field_a = Field::new("a", DataType::Int64, false);
210    /// let field_b = Field::new("b", DataType::Boolean, false);
211    ///
212    /// let schema = Schema::new(vec![field_a, field_b]);
213    /// ```
214    pub fn new(fields: impl Into<Fields>) -> Self {
215        Self::new_with_metadata(fields, HashMap::new())
216    }
217
218    /// Creates a new [`Schema`] from a sequence of [`Field`] values
219    /// and adds additional metadata in form of key value pairs.
220    ///
221    /// # Example
222    ///
223    /// ```
224    /// # use arrow_schema::*;
225    /// # use std::collections::HashMap;
226    ///
227    /// let field_a = Field::new("a", DataType::Int64, false);
228    /// let field_b = Field::new("b", DataType::Boolean, false);
229    ///
230    /// let mut metadata: HashMap<String, String> = HashMap::new();
231    /// metadata.insert("row_count".to_string(), "100".to_string());
232    ///
233    /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata);
234    /// ```
235    #[inline]
236    pub fn new_with_metadata(fields: impl Into<Fields>, metadata: HashMap<String, String>) -> Self {
237        Self {
238            fields: fields.into(),
239            metadata,
240        }
241    }
242
243    /// Sets the metadata of this `Schema` to be `metadata` and returns self
244    pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
245        self.metadata = metadata;
246        self
247    }
248
249    /// Returns a new schema with only the specified columns in the new schema
250    /// This carries metadata from the parent schema over as well
251    pub fn project(&self, indices: &[usize]) -> Result<Schema, ArrowError> {
252        let new_fields = indices
253            .iter()
254            .map(|i| {
255                self.fields.get(*i).cloned().ok_or_else(|| {
256                    ArrowError::SchemaError(format!(
257                        "project index {} out of bounds, max field {}",
258                        i,
259                        self.fields().len()
260                    ))
261                })
262            })
263            .collect::<Result<Vec<_>, _>>()?;
264        Ok(Self::new_with_metadata(new_fields, self.metadata.clone()))
265    }
266
267    /// Merge schema into self if it is compatible. Struct fields will be merged recursively.
268    ///
269    /// Example:
270    ///
271    /// ```
272    /// # use arrow_schema::*;
273    ///
274    /// let merged = Schema::try_merge(vec![
275    ///     Schema::new(vec![
276    ///         Field::new("c1", DataType::Int64, false),
277    ///         Field::new("c2", DataType::Utf8, false),
278    ///     ]),
279    ///     Schema::new(vec![
280    ///         Field::new("c1", DataType::Int64, true),
281    ///         Field::new("c2", DataType::Utf8, false),
282    ///         Field::new("c3", DataType::Utf8, false),
283    ///     ]),
284    /// ]).unwrap();
285    ///
286    /// assert_eq!(
287    ///     merged,
288    ///     Schema::new(vec![
289    ///         Field::new("c1", DataType::Int64, true),
290    ///         Field::new("c2", DataType::Utf8, false),
291    ///         Field::new("c3", DataType::Utf8, false),
292    ///     ]),
293    /// );
294    /// ```
295    pub fn try_merge(schemas: impl IntoIterator<Item = Self>) -> Result<Self, ArrowError> {
296        let mut out_meta = HashMap::new();
297        let mut out_fields = SchemaBuilder::new();
298        for schema in schemas {
299            let Schema { metadata, fields } = schema;
300
301            // merge metadata
302            for (key, value) in metadata.into_iter() {
303                if let Some(old_val) = out_meta.get(&key) {
304                    if old_val != &value {
305                        return Err(ArrowError::SchemaError(format!(
306                            "Fail to merge schema due to conflicting metadata. \
307                                         Key '{key}' has different values '{old_val}' and '{value}'"
308                        )));
309                    }
310                }
311                out_meta.insert(key, value);
312            }
313
314            // merge fields
315            fields.iter().try_for_each(|x| out_fields.try_merge(x))?
316        }
317
318        Ok(out_fields.finish().with_metadata(out_meta))
319    }
320
321    /// Returns an immutable reference of the vector of `Field` instances.
322    #[inline]
323    pub const fn fields(&self) -> &Fields {
324        &self.fields
325    }
326
327    /// Returns a vector with references to all fields (including nested fields)
328    ///
329    /// # Example
330    ///
331    /// ```
332    /// use std::sync::Arc;
333    /// use arrow_schema::{DataType, Field, Fields, Schema};
334    ///
335    /// let f1 = Arc::new(Field::new("a", DataType::Boolean, false));
336    ///
337    /// let f2_inner = Arc::new(Field::new("b_inner", DataType::Int8, false));
338    /// let f2 = Arc::new(Field::new("b", DataType::List(f2_inner.clone()), false));
339    ///
340    /// let f3_inner1 = Arc::new(Field::new("c_inner1", DataType::Int8, false));
341    /// let f3_inner2 = Arc::new(Field::new("c_inner2", DataType::Int8, false));
342    /// let f3 = Arc::new(Field::new(
343    ///     "c",
344    ///     DataType::Struct(vec![f3_inner1.clone(), f3_inner2.clone()].into()),
345    ///     false
346    /// ));
347    ///
348    /// let mut schema = Schema::new(vec![
349    ///   f1.clone(), f2.clone(), f3.clone()
350    /// ]);
351    /// assert_eq!(
352    ///     schema.flattened_fields(),
353    ///     vec![
354    ///         f1.as_ref(),
355    ///         f2.as_ref(),
356    ///         f2_inner.as_ref(),
357    ///         f3.as_ref(),
358    ///         f3_inner1.as_ref(),
359    ///         f3_inner2.as_ref()
360    ///    ]
361    /// );
362    /// ```
363    #[inline]
364    pub fn flattened_fields(&self) -> Vec<&Field> {
365        self.fields.iter().flat_map(|f| f.fields()).collect()
366    }
367
368    /// Returns a vector with references to all fields (including nested fields)
369    #[deprecated(since = "52.2.0", note = "Use `flattened_fields` instead")]
370    #[inline]
371    pub fn all_fields(&self) -> Vec<&Field> {
372        self.flattened_fields()
373    }
374
375    /// Returns an immutable reference of a specific [`Field`] instance selected using an
376    /// offset within the internal `fields` vector.
377    ///
378    /// # Panics
379    ///
380    /// Panics if index out of bounds
381    pub fn field(&self, i: usize) -> &Field {
382        &self.fields[i]
383    }
384
385    /// Returns an immutable reference of a specific [`Field`] instance selected by name.
386    pub fn field_with_name(&self, name: &str) -> Result<&Field, ArrowError> {
387        Ok(&self.fields[self.index_of(name)?])
388    }
389
390    /// Returns a vector of immutable references to all [`Field`] instances selected by
391    /// the dictionary ID they use.
392    pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> {
393        self.fields
394            .iter()
395            .flat_map(|f| f.fields_with_dict_id(dict_id))
396            .collect()
397    }
398
399    /// Find the index of the column with the given name.
400    pub fn index_of(&self, name: &str) -> Result<usize, ArrowError> {
401        let (idx, _) = self.fields().find(name).ok_or_else(|| {
402            let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect();
403            ArrowError::SchemaError(format!(
404                "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}"
405            ))
406        })?;
407        Ok(idx)
408    }
409
410    /// Returns an immutable reference to the Map of custom metadata key-value pairs.
411    #[inline]
412    pub const fn metadata(&self) -> &HashMap<String, String> {
413        &self.metadata
414    }
415
416    /// Look up a column by name and return a immutable reference to the column along with
417    /// its index.
418    pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> {
419        let (idx, field) = self.fields.find(name)?;
420        Some((idx, field.as_ref()))
421    }
422
423    /// Check to see if `self` is a superset of `other` schema.
424    ///
425    /// In particular returns true if `self.metadata` is a superset of `other.metadata`
426    /// and [`Fields::contains`] for `self.fields` and `other.fields`
427    ///
428    /// In other words, any record that conforms to `other` should also conform to `self`.
429    pub fn contains(&self, other: &Schema) -> bool {
430        // make sure self.metadata is a superset of other.metadata
431        self.fields.contains(&other.fields)
432            && other
433                .metadata
434                .iter()
435                .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default())
436    }
437
438    /// Remove field by index and return it. Recommend to use [`SchemaBuilder`]
439    /// if you are looking to remove multiple columns, as this will save allocations.
440    ///
441    /// # Panic
442    ///
443    /// Panics if `index` is out of bounds.
444    ///
445    /// # Example
446    ///
447    /// ```
448    /// use arrow_schema::{DataType, Field, Schema};
449    /// let mut schema = Schema::new(vec![
450    ///   Field::new("a", DataType::Boolean, false),
451    ///   Field::new("b", DataType::Int8, false),
452    ///   Field::new("c", DataType::Utf8, false),
453    /// ]);
454    /// assert_eq!(schema.fields.len(), 3);
455    /// assert_eq!(schema.remove(1), Field::new("b", DataType::Int8, false).into());
456    /// assert_eq!(schema.fields.len(), 2);
457    /// ```
458    #[deprecated(note = "Use SchemaBuilder::remove")]
459    #[doc(hidden)]
460    #[allow(deprecated)]
461    pub fn remove(&mut self, index: usize) -> FieldRef {
462        self.fields.remove(index)
463    }
464}
465
466impl fmt::Display for Schema {
467    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
468        f.write_str(
469            &self
470                .fields
471                .iter()
472                .map(|c| c.to_string())
473                .collect::<Vec<String>>()
474                .join(", "),
475        )
476    }
477}
478
479// need to implement `Hash` manually because `HashMap` implement Eq but no `Hash`
480#[allow(clippy::derived_hash_with_manual_eq)]
481impl Hash for Schema {
482    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
483        self.fields.hash(state);
484
485        // ensure deterministic key order
486        let mut keys: Vec<&String> = self.metadata.keys().collect();
487        keys.sort();
488        for k in keys {
489            k.hash(state);
490            self.metadata.get(k).expect("key valid").hash(state);
491        }
492    }
493}
494
495#[cfg(test)]
496mod tests {
497    use crate::datatype::DataType;
498    use crate::{TimeUnit, UnionMode};
499
500    use super::*;
501
502    #[test]
503    #[cfg(feature = "serde")]
504    fn test_ser_de_metadata() {
505        // ser/de with empty metadata
506        let schema = Schema::new(vec![
507            Field::new("name", DataType::Utf8, false),
508            Field::new("address", DataType::Utf8, false),
509            Field::new("priority", DataType::UInt8, false),
510        ]);
511
512        let json = serde_json::to_string(&schema).unwrap();
513        let de_schema = serde_json::from_str(&json).unwrap();
514
515        assert_eq!(schema, de_schema);
516
517        // ser/de with non-empty metadata
518        let schema =
519            schema.with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect());
520        let json = serde_json::to_string(&schema).unwrap();
521        let de_schema = serde_json::from_str(&json).unwrap();
522
523        assert_eq!(schema, de_schema);
524    }
525
526    #[test]
527    fn test_projection() {
528        let mut metadata = HashMap::new();
529        metadata.insert("meta".to_string(), "data".to_string());
530
531        let schema = Schema::new(vec![
532            Field::new("name", DataType::Utf8, false),
533            Field::new("address", DataType::Utf8, false),
534            Field::new("priority", DataType::UInt8, false),
535        ])
536        .with_metadata(metadata);
537
538        let projected: Schema = schema.project(&[0, 2]).unwrap();
539
540        assert_eq!(projected.fields().len(), 2);
541        assert_eq!(projected.fields()[0].name(), "name");
542        assert_eq!(projected.fields()[1].name(), "priority");
543        assert_eq!(projected.metadata.get("meta").unwrap(), "data")
544    }
545
546    #[test]
547    fn test_oob_projection() {
548        let mut metadata = HashMap::new();
549        metadata.insert("meta".to_string(), "data".to_string());
550
551        let schema = Schema::new(vec![
552            Field::new("name", DataType::Utf8, false),
553            Field::new("address", DataType::Utf8, false),
554            Field::new("priority", DataType::UInt8, false),
555        ])
556        .with_metadata(metadata);
557
558        let projected = schema.project(&[0, 3]);
559
560        assert!(projected.is_err());
561        if let Err(e) = projected {
562            assert_eq!(
563                e.to_string(),
564                "Schema error: project index 3 out of bounds, max field 3".to_string()
565            )
566        }
567    }
568
569    #[test]
570    fn test_schema_contains() {
571        let mut metadata1 = HashMap::new();
572        metadata1.insert("meta".to_string(), "data".to_string());
573
574        let schema1 = Schema::new(vec![
575            Field::new("name", DataType::Utf8, false),
576            Field::new("address", DataType::Utf8, false),
577            Field::new("priority", DataType::UInt8, false),
578        ])
579        .with_metadata(metadata1.clone());
580
581        let mut metadata2 = HashMap::new();
582        metadata2.insert("meta".to_string(), "data".to_string());
583        metadata2.insert("meta2".to_string(), "data".to_string());
584        let schema2 = Schema::new(vec![
585            Field::new("name", DataType::Utf8, false),
586            Field::new("address", DataType::Utf8, false),
587            Field::new("priority", DataType::UInt8, false),
588        ])
589        .with_metadata(metadata2);
590
591        // reflexivity
592        assert!(schema1.contains(&schema1));
593        assert!(schema2.contains(&schema2));
594
595        assert!(!schema1.contains(&schema2));
596        assert!(schema2.contains(&schema1));
597    }
598
599    #[test]
600    fn schema_equality() {
601        let schema1 = Schema::new(vec![
602            Field::new("c1", DataType::Utf8, false),
603            Field::new("c2", DataType::Float64, true),
604            Field::new("c3", DataType::LargeBinary, true),
605        ]);
606        let schema2 = Schema::new(vec![
607            Field::new("c1", DataType::Utf8, false),
608            Field::new("c2", DataType::Float64, true),
609            Field::new("c3", DataType::LargeBinary, true),
610        ]);
611
612        assert_eq!(schema1, schema2);
613
614        let schema3 = Schema::new(vec![
615            Field::new("c1", DataType::Utf8, false),
616            Field::new("c2", DataType::Float32, true),
617        ]);
618        let schema4 = Schema::new(vec![
619            Field::new("C1", DataType::Utf8, false),
620            Field::new("C2", DataType::Float64, true),
621        ]);
622
623        assert_ne!(schema1, schema3);
624        assert_ne!(schema1, schema4);
625        assert_ne!(schema2, schema3);
626        assert_ne!(schema2, schema4);
627        assert_ne!(schema3, schema4);
628
629        let f = Field::new("c1", DataType::Utf8, false).with_metadata(
630            [("foo".to_string(), "bar".to_string())]
631                .iter()
632                .cloned()
633                .collect(),
634        );
635        let schema5 = Schema::new(vec![
636            f,
637            Field::new("c2", DataType::Float64, true),
638            Field::new("c3", DataType::LargeBinary, true),
639        ]);
640        assert_ne!(schema1, schema5);
641    }
642
643    #[test]
644    fn create_schema_string() {
645        let schema = person_schema();
646        assert_eq!(schema.to_string(),
647                   "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {\"k\": \"v\"} }, \
648        Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \
649        Field { name: \"address\", data_type: Struct([\
650            Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \
651            Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }\
652        ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \
653        Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: {} }")
654    }
655
656    #[test]
657    fn schema_field_accessors() {
658        let schema = person_schema();
659
660        // test schema accessors
661        assert_eq!(schema.fields().len(), 4);
662
663        // test field accessors
664        let first_name = &schema.fields()[0];
665        assert_eq!(first_name.name(), "first_name");
666        assert_eq!(first_name.data_type(), &DataType::Utf8);
667        assert!(!first_name.is_nullable());
668        assert_eq!(first_name.dict_id(), None);
669        assert_eq!(first_name.dict_is_ordered(), None);
670
671        let metadata = first_name.metadata();
672        assert!(!metadata.is_empty());
673        let md = &metadata;
674        assert_eq!(md.len(), 1);
675        let key = md.get("k");
676        assert!(key.is_some());
677        assert_eq!(key.unwrap(), "v");
678
679        let interests = &schema.fields()[3];
680        assert_eq!(interests.name(), "interests");
681        assert_eq!(
682            interests.data_type(),
683            &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
684        );
685        assert_eq!(interests.dict_id(), Some(123));
686        assert_eq!(interests.dict_is_ordered(), Some(true));
687    }
688
689    #[test]
690    #[should_panic(
691        expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]"
692    )]
693    fn schema_index_of() {
694        let schema = person_schema();
695        assert_eq!(schema.index_of("first_name").unwrap(), 0);
696        assert_eq!(schema.index_of("last_name").unwrap(), 1);
697        schema.index_of("nickname").unwrap();
698    }
699
700    #[test]
701    #[should_panic(
702        expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]"
703    )]
704    fn schema_field_with_name() {
705        let schema = person_schema();
706        assert_eq!(
707            schema.field_with_name("first_name").unwrap().name(),
708            "first_name"
709        );
710        assert_eq!(
711            schema.field_with_name("last_name").unwrap().name(),
712            "last_name"
713        );
714        schema.field_with_name("nickname").unwrap();
715    }
716
717    #[test]
718    fn schema_field_with_dict_id() {
719        let schema = person_schema();
720
721        let fields_dict_123: Vec<_> = schema
722            .fields_with_dict_id(123)
723            .iter()
724            .map(|f| f.name())
725            .collect();
726        assert_eq!(fields_dict_123, vec!["interests"]);
727
728        assert!(schema.fields_with_dict_id(456).is_empty());
729    }
730
731    fn person_schema() -> Schema {
732        let kv_array = [("k".to_string(), "v".to_string())];
733        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
734        let first_name =
735            Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
736
737        Schema::new(vec![
738            first_name,
739            Field::new("last_name", DataType::Utf8, false),
740            Field::new(
741                "address",
742                DataType::Struct(Fields::from(vec![
743                    Field::new("street", DataType::Utf8, false),
744                    Field::new("zip", DataType::UInt16, false),
745                ])),
746                false,
747            ),
748            Field::new_dict(
749                "interests",
750                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
751                true,
752                123,
753                true,
754            ),
755        ])
756    }
757
758    #[test]
759    fn test_try_merge_field_with_metadata() {
760        // 1. Different values for the same key should cause error.
761        let metadata1: HashMap<String, String> = [("foo".to_string(), "bar".to_string())]
762            .iter()
763            .cloned()
764            .collect();
765        let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1);
766
767        let metadata2: HashMap<String, String> = [("foo".to_string(), "baz".to_string())]
768            .iter()
769            .cloned()
770            .collect();
771        let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);
772
773        assert!(Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err());
774
775        // 2. None + Some
776        let mut f1 = Field::new("first_name", DataType::Utf8, false);
777        let metadata2: HashMap<String, String> = [("missing".to_string(), "value".to_string())]
778            .iter()
779            .cloned()
780            .collect();
781        let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);
782
783        assert!(f1.try_merge(&f2).is_ok());
784        assert!(!f1.metadata().is_empty());
785        assert_eq!(f1.metadata(), f2.metadata());
786
787        // 3. Some + Some
788        let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(
789            [("foo".to_string(), "bar".to_string())]
790                .iter()
791                .cloned()
792                .collect(),
793        );
794        let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(
795            [("foo2".to_string(), "bar2".to_string())]
796                .iter()
797                .cloned()
798                .collect(),
799        );
800
801        assert!(f1.try_merge(&f2).is_ok());
802        assert!(!f1.metadata().is_empty());
803        assert_eq!(
804            f1.metadata().clone(),
805            [
806                ("foo".to_string(), "bar".to_string()),
807                ("foo2".to_string(), "bar2".to_string())
808            ]
809            .iter()
810            .cloned()
811            .collect()
812        );
813
814        // 4. Some + None.
815        let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(
816            [("foo".to_string(), "bar".to_string())]
817                .iter()
818                .cloned()
819                .collect(),
820        );
821        let f2 = Field::new("first_name", DataType::Utf8, false);
822        assert!(f1.try_merge(&f2).is_ok());
823        assert!(!f1.metadata().is_empty());
824        assert_eq!(
825            f1.metadata().clone(),
826            [("foo".to_string(), "bar".to_string())]
827                .iter()
828                .cloned()
829                .collect()
830        );
831
832        // 5. None + None.
833        let mut f1 = Field::new("first_name", DataType::Utf8, false);
834        let f2 = Field::new("first_name", DataType::Utf8, false);
835        assert!(f1.try_merge(&f2).is_ok());
836        assert!(f1.metadata().is_empty());
837    }
838
839    #[test]
840    fn test_schema_merge() {
841        let merged = Schema::try_merge(vec![
842            Schema::new(vec![
843                Field::new("first_name", DataType::Utf8, false),
844                Field::new("last_name", DataType::Utf8, false),
845                Field::new(
846                    "address",
847                    DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)].into()),
848                    false,
849                ),
850            ]),
851            Schema::new_with_metadata(
852                vec![
853                    // nullable merge
854                    Field::new("last_name", DataType::Utf8, true),
855                    Field::new(
856                        "address",
857                        DataType::Struct(Fields::from(vec![
858                            // add new nested field
859                            Field::new("street", DataType::Utf8, false),
860                            // nullable merge on nested field
861                            Field::new("zip", DataType::UInt16, true),
862                        ])),
863                        false,
864                    ),
865                    // new field
866                    Field::new("number", DataType::Utf8, true),
867                ],
868                [("foo".to_string(), "bar".to_string())]
869                    .iter()
870                    .cloned()
871                    .collect::<HashMap<String, String>>(),
872            ),
873        ])
874        .unwrap();
875
876        assert_eq!(
877            merged,
878            Schema::new_with_metadata(
879                vec![
880                    Field::new("first_name", DataType::Utf8, false),
881                    Field::new("last_name", DataType::Utf8, true),
882                    Field::new(
883                        "address",
884                        DataType::Struct(Fields::from(vec![
885                            Field::new("zip", DataType::UInt16, true),
886                            Field::new("street", DataType::Utf8, false),
887                        ])),
888                        false,
889                    ),
890                    Field::new("number", DataType::Utf8, true),
891                ],
892                [("foo".to_string(), "bar".to_string())]
893                    .iter()
894                    .cloned()
895                    .collect::<HashMap<String, String>>()
896            )
897        );
898
899        // support merge union fields
900        assert_eq!(
901            Schema::try_merge(vec![
902                Schema::new(vec![Field::new_union(
903                    "c1",
904                    vec![0, 1],
905                    vec![
906                        Field::new("c11", DataType::Utf8, true),
907                        Field::new("c12", DataType::Utf8, true),
908                    ],
909                    UnionMode::Dense
910                ),]),
911                Schema::new(vec![Field::new_union(
912                    "c1",
913                    vec![1, 2],
914                    vec![
915                        Field::new("c12", DataType::Utf8, true),
916                        Field::new("c13", DataType::Time64(TimeUnit::Second), true),
917                    ],
918                    UnionMode::Dense
919                ),])
920            ])
921            .unwrap(),
922            Schema::new(vec![Field::new_union(
923                "c1",
924                vec![0, 1, 2],
925                vec![
926                    Field::new("c11", DataType::Utf8, true),
927                    Field::new("c12", DataType::Utf8, true),
928                    Field::new("c13", DataType::Time64(TimeUnit::Second), true),
929                ],
930                UnionMode::Dense
931            ),]),
932        );
933
934        // incompatible field should throw error
935        assert!(Schema::try_merge(vec![
936            Schema::new(vec![
937                Field::new("first_name", DataType::Utf8, false),
938                Field::new("last_name", DataType::Utf8, false),
939            ]),
940            Schema::new(vec![Field::new("last_name", DataType::Int64, false),])
941        ])
942        .is_err());
943
944        // incompatible metadata should throw error
945        let res = Schema::try_merge(vec![
946            Schema::new_with_metadata(
947                vec![Field::new("first_name", DataType::Utf8, false)],
948                [("foo".to_string(), "bar".to_string())]
949                    .iter()
950                    .cloned()
951                    .collect::<HashMap<String, String>>(),
952            ),
953            Schema::new_with_metadata(
954                vec![Field::new("last_name", DataType::Utf8, false)],
955                [("foo".to_string(), "baz".to_string())]
956                    .iter()
957                    .cloned()
958                    .collect::<HashMap<String, String>>(),
959            ),
960        ])
961        .unwrap_err();
962
963        let expected = "Fail to merge schema due to conflicting metadata. Key 'foo' has different values 'bar' and 'baz'";
964        assert!(
965            res.to_string().contains(expected),
966            "Could not find expected string '{expected}' in '{res}'"
967        );
968    }
969
970    #[test]
971    fn test_schema_builder_change_field() {
972        let mut builder = SchemaBuilder::new();
973        builder.push(Field::new("a", DataType::Int32, false));
974        builder.push(Field::new("b", DataType::Utf8, false));
975        *builder.field_mut(1) = Arc::new(Field::new("c", DataType::Int32, false));
976        assert_eq!(
977            builder.fields,
978            vec![
979                Arc::new(Field::new("a", DataType::Int32, false)),
980                Arc::new(Field::new("c", DataType::Int32, false))
981            ]
982        );
983    }
984
985    #[test]
986    fn test_schema_builder_reverse() {
987        let mut builder = SchemaBuilder::new();
988        builder.push(Field::new("a", DataType::Int32, false));
989        builder.push(Field::new("b", DataType::Utf8, true));
990        builder.reverse();
991        assert_eq!(
992            builder.fields,
993            vec![
994                Arc::new(Field::new("b", DataType::Utf8, true)),
995                Arc::new(Field::new("a", DataType::Int32, false))
996            ]
997        );
998    }
999
1000    #[test]
1001    fn test_schema_builder_metadata() {
1002        let mut metadata = HashMap::with_capacity(1);
1003        metadata.insert("key".to_string(), "value".to_string());
1004
1005        let fields = vec![Field::new("test", DataType::Int8, true)];
1006        let mut builder: SchemaBuilder = Schema::new(fields).with_metadata(metadata).into();
1007        builder.metadata_mut().insert("k".into(), "v".into());
1008        let out = builder.finish();
1009        assert_eq!(out.metadata.len(), 2);
1010        assert_eq!(out.metadata["k"], "v");
1011        assert_eq!(out.metadata["key"], "value");
1012    }
1013}