lance_arrow/
schema.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Extension to arrow schema
5
6use arrow_schema::{ArrowError, DataType, Field, FieldRef, Schema};
7
8pub enum Indentation {
9    OneLine,
10    MultiLine(u8),
11}
12
13impl Indentation {
14    fn value(&self) -> String {
15        match self {
16            Self::OneLine => "".to_string(),
17            Self::MultiLine(spaces) => " ".repeat(*spaces as usize),
18        }
19    }
20
21    fn deepen(&self) -> Self {
22        match self {
23            Self::OneLine => Self::OneLine,
24            Self::MultiLine(spaces) => Self::MultiLine(spaces + 2),
25        }
26    }
27}
28
29/// Extends the functionality of [arrow_schema::Field].
30pub trait FieldExt {
31    /// Create a compact string representation of the field
32    ///
33    /// This is intended for display purposes and not for serialization
34    fn to_compact_string(&self, indent: Indentation) -> String;
35
36    fn is_packed_struct(&self) -> bool;
37}
38
39impl FieldExt for Field {
40    fn to_compact_string(&self, indent: Indentation) -> String {
41        let mut result = format!("{}: ", self.name().clone());
42        match self.data_type() {
43            DataType::Struct(fields) => {
44                result += "{";
45                result += &indent.value();
46                for (field_idx, field) in fields.iter().enumerate() {
47                    result += field.to_compact_string(indent.deepen()).as_str();
48                    if field_idx < fields.len() - 1 {
49                        result += ",";
50                    }
51                    result += indent.value().as_str();
52                }
53                result += "}";
54            }
55            DataType::List(field)
56            | DataType::LargeList(field)
57            | DataType::ListView(field)
58            | DataType::LargeListView(field) => {
59                result += "[";
60                result += field.to_compact_string(indent.deepen()).as_str();
61                result += "]";
62            }
63            DataType::FixedSizeList(child, dimension) => {
64                result += &format!(
65                    "[{}; {}]",
66                    child.to_compact_string(indent.deepen()),
67                    dimension
68                );
69            }
70            DataType::Dictionary(key_type, value_type) => {
71                result += &value_type.to_string();
72                result += "@";
73                result += &key_type.to_string();
74            }
75            _ => {
76                result += &self.data_type().to_string();
77            }
78        }
79        if self.is_nullable() {
80            result += "?";
81        }
82        result
83    }
84
85    // Check if field has metadata `packed` set to true, this check is case insensitive.
86    fn is_packed_struct(&self) -> bool {
87        let field_metadata = self.metadata();
88        field_metadata
89            .get("packed")
90            .map(|v| v.to_lowercase() == "true")
91            .unwrap_or(false)
92    }
93}
94
95/// Extends the functionality of [arrow_schema::Schema].
96pub trait SchemaExt {
97    /// Create a new [`Schema`] with one extra field.
98    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError>;
99
100    fn try_with_column_at(
101        &self,
102        index: usize,
103        field: Field,
104    ) -> std::result::Result<Schema, ArrowError>;
105
106    fn field_names(&self) -> Vec<&String>;
107
108    fn without_column(&self, column_name: &str) -> Schema;
109
110    /// Create a compact string representation of the schema
111    ///
112    /// This is intended for display purposes and not for serialization
113    fn to_compact_string(&self, indent: Indentation) -> String;
114}
115
116impl SchemaExt for Schema {
117    fn try_with_column(&self, field: Field) -> std::result::Result<Schema, ArrowError> {
118        if self.column_with_name(field.name()).is_some() {
119            return Err(ArrowError::SchemaError(format!(
120                "Can not append column {} on schema: {:?}",
121                field.name(),
122                self
123            )));
124        };
125        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
126        fields.push(FieldRef::new(field));
127        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
128    }
129
130    fn try_with_column_at(
131        &self,
132        index: usize,
133        field: Field,
134    ) -> std::result::Result<Schema, ArrowError> {
135        if self.column_with_name(field.name()).is_some() {
136            return Err(ArrowError::SchemaError(format!(
137                "Failed to modify schema: Inserting column {} would create a duplicate column in schema: {:?}",
138                field.name(),
139                self
140            )));
141        };
142        let mut fields: Vec<FieldRef> = self.fields().iter().cloned().collect();
143        fields.insert(index, FieldRef::new(field));
144        Ok(Self::new_with_metadata(fields, self.metadata.clone()))
145    }
146
147    /// Project the schema to remove the given column.
148    ///
149    /// This only works on top-level fields right now. If a field does not exist,
150    /// the schema will be returned as is.
151    fn without_column(&self, column_name: &str) -> Schema {
152        let fields: Vec<FieldRef> = self
153            .fields()
154            .iter()
155            .filter(|f| f.name() != column_name)
156            .cloned()
157            .collect();
158        Self::new_with_metadata(fields, self.metadata.clone())
159    }
160
161    fn field_names(&self) -> Vec<&String> {
162        self.fields().iter().map(|f| f.name()).collect()
163    }
164
165    fn to_compact_string(&self, indent: Indentation) -> String {
166        let mut result = "{".to_string();
167        result += &indent.value();
168        for (field_idx, field) in self.fields.iter().enumerate() {
169            result += field.to_compact_string(indent.deepen()).as_str();
170            if field_idx < self.fields.len() - 1 {
171                result += ",";
172            }
173            result += indent.value().as_str();
174        }
175        result += "}";
176        result
177    }
178}