arrow_array/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! The central type in Apache Arrow are arrays, which are a known-length sequence of values
19//! all having the same type. This crate provides concrete implementations of each type, as
20//! well as an [`Array`] trait that can be used for type-erasure.
21//!
22//! # Building an Array
23//!
24//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`]
25//!
26//! ```
27//! # use arrow_array::{Int32Array, ListArray, StringArray};
28//! # use arrow_array::types::Int32Type;
29//! #
30//! Int32Array::from(vec![1, 2]);
31//! Int32Array::from(vec![Some(1), None]);
32//! Int32Array::from_iter([1, 2, 3, 4]);
33//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]);
34//!
35//! StringArray::from(vec!["foo", "bar"]);
36//! StringArray::from(vec![Some("foo"), None]);
37//! StringArray::from_iter([Some("foo"), None]);
38//! StringArray::from_iter_values(["foo", "bar"]);
39//!
40//! ListArray::from_iter_primitive::<Int32Type, _, _>([
41//! Some(vec![Some(1), None, Some(3)]),
42//! None,
43//! Some(vec![])
44//! ]);
45//! ```
46//!
47//! Additionally [`ArrayBuilder`](builder::ArrayBuilder) implementations can be
48//! used to construct arrays with a push-based interface
49//!
50//! ```
51//! # use arrow_array::Int16Array;
52//! #
53//! // Create a new builder with a capacity of 100
54//! let mut builder = Int16Array::builder(100);
55//!
56//! // Append a single primitive value
57//! builder.append_value(1);
58//! // Append a null value
59//! builder.append_null();
60//! // Append a slice of primitive values
61//! builder.append_slice(&[2, 3, 4]);
62//!
63//! // Build the array
64//! let array = builder.finish();
65//!
66//! assert_eq!(5, array.len());
67//! assert_eq!(2, array.value(2));
68//! assert_eq!(&array.values()[3..5], &[3, 4])
69//! ```
70//!
71//! # Low-level API
72//!
73//! Internally, arrays consist of one or more shared memory regions backed by a [`Buffer`],
74//! the number and meaning of which depend on the array’s data type, as documented in
75//! the [Arrow specification].
76//!
77//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of:
78//!
79//! * An optional [`NullBuffer`] identifying any null values
80//! * A contiguous [`ScalarBuffer<i16>`] of values
81//!
82//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of:
83//!
84//! * An optional [`NullBuffer`] identifying any null values
85//! * An offsets [`OffsetBuffer<i32>`] identifying valid UTF-8 sequences within the values buffer
86//! * A values [`Buffer`] of UTF-8 encoded string data
87//!
88//! Array constructors such as [`PrimitiveArray::try_new`] provide the ability to cheaply
89//! construct an array from these parts, with functions such as [`PrimitiveArray::into_parts`]
90//! providing the reverse operation.
91//!
92//! ```
93//! # use arrow_array::{Array, Int32Array, StringArray};
94//! # use arrow_buffer::OffsetBuffer;
95//! #
96//! // Create a Int32Array from Vec without copying
97//! let array = Int32Array::new(vec![1, 2, 3].into(), None);
98//! assert_eq!(array.values(), &[1, 2, 3]);
99//! assert_eq!(array.null_count(), 0);
100//!
101//! // Create a StringArray from parts
102//! let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
103//! let array = StringArray::new(offsets, b"helloworld".into(), None);
104//! let values: Vec<_> = array.iter().map(|x| x.unwrap()).collect();
105//! assert_eq!(values, &["hello", "world"]);
106//! ```
107//!
108//! As [`Buffer`], and its derivatives, can be created from [`Vec`] without copying, this provides
109//! an efficient way to not only interoperate with other Rust code, but also implement kernels
110//! optimised for the arrow data layout - e.g. by handling buffers instead of values.
111//!
112//! # Zero-Copy Slicing
113//!
114//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this
115//! data. Internally this just increments some ref-counts, and so is incredibly cheap
116//!
117//! ```rust
118//! # use arrow_array::Int32Array;
119//! let array = Int32Array::from_iter([1, 2, 3]);
120//!
121//! // Slice with offset 1 and length 2
122//! let sliced = array.slice(1, 2);
123//! assert_eq!(sliced.values(), &[2, 3]);
124//! ```
125//!
126//! # Downcasting an Array
127//!
128//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`].
129//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`].
130//!
131//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs,
132//! it is often the case that you wish to interact with the concrete arrays directly.
133//!
134//! This requires downcasting to the concrete type of the array:
135//!
136//! ```
137//! # use arrow_array::{Array, Float32Array, Int32Array};
138//!
139//! // Safely downcast an `Array` to an `Int32Array` and compute the sum
140//! // using native i32 values
141//! fn sum_int32(array: &dyn Array) -> i32 {
142//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap();
143//! integers.iter().map(|val| val.unwrap_or_default()).sum()
144//! }
145//!
146//! // Safely downcasts the array to a `Float32Array` and returns a &[f32] view of the data
147//! // Note: the values for positions corresponding to nulls will be arbitrary (but still valid f32)
148//! fn as_f32_slice(array: &dyn Array) -> &[f32] {
149//! array.as_any().downcast_ref::<Float32Array>().unwrap().values()
150//! }
151//! ```
152//!
153//! The [`cast::AsArray`] extension trait can make this more ergonomic
154//!
155//! ```
156//! # use arrow_array::Array;
157//! # use arrow_array::cast::{AsArray, as_primitive_array};
158//! # use arrow_array::types::Float32Type;
159//!
160//! fn as_f32_slice(array: &dyn Array) -> &[f32] {
161//! array.as_primitive::<Float32Type>().values()
162//! }
163//! ```
164//! # Alternatives to ChunkedArray Support
165//!
166//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python
167//! and C++ Arrow implementations. The recommended alternative is to use one of the following:
168//! - `Vec<ArrayRef>` a simple, eager version of a `ChunkedArray`
169//! - `impl Iterator<Item=ArrayRef>` a lazy version of a `ChunkedArray`
170//! - `impl Stream<Item=ArrayRef>` a lazy async version of a `ChunkedArray`
171//!
172//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes
173//! extensive use of [RecordBatchStream].
174//!
175//! This approach integrates well into the Rust ecosystem, simplifies the implementation and
176//! encourages the use of performant lazy and async patterns.
177//! ```rust
178//! use std::sync::Arc;
179//! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray};
180//! use arrow_array::cast::AsArray;
181//! use arrow_array::types::Float32Type;
182//! use arrow_schema::DataType;
183//!
184//! let batches = [
185//! RecordBatch::try_from_iter(vec![
186//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef),
187//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef),
188//! ]).unwrap(),
189//! RecordBatch::try_from_iter(vec![
190//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef),
191//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef),
192//! ]).unwrap(),
193//! ];
194//!
195//! let labels: Vec<&str> = batches
196//! .iter()
197//! .flat_map(|batch| batch.column(0).as_string::<i32>())
198//! .map(Option::unwrap)
199//! .collect();
200//!
201//! let values: Vec<f32> = batches
202//! .iter()
203//! .flat_map(|batch| batch.column(1).as_primitive::<Float32Type>().values())
204//! .copied()
205//! .collect();
206//!
207//! assert_eq!(labels, ["A", "B", "C", "D", "E"]);
208//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]);
209//!```
210//! [`ScalarBuffer<T>`]: arrow_buffer::ScalarBuffer
211//! [`ScalarBuffer<i16>`]: arrow_buffer::ScalarBuffer
212//! [`OffsetBuffer<i32>`]: arrow_buffer::OffsetBuffer
213//! [`NullBuffer`]: arrow_buffer::NullBuffer
214//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html
215//! [`&dyn Array`]: Array
216//! [`NullBuffer`]: arrow_buffer::NullBuffer
217//! [`Buffer`]: arrow_buffer::Buffer
218//! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
219//! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html
220//! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html
221//! [DataFusion]: https://github.com/apache/arrow-datafusion
222//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html
223
224#![deny(rustdoc::broken_intra_doc_links)]
225#![warn(missing_docs)]
226
227pub mod array;
228pub use array::*;
229
230mod record_batch;
231pub use record_batch::{
232 RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter,
233};
234
235mod arithmetic;
236pub use arithmetic::ArrowNativeTypeOp;
237
238mod numeric;
239pub use numeric::*;
240
241mod scalar;
242pub use scalar::*;
243
244pub mod builder;
245pub mod cast;
246mod delta;
247#[cfg(feature = "ffi")]
248pub mod ffi;
249#[cfg(feature = "ffi")]
250pub mod ffi_stream;
251pub mod iterator;
252pub mod run_iterator;
253pub mod temporal_conversions;
254pub mod timezone;
255mod trusted_len;
256pub mod types;
257
258#[cfg(test)]
259mod tests {
260 use crate::builder::*;
261
262 #[test]
263 fn test_buffer_builder_availability() {
264 let _builder = Int8BufferBuilder::new(10);
265 let _builder = Int16BufferBuilder::new(10);
266 let _builder = Int32BufferBuilder::new(10);
267 let _builder = Int64BufferBuilder::new(10);
268 let _builder = UInt16BufferBuilder::new(10);
269 let _builder = UInt32BufferBuilder::new(10);
270 let _builder = Float32BufferBuilder::new(10);
271 let _builder = Float64BufferBuilder::new(10);
272 let _builder = TimestampSecondBufferBuilder::new(10);
273 let _builder = TimestampMillisecondBufferBuilder::new(10);
274 let _builder = TimestampMicrosecondBufferBuilder::new(10);
275 let _builder = TimestampNanosecondBufferBuilder::new(10);
276 let _builder = Date32BufferBuilder::new(10);
277 let _builder = Date64BufferBuilder::new(10);
278 let _builder = Time32SecondBufferBuilder::new(10);
279 let _builder = Time32MillisecondBufferBuilder::new(10);
280 let _builder = Time64MicrosecondBufferBuilder::new(10);
281 let _builder = Time64NanosecondBufferBuilder::new(10);
282 let _builder = IntervalYearMonthBufferBuilder::new(10);
283 let _builder = IntervalDayTimeBufferBuilder::new(10);
284 let _builder = IntervalMonthDayNanoBufferBuilder::new(10);
285 let _builder = DurationSecondBufferBuilder::new(10);
286 let _builder = DurationMillisecondBufferBuilder::new(10);
287 let _builder = DurationMicrosecondBufferBuilder::new(10);
288 let _builder = DurationNanosecondBufferBuilder::new(10);
289 }
290}