arrow_array/builder/
generic_bytes_view_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::marker::PhantomData;
20use std::sync::Arc;
21
22use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
23use arrow_data::ByteView;
24use arrow_schema::ArrowError;
25use hashbrown::hash_table::Entry;
26use hashbrown::HashTable;
27
28use crate::builder::ArrayBuilder;
29use crate::types::bytes::ByteArrayNativeType;
30use crate::types::{BinaryViewType, ByteViewType, StringViewType};
31use crate::{ArrayRef, GenericByteViewArray};
32
33const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB
34const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB
35
36enum BlockSizeGrowthStrategy {
37    Fixed { size: u32 },
38    Exponential { current_size: u32 },
39}
40
41impl BlockSizeGrowthStrategy {
42    fn next_size(&mut self) -> u32 {
43        match self {
44            Self::Fixed { size } => *size,
45            Self::Exponential { current_size } => {
46                if *current_size < MAX_BLOCK_SIZE {
47                    // we have fixed start/end block sizes, so we can't overflow
48                    *current_size = current_size.saturating_mul(2);
49                    *current_size
50                } else {
51                    MAX_BLOCK_SIZE
52                }
53            }
54        }
55    }
56}
57
58/// A builder for [`GenericByteViewArray`]
59///
60/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data,
61/// and a list of views into those buffers.
62///
63/// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`]
64///
65/// This builder can be used in two ways
66///
67/// # Append Values
68///
69/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
70/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
71/// writes values larger than 12 bytes to the current in-progress block, with values smaller
72/// than 12 bytes inlined into the views. If a value is appended that will not fit in the
73/// in-progress block, it will be closed, and a new block of sufficient size allocated
74///
75/// # Append Views
76///
77/// Some use-cases may wish to reuse an existing allocation containing string data, for example,
78/// when parsing data from a parquet data page. In such a case entire blocks can be appended
79/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
80/// using [`GenericByteViewBuilder::try_append_view`]
81pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
82    views_builder: BufferBuilder<u128>,
83    null_buffer_builder: NullBufferBuilder,
84    completed: Vec<Buffer>,
85    in_progress: Vec<u8>,
86    block_size: BlockSizeGrowthStrategy,
87    /// Some if deduplicating strings
88    /// map `<string hash> -> <index to the views>`
89    string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90    phantom: PhantomData<T>,
91}
92
93impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
94    /// Creates a new [`GenericByteViewBuilder`].
95    pub fn new() -> Self {
96        Self::with_capacity(1024)
97    }
98
99    /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
100    pub fn with_capacity(capacity: usize) -> Self {
101        Self {
102            views_builder: BufferBuilder::new(capacity),
103            null_buffer_builder: NullBufferBuilder::new(capacity),
104            completed: vec![],
105            in_progress: vec![],
106            block_size: BlockSizeGrowthStrategy::Exponential {
107                current_size: STARTING_BLOCK_SIZE,
108            },
109            string_tracker: None,
110            phantom: Default::default(),
111        }
112    }
113
114    /// Set a fixed buffer size for variable length strings
115    ///
116    /// The block size is the size of the buffer used to store values greater
117    /// than 12 bytes. The builder allocates new buffers when the current
118    /// buffer is full.
119    ///
120    /// By default the builder balances buffer size and buffer count by
121    /// growing buffer size exponentially from 8KB up to 2MB. The
122    /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
123    ///
124    /// If this method is used, any new buffers allocated are  
125    /// exactly this size. This can be useful for advanced users
126    /// that want to control the memory usage and buffer count.
127    ///
128    /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
129    pub fn with_fixed_block_size(self, block_size: u32) -> Self {
130        debug_assert!(block_size > 0, "Block size must be greater than 0");
131        Self {
132            block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
133            ..self
134        }
135    }
136
137    /// Override the size of buffers to allocate for holding string data
138    /// Use `with_fixed_block_size` instead.
139    #[deprecated(since = "53.0.0", note = "Use `with_fixed_block_size` instead")]
140    pub fn with_block_size(self, block_size: u32) -> Self {
141        self.with_fixed_block_size(block_size)
142    }
143
144    /// Deduplicate strings while building the array
145    ///
146    /// This will potentially decrease the memory usage if the array have repeated strings
147    /// It will also increase the time to build the array as it needs to hash the strings
148    pub fn with_deduplicate_strings(self) -> Self {
149        Self {
150            string_tracker: Some((
151                HashTable::with_capacity(self.views_builder.capacity()),
152                Default::default(),
153            )),
154            ..self
155        }
156    }
157
158    /// Append a new data block returning the new block offset
159    ///
160    /// Note: this will first flush any in-progress block
161    ///
162    /// This allows appending views from blocks added using [`Self::append_block`]. See
163    /// [`Self::append_value`] for appending individual values
164    ///
165    /// ```
166    /// # use arrow_array::builder::StringViewBuilder;
167    /// let mut builder = StringViewBuilder::new();
168    ///
169    /// let block = builder.append_block(b"helloworldbingobongo".into());
170    ///
171    /// builder.try_append_view(block, 0, 5).unwrap();
172    /// builder.try_append_view(block, 5, 5).unwrap();
173    /// builder.try_append_view(block, 10, 5).unwrap();
174    /// builder.try_append_view(block, 15, 5).unwrap();
175    /// builder.try_append_view(block, 0, 15).unwrap();
176    /// let array = builder.finish();
177    ///
178    /// let actual: Vec<_> = array.iter().flatten().collect();
179    /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"];
180    /// assert_eq!(actual, expected);
181    /// ```
182    pub fn append_block(&mut self, buffer: Buffer) -> u32 {
183        assert!(buffer.len() < u32::MAX as usize);
184
185        self.flush_in_progress();
186        let offset = self.completed.len();
187        self.push_completed(buffer);
188        offset as u32
189    }
190
191    /// Append a view of the given `block`, `offset` and `length`
192    ///
193    /// # Safety
194    /// (1) The block must have been added using [`Self::append_block`]
195    /// (2) The range `offset..offset+length` must be within the bounds of the block
196    /// (3) The data in the block must be valid of type `T`
197    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
198        let b = self.completed.get_unchecked(block as usize);
199        let start = offset as usize;
200        let end = start.saturating_add(len as usize);
201        let b = b.get_unchecked(start..end);
202
203        let view = make_view(b, block, offset);
204        self.views_builder.append(view);
205        self.null_buffer_builder.append_non_null();
206    }
207
208    /// Try to append a view of the given `block`, `offset` and `length`
209    ///
210    /// See [`Self::append_block`]
211    pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> {
212        let b = self.completed.get(block as usize).ok_or_else(|| {
213            ArrowError::InvalidArgumentError(format!("No block found with index {block}"))
214        })?;
215        let start = offset as usize;
216        let end = start.saturating_add(len as usize);
217
218        let b = b.get(start..end).ok_or_else(|| {
219            ArrowError::InvalidArgumentError(format!(
220                "Range {start}..{end} out of bounds for block of length {}",
221                b.len()
222            ))
223        })?;
224
225        if T::Native::from_bytes_checked(b).is_none() {
226            return Err(ArrowError::InvalidArgumentError(
227                "Invalid view data".to_string(),
228            ));
229        }
230
231        unsafe {
232            self.append_view_unchecked(block, offset, len);
233        }
234        Ok(())
235    }
236
237    /// Flushes the in progress block if any
238    #[inline]
239    fn flush_in_progress(&mut self) {
240        if !self.in_progress.is_empty() {
241            let f = Buffer::from_vec(std::mem::take(&mut self.in_progress));
242            self.push_completed(f)
243        }
244    }
245
246    /// Append a block to `self.completed`, checking for overflow
247    #[inline]
248    fn push_completed(&mut self, block: Buffer) {
249        assert!(block.len() < u32::MAX as usize, "Block too large");
250        assert!(self.completed.len() < u32::MAX as usize, "Too many blocks");
251        self.completed.push(block);
252    }
253
254    /// Returns the value at the given index
255    /// Useful if we want to know what value has been inserted to the builder
256    /// The index has to be smaller than `self.len()`, otherwise it will panic
257    pub fn get_value(&self, index: usize) -> &[u8] {
258        let view = self.views_builder.as_slice().get(index).unwrap();
259        let len = *view as u32;
260        if len <= 12 {
261            // # Safety
262            // The view is valid from the builder
263            unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
264        } else {
265            let view = ByteView::from(*view);
266            if view.buffer_index < self.completed.len() as u32 {
267                let block = &self.completed[view.buffer_index as usize];
268                &block[view.offset as usize..view.offset as usize + view.length as usize]
269            } else {
270                &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize]
271            }
272        }
273    }
274
275    /// Appends a value into the builder
276    ///
277    /// # Panics
278    ///
279    /// Panics if
280    /// - String buffer count exceeds `u32::MAX`
281    /// - String length exceeds `u32::MAX`
282    #[inline]
283    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
284        let v: &[u8] = value.as_ref().as_ref();
285        let length: u32 = v.len().try_into().unwrap();
286        if length <= 12 {
287            let mut view_buffer = [0; 16];
288            view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
289            view_buffer[4..4 + v.len()].copy_from_slice(v);
290            self.views_builder.append(u128::from_le_bytes(view_buffer));
291            self.null_buffer_builder.append_non_null();
292            return;
293        }
294
295        // Deduplication if:
296        // (1) deduplication is enabled.
297        // (2) len > 12
298        if let Some((mut ht, hasher)) = self.string_tracker.take() {
299            let hash_val = hasher.hash_one(v);
300            let hasher_fn = |v: &_| hasher.hash_one(v);
301
302            let entry = ht.entry(
303                hash_val,
304                |idx| {
305                    let stored_value = self.get_value(*idx);
306                    v == stored_value
307                },
308                hasher_fn,
309            );
310            match entry {
311                Entry::Occupied(occupied) => {
312                    // If the string already exists, we will directly use the view
313                    let idx = occupied.get();
314                    self.views_builder
315                        .append(self.views_builder.as_slice()[*idx]);
316                    self.null_buffer_builder.append_non_null();
317                    self.string_tracker = Some((ht, hasher));
318                    return;
319                }
320                Entry::Vacant(vacant) => {
321                    // o.w. we insert the (string hash -> view index)
322                    // the idx is current length of views_builder, as we are inserting a new view
323                    vacant.insert(self.views_builder.len());
324                }
325            }
326            self.string_tracker = Some((ht, hasher));
327        }
328
329        let required_cap = self.in_progress.len() + v.len();
330        if self.in_progress.capacity() < required_cap {
331            self.flush_in_progress();
332            let to_reserve = v.len().max(self.block_size.next_size() as usize);
333            self.in_progress.reserve(to_reserve);
334        };
335        let offset = self.in_progress.len() as u32;
336        self.in_progress.extend_from_slice(v);
337
338        let view = ByteView {
339            length,
340            prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
341            buffer_index: self.completed.len() as u32,
342            offset,
343        };
344        self.views_builder.append(view.into());
345        self.null_buffer_builder.append_non_null();
346    }
347
348    /// Append an `Option` value into the builder
349    #[inline]
350    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
351        match value {
352            None => self.append_null(),
353            Some(v) => self.append_value(v),
354        };
355    }
356
357    /// Append a null value into the builder
358    #[inline]
359    pub fn append_null(&mut self) {
360        self.null_buffer_builder.append_null();
361        self.views_builder.append(0);
362    }
363
364    /// Builds the [`GenericByteViewArray`] and reset this builder
365    pub fn finish(&mut self) -> GenericByteViewArray<T> {
366        self.flush_in_progress();
367        let completed = std::mem::take(&mut self.completed);
368        let len = self.views_builder.len();
369        let views = ScalarBuffer::new(self.views_builder.finish(), 0, len);
370        let nulls = self.null_buffer_builder.finish();
371        if let Some((ref mut ht, _)) = self.string_tracker.as_mut() {
372            ht.clear();
373        }
374        // SAFETY: valid by construction
375        unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
376    }
377
378    /// Builds the [`GenericByteViewArray`] without resetting the builder
379    pub fn finish_cloned(&self) -> GenericByteViewArray<T> {
380        let mut completed = self.completed.clone();
381        if !self.in_progress.is_empty() {
382            completed.push(Buffer::from_slice_ref(&self.in_progress));
383        }
384        let len = self.views_builder.len();
385        let views = Buffer::from_slice_ref(self.views_builder.as_slice());
386        let views = ScalarBuffer::new(views, 0, len);
387        let nulls = self.null_buffer_builder.finish_cloned();
388        // SAFETY: valid by construction
389        unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
390    }
391
392    /// Returns the current null buffer as a slice
393    pub fn validity_slice(&self) -> Option<&[u8]> {
394        self.null_buffer_builder.as_slice()
395    }
396
397    /// Return the allocated size of this builder in bytes, useful for memory accounting.
398    pub fn allocated_size(&self) -> usize {
399        let views = self.views_builder.capacity() * std::mem::size_of::<u128>();
400        let null = self.null_buffer_builder.allocated_size();
401        let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>();
402        let in_progress = self.in_progress.capacity();
403        let tracker = match &self.string_tracker {
404            Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
405            None => 0,
406        };
407        buffer_size + in_progress + tracker + views + null
408    }
409}
410
411impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
412    fn default() -> Self {
413        Self::new()
414    }
415}
416
417impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
418    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
419        write!(f, "{}ViewBuilder", T::PREFIX)?;
420        f.debug_struct("")
421            .field("views_builder", &self.views_builder)
422            .field("in_progress", &self.in_progress)
423            .field("completed", &self.completed)
424            .field("null_buffer_builder", &self.null_buffer_builder)
425            .finish()
426    }
427}
428
429impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> {
430    fn len(&self) -> usize {
431        self.null_buffer_builder.len()
432    }
433
434    fn finish(&mut self) -> ArrayRef {
435        Arc::new(self.finish())
436    }
437
438    fn finish_cloned(&self) -> ArrayRef {
439        Arc::new(self.finish_cloned())
440    }
441
442    fn as_any(&self) -> &dyn Any {
443        self
444    }
445
446    fn as_any_mut(&mut self) -> &mut dyn Any {
447        self
448    }
449
450    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
451        self
452    }
453}
454
455impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
456    for GenericByteViewBuilder<T>
457{
458    #[inline]
459    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
460        for v in iter {
461            self.append_option(v)
462        }
463    }
464}
465
466/// Array builder for [`StringViewArray`][crate::StringViewArray]
467///
468/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
469/// [`GenericByteViewBuilder::append_null`] as normal.
470///
471/// # Example
472/// ```
473/// # use arrow_array::builder::StringViewBuilder;
474/// # use arrow_array::StringViewArray;
475/// let mut builder = StringViewBuilder::new();
476/// builder.append_value("hello");
477/// builder.append_null();
478/// builder.append_value("world");
479/// let array = builder.finish();
480///
481/// let expected = vec![Some("hello"), None, Some("world")];
482/// let actual: Vec<_> = array.iter().collect();
483/// assert_eq!(expected, actual);
484/// ```
485pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;
486
487///  Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
488///
489/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
490/// [`GenericByteViewBuilder::append_null`] as normal.
491///
492/// # Example
493/// ```
494/// # use arrow_array::builder::BinaryViewBuilder;
495/// use arrow_array::BinaryViewArray;
496/// let mut builder = BinaryViewBuilder::new();
497/// builder.append_value("hello");
498/// builder.append_null();
499/// builder.append_value("world");
500/// let array = builder.finish();
501///
502/// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")];
503/// let actual: Vec<_> = array.iter().collect();
504/// assert_eq!(expected, actual);
505/// ```
506///
507pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
508
509/// Creates a view from a fixed length input (the compiler can generate
510/// specialized code for this)
511fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
512    let mut view_buffer = [0; 16];
513    view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes());
514    view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]);
515    u128::from_le_bytes(view_buffer)
516}
517
518/// Create a view based on the given data, block id and offset.
519///
520/// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G>
521/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined),
522/// which slows down things.
523#[inline(never)]
524pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
525    let len = data.len();
526
527    // Generate specialized code for each potential small string length
528    // to improve performance
529    match len {
530        0 => make_inlined_view::<0>(data),
531        1 => make_inlined_view::<1>(data),
532        2 => make_inlined_view::<2>(data),
533        3 => make_inlined_view::<3>(data),
534        4 => make_inlined_view::<4>(data),
535        5 => make_inlined_view::<5>(data),
536        6 => make_inlined_view::<6>(data),
537        7 => make_inlined_view::<7>(data),
538        8 => make_inlined_view::<8>(data),
539        9 => make_inlined_view::<9>(data),
540        10 => make_inlined_view::<10>(data),
541        11 => make_inlined_view::<11>(data),
542        12 => make_inlined_view::<12>(data),
543        // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead.
544        _ => {
545            let view = ByteView {
546                length: len as u32,
547                prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()),
548                buffer_index: block_id,
549                offset,
550            };
551            view.as_u128()
552        }
553    }
554}
555
556#[cfg(test)]
557mod tests {
558    use core::str;
559
560    use super::*;
561    use crate::Array;
562
563    #[test]
564    fn test_string_view_deduplicate() {
565        let value_1 = "long string to test string view";
566        let value_2 = "not so similar string but long";
567
568        let mut builder = StringViewBuilder::new()
569            .with_deduplicate_strings()
570            .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
571
572        let values = vec![
573            Some(value_1),
574            Some(value_2),
575            Some("short"),
576            Some(value_1),
577            None,
578            Some(value_2),
579            Some(value_1),
580        ];
581        builder.extend(values.clone());
582
583        let array = builder.finish_cloned();
584        array.to_data().validate_full().unwrap();
585        assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers.
586        let actual: Vec<_> = array.iter().collect();
587        assert_eq!(actual, values);
588
589        let view0 = array.views().first().unwrap();
590        let view3 = array.views().get(3).unwrap();
591        let view6 = array.views().get(6).unwrap();
592
593        assert_eq!(view0, view3);
594        assert_eq!(view0, view6);
595
596        assert_eq!(array.views().get(1), array.views().get(5));
597    }
598
599    #[test]
600    fn test_string_view_deduplicate_after_finish() {
601        let mut builder = StringViewBuilder::new().with_deduplicate_strings();
602
603        let value_1 = "long string to test string view";
604        let value_2 = "not so similar string but long";
605        builder.append_value(value_1);
606        let _array = builder.finish();
607        builder.append_value(value_2);
608        let _array = builder.finish();
609        builder.append_value(value_1);
610        let _array = builder.finish();
611    }
612
613    #[test]
614    fn test_string_view() {
615        let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");
616        let b2 = Buffer::from(b"cupcakes");
617        let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity");
618
619        let mut v = StringViewBuilder::new();
620        assert_eq!(v.append_block(b1), 0);
621
622        v.append_value("This is a very long string that exceeds the inline length");
623        v.append_value("This is another very long string that exceeds the inline length");
624
625        assert_eq!(v.append_block(b2), 2);
626        assert_eq!(v.append_block(b3), 3);
627
628        // Test short strings
629        v.try_append_view(0, 0, 5).unwrap(); // world
630        v.try_append_view(0, 6, 7).unwrap(); // bananas
631        v.try_append_view(2, 3, 5).unwrap(); // cake
632        v.try_append_view(2, 0, 3).unwrap(); // cup
633        v.try_append_view(2, 0, 8).unwrap(); // cupcakes
634        v.try_append_view(0, 13, 4).unwrap(); // 😁
635        v.try_append_view(0, 13, 0).unwrap(); //
636
637        // Test longer strings
638        v.try_append_view(3, 0, 16).unwrap(); // Many strings are
639        v.try_append_view(1, 0, 19).unwrap(); // This is a very long
640        v.try_append_view(3, 13, 27).unwrap(); // here contained of great length
641
642        v.append_value("I do so like long strings");
643
644        let array = v.finish_cloned();
645        array.to_data().validate_full().unwrap();
646        assert_eq!(array.data_buffers().len(), 5);
647        let actual: Vec<_> = array.iter().flatten().collect();
648        assert_eq!(
649            actual,
650            &[
651                "This is a very long string that exceeds the inline length",
652                "This is another very long string that exceeds the inline length",
653                "world",
654                "bananas",
655                "cakes",
656                "cup",
657                "cupcakes",
658                "😁",
659                "",
660                "Many strings are",
661                "This is a very long",
662                "are here contained of great",
663                "I do so like long strings"
664            ]
665        );
666
667        let err = v.try_append_view(0, u32::MAX, 1).unwrap_err();
668        assert_eq!(err.to_string(), "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17");
669
670        let err = v.try_append_view(0, 1, u32::MAX).unwrap_err();
671        assert_eq!(
672            err.to_string(),
673            "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17"
674        );
675
676        let err = v.try_append_view(0, 13, 2).unwrap_err();
677        assert_eq!(err.to_string(), "Invalid argument error: Invalid view data");
678
679        let err = v.try_append_view(0, 40, 0).unwrap_err();
680        assert_eq!(
681            err.to_string(),
682            "Invalid argument error: Range 40..40 out of bounds for block of length 17"
683        );
684
685        let err = v.try_append_view(5, 0, 0).unwrap_err();
686        assert_eq!(
687            err.to_string(),
688            "Invalid argument error: No block found with index 5"
689        );
690    }
691
692    #[test]
693    fn test_string_view_with_block_size_growth() {
694        let mut exp_builder = StringViewBuilder::new();
695        let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE);
696
697        let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap();
698
699        for i in 0..9 {
700            // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
701            for _ in 0..(2_u32.pow(i)) {
702                exp_builder.append_value(long_string);
703                fixed_builder.append_value(long_string);
704            }
705            exp_builder.flush_in_progress();
706            fixed_builder.flush_in_progress();
707
708            // Every step only add one buffer, but the buffer size is much larger
709            assert_eq!(exp_builder.completed.len(), i as usize + 1);
710            assert_eq!(
711                exp_builder.completed[i as usize].len(),
712                STARTING_BLOCK_SIZE as usize * 2_usize.pow(i)
713            );
714
715            // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
716            assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1);
717
718            // Every buffer is fixed size
719            assert!(fixed_builder
720                .completed
721                .iter()
722                .all(|b| b.len() == STARTING_BLOCK_SIZE as usize));
723        }
724
725        // Add one more value, and the buffer stop growing.
726        exp_builder.append_value(long_string);
727        exp_builder.flush_in_progress();
728        assert_eq!(
729            exp_builder.completed.last().unwrap().capacity(),
730            MAX_BLOCK_SIZE as usize
731        );
732    }
733}