lance_file/v2/
testing.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::sync::Arc;

use arrow_array::{RecordBatch, RecordBatchReader};
use arrow_schema::ArrowError;
use futures::TryStreamExt;
use lance_core::{
    cache::{CapacityMode, FileMetadataCache},
    datatypes::Schema,
};
use lance_encoding::decoder::{DecoderPlugins, FilterExpression};
use lance_io::{
    object_store::ObjectStore,
    scheduler::{ScanScheduler, SchedulerConfig},
    ReadBatchParams,
};
use object_store::path::Path;
use tempfile::TempDir;

use crate::v2::reader::{FileReader, FileReaderOptions};

use super::writer::{FileWriter, FileWriterOptions};

pub struct FsFixture {
    _tmp_dir: TempDir,
    pub tmp_path: Path,
    pub object_store: Arc<ObjectStore>,
    pub scheduler: Arc<ScanScheduler>,
}

impl Default for FsFixture {
    fn default() -> Self {
        let tmp_dir = tempfile::tempdir().unwrap();
        let tmp_path: String = tmp_dir.path().to_str().unwrap().to_owned();
        let tmp_path = Path::parse(tmp_path).unwrap();
        let tmp_path = tmp_path.child("some_file.lance");
        let object_store = Arc::new(ObjectStore::local());
        let scheduler =
            ScanScheduler::new(object_store.clone(), SchedulerConfig::default_for_testing());
        Self {
            _tmp_dir: tmp_dir,
            object_store,
            tmp_path,
            scheduler,
        }
    }
}

pub struct WrittenFile {
    pub schema: Arc<Schema>,
    pub data: Vec<RecordBatch>,
    pub field_id_mapping: Vec<(u32, u32)>,
}

pub async fn write_lance_file(
    data: impl RecordBatchReader,
    fs: &FsFixture,
    options: FileWriterOptions,
) -> WrittenFile {
    let writer = fs.object_store.create(&fs.tmp_path).await.unwrap();

    let lance_schema = lance_core::datatypes::Schema::try_from(data.schema().as_ref()).unwrap();

    let mut file_writer = FileWriter::try_new(writer, lance_schema.clone(), options).unwrap();

    let data = data
        .collect::<std::result::Result<Vec<_>, ArrowError>>()
        .unwrap();

    for batch in &data {
        file_writer.write_batch(batch).await.unwrap();
    }
    let field_id_mapping = file_writer.field_id_to_column_indices().to_vec();
    file_writer.add_schema_metadata("foo", "bar");
    file_writer.finish().await.unwrap();
    WrittenFile {
        schema: Arc::new(lance_schema),
        data,
        field_id_mapping,
    }
}

pub fn test_cache() -> Arc<FileMetadataCache> {
    Arc::new(FileMetadataCache::with_capacity(
        128 * 1024 * 1024,
        CapacityMode::Bytes,
    ))
}

pub async fn read_lance_file(
    fs: &FsFixture,
    decoder_middleware: Arc<DecoderPlugins>,
    filter: FilterExpression,
) -> Vec<RecordBatch> {
    let file_scheduler = fs.scheduler.open_file(&fs.tmp_path).await.unwrap();
    let file_reader = FileReader::try_open(
        file_scheduler,
        None,
        decoder_middleware,
        &test_cache(),
        FileReaderOptions::default(),
    )
    .await
    .unwrap();

    let schema = file_reader.schema();
    assert_eq!(schema.metadata.get("foo").unwrap(), "bar");

    let batch_stream = file_reader
        .read_stream(ReadBatchParams::RangeFull, 1024, 16, filter)
        .unwrap();

    batch_stream.try_collect().await.unwrap()
}

pub async fn count_lance_file(
    fs: &FsFixture,
    decoder_middleware: Arc<DecoderPlugins>,
    filter: FilterExpression,
) -> usize {
    read_lance_file(fs, decoder_middleware, filter)
        .await
        .iter()
        .map(|b| b.num_rows())
        .sum()
}