polars_parquet/parquet/read/page/
reader.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
use std::io::Seek;
use std::sync::OnceLock;

use polars_parquet_format::thrift::protocol::TCompactInputProtocol;
use polars_utils::mmap::{MemReader, MemSlice};

use super::PageIterator;
use crate::parquet::compression::Compression;
use crate::parquet::error::{ParquetError, ParquetResult};
use crate::parquet::metadata::{ColumnChunkMetadata, Descriptor};
use crate::parquet::page::{
    CompressedDataPage, CompressedDictPage, CompressedPage, DataPageHeader, PageType,
    ParquetPageHeader,
};
use crate::parquet::CowBuffer;
use crate::write::Encoding;

/// This meta is a small part of [`ColumnChunkMetadata`].
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PageMetaData {
    /// The start offset of this column chunk in file.
    pub column_start: u64,
    /// The number of values in this column chunk.
    pub num_values: i64,
    /// Compression type
    pub compression: Compression,
    /// The descriptor of this parquet column
    pub descriptor: Descriptor,
}

impl PageMetaData {
    /// Returns a new [`PageMetaData`].
    pub fn new(
        column_start: u64,
        num_values: i64,
        compression: Compression,
        descriptor: Descriptor,
    ) -> Self {
        Self {
            column_start,
            num_values,
            compression,
            descriptor,
        }
    }
}

impl From<&ColumnChunkMetadata> for PageMetaData {
    fn from(column: &ColumnChunkMetadata) -> Self {
        Self {
            column_start: column.byte_range().start,
            num_values: column.num_values(),
            compression: column.compression(),
            descriptor: column.descriptor().descriptor.clone(),
        }
    }
}

/// A fallible [`Iterator`] of [`CompressedDataPage`]. This iterator reads pages back
/// to back until all pages have been consumed.
///
/// The pages from this iterator always have [`None`] [`crate::parquet::page::CompressedDataPage::selected_rows()`] since
/// filter pushdown is not supported without a
/// pre-computed [page index](https://github.com/apache/parquet-format/blob/master/PageIndex.md).
pub struct PageReader {
    // The source
    reader: MemReader,

    compression: Compression,

    // The number of values we have seen so far.
    seen_num_values: i64,

    // The number of total values in this column chunk.
    total_num_values: i64,

    descriptor: Descriptor,

    // The currently allocated buffer.
    pub(crate) scratch: Vec<u8>,

    // Maximum page size (compressed or uncompressed) to limit allocations
    max_page_size: usize,
}

impl PageReader {
    /// Returns a new [`PageReader`].
    ///
    /// It assumes that the reader has been `sought` (`seek`) to the beginning of `column`.
    /// The parameter `max_header_size`
    pub fn new(
        reader: MemReader,
        column: &ColumnChunkMetadata,
        scratch: Vec<u8>,
        max_page_size: usize,
    ) -> Self {
        Self::new_with_page_meta(reader, column.into(), scratch, max_page_size)
    }

    /// Create a new [`PageReader`] with [`PageMetaData`].
    ///
    /// It assumes that the reader has been `sought` (`seek`) to the beginning of `column`.
    pub fn new_with_page_meta(
        reader: MemReader,
        reader_meta: PageMetaData,
        scratch: Vec<u8>,
        max_page_size: usize,
    ) -> Self {
        Self {
            reader,
            total_num_values: reader_meta.num_values,
            compression: reader_meta.compression,
            seen_num_values: 0,
            descriptor: reader_meta.descriptor,
            scratch,
            max_page_size,
        }
    }

    /// Returns the reader and this Readers' interval buffer
    pub fn into_inner(self) -> (MemReader, Vec<u8>) {
        (self.reader, self.scratch)
    }

    pub fn total_num_values(&self) -> usize {
        debug_assert!(self.total_num_values >= 0);
        self.total_num_values as usize
    }

    pub fn read_dict(&mut self) -> ParquetResult<Option<CompressedDictPage>> {
        // If there are no pages, we cannot check if the first page is a dictionary page. Just
        // return the fact there is no dictionary page.
        if self.reader.remaining_len() == 0 {
            return Ok(None);
        }

        // a dictionary page exists iff the first data page is not at the start of
        // the column
        let seek_offset = self.reader.position();
        let page_header = read_page_header(&mut self.reader, self.max_page_size)?;
        let page_type = page_header.type_.try_into()?;

        if !matches!(page_type, PageType::DictionaryPage) {
            self.reader
                .seek(std::io::SeekFrom::Start(seek_offset as u64))?;
            return Ok(None);
        }

        let read_size: usize = page_header.compressed_page_size.try_into()?;

        if read_size > self.max_page_size {
            return Err(ParquetError::WouldOverAllocate);
        }

        let buffer = self.reader.read_slice(read_size);

        if buffer.len() != read_size {
            return Err(ParquetError::oos(
                "The page header reported the wrong page size",
            ));
        }

        finish_page(page_header, buffer, self.compression, &self.descriptor).map(|p| {
            if let CompressedPage::Dict(d) = p {
                Some(d)
            } else {
                unreachable!()
            }
        })
    }
}

impl PageIterator for PageReader {
    fn swap_buffer(&mut self, scratch: &mut Vec<u8>) {
        std::mem::swap(&mut self.scratch, scratch)
    }
}

impl Iterator for PageReader {
    type Item = ParquetResult<CompressedPage>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut buffer = std::mem::take(&mut self.scratch);
        let maybe_maybe_page = next_page(self).transpose();
        if maybe_maybe_page.is_none() {
            // no page => we take back the buffer
            self.scratch = std::mem::take(&mut buffer);
        }
        maybe_maybe_page
    }
}

/// Reads Page header from Thrift.
pub(super) fn read_page_header(
    reader: &mut MemReader,
    max_size: usize,
) -> ParquetResult<ParquetPageHeader> {
    let mut prot = TCompactInputProtocol::new(reader, max_size);
    let page_header = ParquetPageHeader::read_from_in_protocol(&mut prot)?;
    Ok(page_header)
}

/// This function is lightweight and executes a minimal amount of work so that it is IO bounded.
// Any un-necessary CPU-intensive tasks SHOULD be executed on individual pages.
fn next_page(reader: &mut PageReader) -> ParquetResult<Option<CompressedPage>> {
    if reader.seen_num_values >= reader.total_num_values {
        return Ok(None);
    };
    build_page(reader)
}

pub(super) fn build_page(reader: &mut PageReader) -> ParquetResult<Option<CompressedPage>> {
    let page_header = read_page_header(&mut reader.reader, reader.max_page_size)?;

    reader.seen_num_values += get_page_num_values(&page_header)? as i64;

    let read_size: usize = page_header.compressed_page_size.try_into()?;

    if read_size > reader.max_page_size {
        return Err(ParquetError::WouldOverAllocate);
    }

    let buffer = reader.reader.read_slice(read_size);

    if buffer.len() != read_size {
        return Err(ParquetError::oos(
            "The page header reported the wrong page size",
        ));
    }

    finish_page(page_header, buffer, reader.compression, &reader.descriptor).map(Some)
}

pub(super) fn finish_page(
    page_header: ParquetPageHeader,
    data: MemSlice,
    compression: Compression,
    descriptor: &Descriptor,
) -> ParquetResult<CompressedPage> {
    let type_ = page_header.type_.try_into()?;
    let uncompressed_page_size = page_header.uncompressed_page_size.try_into()?;

    static DO_VERBOSE: OnceLock<bool> = OnceLock::new();
    let do_verbose = *DO_VERBOSE.get_or_init(|| std::env::var("PARQUET_DO_VERBOSE").is_ok());

    match type_ {
        PageType::DictionaryPage => {
            let dict_header = page_header.dictionary_page_header.as_ref().ok_or_else(|| {
                ParquetError::oos(
                    "The page header type is a dictionary page but the dictionary header is empty",
                )
            })?;

            if do_verbose {
                eprintln!(
                    "Parquet DictPage ( num_values: {}, datatype: {:?} )",
                    dict_header.num_values, descriptor.primitive_type
                );
            }

            let is_sorted = dict_header.is_sorted.unwrap_or(false);

            // move the buffer to `dict_page`
            let page = CompressedDictPage::new(
                CowBuffer::Borrowed(data),
                compression,
                uncompressed_page_size,
                dict_header.num_values.try_into()?,
                is_sorted,
            );

            Ok(CompressedPage::Dict(page))
        },
        PageType::DataPage => {
            let header = page_header.data_page_header.ok_or_else(|| {
                ParquetError::oos(
                    "The page header type is a v1 data page but the v1 data header is empty",
                )
            })?;

            if do_verbose {
                eprintln!(
                    "Parquet DataPageV1 ( num_values: {}, datatype: {:?}, encoding: {:?} )",
                    header.num_values,
                    descriptor.primitive_type,
                    Encoding::try_from(header.encoding).ok()
                );
            }

            Ok(CompressedPage::Data(CompressedDataPage::new_read(
                DataPageHeader::V1(header),
                CowBuffer::Borrowed(data),
                compression,
                uncompressed_page_size,
                descriptor.clone(),
            )))
        },
        PageType::DataPageV2 => {
            let header = page_header.data_page_header_v2.ok_or_else(|| {
                ParquetError::oos(
                    "The page header type is a v2 data page but the v2 data header is empty",
                )
            })?;

            if do_verbose {
                println!(
                    "Parquet DataPageV2 ( num_values: {}, datatype: {:?}, encoding: {:?} )",
                    header.num_values,
                    descriptor.primitive_type,
                    Encoding::try_from(header.encoding).ok()
                );
            }

            Ok(CompressedPage::Data(CompressedDataPage::new_read(
                DataPageHeader::V2(header),
                CowBuffer::Borrowed(data),
                compression,
                uncompressed_page_size,
                descriptor.clone(),
            )))
        },
    }
}

pub(super) fn get_page_num_values(header: &ParquetPageHeader) -> ParquetResult<i32> {
    let type_ = header.type_.try_into()?;
    Ok(match type_ {
        PageType::DataPage => {
            header
                .data_page_header
                .as_ref()
                .ok_or_else(|| {
                    ParquetError::oos(
                        "The page header type is a v1 data page but the v1 header is empty",
                    )
                })?
                .num_values
        },
        PageType::DataPageV2 => {
            header
                .data_page_header_v2
                .as_ref()
                .ok_or_else(|| {
                    ParquetError::oos(
                        "The page header type is a v1 data page but the v1 header is empty",
                    )
                })?
                .num_values
        },
        _ => 0,
    })
}