polars_io/cloud/
polars_object_store.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
use std::ops::Range;
use std::sync::Arc;

use bytes::Bytes;
use futures::StreamExt;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use polars_error::{to_compute_err, PolarsResult};
use tokio::io::AsyncWriteExt;

use crate::pl_async::{
    self, tune_with_concurrency_budget, with_concurrency_budget, MAX_BUDGET_PER_REQUEST,
};

/// Polars specific wrapper for `Arc<dyn ObjectStore>` that limits the number of
/// concurrent requests for the entire application.
#[derive(Debug, Clone)]
pub struct PolarsObjectStore(Arc<dyn ObjectStore>);
pub type ObjectStorePath = object_store::path::Path;

impl PolarsObjectStore {
    pub fn new(store: Arc<dyn ObjectStore>) -> Self {
        Self(store)
    }

    pub async fn get(&self, path: &Path) -> PolarsResult<Bytes> {
        tune_with_concurrency_budget(1, || async {
            self.0
                .get(path)
                .await
                .map_err(to_compute_err)?
                .bytes()
                .await
                .map_err(to_compute_err)
        })
        .await
    }

    pub async fn get_range(&self, path: &Path, range: Range<usize>) -> PolarsResult<Bytes> {
        tune_with_concurrency_budget(1, || self.0.get_range(path, range))
            .await
            .map_err(to_compute_err)
    }

    pub async fn get_ranges(
        &self,
        path: &Path,
        ranges: &[Range<usize>],
    ) -> PolarsResult<Vec<Bytes>> {
        tune_with_concurrency_budget(
            (ranges.len() as u32).clamp(0, MAX_BUDGET_PER_REQUEST as u32),
            || self.0.get_ranges(path, ranges),
        )
        .await
        .map_err(to_compute_err)
    }

    pub async fn download<F: tokio::io::AsyncWrite + std::marker::Unpin>(
        &self,
        path: &Path,
        file: &mut F,
    ) -> PolarsResult<()> {
        tune_with_concurrency_budget(1, || async {
            let mut stream = self
                .0
                .get(path)
                .await
                .map_err(to_compute_err)?
                .into_stream();

            let mut len = 0;
            while let Some(bytes) = stream.next().await {
                let bytes = bytes.map_err(to_compute_err)?;
                len += bytes.len();
                file.write_all(bytes.as_ref())
                    .await
                    .map_err(to_compute_err)?;
            }

            PolarsResult::Ok(pl_async::Size::from(len as u64))
        })
        .await?;
        Ok(())
    }

    /// Fetch the metadata of the parquet file, do not memoize it.
    pub async fn head(&self, path: &Path) -> PolarsResult<ObjectMeta> {
        with_concurrency_budget(1, || async {
            let head_result = self.0.head(path).await;

            if head_result.is_err() {
                // Pre-signed URLs forbid the HEAD method, but we can still retrieve the header
                // information with a range 0-0 request.
                let get_range_0_0_result = self
                    .0
                    .get_opts(
                        path,
                        object_store::GetOptions {
                            range: Some((0..1).into()),
                            ..Default::default()
                        },
                    )
                    .await;

                if let Ok(v) = get_range_0_0_result {
                    return Ok(v.meta);
                }
            }

            head_result
        })
        .await
        .map_err(to_compute_err)
    }
}