crates_index/
sparse.rs

1use std::io;
2use std::path::{Path, PathBuf};
3
4use crate::dirs::{
5    crate_name_to_relative_path, local_path_and_canonical_url_with_hash_kind, HashKind, DEFAULT_HASHER_KIND,
6};
7use crate::{path_max_byte_len, Crate, Error, IndexConfig, SparseIndex};
8
9/// The default URL of the crates.io HTTP index, see [`SparseIndex::from_url`] and [`SparseIndex::new_cargo_default`]
10pub const URL: &str = "sparse+https://index.crates.io/";
11
12impl SparseIndex {
13    /// Creates a view over the sparse HTTP index from a provided URL, opening
14    /// the same location on disk that Cargo uses for that registry index's
15    /// metadata and cache.
16    ///
17    /// Note this function takes the `CARGO_HOME` environment variable into account
18    #[inline]
19    pub fn from_url(url: &str) -> Result<Self, Error> {
20        Self::from_url_with_hash_kind(url, &DEFAULT_HASHER_KIND)
21    }
22
23    /// Like [`Self::from_url`] but accepts an explicit [`HashKind`] for determining the crates index path.
24    #[inline]
25    pub fn from_url_with_hash_kind(url: &str, hash_kind: &HashKind) -> Result<Self, Error> {
26        Self::with_path_and_hash_kind(home::cargo_home()?, url, hash_kind)
27    }
28
29    /// Creates an index for the default crates.io registry, using the same
30    /// disk location as Cargo itself.
31    ///
32    /// This is the recommended way to access the crates.io sparse index.
33    ///
34    /// Note this function takes the `CARGO_HOME` environment variable into account
35    #[inline]
36    pub fn new_cargo_default() -> Result<Self, Error> {
37        Self::from_url(URL)
38    }
39
40    /// Creates a view over the sparse HTTP index from the provided URL, rooted
41    /// at the specified location
42    #[inline]
43    pub fn with_path(cargo_home: impl AsRef<Path>, url: impl AsRef<str>) -> Result<Self, Error> {
44        Self::with_path_and_hash_kind(cargo_home, url, &DEFAULT_HASHER_KIND)
45    }
46
47    /// Like [`Self::with_path`] but accepts an explicit [`HashKind`] for determining the crates index path.
48    #[inline]
49    pub fn with_path_and_hash_kind(
50        cargo_home: impl AsRef<Path>,
51        url: impl AsRef<str>,
52        hash_kind: &HashKind,
53    ) -> Result<Self, Error> {
54        let url = url.as_ref();
55        // It is required to have the sparse+ scheme modifier for sparse urls as
56        // they are part of the short ident hash calculation done by cargo
57        if !url.starts_with("sparse+http") {
58            return Err(Error::Url(url.to_owned()));
59        }
60
61        let (path, url) = local_path_and_canonical_url_with_hash_kind(url, Some(cargo_home.as_ref()), hash_kind)?;
62        Ok(Self::at_path(path, url))
63    }
64
65    /// Creates a view over the sparse HTTP index at the exact specified path
66    #[inline]
67    #[must_use]
68    pub fn at_path(path: PathBuf, mut url: String) -> Self {
69        if !url.ends_with('/') {
70            url.push('/');
71        }
72        Self { path, url }
73    }
74
75    /// Get the global configuration of the index. There are no guarantees around freshness,
76    /// and if the config is not available, no fetch will be performed.
77    pub fn index_config(&self) -> Result<IndexConfig, Error> {
78        let path = self.path.join("config.json");
79        let bytes = std::fs::read(path).map_err(Error::Io)?;
80
81        serde_json::from_slice(&bytes).map_err(Error::Json)
82    }
83
84    /// Reads a crate from the local cache of the index. There are no guarantees around freshness,
85    /// and if the crate is not known in the cache, no fetch will be performed.
86    pub fn crate_from_cache(&self, name: &str) -> Result<Crate, Error> {
87        let cache_path = self
88            .cache_path(name)
89            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "bad name"))?;
90
91        let cache_bytes = std::fs::read(&cache_path)
92            .map_err(|e| io::Error::new(e.kind(), format!("{}: `{}`", e, cache_path.display())))?;
93        Ok(Crate::from_cache_slice(&cache_bytes, None)?)
94    }
95
96    /// The HTTP url of the index
97    #[inline]
98    #[must_use]
99    pub fn url(&self) -> &str {
100        self.url.strip_prefix("sparse+").unwrap_or(&self.url)
101    }
102
103    /// Get the URL that can be used to fetch the index entry for the specified
104    /// crate
105    ///
106    /// The body of a successful response for the returned URL can be parsed
107    /// via [`Crate::from_slice`]
108    #[inline]
109    #[must_use]
110    pub fn crate_url(&self, name: &str) -> Option<String> {
111        let rel_path = crate_name_to_relative_path(name, Some('/'))?;
112        Some(format!("{}{rel_path}", self.url()))
113    }
114
115    /// Gets the full path to the cache file for the specified crate
116    fn cache_path(&self, name: &str) -> Option<PathBuf> {
117        let rel_path = crate_name_to_relative_path(name, None)?;
118
119        // avoid realloc on each push
120        let mut cache_path = PathBuf::with_capacity(path_max_byte_len(&self.path) + 8 + rel_path.len());
121        cache_path.push(&self.path);
122        cache_path.push(".cache");
123        cache_path.push(rel_path);
124
125        Some(cache_path)
126    }
127
128    /// Reads the version of the cache entry for the specified crate, if it exists
129    ///
130    /// The version is of the form `key:value`, where, currently, the key is either
131    /// `etag` or `last-modified`
132    #[cfg(feature = "sparse")]
133    fn read_cache_version(&self, name: &str) -> Option<String> {
134        let cache_path = self.cache_path(name)?;
135        let bytes = std::fs::read(cache_path).ok()?;
136
137        const CURRENT_CACHE_VERSION: u8 = 3;
138        const CURRENT_INDEX_FORMAT_VERSION: u32 = 2;
139
140        let (&first_byte, rest) = bytes.split_first()?;
141
142        if first_byte != CURRENT_CACHE_VERSION {
143            return None;
144        }
145
146        let index_v_bytes = rest.get(..4)?;
147        let index_v = u32::from_le_bytes(index_v_bytes.try_into().unwrap());
148        if index_v != CURRENT_INDEX_FORMAT_VERSION {
149            return None;
150        }
151        let rest = &rest[4..];
152
153        let version = crate::split(rest, 0)
154            .next()
155            .and_then(|version| std::str::from_utf8(version).ok().map(String::from));
156
157        version
158    }
159
160    #[cfg(feature = "sparse")]
161    fn make_request(&self, url: &str, cache_version: Option<&str>) -> Result<http::request::Builder, Error> {
162        use http::header;
163
164        let mut req = http::Request::get(url).version(http::Version::HTTP_2);
165
166        {
167            let headers = req.headers_mut().unwrap();
168
169            // AFAICT this does not affect responses at the moment, but could in the future
170            // if there are changes
171            headers.insert("cargo-protocol", header::HeaderValue::from_static("version=1"));
172            // All index entries are just files with lines of JSON
173            headers.insert(header::ACCEPT, header::HeaderValue::from_static("text/plain"));
174            // We need to accept both identity and gzip, as otherwise cloudfront will
175            // always respond to requests with strong etag's, which will differ from
176            // cache entries generated by cargo
177            headers.insert(
178                header::ACCEPT_ENCODING,
179                header::HeaderValue::from_static("gzip,identity"),
180            );
181
182            // If we have a local cache entry, include its version with the
183            // appropriate header, this allows the server to respond with a
184            // cached, or even better, empty response if its version matches
185            // the local one making the request/response loop basically free
186            if let Some(cache_version) = cache_version {
187                if let Some((key, value)) = cache_version.split_once(':') {
188                    if let Ok(value) = header::HeaderValue::from_str(value.trim()) {
189                        if key == header::ETAG {
190                            headers.insert(header::IF_NONE_MATCH, value);
191                        } else if key == header::LAST_MODIFIED {
192                            headers.insert(header::IF_MODIFIED_SINCE, value);
193                        } else {
194                            // We could error here, but that's kind of pointless
195                            // since the response will be sent in full if we haven't
196                            // specified one of the above headers. Though it does
197                            // potentially indicate something weird is going on
198                        }
199                    }
200                }
201            }
202        }
203
204        Ok(req)
205    }
206
207    /// Creates an HTTP request that can be sent via your HTTP client of choice
208    /// to retrieve the config for this index.
209    ///
210    /// See [`Self::parse_config_response()`] processing the response from the remote
211    /// index.
212    ///
213    /// It is highly recommended to assume HTTP/2 when making requests to remote
214    /// indices, at least crates.io.
215    #[cfg(feature = "sparse")]
216    pub fn make_config_request(&self) -> Result<http::request::Builder, Error> {
217        self.make_request(&format!("{}config.json", self.url()), None)
218    }
219
220    /// Creates an HTTP request that can be sent via your HTTP client of choice
221    /// to retrieve the current metadata for the specified crate `namw`.
222    ///
223    /// See [`Self::parse_cache_response()`] processing the response from the remote
224    /// index.
225    ///
226    /// It is highly recommended to assume HTTP/2 when making requests to remote
227    /// indices, at least crates.io.
228    #[cfg(feature = "sparse")]
229    pub fn make_cache_request(&self, name: &str) -> Result<http::request::Builder, Error> {
230        self.make_request(
231            &self
232                .crate_url(name)
233                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "crate name is invalid"))?,
234            self.read_cache_version(name).as_deref(),
235        )
236    }
237
238    /// Process the response to a request created by [`Self::make_config_request()`].
239    ///
240    /// If `write_config` is `true`, write the configuration to disk after parsing it.
241    /// Note that the write operation may fail, and as opposed to the similar parameter
242    /// in [`Self::parse_cache_response()`], write errors will not be ignored.
243    ///
244    /// Note that the `response` from sparse HTTP indices, at least crates.io, may
245    /// send responses with `gzip` compression, it is your responsibility to
246    /// decompress it before sending to this function.
247    #[cfg(feature = "sparse")]
248    pub fn parse_config_response(
249        &self,
250        response: http::Response<Vec<u8>>,
251        write_config: bool,
252    ) -> Result<IndexConfig, Error> {
253        use http::StatusCode;
254        let (parts, body) = response.into_parts();
255
256        match parts.status {
257            StatusCode::OK => {
258                let res = serde_json::from_slice(&body).map_err(Error::Json);
259                if write_config {
260                    let path = self.path.join("config.json");
261                    std::fs::create_dir_all(path.parent().unwrap())?;
262                    std::fs::write(&path, &body)?;
263                }
264                res
265            }
266            StatusCode::UNAUTHORIZED => {
267                Err(io::Error::new(io::ErrorKind::PermissionDenied, "the request was not authorized").into())
268            }
269            StatusCode::NOT_FOUND => {
270                Err(io::Error::new(io::ErrorKind::NotFound, "config.json not found in registry").into())
271            }
272            other => Err(io::Error::new(
273                io::ErrorKind::Unsupported,
274                format!(
275                    "the server responded with status code '{other}', which is not supported in the current protocol"
276                ),
277            )
278            .into()),
279        }
280    }
281
282    /// Process the response to a request created by [`Self::make_cache_request`]
283    ///
284    /// This handles both the scenario where the local cache is missing the specified
285    /// crate, or it is out of date, as well as the local entry being up to date
286    /// and can just be read from disk
287    ///
288    /// You may specify whether an updated index entry is written locally to the
289    /// cache or not
290    ///
291    /// Note that responses from sparse HTTP indices, at least crates.io, may
292    /// send responses with `gzip` compression, it is your responsibility to
293    /// decompress it before sending to this function
294    #[cfg(feature = "sparse")]
295    pub fn parse_cache_response(
296        &self,
297        name: &str,
298        response: http::Response<Vec<u8>>,
299        write_cache_entry: bool,
300    ) -> Result<Option<Crate>, Error> {
301        use http::{header, StatusCode};
302        let (parts, body) = response.into_parts();
303
304        match parts.status {
305            // The server responded with the full contents of the index entry
306            StatusCode::OK => {
307                let krate = Crate::from_slice(&body)?;
308
309                if write_cache_entry {
310                    // The same as cargo, prefer etag over last-modified
311                    let version = if let Some(etag) = parts.headers.get(header::ETAG) {
312                        etag.to_str().ok().map(|etag| format!("{}: {etag}", header::ETAG))
313                    } else if let Some(lm) = parts.headers.get(header::LAST_MODIFIED) {
314                        lm.to_str().ok().map(|lm| format!("{}: {lm}", header::LAST_MODIFIED))
315                    } else {
316                        None
317                    };
318
319                    let version = version.unwrap_or_else(|| "Unknown".to_owned());
320
321                    // This should always succeed, but no need to panic or fail
322                    if let Some(cache_path) = self.cache_path(name) {
323                        if std::fs::create_dir_all(cache_path.parent().unwrap()).is_ok() {
324                            // It's unfortunate if this fails for some reason, but
325                            // not writing the cache entry shouldn't stop the user
326                            // from getting the crate's metadata
327                            let _ = krate.write_cache_entry(&cache_path, &version);
328                        }
329                    }
330                }
331
332                Ok(Some(krate))
333            }
334            // The local cache entry is up to date with the latest entry on the
335            // server, we can just return the local one
336            StatusCode::NOT_MODIFIED => self.crate_from_cache(name).map(Option::Some),
337            // The server requires authorization but the user didn't provide it
338            StatusCode::UNAUTHORIZED => {
339                Err(io::Error::new(io::ErrorKind::PermissionDenied, "the request was not authorized").into())
340            }
341            // The crate does not exist, or has been removed
342            StatusCode::NOT_FOUND | StatusCode::GONE | StatusCode::UNAVAILABLE_FOR_LEGAL_REASONS => Ok(None),
343            other => Err(io::Error::new(
344                io::ErrorKind::Unsupported,
345                format!(
346                    "the server responded with status code '{other}', which is not supported in the current protocol"
347                ),
348            )
349            .into()),
350        }
351    }
352}
353
354#[cfg(test)]
355#[cfg(feature = "sparse")]
356mod tests {
357    use crate::SparseIndex;
358    use http::header;
359
360    #[inline]
361    fn crates_io() -> SparseIndex {
362        SparseIndex::with_path(
363            std::path::Path::new(&std::env::var_os("CARGO_MANIFEST_DIR").unwrap())
364                .join("tests/fixtures/sparse_registry_cache/cargo_home"),
365            crate::sparse::URL,
366        )
367        .unwrap()
368    }
369
370    // curl -v -H 'accept-encoding: gzip,identity' https://index.crates.io/cr/at/crates-index
371    const CRATES_INDEX_INDEX_ENTRY: &[u8] = include_bytes!("../tests/fixtures/crates-index.txt");
372
373    // Validates that a valid cache entry is written if the index entry has been
374    // modified
375    #[test]
376    fn writes_cache_entry() {
377        let index = crates_io();
378
379        let cache_path = index.cache_path("crates-index").unwrap();
380        if cache_path.exists() {
381            std::fs::remove_file(&cache_path).expect("failed to remove existing crates-index cache file");
382        }
383
384        let response = http::Response::builder()
385            .status(http::StatusCode::OK)
386            .header(header::ETAG, "W/\"7fbfc422231ec53a9283f2eb2fb4f459\"")
387            .body(CRATES_INDEX_INDEX_ENTRY.to_vec())
388            .unwrap();
389
390        let http_krate = index
391            .parse_cache_response("crates-index", response, true /* write cache entry */)
392            .unwrap()
393            .unwrap();
394        assert!(cache_path.is_file(), "the cache entry was indeed written");
395        let cache_krate = index.crate_from_cache("crates-index").unwrap();
396
397        for (http, cache) in http_krate.versions().iter().zip(cache_krate.versions().iter()) {
398            assert_eq!(http.version(), cache.version());
399        }
400    }
401}