tame_index/
utils.rs

1//! Provides several useful functions for determining the disk location of a
2//! remote registry index
3
4use crate::{Error, InvalidUrl, InvalidUrlError, PathBuf};
5
6pub mod flock;
7#[cfg(feature = "__git")]
8pub mod git;
9
10/// Returns the storage directory (in utf-8) used by Cargo, often known as
11/// `.cargo` or `CARGO_HOME`
12#[inline]
13pub fn cargo_home() -> Result<crate::PathBuf, crate::Error> {
14    Ok(crate::PathBuf::from_path_buf(home::cargo_home()?)?)
15}
16
17/// Encodes a slice of bytes into a hexadecimal string to the specified buffer
18pub(crate) fn encode_hex<'out, const I: usize, const O: usize>(
19    input: &[u8; I],
20    output: &'out mut [u8; O],
21) -> &'out str {
22    assert_eq!(I * 2, O);
23
24    const CHARS: &[u8] = b"0123456789abcdef";
25
26    for (i, &byte) in input.iter().enumerate() {
27        let i = i * 2;
28        output[i] = CHARS[(byte >> 4) as usize];
29        output[i + 1] = CHARS[(byte & 0xf) as usize];
30    }
31
32    // SAFETY: we only emit ASCII hex characters
33    #[allow(unsafe_code)]
34    unsafe {
35        std::str::from_utf8_unchecked(output)
36    }
37}
38
39/// The details for a remote url
40pub struct UrlDir {
41    /// The unique directory name for the url
42    pub dir_name: String,
43    /// The canonical url for the remote url
44    pub canonical: String,
45}
46
47/// Canonicalizes a `git+` url the same as cargo
48pub fn canonicalize_url(url: &str) -> Result<String, Error> {
49    let url = url.strip_prefix("git+").unwrap_or(url);
50
51    let scheme_ind = url.find("://").map(|i| i + 3).ok_or_else(|| InvalidUrl {
52        url: url.to_owned(),
53        source: InvalidUrlError::MissingScheme,
54    })?;
55
56    // Could use the Url crate for this, but it's simple enough and we don't
57    // need to deal with every possible url (I hope...)
58    let host = match url[scheme_ind..].find('/') {
59        Some(end) => &url[scheme_ind..scheme_ind + end],
60        None => &url[scheme_ind..],
61    };
62
63    // trim port
64    let host = host.split(':').next().unwrap();
65
66    // cargo special cases github.com for reasons, so do the same
67    let mut canonical = if host == "github.com" {
68        url.to_lowercase()
69    } else {
70        url.to_owned()
71    };
72
73    // Chop off any query params/fragments
74    if let Some(hash) = canonical.rfind('#') {
75        canonical.truncate(hash);
76    }
77
78    if let Some(query) = canonical.rfind('?') {
79        canonical.truncate(query);
80    }
81
82    if canonical.ends_with('/') {
83        canonical.pop();
84    }
85
86    if canonical.ends_with(".git") {
87        canonical.truncate(canonical.len() - 4);
88    }
89
90    Ok(canonical)
91}
92
93/// Converts a url into a relative path and its canonical form
94///
95/// Cargo uses a small algorithm to create unique directory names for any url
96/// so that they can be located in the same root without clashing
97///
98/// This function currently only supports 3 different URL kinds.
99///
100/// * `(?:registry+)?<git registry url>`
101/// * `sparse+<sparse registry url>`
102/// * `git+<git repo url>`
103#[allow(deprecated)]
104pub fn url_to_local_dir(url: &str, stable: bool) -> Result<UrlDir, Error> {
105    use std::hash::{Hash, Hasher, SipHasher};
106
107    // This is extremely irritating, but we need to use usize for the kind, which
108    // impacts the hash calculation, making it different based on pointer size.
109    //
110    // The reason for this is that cargo just uses #[derive(Hash)] for the SourceKind
111    // https://github.com/rust-lang/cargo/blob/88b4b3bcd3bbb66873734d97ae412a6bcf9b75ee/crates/cargo-util-schemas/src/core/source_kind.rs#L4-L5,
112    // which then uses https://doc.rust-lang.org/core/intrinsics/fn.discriminant_value.html
113    // to get the discriminant and add to the hash...and that is pointer width :(
114    //
115    // Note that these are isize instead of usize because contrary to what one
116    // would expect from the automatic discriminant assigned by rustc starting
117    // at 0 and incrementing by 1 each time...it's actually signed, which can
118    // be seen by overriding `Hasher::write_isize` and hashing a discriminant
119    //
120    // This is unfortunately a hard requirement because of https://github.com/rust-lang/rustc-stable-hash/blob/24e9848c89917abca155c8f854118e6d00ad4a30/src/stable_hasher.rs#L263-L299
121    // where it specializes _only_ isize to only write a u8 if the value is less
122    // than 0xff, something that doesn't happen for usize, which of course affects
123    // the calculated hash
124    const GIT_REPO: isize = 0;
125    const GIT_REGISTRY: isize = 2;
126    const SPARSE_REGISTRY: isize = 3;
127
128    // Ensure we have a registry or bare url
129    let (url, scheme_ind, kind) = {
130        let mut scheme_ind = url.find("://").ok_or_else(|| InvalidUrl {
131            url: url.to_owned(),
132            source: InvalidUrlError::MissingScheme,
133        })?;
134
135        let scheme_str = &url[..scheme_ind];
136
137        let (url, kind) = match scheme_str.split_once('+') {
138            Some(("sparse", _)) => (url, SPARSE_REGISTRY),
139            // If there is no scheme modifier, assume git registry, same as cargo
140            None => (url, GIT_REGISTRY),
141            Some(("registry", _)) => {
142                scheme_ind -= 9;
143                (&url[9..], GIT_REGISTRY)
144            }
145            Some(("git", _)) => {
146                scheme_ind -= 4;
147                (&url[4..], GIT_REPO)
148            }
149            Some((_, _)) => {
150                return Err(InvalidUrl {
151                    url: url.to_owned(),
152                    source: InvalidUrlError::UnknownSchemeModifier,
153                }
154                .into());
155            }
156        };
157
158        (url, scheme_ind + 3, kind)
159    };
160
161    let (dir_name, url) = if kind == GIT_REPO {
162        let canonical = canonicalize_url(url)?;
163
164        // For git repo sources, the ident is made up of the last path component
165        // which for most git hosting providers is the name of the repo itself
166        // rather than the other parts of the path that indicate user/org, but
167        // the hash is the hash of the full canonical url so still unique even
168        // for repos with the same name, but different org/user owners
169        let mut dir_name = canonical
170            .split('/')
171            .next_back()
172            .unwrap_or("_empty")
173            .to_owned();
174
175        let hash = if stable {
176            let mut hasher = rustc_stable_hash::StableSipHasher128::new();
177            canonical.hash(&mut hasher);
178            Hasher::finish(&hasher)
179        } else {
180            let mut hasher = SipHasher::new();
181            canonical.hash(&mut hasher);
182            hasher.finish()
183        };
184        let mut raw_ident = [0u8; 16];
185        let ident = encode_hex(&hash.to_le_bytes(), &mut raw_ident);
186
187        dir_name.push('-');
188        dir_name.push_str(ident);
189
190        (dir_name, canonical)
191    } else {
192        let hash = if stable {
193            let mut hasher = rustc_stable_hash::StableSipHasher128::new();
194            kind.hash(&mut hasher);
195            url.hash(&mut hasher);
196            Hasher::finish(&hasher)
197        } else {
198            let mut hasher = SipHasher::new();
199            kind.hash(&mut hasher);
200            url.hash(&mut hasher);
201            hasher.finish()
202        };
203        let mut raw_ident = [0u8; 16];
204        let ident = encode_hex(&hash.to_le_bytes(), &mut raw_ident);
205
206        // Could use the Url crate for this, but it's simple enough and we don't
207        // need to deal with every possible url (I hope...)
208        let host = match url[scheme_ind..].find('/') {
209            Some(end) => &url[scheme_ind..scheme_ind + end],
210            None => &url[scheme_ind..],
211        };
212
213        // trim port
214        let host = host.split(':').next().unwrap();
215        let host = host.split_once('@').map_or(host, |(_user, host)| host);
216
217        (format!("{host}-{ident}"), url.to_owned())
218    };
219
220    Ok(UrlDir {
221        dir_name,
222        canonical: url,
223    })
224}
225
226/// Get the disk location of the specified url, as well as its canonical form
227///
228/// If not specified, the root directory is the user's default cargo home
229pub fn get_index_details(
230    url: &str,
231    root: Option<PathBuf>,
232    stable: bool,
233) -> Result<(PathBuf, String), Error> {
234    let url_dir = url_to_local_dir(url, stable)?;
235
236    let mut path = match root {
237        Some(path) => path,
238        None => cargo_home()?,
239    };
240
241    path.push("registry");
242    path.push("index");
243    path.push(url_dir.dir_name);
244
245    Ok((path, url_dir.canonical))
246}
247
248use std::io;
249
250/// Parses the output of `cargo -V` to get the semver
251///
252/// This handles the 2? cases that I am aware of
253///
254/// 1. Official cargo prints `cargo <semver>(?:-<channel>)? (<sha1[..7]> <date>)`
255/// 2. Non-official builds may drop the additional metadata and just print `cargo <semver>`
256#[inline]
257fn parse_cargo_semver(s: &str) -> Result<semver::Version, Error> {
258    let semver = s.trim().split(' ').nth(1).ok_or_else(|| {
259        io::Error::new(
260            io::ErrorKind::InvalidData,
261            "cargo version information was in an invalid format",
262        )
263    })?;
264
265    Ok(semver.parse()?)
266}
267
268/// Retrieves the current version of cargo being used
269pub fn cargo_version(working_dir: Option<&crate::Path>) -> Result<crate::Version, Error> {
270    let mut cargo = std::process::Command::new(
271        std::env::var_os("CARGO")
272            .as_deref()
273            .unwrap_or(std::ffi::OsStr::new("cargo")),
274    );
275
276    cargo.arg("-V");
277
278    if let Some(wd) = working_dir {
279        cargo.current_dir(wd);
280    }
281
282    cargo.stdout(std::process::Stdio::piped());
283
284    let output = cargo.output()?;
285    if !output.status.success() {
286        return Err(io::Error::new(
287            io::ErrorKind::Other,
288            "failed to request cargo version information",
289        )
290        .into());
291    }
292
293    let stdout = String::from_utf8(output.stdout)
294        .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))?;
295
296    parse_cargo_semver(&stdout)
297}
298
299#[cfg(test)]
300mod test {
301    use super::{get_index_details, url_to_local_dir};
302    use crate::PathBuf;
303
304    #[test]
305    #[cfg(all(target_pointer_width = "64", target_endian = "little"))]
306    fn canonicalizes_git_urls() {
307        let super::UrlDir { dir_name, canonical } = url_to_local_dir("git+https://github.com/EmbarkStudios/cpal.git?rev=d59b4de#d59b4decf72a96932a1482cc27fe4c0b50c40d32", false).unwrap();
308
309        assert_eq!("https://github.com/embarkstudios/cpal", canonical);
310        assert_eq!("cpal-a7ffd7cabefac714", dir_name);
311
312        let super::UrlDir {
313            dir_name,
314            canonical,
315        } = url_to_local_dir("git+https://github.com/gfx-rs/genmesh?rev=71abe4d", false).unwrap();
316
317        assert_eq!("https://github.com/gfx-rs/genmesh", canonical);
318        assert_eq!("genmesh-401fe503e87439cc", dir_name);
319
320        // For registry urls, even if they come from github, they are _not_ canonicalized
321        // and their exact url (other than the registry+ scheme modifier) is used
322        // for the hash calculation, as technically URLs are case sensitive, but
323        // in practice doesn't matter for connection purposes
324        let super::UrlDir {
325            dir_name,
326            canonical,
327        } = url_to_local_dir(
328            "registry+https://github.com/Rust-Lang/crates.io-index",
329            false,
330        )
331        .unwrap();
332
333        assert_eq!("https://github.com/Rust-Lang/crates.io-index", canonical);
334        assert_eq!("github.com-016fae53232cc64d", dir_name);
335
336        // cargo treats github.com specially (eg lowercasing), but it _always_
337        // strips the .git extension if it exists
338        let super::UrlDir {
339            dir_name,
340            canonical,
341        } = url_to_local_dir(
342            "git+https://gitlab.com/gilrs-project/gilrs.git?rev=1bbec17",
343            false,
344        )
345        .unwrap();
346
347        assert_eq!("https://gitlab.com/gilrs-project/gilrs", canonical);
348        assert_eq!("gilrs-7804d1d6a17891c9", dir_name);
349
350        let super::UrlDir {
351            dir_name,
352            canonical,
353        } = url_to_local_dir("ssh://git@github.com/rust-lang/crates.io-index.git", false).unwrap();
354
355        assert_eq!(
356            "ssh://git@github.com/rust-lang/crates.io-index.git",
357            canonical
358        );
359        assert_eq!("github.com-01dba724c7458575", dir_name);
360    }
361
362    #[test]
363    #[cfg(all(target_pointer_width = "64", target_endian = "little"))]
364    fn matches_cargo() {
365        assert_eq!(
366            get_index_details(crate::CRATES_IO_INDEX, Some(PathBuf::new()), false).unwrap(),
367            (
368                "registry/index/github.com-1ecc6299db9ec823".into(),
369                crate::CRATES_IO_INDEX.to_owned()
370            )
371        );
372
373        assert_eq!(
374            get_index_details(crate::CRATES_IO_HTTP_INDEX, Some(PathBuf::new()), false).unwrap(),
375            (
376                "registry/index/index.crates.io-6f17d22bba15001f".into(),
377                crate::CRATES_IO_HTTP_INDEX.to_owned(),
378            )
379        );
380
381        const NON_CRATES_IO_GITHUB: &str = "https://github.com/EmbarkStudios/cargo-test-index";
382        assert_eq!(
383            get_index_details(NON_CRATES_IO_GITHUB, Some(PathBuf::new()), false).unwrap(),
384            (
385                "registry/index/github.com-655148e0a865c9e0".into(),
386                NON_CRATES_IO_GITHUB.to_owned(),
387            )
388        );
389
390        const NON_GITHUB_INDEX: &str =
391            "https://dl.cloudsmith.io/public/embark/deny/cargo/index.git";
392        assert_eq!(
393            get_index_details(NON_GITHUB_INDEX, Some(PathBuf::new()), false).unwrap(),
394            (
395                "registry/index/dl.cloudsmith.io-955e041deb7d37e6".into(),
396                NON_GITHUB_INDEX.to_owned(),
397            )
398        );
399
400        // Just verifies that any non git+ or sparse+ url is treated as a git
401        // registry for purposes of hashing
402        const FAKE_REGISTRY: &str = "https://github.com/RustSec/advisory-db";
403
404        assert_eq!(
405            url_to_local_dir(FAKE_REGISTRY, false).unwrap().dir_name,
406            "github.com-a946fc29ac602819"
407        );
408    }
409
410    #[test]
411    fn matches_cargo_1850() {
412        assert_eq!(
413            get_index_details(crate::CRATES_IO_HTTP_INDEX, Some(PathBuf::new()), true).unwrap(),
414            (
415                "registry/index/index.crates.io-1949cf8c6b5b557f".into(),
416                crate::CRATES_IO_HTTP_INDEX.to_owned(),
417            )
418        );
419    }
420
421    #[test]
422    #[cfg(all(target_pointer_width = "32", target_endian = "little"))]
423    fn matches_cargo_32bit() {
424        assert_eq!(
425            get_index_details(crate::CRATES_IO_HTTP_INDEX, Some(PathBuf::new()), false).unwrap(),
426            (
427                "registry/index/index.crates.io-1cd66030c949c28d".into(),
428                crate::CRATES_IO_HTTP_INDEX.to_owned(),
429            )
430        );
431    }
432
433    #[test]
434    fn gets_cargo_version() {
435        const MINIMUM: semver::Version = semver::Version::new(1, 70, 0);
436        let version = super::cargo_version(None).unwrap();
437        assert!(version >= MINIMUM);
438    }
439
440    #[test]
441    fn parses_cargo_semver() {
442        use super::parse_cargo_semver as pcs;
443
444        assert_eq!(
445            pcs("cargo 1.71.0 (cfd3bbd8f 2023-06-08)\n").unwrap(),
446            semver::Version::new(1, 71, 0)
447        );
448        assert_eq!(
449            pcs("cargo 1.73.0-nightly (7ac9416d8 2023-07-24)\n").unwrap(),
450            "1.73.0-nightly".parse().unwrap()
451        );
452        assert_eq!(
453            pcs("cargo 1.70.0\n").unwrap(),
454            semver::Version::new(1, 70, 0)
455        );
456    }
457}