gix_url/
parse.rs

1use std::convert::Infallible;
2
3use crate::Scheme;
4use bstr::{BStr, BString, ByteSlice};
5use percent_encoding::percent_decode_str;
6
7/// The error returned by [parse()](crate::parse()).
8#[derive(Debug, thiserror::Error)]
9#[allow(missing_docs)]
10pub enum Error {
11    #[error("{} \"{url}\" is not valid UTF-8", kind.as_str())]
12    Utf8 {
13        url: BString,
14        kind: UrlKind,
15        source: std::str::Utf8Error,
16    },
17    #[error("{} {url:?} can not be parsed as valid URL", kind.as_str())]
18    Url {
19        url: String,
20        kind: UrlKind,
21        source: url::ParseError,
22    },
23
24    #[error("The host portion of the following URL is too long ({} bytes, {len} bytes total): {truncated_url:?}", truncated_url.len())]
25    TooLong { truncated_url: BString, len: usize },
26    #[error("{} \"{url}\" does not specify a path to a repository", kind.as_str())]
27    MissingRepositoryPath { url: BString, kind: UrlKind },
28    #[error("URL {url:?} is relative which is not allowed in this context")]
29    RelativeUrl { url: String },
30}
31
32impl From<Infallible> for Error {
33    fn from(_: Infallible) -> Self {
34        unreachable!("Cannot actually happen, but it seems there can't be a blanket impl for this")
35    }
36}
37
38///
39#[derive(Debug, Clone, Copy)]
40pub enum UrlKind {
41    ///
42    Url,
43    ///
44    Scp,
45    ///
46    Local,
47}
48
49impl UrlKind {
50    fn as_str(&self) -> &'static str {
51        match self {
52            UrlKind::Url => "URL",
53            UrlKind::Scp => "SCP-like target",
54            UrlKind::Local => "local path",
55        }
56    }
57}
58
59pub(crate) enum InputScheme {
60    Url { protocol_end: usize },
61    Scp { colon: usize },
62    Local,
63}
64
65pub(crate) fn find_scheme(input: &BStr) -> InputScheme {
66    // TODO: url's may only contain `:/`, we should additionally check if the characters used for
67    //       protocol are all valid
68    if let Some(protocol_end) = input.find("://") {
69        return InputScheme::Url { protocol_end };
70    }
71
72    if let Some(colon) = input.find_byte(b':') {
73        // allow user to select files containing a `:` by passing them as absolute or relative path
74        // this is behavior explicitly mentioned by the scp and git manuals
75        let explicitly_local = &input[..colon].contains(&b'/');
76        let dos_driver_letter = cfg!(windows) && input[..colon].len() == 1;
77
78        if !explicitly_local && !dos_driver_letter {
79            return InputScheme::Scp { colon };
80        }
81    }
82
83    InputScheme::Local
84}
85
86pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error> {
87    const MAX_LEN: usize = 1024;
88    let bytes_to_path = input[protocol_end + "://".len()..]
89        .iter()
90        .filter(|b| !b.is_ascii_whitespace())
91        .skip_while(|b| **b == b'/' || **b == b'\\')
92        .position(|b| *b == b'/')
93        .unwrap_or(input.len() - protocol_end);
94    if bytes_to_path > MAX_LEN || protocol_end > MAX_LEN {
95        return Err(Error::TooLong {
96            truncated_url: input[..(protocol_end + "://".len() + MAX_LEN).min(input.len())].into(),
97            len: input.len(),
98        });
99    }
100    let (input, url) = input_to_utf8_and_url(input, UrlKind::Url)?;
101    let scheme = url.scheme().into();
102
103    if matches!(scheme, Scheme::Git | Scheme::Ssh) && url.path().is_empty() {
104        return Err(Error::MissingRepositoryPath {
105            url: input.into(),
106            kind: UrlKind::Url,
107        });
108    }
109
110    if url.cannot_be_a_base() {
111        return Err(Error::RelativeUrl { url: input.to_owned() });
112    }
113
114    Ok(crate::Url {
115        serialize_alternative_form: false,
116        scheme,
117        user: url_user(&url, UrlKind::Url)?,
118        password: url
119            .password()
120            .map(|s| percent_decoded_utf8(s, UrlKind::Url))
121            .transpose()?,
122        host: url.host_str().map(Into::into),
123        port: url.port(),
124        path: url.path().into(),
125    })
126}
127
128fn percent_decoded_utf8(s: &str, kind: UrlKind) -> Result<String, Error> {
129    Ok(percent_decode_str(s)
130        .decode_utf8()
131        .map_err(|err| Error::Utf8 {
132            url: s.into(),
133            kind,
134            source: err,
135        })?
136        .into_owned())
137}
138
139pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
140    let input = input_to_utf8(input, UrlKind::Scp)?;
141
142    // TODO: this incorrectly splits at IPv6 addresses, check for `[]` before splitting
143    let (host, path) = input.split_at(colon);
144    debug_assert_eq!(path.get(..1), Some(":"), "{path} should start with :");
145    let path = &path[1..];
146
147    if path.is_empty() {
148        return Err(Error::MissingRepositoryPath {
149            url: input.to_owned().into(),
150            kind: UrlKind::Scp,
151        });
152    }
153
154    // The path returned by the parsed url often has the wrong number of leading `/` characters but
155    // should never differ in any other way (ssh URLs should not contain a query or fragment part).
156    // To avoid the various off-by-one errors caused by the `/` characters, we keep using the path
157    // determined above and can therefore skip parsing it here as well.
158    let url = url::Url::parse(&format!("ssh://{host}")).map_err(|source| Error::Url {
159        url: input.to_owned(),
160        kind: UrlKind::Scp,
161        source,
162    })?;
163
164    Ok(crate::Url {
165        serialize_alternative_form: true,
166        scheme: url.scheme().into(),
167        user: url_user(&url, UrlKind::Scp)?,
168        password: url
169            .password()
170            .map(|s| percent_decoded_utf8(s, UrlKind::Scp))
171            .transpose()?,
172        host: url.host_str().map(Into::into),
173        port: url.port(),
174        path: path.into(),
175    })
176}
177
178fn url_user(url: &url::Url, kind: UrlKind) -> Result<Option<String>, Error> {
179    if url.username().is_empty() && url.password().is_none() {
180        Ok(None)
181    } else {
182        Ok(Some(percent_decoded_utf8(url.username(), kind)?))
183    }
184}
185
186pub(crate) fn file_url(input: &BStr, protocol_colon: usize) -> Result<crate::Url, Error> {
187    let input = input_to_utf8(input, UrlKind::Url)?;
188    let input_after_protocol = &input[protocol_colon + "://".len()..];
189
190    let Some(first_slash) = input_after_protocol
191        .find('/')
192        .or_else(|| cfg!(windows).then(|| input_after_protocol.find('\\')).flatten())
193    else {
194        return Err(Error::MissingRepositoryPath {
195            url: input.to_owned().into(),
196            kind: UrlKind::Url,
197        });
198    };
199
200    // We cannot use the url crate to parse host and path because it special cases Windows
201    // driver letters. With the url crate an input of `file://x:/path/to/git` is parsed as empty
202    // host and with `x:/path/to/git` as path. This behavior is wrong for Git which only follows
203    // that rule on Windows and parses `x:` as host on Unix platforms. Additionally, the url crate
204    // does not account for Windows special UNC path support.
205
206    // TODO: implement UNC path special case
207    let windows_special_path = if cfg!(windows) {
208        // Inputs created via url::Url::from_file_path contain an additional `/` between the
209        // protocol and the absolute path. Make sure we ignore that first slash character to avoid
210        // producing invalid paths.
211        let input_after_protocol = if first_slash == 0 {
212            &input_after_protocol[1..]
213        } else {
214            input_after_protocol
215        };
216        // parse `file://x:/path/to/git` as explained above
217        if input_after_protocol.chars().nth(1) == Some(':') {
218            Some(input_after_protocol)
219        } else {
220            None
221        }
222    } else {
223        None
224    };
225
226    let host = if windows_special_path.is_some() || first_slash == 0 {
227        // `file:///path/to/git` or a windows special case was triggered
228        None
229    } else {
230        // `file://host/path/to/git`
231        Some(&input_after_protocol[..first_slash])
232    };
233
234    // default behavior on Unix platforms and if no Windows special case was triggered
235    let path = windows_special_path.unwrap_or(&input_after_protocol[first_slash..]);
236
237    Ok(crate::Url {
238        serialize_alternative_form: false,
239        host: host.map(Into::into),
240        ..local(path.into())?
241    })
242}
243
244pub(crate) fn local(input: &BStr) -> Result<crate::Url, Error> {
245    if input.is_empty() {
246        return Err(Error::MissingRepositoryPath {
247            url: input.to_owned(),
248            kind: UrlKind::Local,
249        });
250    }
251
252    Ok(crate::Url {
253        serialize_alternative_form: true,
254        scheme: Scheme::File,
255        password: None,
256        user: None,
257        host: None,
258        port: None,
259        path: input.to_owned(),
260    })
261}
262
263fn input_to_utf8(input: &BStr, kind: UrlKind) -> Result<&str, Error> {
264    std::str::from_utf8(input).map_err(|source| Error::Utf8 {
265        url: input.to_owned(),
266        kind,
267        source,
268    })
269}
270
271fn input_to_utf8_and_url(input: &BStr, kind: UrlKind) -> Result<(&str, url::Url), Error> {
272    let input = input_to_utf8(input, kind)?;
273    url::Url::parse(input)
274        .map(|url| (input, url))
275        .map_err(|source| Error::Url {
276            url: input.to_owned(),
277            kind,
278            source,
279        })
280}