unic_idna/
process.rs

1// Copyright 2016 The rust-url developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12use unic_normal::StrNormalForm;
13use unic_ucd_bidi::{bidi_class, BidiClass};
14use unic_ucd_normal::is_combining_mark;
15
16use crate::mapping::Mapping;
17use crate::punycode;
18
19/// Prefix used in Punycode encoding.
20pub static PUNYCODE_PREFIX: &'static str = "xn--";
21
22fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
23    match Mapping::of(codepoint) {
24        Mapping::Valid => output.push(codepoint),
25        Mapping::Ignored => {}
26        Mapping::Mapped(slice) => output.push_str(slice),
27        Mapping::Deviation(slice) => {
28            if flags.transitional_processing {
29                output.push_str(slice)
30            } else {
31                output.push(codepoint)
32            }
33        }
34        Mapping::Disallowed => {
35            errors.push(Error::DissallowedCharacter);
36            output.push(codepoint);
37        }
38        Mapping::DisallowedStd3Valid => {
39            if flags.use_std3_ascii_rules {
40                errors.push(Error::DissallowedByStd3AsciiRules);
41            }
42            output.push(codepoint)
43        }
44        Mapping::DisallowedStd3Mapped(slice) => {
45            if flags.use_std3_ascii_rules {
46                errors.push(Error::DissallowedMappedInStd3);
47            }
48            output.push_str(slice)
49        }
50    }
51}
52
53// http://tools.ietf.org/html/rfc5893#section-2
54fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
55    use self::bidi_class::abbr_names::*;
56
57    // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label.  A label
58    // is RTL if it contains at least one character of bidi class R, AL or AN.
59    if !is_bidi_domain {
60        return true;
61    }
62
63    let mut chars = label.chars();
64    let first_char_class = match chars.next() {
65        Some(c) => BidiClass::of(c),
66        None => return true, // empty string
67    };
68
69    match first_char_class {
70        // LTR label
71        L => {
72            // Rule 5
73            while let Some(c) = chars.next() {
74                if !matches!(BidiClass::of(c), L | EN | ES | CS | ET | ON | BN | NSM) {
75                    return false;
76                }
77            }
78
79            // Rule 6
80            // must end in L or EN followed by 0 or more NSM
81            let mut rev_chars = label.chars().rev();
82            let mut last_non_nsm = rev_chars.next();
83            loop {
84                match last_non_nsm {
85                    Some(c) if BidiClass::of(c) == NSM => {
86                        last_non_nsm = rev_chars.next();
87                        continue;
88                    }
89                    _ => {
90                        break;
91                    }
92                }
93            }
94            match last_non_nsm {
95                Some(c) if BidiClass::of(c) == L || BidiClass::of(c) == EN => {}
96                Some(_) => {
97                    return false;
98                }
99                _ => {}
100            }
101        }
102
103        // RTL label
104        R | AL => {
105            let mut found_en = false;
106            let mut found_an = false;
107
108            // Rule 2
109            for c in chars {
110                let char_class = BidiClass::of(c);
111
112                if char_class == EN {
113                    found_en = true;
114                }
115                if char_class == AN {
116                    found_an = true;
117                }
118
119                if !matches!(char_class, R | AL | AN | EN | ES | CS | ET | ON | BN | NSM) {
120                    return false;
121                }
122            }
123
124            // Rule 3
125            let mut rev_chars = label.chars().rev();
126            let mut last = rev_chars.next();
127            loop {
128                // must end in L or EN followed by 0 or more NSM
129                match last {
130                    Some(c) if BidiClass::of(c) == NSM => {
131                        last = rev_chars.next();
132                        continue;
133                    }
134                    _ => {
135                        break;
136                    }
137                }
138            }
139            match last {
140                Some(c) if matches!(BidiClass::of(c), R | AL | EN | AN) => {}
141                _ => {
142                    return false;
143                }
144            }
145
146            // Rule 4
147            if found_an && found_en {
148                return false;
149            }
150        }
151
152        // Rule 1: Should start with L or R/AL
153        _ => {
154            return false;
155        }
156    }
157
158    true
159}
160
161// https://www.unicode.org/reports/tr46/#Validity_Criteria
162#[cfg_attr(feature = "cargo-clippy", allow(if_same_then_else))]
163fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
164    let first_char = label.chars().next();
165
166    if first_char == None {
167        // Empty string, pass
168    }
169    // V1: Must be in NFC form.
170    else if label.nfc().ne(label.chars()) {
171        errors.push(Error::ValidityCriteria);
172    }
173    // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
174    //
175    // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
176    // third and fourth positions. But nobody follows this criteria. See the spec issue below:
177    // https://github.com/whatwg/url/issues/53
178    //
179    // TODO: Add *CheckHyphens* flag.
180
181    // V3: neither begin nor end with a U+002D HYPHEN-MINUS
182    else if label.starts_with('-') || label.ends_with('-') {
183        errors.push(Error::ValidityCriteria);
184    }
185    // V4: not contain a U+002E FULL STOP
186    //
187    // Here, label can't contain '.' since the input is from .split('.')
188
189    // V5: not begin with a GC=Mark
190    else if is_combining_mark(first_char.unwrap()) {
191        errors.push(Error::ValidityCriteria);
192    }
193    // V6: Check against Mapping Table
194    else if label.chars().any(|c| match Mapping::of(c) {
195        Mapping::Valid => false,
196        Mapping::Deviation(_) => flags.transitional_processing,
197        Mapping::DisallowedStd3Valid => flags.use_std3_ascii_rules,
198        _ => true,
199    }) {
200        errors.push(Error::ValidityCriteria);
201    }
202    // V7: ContextJ rules
203    //
204    // TODO: Implement rules and add *CheckJoiners* flag.
205
206    // V8: Bidi rules
207    //
208    // TODO: Add *CheckBidi* flag
209    else if !passes_bidi(label, is_bidi_domain) {
210        errors.push(Error::ValidityCriteria);
211    }
212}
213
214// https://www.unicode.org/reports/tr46/#Processing
215fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
216    use self::bidi_class::abbr_names::*;
217
218    let mut mapped = String::new();
219    for c in domain.chars() {
220        map_char(c, flags, &mut mapped, errors)
221    }
222    let normalized: String = mapped.nfc().collect();
223
224    // Find out if it's a Bidi Domain Name
225    //
226    // First, check for literal bidi chars
227    let mut is_bidi_domain = domain
228        .chars()
229        .any(|c| matches!(BidiClass::of(c), R | AL | AN));
230    if !is_bidi_domain {
231        // Then check for punycode-encoded bidi chars
232        for label in normalized.split('.') {
233            if label.starts_with(PUNYCODE_PREFIX) {
234                match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
235                    Some(decoded_label) => {
236                        if decoded_label
237                            .chars()
238                            .any(|c| matches!(BidiClass::of(c), R | AL | AN))
239                        {
240                            is_bidi_domain = true;
241                        }
242                    }
243                    None => {
244                        is_bidi_domain = true;
245                    }
246                }
247            }
248        }
249    }
250
251    let mut validated = String::new();
252    let mut first = true;
253    for label in normalized.split('.') {
254        if !first {
255            validated.push('.');
256        }
257        first = false;
258        if label.starts_with(PUNYCODE_PREFIX) {
259            match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
260                Some(decoded_label) => {
261                    let flags = Flags {
262                        transitional_processing: false,
263                        ..flags
264                    };
265                    validate(&decoded_label, is_bidi_domain, flags, errors);
266                    validated.push_str(&decoded_label)
267                }
268                None => errors.push(Error::PunycodeError),
269            }
270        } else {
271            validate(label, is_bidi_domain, flags, errors);
272            validated.push_str(label)
273        }
274    }
275    validated
276}
277
278/// Optional settings for processing and conversion algorithms.
279#[derive(Copy, Clone, Debug)]
280pub struct Flags {
281    /// *UseSTD3ASCIIRules* flag.
282    ///
283    /// <https://www.unicode.org/reports/tr46/#UseSTD3ASCIIRules>
284    pub use_std3_ascii_rules: bool,
285
286    /// *Transitional_Processing* or *Nontransitional Processing*, for transitional handling of
287    /// *Deviation* characters.
288    ///
289    /// <https://www.unicode.org/reports/tr46/#Conformance>
290    pub transitional_processing: bool,
291
292    /// *VerifyDnsLength* flag, to verify DNS length restrictions.
293    ///
294    /// <https://www.unicode.org/reports/tr46/#ToASCII>
295    pub verify_dns_length: bool,
296}
297
298/// Error types recorded during UTS #46 processing.
299#[cfg_attr(feature = "cargo-clippy", allow(enum_variant_names))]
300#[derive(PartialEq, Eq, Clone, Copy, Debug)]
301enum Error {
302    PunycodeError,
303    ValidityCriteria,
304    DissallowedByStd3AsciiRules,
305    DissallowedMappedInStd3,
306    DissallowedCharacter,
307    TooLongForDns,
308    TooShortForDns,
309}
310
311/// Errors recorded during UTS #46 processing.
312///
313/// This is opaque for now, only indicating the presence of at least one error.
314/// More details may be exposed in the future.
315#[derive(Debug, Eq, PartialEq)]
316pub struct Errors(Vec<Error>);
317
318/// <https://www.unicode.org/reports/tr46/#ToASCII>
319pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> {
320    let mut errors = Vec::new();
321    let mut result = String::new();
322    let mut first = true;
323    for label in processing(domain, flags, &mut errors).split('.') {
324        if !first {
325            result.push('.');
326        }
327        first = false;
328        if label.is_ascii() {
329            result.push_str(label);
330        } else {
331            match punycode::encode_str(label) {
332                Some(x) => {
333                    result.push_str(PUNYCODE_PREFIX);
334                    result.push_str(&x);
335                }
336                None => errors.push(Error::PunycodeError),
337            }
338        }
339    }
340
341    if flags.verify_dns_length {
342        let domain = if result.ends_with('.') {
343            &result[..result.len() - 1]
344        } else {
345            &*result
346        };
347        if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
348            errors.push(Error::TooShortForDns)
349        }
350        if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
351            errors.push(Error::TooLongForDns)
352        }
353    }
354    if errors.is_empty() {
355        Ok(result)
356    } else {
357        Err(Errors(errors))
358    }
359}
360
361/// <https://www.unicode.org/reports/tr46/#ToUnicode>
362///
363/// Only `use_std3_ascii_rules` is used in `flags`.
364pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) {
365    flags.transitional_processing = false;
366    let mut errors = Vec::new();
367    let domain = processing(domain, flags, &mut errors);
368    let errors = if errors.is_empty() {
369        Ok(())
370    } else {
371        Err(Errors(errors))
372    };
373    (domain, errors)
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    /// https://github.com/servo/rust-url/issues/373
381    #[test]
382    fn test_punycode_prefix_with_length_check() {
383        fn _to_ascii(domain: &str) -> Result<String, Errors> {
384            to_ascii(
385                domain,
386                Flags {
387                    transitional_processing: false,
388                    use_std3_ascii_rules: true,
389                    verify_dns_length: true,
390                },
391            )
392        }
393
394        assert!(_to_ascii("xn--").is_err());
395        assert!(_to_ascii("xn---").is_err());
396        assert!(_to_ascii("xn-----").is_err());
397        assert!(_to_ascii("xn--.").is_err());
398        assert!(_to_ascii("xn--...").is_err());
399        assert!(_to_ascii(".xn--").is_err());
400        assert!(_to_ascii("...xn--").is_err());
401        assert!(_to_ascii("xn--.xn--").is_err());
402        assert!(_to_ascii("xn--.example.org").is_err());
403    }
404
405    /// https://github.com/servo/rust-url/issues/373
406    #[test]
407    fn test_punycode_prefix_without_length_check() {
408        fn _to_ascii(domain: &str) -> Result<String, Errors> {
409            to_ascii(
410                domain,
411                Flags {
412                    transitional_processing: false,
413                    use_std3_ascii_rules: true,
414                    verify_dns_length: false,
415                },
416            )
417        }
418
419        assert_eq!(_to_ascii("xn--"), Ok("".to_owned()));
420        assert!(_to_ascii("xn---").is_err());
421        assert!(_to_ascii("xn-----").is_err());
422        assert_eq!(_to_ascii("xn--."), Ok(".".to_owned()));
423        assert_eq!(_to_ascii("xn--..."), Ok("...".to_owned()));
424        assert_eq!(_to_ascii(".xn--"), Ok(".".to_owned()));
425        assert_eq!(_to_ascii("...xn--"), Ok("...".to_owned()));
426        assert_eq!(_to_ascii("xn--.xn--"), Ok(".".to_owned()));
427        assert_eq!(_to_ascii("xn--.example.org"), Ok(".example.org".to_owned()));
428    }
429
430    #[test]
431    fn test_v5() {
432        fn _to_ascii(domain: &str) -> Result<String, Errors> {
433            to_ascii(
434                domain,
435                Flags {
436                    transitional_processing: false,
437                    use_std3_ascii_rules: true,
438                    verify_dns_length: true,
439                },
440            )
441        }
442
443        // IdnaTest:784 蔏。𑰺
444        assert!(is_combining_mark('\u{11C3A}'));
445        assert!(_to_ascii("\u{11C3A}").is_err());
446        assert!(_to_ascii("\u{850f}.\u{11C3A}").is_err());
447        assert!(_to_ascii("\u{850f}\u{ff61}\u{11C3A}").is_err());
448    }
449
450    #[test]
451    fn test_v8_bidi_rules() {
452        fn _to_ascii(domain: &str) -> Result<String, Errors> {
453            to_ascii(
454                domain,
455                Flags {
456                    transitional_processing: false,
457                    use_std3_ascii_rules: true,
458                    verify_dns_length: true,
459                },
460            )
461        }
462
463        assert_eq!(_to_ascii("abc"), Ok("abc".to_owned()));
464        assert_eq!(_to_ascii("123"), Ok("123".to_owned()));
465        assert_eq!(_to_ascii("אבּג"), Ok("xn--kdb3bdf".to_owned()));
466        assert_eq!(_to_ascii("ابج"), Ok("xn--mgbcm".to_owned()));
467        assert_eq!(_to_ascii("abc.ابج"), Ok("abc.xn--mgbcm".to_owned()));
468        assert_eq!(
469            _to_ascii("אבּג.ابج"),
470            Ok("xn--kdb3bdf.xn--mgbcm".to_owned())
471        );
472
473        // Bidi domain names cannot start with digits
474        assert!(_to_ascii("0a.\u{05D0}").is_err());
475        assert!(_to_ascii("0à.\u{05D0}").is_err());
476
477        // Bidi chars may be punycode-encoded
478        assert!(_to_ascii("xn--0ca24w").is_err());
479    }
480}