fluent_syntax/
unicode.rs

1//! A set of helper functions for unescaping Fluent unicode escape sequences.
2//!
3//! # Unicode
4//!
5//! Fluent supports UTF-8 in all FTL resources, but it also allows
6//! unicode sequences to be escaped in [`String
7//! Literals`](super::ast::InlineExpression::StringLiteral).
8//!
9//! Four byte sequences are encoded with `\u` and six byte
10//! sequences using `\U`.
11//! ## Example
12//!
13//! ```
14//! use fluent_syntax::unicode::unescape_unicode_to_string;
15//!
16//! assert_eq!(
17//!     unescape_unicode_to_string("Foo \\u5bd2 Bar"),
18//!     "Foo 寒 Bar"
19//! );
20//!
21//! assert_eq!(
22//!     unescape_unicode_to_string("Foo \\U01F68A Bar"),
23//!     "Foo 🚊 Bar"
24//! );
25//! ```
26//!
27//! # Other unescapes
28//!
29//! This also allows for a char `"` to be present inside an FTL string literal,
30//! and for `\` itself to be escaped.
31//!
32//! ## Example
33//!
34//! ```
35//! use fluent_syntax::unicode::unescape_unicode_to_string;
36//!
37//! assert_eq!(
38//!     unescape_unicode_to_string("Foo \\\" Bar"),
39//!     "Foo \" Bar"
40//! );
41//! assert_eq!(
42//!     unescape_unicode_to_string("Foo \\\\ Bar"),
43//!     "Foo \\ Bar"
44//! );
45//! ```
46use std::borrow::Cow;
47use std::char;
48use std::fmt;
49
50const UNKNOWN_CHAR: char = '�';
51
52fn encode_unicode(s: Option<&str>) -> char {
53    s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32))
54        .unwrap_or(UNKNOWN_CHAR)
55}
56
57/// Unescapes to a writer without allocating.
58///
59/// ## Example
60///
61/// ```
62/// use fluent_syntax::unicode::unescape_unicode;
63///
64/// let mut s = String::new();
65/// unescape_unicode(&mut s, "Foo \\U01F60A Bar");
66/// assert_eq!(s, "Foo 😊 Bar");
67/// ```
68pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result
69where
70    W: fmt::Write,
71{
72    let bytes = input.as_bytes();
73
74    let mut start = 0;
75    let mut ptr = 0;
76
77    while let Some(b) = bytes.get(ptr) {
78        if b != &b'\\' {
79            ptr += 1;
80            continue;
81        }
82        if start != ptr {
83            w.write_str(&input[start..ptr])?;
84        }
85
86        ptr += 1;
87
88        let new_char = match bytes.get(ptr) {
89            Some(b'\\') => '\\',
90            Some(b'"') => '"',
91            Some(u @ b'u') | Some(u @ b'U') => {
92                let seq_start = ptr + 1;
93                let len = if u == &b'u' { 4 } else { 6 };
94                ptr += len;
95                encode_unicode(input.get(seq_start..seq_start + len))
96            }
97            _ => UNKNOWN_CHAR,
98        };
99        ptr += 1;
100        w.write_char(new_char)?;
101        start = ptr;
102    }
103    if start != ptr {
104        w.write_str(&input[start..ptr])?;
105    }
106    Ok(())
107}
108
109/// Unescapes to a `Cow<str>` optionally allocating.
110///
111/// ## Example
112///
113/// ```
114/// use fluent_syntax::unicode::unescape_unicode_to_string;
115///
116/// assert_eq!(
117///     unescape_unicode_to_string("Foo \\U01F60A Bar"),
118///     "Foo 😊 Bar"
119/// );
120/// ```
121pub fn unescape_unicode_to_string(input: &str) -> Cow<str> {
122    let bytes = input.as_bytes();
123    let mut result = Cow::from(input);
124
125    let mut ptr = 0;
126
127    while let Some(b) = bytes.get(ptr) {
128        if b != &b'\\' {
129            if let Cow::Owned(ref mut s) = result {
130                s.push(*b as char);
131            }
132            ptr += 1;
133            continue;
134        }
135
136        if let Cow::Borrowed(_) = result {
137            result = Cow::from(&input[0..ptr]);
138        }
139
140        ptr += 1;
141
142        let new_char = match bytes.get(ptr) {
143            Some(b'\\') => '\\',
144            Some(b'"') => '"',
145            Some(u @ b'u') | Some(u @ b'U') => {
146                let start = ptr + 1;
147                let len = if u == &b'u' { 4 } else { 6 };
148                ptr += len;
149                input
150                    .get(start..(start + len))
151                    .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice)))
152            }
153            _ => UNKNOWN_CHAR,
154        };
155        result.to_mut().push(new_char);
156        ptr += 1;
157    }
158    result
159}