sonic_number/
lib.rs

1mod arch;
2mod common;
3mod decimal;
4mod float;
5mod lemire;
6mod slow;
7mod table;
8
9use self::{common::BiasedFp, float::RawFloat, table::POWER_OF_FIVE_128};
10use crate::arch::simd_str2int;
11
12const FLOATING_LONGEST_DIGITS: usize = 17;
13const F64_BITS: u32 = 64;
14const F64_SIG_BITS: u32 = 52;
15const F64_SIG_FULL_BITS: u32 = 53;
16const F64_EXP_BIAS: i32 = 1023;
17const F64_SIG_MASK: u64 = 0x000F_FFFF_FFFF_FFFF;
18
19#[derive(Debug)]
20pub enum ParserNumber {
21    Unsigned(u64),
22    /// Always less than zero.
23    Signed(i64),
24    /// Always finite.
25    Float(f64),
26}
27
28#[derive(Debug)]
29pub enum Error {
30    InvalidNumber,
31    FloatMustBeFinite,
32}
33
34macro_rules! match_digit {
35    ($data:expr, $i:expr, $pattern:pat) => {
36        $i < $data.len() && matches!($data[$i], $pattern)
37    };
38}
39
40macro_rules! is_digit {
41    ($data:expr, $i:expr) => {
42        $i < $data.len() && $data[$i].is_ascii_digit()
43    };
44}
45
46macro_rules! digit {
47    ($data:expr, $i:expr) => {
48        ($data[$i] - b'0') as u64
49    };
50}
51
52macro_rules! check_digit {
53    ($data:expr, $i:expr) => {
54        if !($i < $data.len() && $data[$i].is_ascii_digit()) {
55            return Err(Error::InvalidNumber);
56        }
57    };
58}
59
60#[inline(always)]
61fn parse_exponent(data: &[u8], index: &mut usize) -> Result<i32, Error> {
62    let mut exponent: i32 = 0;
63    let mut negative = false;
64
65    if *index >= data.len() {
66        return Err(Error::InvalidNumber);
67    }
68
69    match data[*index] {
70        b'+' => *index += 1,
71        b'-' => {
72            negative = true;
73            *index += 1;
74        }
75        _ => {}
76    }
77
78    check_digit!(data, *index);
79    while exponent < 1000 && is_digit!(data, *index) {
80        exponent = digit!(data, *index) as i32 + exponent * 10;
81        *index += 1;
82    }
83    while is_digit!(data, *index) {
84        *index += 1;
85    }
86    if negative {
87        exponent = -exponent;
88    }
89    Ok(exponent)
90}
91
92const POW10_UINT: [u64; 18] = [
93    1,
94    10,
95    100,
96    1000,
97    10000,
98    100000,
99    1000000,
100    10000000,
101    100000000,
102    1000000000,
103    10000000000,
104    100000000000,
105    1000000000000,
106    10000000000000,
107    100000000000000,
108    1000000000000000,
109    10000000000000000,
110    100000000000000000,
111];
112
113// parse at most 16 digits for fraction, record the exponent.
114// because we calcaute at least the first significant digit when both normal or subnormal float
115// points
116#[inline(always)]
117fn parse_number_fraction(
118    data: &[u8],
119    index: &mut usize,
120    significant: &mut u64,
121    exponent: &mut i32,
122    mut need: isize,
123    dot_pos: usize,
124) -> Result<bool, Error> {
125    debug_assert!(need < FLOATING_LONGEST_DIGITS as isize);
126
127    // native implement:
128    // while need > 0 && is_digit!(data, *index) {
129    //     *significant = *significant * 10 + digit!(data, *index);
130    //     *index += 1;
131    //     need -= 1;
132    // }
133    if need > 0 {
134        if data.len() - *index >= 16 {
135            let (frac, ndigits) = unsafe { simd_str2int(&data[*index..], need as usize) };
136            *significant = *significant * POW10_UINT[ndigits] + frac;
137            *index += ndigits;
138        } else {
139            while need > 0 && is_digit!(data, *index) {
140                *significant = *significant * 10 + digit!(data, *index);
141                *index += 1;
142                need -= 1;
143            }
144        }
145    }
146
147    *exponent -= *index as i32 - dot_pos as i32;
148    let mut trunc = false;
149    while is_digit!(data, *index) {
150        trunc = true;
151        *index += 1;
152    }
153
154    if match_digit!(data, *index, b'e' | b'E') {
155        *index += 1;
156        *exponent += parse_exponent(data, &mut *index)?;
157    }
158    Ok(trunc)
159}
160
161#[inline(always)]
162pub fn parse_number(data: &[u8], index: &mut usize, negative: bool) -> Result<ParserNumber, Error> {
163    let mut significant: u64 = 0;
164    let mut exponent: i32 = 0;
165    let mut trunc = false;
166    let raw_num = &data[*index..];
167
168    if match_digit!(data, *index, b'0') {
169        *index += 1;
170
171        if *index >= data.len() || !matches!(data[*index], b'.' | b'e' | b'E') {
172            // view -0 as float number
173            if negative {
174                return Ok(ParserNumber::Float(0.0));
175            }
176            return Ok(ParserNumber::Unsigned(0));
177        }
178
179        // deal with 0e123 or 0.000e123
180        match data[*index] {
181            b'.' => {
182                *index += 1;
183                let dot_pos = *index;
184                check_digit!(data, *index);
185                while match_digit!(data, *index, b'0') {
186                    *index += 1;
187                }
188                // special case: 0.000e123
189                if match_digit!(data, *index, b'e' | b'E') {
190                    *index += 1;
191                    if match_digit!(data, *index, b'-' | b'+') {
192                        *index += 1;
193                    }
194                    check_digit!(data, *index);
195                    while is_digit!(data, *index) {
196                        *index += 1;
197                    }
198                    return Ok(ParserNumber::Float(0.0));
199                }
200
201                // we calculate the first digit here for two reasons:
202                // 1. fastpath for small float number
203                // 2. we only need parse at most 16 digits in parse_number_fraction
204                // and it is friendly for simd
205                if !is_digit!(data, *index) {
206                    return Ok(ParserNumber::Float(0.0));
207                }
208
209                significant = digit!(data, *index);
210                *index += 1;
211
212                if is_digit!(data, *index) {
213                    let need = FLOATING_LONGEST_DIGITS as isize - 1;
214                    trunc = parse_number_fraction(
215                        data,
216                        index,
217                        &mut significant,
218                        &mut exponent,
219                        need,
220                        dot_pos,
221                    )?;
222                } else {
223                    exponent -= *index as i32 - dot_pos as i32;
224                    if match_digit!(data, *index, b'e' | b'E') {
225                        *index += 1;
226                        exponent += parse_exponent(data, &mut *index)?;
227                    }
228                }
229            }
230            b'e' | b'E' => {
231                *index += 1;
232                if match_digit!(data, *index, b'-' | b'+') {
233                    *index += 1;
234                }
235                check_digit!(data, *index);
236                while is_digit!(data, *index) {
237                    *index += 1;
238                }
239                return Ok(ParserNumber::Float(0.0));
240            }
241            _ => unreachable!("unreachable branch in parse_number_unchecked"),
242        }
243    } else {
244        // parse significant digits
245        let digit_start = *index;
246        while is_digit!(data, *index) {
247            // assume most number is not overflow here. When it overflow, we will check digits count
248            // and fallback into the slow path.
249            significant = significant
250                .wrapping_mul(10)
251                .wrapping_add(digit!(data, *index));
252            *index += 1;
253        }
254        let mut digits_cnt = *index - digit_start;
255        if digits_cnt == 0 {
256            return Err(Error::InvalidNumber);
257        }
258
259        // slow path for too long integer
260        if digits_cnt > 19 {
261            *index = digit_start;
262            significant = 0;
263            digits_cnt = 0;
264            while is_digit!(data, *index) && digits_cnt < 19 {
265                significant = significant * 10 + digit!(data, *index);
266                digits_cnt += 1;
267                *index += 1;
268            }
269
270            // overflow for u64 sig, mark as truncated
271            while is_digit!(data, *index) {
272                exponent += 1;
273                *index += 1;
274                trunc = true;
275            }
276        }
277
278        // TODO: fix special case like `43332000001000000003888e-4`.
279        // it should parse as `4.3332000001000003e18`.
280        if match_digit!(data, *index, b'e' | b'E') {
281            // parse exponent
282            *index += 1;
283            exponent += parse_exponent(data, index)?;
284        } else if match_digit!(data, *index, b'.') {
285            *index += 1;
286            check_digit!(data, *index);
287            let dot_pos = *index;
288
289            // parse fraction
290            let need = FLOATING_LONGEST_DIGITS as isize - digits_cnt as isize;
291            trunc =
292                parse_number_fraction(data, index, &mut significant, &mut exponent, need, dot_pos)?;
293        } else {
294            // parse integer, all parse has finished.
295            if exponent == 0 {
296                if negative {
297                    if significant > (1u64 << 63) {
298                        return Ok(ParserNumber::Float(-(significant as f64)));
299                    } else {
300                        // if significant is 0x8000_0000_0000_0000, it will overflow here.
301                        // so, we must use wrapping_sub here.
302                        return Ok(ParserNumber::Signed(0_i64.wrapping_sub(significant as i64)));
303                    }
304                } else {
305                    return Ok(ParserNumber::Unsigned(significant));
306                }
307            } else if exponent == 1 {
308                // now we get 20 digits, it maybe overflow for uint64
309                let last = digit!(data, *index - 1);
310                let (out, ov0) = significant.overflowing_mul(10);
311                let (out, ov1) = out.overflowing_add(last);
312                if !ov0 && !ov1 {
313                    // negative must be overflow here.
314                    significant = out;
315                    if negative {
316                        return Ok(ParserNumber::Float(-(significant as f64)));
317                    } else {
318                        return Ok(ParserNumber::Unsigned(significant));
319                    }
320                }
321            }
322            trunc = true;
323        }
324    }
325
326    // raw_num is pass-through for fallback parsing logic
327    parse_float(significant, exponent, negative, trunc, raw_num)
328}
329
330#[inline(always)]
331fn parse_float(
332    significant: u64,
333    exponent: i32,
334    negative: bool,
335    trunc: bool,
336    raw_num: &[u8],
337) -> Result<ParserNumber, Error> {
338    // parse double fast
339    if significant >> 52 == 0 && (-22..=(22 + 15)).contains(&exponent) {
340        if let Some(mut float) = parse_float_fast(exponent, significant) {
341            if negative {
342                float = -float;
343            }
344            return Ok(ParserNumber::Float(float));
345        }
346    }
347
348    if !trunc && exponent > (-308 + 1) && exponent < (308 - 20) {
349        if let Some(raw) = parse_floating_normal_fast(exponent, significant) {
350            let mut float = f64::from_u64_bits(raw);
351            if negative {
352                float = -float;
353            }
354            return Ok(ParserNumber::Float(float));
355        }
356    }
357
358    // If significant digits were truncated, then we can have rounding error
359    // only if `mantissa + 1` produces a different result. We also avoid
360    // redundantly using the Eisel-Lemire algorithm if it was unable to
361    // correctly round on the first pass.
362    let exponent = exponent as i64;
363    let mut fp = lemire::compute_float::<f64>(exponent, significant);
364    if trunc && fp.e >= 0 && fp != lemire::compute_float::<f64>(exponent, significant + 1) {
365        fp.e = -1;
366    }
367
368    // Unable to correctly round the float using the Eisel-Lemire algorithm.
369    // Fallback to a slower, but always correct algorithm.
370    if fp.e < 0 {
371        fp = slow::parse_long_mantissa::<f64>(raw_num);
372    }
373
374    let mut float = biased_fp_to_float::<f64>(fp);
375    if negative {
376        float = -float;
377    }
378
379    // check inf for float
380    if float.is_infinite() {
381        return Err(Error::FloatMustBeFinite);
382    }
383    Ok(ParserNumber::Float(float))
384}
385
386// This function is modified from yyjson
387#[inline(always)]
388fn parse_floating_normal_fast(exp10: i32, man: u64) -> Option<u64> {
389    let (mut hi, lo, hi2, add, bits);
390    let mut exp2: i32;
391    let mut exact = false;
392    let idx = exp10 + 342;
393    let sig2_ext = POWER_OF_FIVE_128[idx as usize].1;
394    let sig2 = POWER_OF_FIVE_128[idx as usize].0;
395
396    let mut lz = man.leading_zeros();
397    let sig1 = man << lz;
398    exp2 = ((217706 * exp10 - 4128768) >> 16) - lz as i32;
399
400    (lo, hi) = lemire::full_multiplication(sig1, sig2);
401
402    bits = hi & ((1u64 << (64 - 54 - 1)) - 1);
403    if bits.wrapping_sub(1) < ((1u64 << (64 - 54 - 1)) - 2) {
404        exact = true;
405    } else {
406        (_, hi2) = lemire::full_multiplication(sig1, sig2_ext);
407        // not need warring overflow here
408        add = lo.wrapping_add(hi2);
409        if add + 1 > 1u64 {
410            let carry = add < lo || add < hi2;
411            hi += carry as u64;
412            exact = true;
413        }
414    }
415
416    if exact {
417        lz = if hi < (1u64 << 63) { 1 } else { 0 };
418        hi <<= lz;
419        exp2 -= lz as i32;
420        exp2 += 64;
421
422        let round_up = (hi & (1u64 << (64 - 54))) > 0;
423        hi = hi.wrapping_add(if round_up { 1u64 << (64 - 54) } else { 0 });
424
425        if hi < (1u64 << (64 - 54)) {
426            hi = 1u64 << 63;
427            exp2 += 1;
428        }
429
430        hi >>= F64_BITS - F64_SIG_FULL_BITS;
431        exp2 += F64_BITS as i32 - F64_SIG_FULL_BITS as i32 + F64_SIG_BITS as i32;
432        exp2 += F64_EXP_BIAS;
433        let raw = ((exp2 as u64) << F64_SIG_BITS) | (hi & F64_SIG_MASK);
434        return Some(raw);
435    }
436    None
437}
438
439#[inline(always)]
440/// Converts a `BiasedFp` to the closest machine float type.
441fn biased_fp_to_float<T: RawFloat>(x: BiasedFp) -> T {
442    let mut word = x.f;
443    word |= (x.e as u64) << T::MANTISSA_EXPLICIT_BITS;
444    T::from_u64_bits(word)
445}
446
447#[inline(always)]
448fn parse_float_fast(exp10: i32, significant: u64) -> Option<f64> {
449    let mut d = significant as f64;
450    if exp10 > 0 {
451        if exp10 > 22 {
452            d *= POW10_FLOAT[exp10 as usize - 22];
453            if (-1e15..=1e15).contains(&d) {
454                Some(d * POW10_FLOAT[22])
455            } else {
456                None
457            }
458        } else {
459            Some(d * POW10_FLOAT[exp10 as usize])
460        }
461    } else {
462        Some(d / POW10_FLOAT[(-exp10) as usize])
463    }
464}
465
466const POW10_FLOAT: [f64; 23] = [
467    /* <= the connvertion to double is not exact when less than 1 => */ 1e-000, 1e+001,
468    1e+002, 1e+003, 1e+004, 1e+005, 1e+006, 1e+007, 1e+008, 1e+009, 1e+010, 1e+011, 1e+012, 1e+013,
469    1e+014, 1e+015, 1e+016, 1e+017, 1e+018, 1e+019, 1e+020, 1e+021,
470    1e+022, /* <= the connvertion to double is not exact when larger,  => */
471];
472
473#[cfg(test)]
474mod test {
475    use crate::{parse_number, ParserNumber};
476
477    fn test_parse_ok(input: &str, expect: f64) {
478        assert_eq!(input.parse::<f64>().unwrap(), expect);
479
480        let mut data = input.as_bytes().to_vec();
481        data.push(b' ');
482        let mut index = 0;
483        let num = parse_number(&data, &mut index, false).unwrap();
484        assert!(
485            matches!(num, ParserNumber::Float(f) if f == expect),
486            "parsed is {:?} failed num is {}",
487            num,
488            input
489        );
490        assert_eq!(data[index], b' ', "failed num is {}", input);
491    }
492
493    #[test]
494    fn test_parse_float() {
495        test_parse_ok("0.0", 0.0);
496        test_parse_ok("0.01", 0.01);
497        test_parse_ok("0.1", 0.1);
498        test_parse_ok("0.12", 0.12);
499        test_parse_ok("0.123", 0.123);
500        test_parse_ok("0.1234", 0.1234);
501        test_parse_ok("0.12345", 0.12345);
502        test_parse_ok("0.123456", 0.123456);
503        test_parse_ok("0.1234567", 0.1234567);
504        test_parse_ok("0.12345678", 0.12345678);
505        test_parse_ok("0.123456789", 0.123456789);
506        test_parse_ok("0.1234567890", 0.1234567890);
507        test_parse_ok("0.10000000149011612", 0.10000000149011612);
508        test_parse_ok("0.06411743306171047", 0.06411743306171047);
509
510        test_parse_ok("0e-1", 0e-1);
511        test_parse_ok("0e+1000000", 0e+1000000);
512        test_parse_ok("0.001e-1", 0.001e-1);
513        test_parse_ok("0.001e+123", 0.001e+123);
514        test_parse_ok(
515            "0.000000000000000000000000001e+123",
516            0.000000000000000000000000001e+123,
517        );
518
519        test_parse_ok("1.0", 1.0);
520        test_parse_ok("1350.0", 1350.0);
521        test_parse_ok("1.10000000149011612", 1.1000000014901161);
522
523        test_parse_ok("1e0", 1e0);
524        test_parse_ok("1.0e0", 1.0e0);
525        test_parse_ok("1.0e+0", 1.0e+0);
526        test_parse_ok("1.001e-123", 1.001e-123);
527        test_parse_ok("10000000149011610000.0e-123", 1.000_000_014_901_161e-104);
528        test_parse_ok(
529            "10000000149011612123.001e-123",
530            1.000_000_014_901_161_2e-104,
531        );
532        test_parse_ok("33333333333333333333", 3.333333333333333e19);
533        test_parse_ok("135e-12", 135e-12);
534
535        // test truncated float number without dot
536        test_parse_ok("12448139190673828122020e-47", 1.244813919067383e-25);
537        test_parse_ok(
538            "3469446951536141862700000000000000000e-62",
539            3.469446951536142e-26,
540        );
541    }
542}