lexical_util/
div128.rs

1//! Optimized division algorithms for u128.
2//!
3//! # Fast Algorithms
4//!
5//! The more optimized algorithms for calculating the divisor constants are
6//! based off of the paper "Division by Invariant Integers Using
7//! Multiplication", by T. Granlund and P. Montgomery, in "Proc. of the
8//! SIGPLAN94 Conference on Programming Language Design and Implementation",
9//! available online [here](https://gmplib.org/~tege/divcnst-pldi94.pdf).
10//!
11//! This approach is derived from the Rust algorithm for formatting 128-bit
12//! values, and therefore is similarly dual-licensed under MIT/Apache-2.0.
13//!
14//! # Fallback Algorithms
15//!
16//! The slower algorithms in this module are derived off of `dtolnay/itoa`
17//! and Rust's compiler-builtins crate. This copies a specific
18//! path of LLVM's `__udivmodti4` intrinsic, which does division/
19//! modulus for u128 in a single step. Rust implements both division
20//! and modulus in terms of this intrinsic, but calls the intrinsic
21//! twice for subsequent division and modulus operations on the same
22//! dividend/divisor, leading to significant performance overhead.
23//!
24//! This module calculates the optimal divisors for each radix,
25//! and exports a general-purpose division algorithm for u128 where
26//! the divisor can fit in a u64. The moderate algorithm is derived from
27//! dtolnay/itoa, which can be found
28//! [here](https://github.com/dtolnay/itoa/blob/master/src/udiv128.rs), which
29//! in turn is derived from Rust's compiler-builtins crate, which can be found
30//! [here](https://github.com/rust-lang-nursery/compiler-builtins/blob/master/src/int/udiv.rs).
31//!
32//! Licensing for these routines is therefore subject to an MIT/Illinois
33//! dual license (a BSD-like license), while the rest of the module is
34//! subject to an MIT/Apache-2.0 dual-license.
35//!
36//! # Generation
37//!
38//! See [`etc/div128.py`] for the script to generate the divisors and the
39//! constants, and the division algorithm.
40//!
41//! [`etc/div128.py`]: https://github.com/Alexhuszagh/rust-lexical/blob/main/lexical-util/etc/div128.py
42
43#![cfg(not(feature = "compact"))]
44#![cfg(feature = "write")]
45
46use crate::assert::debug_assert_radix;
47use crate::mul::mulhi;
48
49/// Calculate a div/remainder algorithm optimized for power-of-two radixes.
50///
51/// This is trivial: the number of digits we process is `64 / log2(radix)`.
52/// Therefore, the `shr` is `log2(radix) * digits`, and the mask is just the
53/// lower `shr` bits of the digits.
54#[inline(always)]
55#[allow(clippy::many_single_char_names)] // reason="mathematical names"
56pub const fn pow2_u128_divrem(n: u128, mask: u64, shr: u32) -> (u128, u64) {
57    let quot = n >> shr;
58    let rem = mask & n as u64;
59    (quot, rem)
60}
61
62/// Fast division/remainder algorithm for u128, without a fast native
63/// approximation.
64#[inline(always)]
65#[allow(clippy::many_single_char_names)] // reason="mathematical names"
66pub fn fast_u128_divrem(
67    n: u128,
68    d: u64,
69    fast: u128,
70    fast_shr: u32,
71    factor: u128,
72    factor_shr: u32,
73) -> (u128, u64) {
74    let quot = if n < fast {
75        ((n >> fast_shr) as u64 / (d >> fast_shr)) as u128
76    } else {
77        mulhi::<u128, u64>(n, factor) >> factor_shr
78    };
79    let rem = (n - quot * d as u128) as u64;
80    (quot, rem)
81}
82
83/// Fast division/remainder algorithm for u128, without a fast native
84/// approximation.
85#[inline(always)]
86#[allow(clippy::many_single_char_names)] // reason="mathematical names"
87pub fn moderate_u128_divrem(n: u128, d: u64, factor: u128, factor_shr: u32) -> (u128, u64) {
88    let quot = mulhi::<u128, u64>(n, factor) >> factor_shr;
89    let rem = (n - quot * d as u128) as u64;
90    (quot, rem)
91}
92
93/// Optimized fallback division/remainder algorithm for u128.
94///
95/// This is because the code generation for u128 divrem is very inefficient
96/// in Rust, calling both `__udivmodti4` twice internally, rather than a single
97/// time.
98///
99/// This is still a fair bit slower than the optimized algorithms described
100/// in the above paper, but this is a suitable fallback when we cannot use
101/// the faster algorithm.
102#[cfg_attr(not(feature = "compact"), inline(always))]
103#[allow(clippy::many_single_char_names)] // reason="mathematical names"
104pub fn slow_u128_divrem(n: u128, d: u64, d_ctlz: u32) -> (u128, u64) {
105    // Ensure we have the correct number of leading zeros passed.
106    debug_assert_eq!(d_ctlz, d.leading_zeros());
107
108    // Optimize if we can divide using u64 first.
109    let high = (n >> 64) as u64;
110    if high == 0 {
111        let low = n as u64;
112        return ((low / d) as u128, low % d);
113    }
114
115    // sr = 1 + u64::BITS + d.leading_zeros() - high.leading_zeros();
116    let sr = 65 + d_ctlz - high.leading_zeros();
117
118    // 1 <= sr <= u64::BITS - 1
119    let mut q: u128 = n << (128 - sr);
120    let mut r: u128 = n >> sr;
121    let mut carry: u64 = 0;
122
123    // Don't use a range because they may generate references to memcpy in
124    // unoptimized code Loop invariants:  r < d; carry is 0 or 1
125    let mut i = 0;
126    while i < sr {
127        i += 1;
128
129        // r:q = ((r:q) << 1) | carry
130        r = (r << 1) | (q >> 127);
131        q = (q << 1) | carry as u128;
132
133        // carry = 0
134        // if r >= d {
135        //     r -= d;
136        //     carry = 1;
137        // }
138        let s = (d as u128).wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
139        carry = (s & 1) as u64;
140        r -= (d as u128) & s as u128;
141    }
142
143    ((q << 1) | carry as u128, r as u64)
144}
145
146/// Calculate the div/remainder of a value based on the radix.
147///
148/// This uses the largest divisor possible for the given size,
149/// and uses various fast-path approximations for different types.
150///
151/// 1. Powers-of-two can be cleanly split into 2 64-bit products.
152/// 2. Division that can be simulated as if by multiplication by a constant.
153/// 3. Cases of 2. with a power-of-two divisor.
154/// 4. Fallback cases.
155///
156/// This returns the quotient and the remainder.
157/// For the number of digits processed, see
158/// [`min_step`](crate::step::min_step).
159#[inline(always)]
160#[allow(clippy::needless_return)] // reason="required based on radix configuration"
161pub fn u128_divrem(n: u128, radix: u32) -> (u128, u64) {
162    debug_assert_radix(radix);
163
164    // NOTE: to avoid branching when w don't need it, we use the compile logic
165
166    #[cfg(feature = "radix")]
167    {
168        return match radix {
169            2 => u128_divrem_2(n),
170            3 => u128_divrem_3(n),
171            4 => u128_divrem_4(n),
172            5 => u128_divrem_5(n),
173            6 => u128_divrem_6(n),
174            7 => u128_divrem_7(n),
175            8 => u128_divrem_8(n),
176            9 => u128_divrem_9(n),
177            10 => u128_divrem_10(n),
178            11 => u128_divrem_11(n),
179            12 => u128_divrem_12(n),
180            13 => u128_divrem_13(n),
181            14 => u128_divrem_14(n),
182            15 => u128_divrem_15(n),
183            16 => u128_divrem_16(n),
184            17 => u128_divrem_17(n),
185            18 => u128_divrem_18(n),
186            19 => u128_divrem_19(n),
187            20 => u128_divrem_20(n),
188            21 => u128_divrem_21(n),
189            22 => u128_divrem_22(n),
190            23 => u128_divrem_23(n),
191            24 => u128_divrem_24(n),
192            25 => u128_divrem_25(n),
193            26 => u128_divrem_26(n),
194            27 => u128_divrem_27(n),
195            28 => u128_divrem_28(n),
196            29 => u128_divrem_29(n),
197            30 => u128_divrem_30(n),
198            31 => u128_divrem_31(n),
199            32 => u128_divrem_32(n),
200            33 => u128_divrem_33(n),
201            34 => u128_divrem_34(n),
202            35 => u128_divrem_35(n),
203            36 => u128_divrem_36(n),
204            _ => unreachable!(),
205        };
206    }
207
208    #[cfg(all(feature = "power-of-two", not(feature = "radix")))]
209    {
210        return match radix {
211            2 => u128_divrem_2(n),
212            4 => u128_divrem_4(n),
213            8 => u128_divrem_8(n),
214            10 => u128_divrem_10(n),
215            16 => u128_divrem_16(n),
216            32 => u128_divrem_32(n),
217            _ => unreachable!(),
218        };
219    }
220
221    #[cfg(not(feature = "power-of-two"))]
222    {
223        return u128_divrem_10(n);
224    }
225}
226
227// AUTO-GENERATED
228// These functions were auto-generated by `etc/div128.py`.
229// Do not edit them unless there is a good reason to.
230// Preferably, edit the source code to generate the constants.
231//
232// The seemingly magical values are all derived there, and are explained
233// in the function signatures of the functions they call.
234
235#[inline(always)]
236#[cfg_attr(not(feature = "power-of-two"), allow(dead_code))]
237const fn u128_divrem_2(n: u128) -> (u128, u64) {
238    pow2_u128_divrem(n, 18446744073709551615, 64)
239}
240
241#[inline(always)]
242#[cfg_attr(not(feature = "radix"), allow(dead_code))]
243fn u128_divrem_3(n: u128) -> (u128, u64) {
244    slow_u128_divrem(n, 12157665459056928801, 0)
245}
246
247#[inline(always)]
248#[cfg_attr(not(feature = "power-of-two"), allow(dead_code))]
249const fn u128_divrem_4(n: u128) -> (u128, u64) {
250    pow2_u128_divrem(n, 18446744073709551615, 64)
251}
252
253#[inline(always)]
254#[cfg_attr(not(feature = "radix"), allow(dead_code))]
255fn u128_divrem_5(n: u128) -> (u128, u64) {
256    moderate_u128_divrem(n, 7450580596923828125, 105312291668557186697918027683670432319, 61)
257}
258
259#[inline(always)]
260#[cfg_attr(not(feature = "radix"), allow(dead_code))]
261fn u128_divrem_6(n: u128) -> (u128, u64) {
262    fast_u128_divrem(
263        n,
264        4738381338321616896,
265        309485009821345068724781056,
266        24,
267        165591931273573223021296166324748699891,
268        61,
269    )
270}
271
272#[inline(always)]
273#[cfg_attr(not(feature = "radix"), allow(dead_code))]
274fn u128_divrem_7(n: u128) -> (u128, u64) {
275    moderate_u128_divrem(n, 3909821048582988049, 200683792729517998822275406364627986707, 61)
276}
277
278#[inline(always)]
279#[cfg_attr(not(feature = "power-of-two"), allow(dead_code))]
280const fn u128_divrem_8(n: u128) -> (u128, u64) {
281    pow2_u128_divrem(n, 9223372036854775807, 63)
282}
283
284#[inline(always)]
285#[cfg_attr(not(feature = "radix"), allow(dead_code))]
286fn u128_divrem_9(n: u128) -> (u128, u64) {
287    slow_u128_divrem(n, 12157665459056928801, 0)
288}
289
290#[inline(always)]
291fn u128_divrem_10(n: u128) -> (u128, u64) {
292    fast_u128_divrem(
293        n,
294        10000000000000000000,
295        9671406556917033397649408,
296        19,
297        156927543384667019095894735580191660403,
298        62,
299    )
300}
301
302#[inline(always)]
303#[cfg_attr(not(feature = "radix"), allow(dead_code))]
304fn u128_divrem_11(n: u128) -> (u128, u64) {
305    slow_u128_divrem(n, 5559917313492231481, 1)
306}
307
308#[inline(always)]
309#[cfg_attr(not(feature = "radix"), allow(dead_code))]
310fn u128_divrem_12(n: u128) -> (u128, u64) {
311    slow_u128_divrem(n, 2218611106740436992, 3)
312}
313
314#[inline(always)]
315#[cfg_attr(not(feature = "radix"), allow(dead_code))]
316fn u128_divrem_13(n: u128) -> (u128, u64) {
317    moderate_u128_divrem(n, 8650415919381337933, 181410402513790565292660635782582404765, 62)
318}
319
320#[inline(always)]
321#[cfg_attr(not(feature = "radix"), allow(dead_code))]
322fn u128_divrem_14(n: u128) -> (u128, u64) {
323    fast_u128_divrem(
324        n,
325        2177953337809371136,
326        1208925819614629174706176,
327        16,
328        1407280417134467544760816054546363235,
329        53,
330    )
331}
332
333#[inline(always)]
334#[cfg_attr(not(feature = "radix"), allow(dead_code))]
335fn u128_divrem_15(n: u128) -> (u128, u64) {
336    moderate_u128_divrem(n, 6568408355712890625, 1866504587258795246613513364166764993, 55)
337}
338
339#[inline(always)]
340#[cfg_attr(not(feature = "power-of-two"), allow(dead_code))]
341const fn u128_divrem_16(n: u128) -> (u128, u64) {
342    pow2_u128_divrem(n, 18446744073709551615, 64)
343}
344
345#[inline(always)]
346#[cfg_attr(not(feature = "radix"), allow(dead_code))]
347fn u128_divrem_17(n: u128) -> (u128, u64) {
348    moderate_u128_divrem(n, 2862423051509815793, 68529153692836345537218837732158950089, 59)
349}
350
351#[inline(always)]
352#[cfg_attr(not(feature = "radix"), allow(dead_code))]
353fn u128_divrem_18(n: u128) -> (u128, u64) {
354    fast_u128_divrem(
355        n,
356        6746640616477458432,
357        604462909807314587353088,
358        15,
359        232601011830094623283686247347795155951,
360        62,
361    )
362}
363
364#[inline(always)]
365#[cfg_attr(not(feature = "radix"), allow(dead_code))]
366fn u128_divrem_19(n: u128) -> (u128, u64) {
367    moderate_u128_divrem(n, 15181127029874798299, 25842538415601616733690423925257626679, 60)
368}
369
370#[inline(always)]
371#[cfg_attr(not(feature = "radix"), allow(dead_code))]
372fn u128_divrem_20(n: u128) -> (u128, u64) {
373    fast_u128_divrem(
374        n,
375        1638400000000000000,
376        4951760157141521099596496896,
377        28,
378        239452428260295134118491722992235809941,
379        60,
380    )
381}
382
383#[inline(always)]
384#[cfg_attr(not(feature = "radix"), allow(dead_code))]
385fn u128_divrem_21(n: u128) -> (u128, u64) {
386    moderate_u128_divrem(n, 3243919932521508681, 120939747781233590383781714337497669585, 60)
387}
388
389#[inline(always)]
390#[cfg_attr(not(feature = "radix"), allow(dead_code))]
391fn u128_divrem_22(n: u128) -> (u128, u64) {
392    slow_u128_divrem(n, 6221821273427820544, 1)
393}
394
395#[inline(always)]
396#[cfg_attr(not(feature = "radix"), allow(dead_code))]
397fn u128_divrem_23(n: u128) -> (u128, u64) {
398    moderate_u128_divrem(n, 11592836324538749809, 270731922700393644432243678371210997949, 63)
399}
400
401#[inline(always)]
402#[cfg_attr(not(feature = "radix"), allow(dead_code))]
403fn u128_divrem_24(n: u128) -> (u128, u64) {
404    fast_u128_divrem(
405        n,
406        876488338465357824,
407        10141204801825835211973625643008,
408        39,
409        55950381945266105153185943557606235389,
410        57,
411    )
412}
413
414#[inline(always)]
415#[cfg_attr(not(feature = "radix"), allow(dead_code))]
416fn u128_divrem_25(n: u128) -> (u128, u64) {
417    moderate_u128_divrem(n, 1490116119384765625, 131640364585696483372397534604588040399, 59)
418}
419
420#[inline(always)]
421#[cfg_attr(not(feature = "radix"), allow(dead_code))]
422fn u128_divrem_26(n: u128) -> (u128, u64) {
423    fast_u128_divrem(
424        n,
425        2481152873203736576,
426        151115727451828646838272,
427        13,
428        316239166637962178669658228673482425689,
429        61,
430    )
431}
432
433#[inline(always)]
434#[cfg_attr(not(feature = "radix"), allow(dead_code))]
435fn u128_divrem_27(n: u128) -> (u128, u64) {
436    slow_u128_divrem(n, 4052555153018976267, 2)
437}
438
439#[inline(always)]
440#[cfg_attr(not(feature = "radix"), allow(dead_code))]
441fn u128_divrem_28(n: u128) -> (u128, u64) {
442    fast_u128_divrem(
443        n,
444        6502111422497947648,
445        1237940039285380274899124224,
446        26,
447        241348591538561183926479953354701294803,
448        62,
449    )
450}
451
452#[inline(always)]
453#[cfg_attr(not(feature = "radix"), allow(dead_code))]
454fn u128_divrem_29(n: u128) -> (u128, u64) {
455    moderate_u128_divrem(n, 10260628712958602189, 152941450056053853841698190746050519297, 62)
456}
457
458#[inline(always)]
459#[cfg_attr(not(feature = "radix"), allow(dead_code))]
460fn u128_divrem_30(n: u128) -> (u128, u64) {
461    slow_u128_divrem(n, 15943230000000000000, 0)
462}
463
464#[inline(always)]
465#[cfg_attr(not(feature = "radix"), allow(dead_code))]
466fn u128_divrem_31(n: u128) -> (u128, u64) {
467    moderate_u128_divrem(n, 787662783788549761, 124519929891402176328714857711808162537, 58)
468}
469
470#[inline(always)]
471#[cfg_attr(not(feature = "power-of-two"), allow(dead_code))]
472const fn u128_divrem_32(n: u128) -> (u128, u64) {
473    pow2_u128_divrem(n, 1152921504606846975, 60)
474}
475
476#[inline(always)]
477#[cfg_attr(not(feature = "radix"), allow(dead_code))]
478fn u128_divrem_33(n: u128) -> (u128, u64) {
479    slow_u128_divrem(n, 1667889514952984961, 3)
480}
481
482#[inline(always)]
483#[cfg_attr(not(feature = "radix"), allow(dead_code))]
484fn u128_divrem_34(n: u128) -> (u128, u64) {
485    fast_u128_divrem(
486        n,
487        2386420683693101056,
488        75557863725914323419136,
489        12,
490        328792707121977505492535302517672775183,
491        61,
492    )
493}
494
495#[inline(always)]
496#[cfg_attr(not(feature = "radix"), allow(dead_code))]
497fn u128_divrem_35(n: u128) -> (u128, u64) {
498    moderate_u128_divrem(n, 3379220508056640625, 116097442450503652080238494022501325491, 60)
499}
500
501#[inline(always)]
502#[cfg_attr(not(feature = "radix"), allow(dead_code))]
503fn u128_divrem_36(n: u128) -> (u128, u64) {
504    fast_u128_divrem(
505        n,
506        4738381338321616896,
507        309485009821345068724781056,
508        24,
509        165591931273573223021296166324748699891,
510        61,
511    )
512}