c2_chacha/
guts.rs

1#[cfg(feature = "rustcrypto_api")]
2pub use cipher::generic_array;
3
4pub use ppv_lite86::Machine;
5use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext};
6
7pub(crate) const BLOCK: usize = 64;
8pub(crate) const BLOCK64: u64 = BLOCK as u64;
9const LOG2_BUFBLOCKS: u64 = 2;
10const BUFBLOCKS: u64 = 1 << LOG2_BUFBLOCKS;
11pub(crate) const BUFSZ64: u64 = BLOCK64 * BUFBLOCKS;
12pub(crate) const BUFSZ: usize = BUFSZ64 as usize;
13
14/// Parameters of a ChaCha stream, including fixed parameters and current position.
15#[derive(Clone, PartialEq, Eq)]
16pub struct ChaCha {
17    pub(crate) b: vec128_storage,
18    pub(crate) c: vec128_storage,
19    pub(crate) d: vec128_storage,
20}
21
22/// Working state of a ChaCha stream.
23#[derive(Clone, PartialEq, Eq)]
24pub struct State<V> {
25    pub(crate) a: V,
26    pub(crate) b: V,
27    pub(crate) c: V,
28    pub(crate) d: V,
29}
30
31#[inline(always)]
32pub(crate) fn round<V: ArithOps + BitOps32>(mut x: State<V>) -> State<V> {
33    x.a += x.b;
34    x.d = (x.d ^ x.a).rotate_each_word_right16();
35    x.c += x.d;
36    x.b = (x.b ^ x.c).rotate_each_word_right20();
37    x.a += x.b;
38    x.d = (x.d ^ x.a).rotate_each_word_right24();
39    x.c += x.d;
40    x.b = (x.b ^ x.c).rotate_each_word_right25();
41    x
42}
43
44#[inline(always)]
45pub(crate) fn diagonalize<V: LaneWords4>(mut x: State<V>) -> State<V> {
46    // Since b has the critical data dependency, avoid rotating b to hide latency.
47    //
48    // The order of these statements is important for performance on pre-AVX2 Intel machines, which
49    // are throughput-bound and operating near their superscalar limits during refill_wide. The
50    // permutations here and in undiagonalize have been found in testing on Nehalem to be optimal.
51    x.a = x.a.shuffle_lane_words1230();
52    x.c = x.c.shuffle_lane_words3012();
53    x.d = x.d.shuffle_lane_words2301();
54    x
55}
56
57#[inline(always)]
58pub(crate) fn undiagonalize<V: LaneWords4>(mut x: State<V>) -> State<V> {
59    // The order of these statements is magic. See comment in diagonalize.
60    x.c = x.c.shuffle_lane_words1230();
61    x.d = x.d.shuffle_lane_words2301();
62    x.a = x.a.shuffle_lane_words3012();
63    x
64}
65
66impl ChaCha {
67    pub fn new(key: &[u8; 32], nonce: &[u8]) -> Self {
68        let ctr_nonce = [
69            0,
70            if nonce.len() == 12 {
71                read_u32le(&nonce[0..4])
72            } else {
73                0
74            },
75            read_u32le(&nonce[nonce.len() - 8..nonce.len() - 4]),
76            read_u32le(&nonce[nonce.len() - 4..]),
77        ];
78        let key0 = [
79            read_u32le(&key[0..4]),
80            read_u32le(&key[4..8]),
81            read_u32le(&key[8..12]),
82            read_u32le(&key[12..16]),
83        ];
84        let key1 = [
85            read_u32le(&key[16..20]),
86            read_u32le(&key[20..24]),
87            read_u32le(&key[24..28]),
88            read_u32le(&key[28..32]),
89        ];
90
91        ChaCha {
92            b: key0.into(),
93            c: key1.into(),
94            d: ctr_nonce.into(),
95        }
96    }
97
98    #[inline(always)]
99    fn pos64<M: Machine>(&self, m: M) -> u64 {
100        let d: M::u32x4 = m.unpack(self.d);
101        ((d.extract(1) as u64) << 32) | d.extract(0) as u64
102    }
103
104    /// Set 64-bit block count, affecting next refill.
105    #[inline(always)]
106    pub(crate) fn seek64<M: Machine>(&mut self, m: M, blockct: u64) {
107        let d: M::u32x4 = m.unpack(self.d);
108        self.d = d
109            .insert((blockct >> 32) as u32, 1)
110            .insert(blockct as u32, 0)
111            .into();
112    }
113
114    /// Set 32-bit block count, affecting next refill.
115    #[inline(always)]
116    pub(crate) fn seek32<M: Machine>(&mut self, m: M, blockct: u32) {
117        let d: M::u32x4 = m.unpack(self.d);
118        self.d = d.insert(blockct, 0).into();
119    }
120
121    /// Produce output from the current state.
122    #[inline(always)]
123    fn output_narrow<M: Machine>(&mut self, m: M, x: State<M::u32x4>, out: &mut [u8; BLOCK]) {
124        let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
125        (x.a + k).write_le(&mut out[0..16]);
126        (x.b + m.unpack(self.b)).write_le(&mut out[16..32]);
127        (x.c + m.unpack(self.c)).write_le(&mut out[32..48]);
128        (x.d + m.unpack(self.d)).write_le(&mut out[48..64]);
129    }
130
131    /// Add one to the block counter (no overflow check).
132    #[inline(always)]
133    fn inc_block_ct<M: Machine>(&mut self, m: M) {
134        let mut pos = self.pos64(m);
135        let d0: M::u32x4 = m.unpack(self.d);
136        pos += 1;
137        let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
138        self.d = d1.into();
139    }
140
141    /// Produce 4 blocks of output, advancing the state
142    #[inline(always)]
143    pub fn refill4(&mut self, drounds: u32, out: &mut [u8; BUFSZ]) {
144        refill_wide(self, drounds, out)
145    }
146
147    /// Produce a block of output, advancing the state
148    #[inline(always)]
149    pub fn refill(&mut self, drounds: u32, out: &mut [u8; BLOCK]) {
150        refill_narrow(self, drounds, out)
151    }
152
153    #[inline(always)]
154    pub(crate) fn refill_rounds(&mut self, drounds: u32) -> State<vec128_storage> {
155        refill_narrow_rounds(self, drounds)
156    }
157
158    #[inline]
159    pub fn set_stream_param(&mut self, param: u32, value: u64) {
160        let mut d: [u32; 4] = self.d.into();
161        let p0 = ((param << 1) | 1) as usize;
162        let p1 = (param << 1) as usize;
163        d[p0] = (value >> 32) as u32;
164        d[p1] = value as u32;
165        self.d = d.into();
166    }
167
168    #[inline]
169    pub fn get_stream_param(&self, param: u32) -> u64 {
170        let d: [u32; 4] = self.d.into();
171        let p0 = ((param << 1) | 1) as usize;
172        let p1 = (param << 1) as usize;
173        ((d[p0] as u64) << 32) | d[p1] as u64
174    }
175
176    /// Return whether rhs represents the same stream, irrespective of current 32-bit position.
177    #[inline]
178    pub fn stream32_eq(&self, rhs: &Self) -> bool {
179        let self_d: [u32; 4] = self.d.into();
180        let rhs_d: [u32; 4] = rhs.d.into();
181        self.b == rhs.b
182            && self.c == rhs.c
183            && self_d[3] == rhs_d[3]
184            && self_d[2] == rhs_d[2]
185            && self_d[1] == rhs_d[1]
186    }
187
188    /// Return whether rhs represents the same stream, irrespective of current 64-bit position.
189    #[inline]
190    pub fn stream64_eq(&self, rhs: &Self) -> bool {
191        let self_d: [u32; 4] = self.d.into();
192        let rhs_d: [u32; 4] = rhs.d.into();
193        self.b == rhs.b && self.c == rhs.c && self_d[3] == rhs_d[3] && self_d[2] == rhs_d[2]
194    }
195}
196
197// This implementation is platform-independent.
198#[inline(always)]
199#[cfg(target_endian = "big")]
200fn add_pos<Mach: Machine>(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 {
201    let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
202    let pos = pos0.wrapping_add(i);
203    d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0)
204}
205#[inline(always)]
206#[cfg(target_endian = "big")]
207fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
208    let d0: Mach::u32x4 = m.unpack(d);
209    let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
210    pos = pos.wrapping_add(1);
211    let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
212    pos = pos.wrapping_add(1);
213    let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
214    pos = pos.wrapping_add(1);
215    let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
216    Mach::u32x4x4::from_lanes([d0, d1, d2, d3])
217}
218
219// Pos is packed into the state vectors as a little-endian u64,
220// so on LE platforms we can use native vector ops to increment it.
221#[inline(always)]
222#[cfg(target_endian = "little")]
223fn add_pos<Mach: Machine>(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 {
224    let d0: Mach::u64x2 = m.unpack(d.into());
225    let incr = m.vec([i, 0]);
226    m.unpack((d0 + incr).into())
227}
228#[inline(always)]
229#[cfg(target_endian = "little")]
230fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
231    let d0: Mach::u64x2 = m.unpack(d);
232    let incr = Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]);
233    m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into())
234}
235
236#[allow(clippy::many_single_char_names)]
237#[inline(always)]
238fn refill_wide_impl<Mach: Machine>(
239    m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u8; BUFSZ],
240) {
241    let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
242    let b = m.unpack(state.b);
243    let c = m.unpack(state.c);
244    let mut x = State {
245        a: Mach::u32x4x4::from_lanes([k, k, k, k]),
246        b: Mach::u32x4x4::from_lanes([b, b, b, b]),
247        c: Mach::u32x4x4::from_lanes([c, c, c, c]),
248        d: d0123(m, state.d),
249    };
250    for _ in 0..drounds {
251        x = round(x);
252        x = undiagonalize(round(diagonalize(x)));
253    }
254    let kk = Mach::u32x4x4::from_lanes([k, k, k, k]);
255    let sb = m.unpack(state.b);
256    let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]);
257    let sc = m.unpack(state.c);
258    let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]);
259    let sd = d0123(m, state.d);
260    let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd);
261    results.0.write_le(&mut out[0..64]);
262    results.1.write_le(&mut out[64..128]);
263    results.2.write_le(&mut out[128..192]);
264    results.3.write_le(&mut out[192..256]);
265    state.d = add_pos(m, sd.to_lanes()[0], 4).into();
266}
267
268dispatch!(m, Mach, {
269    fn refill_wide(state: &mut ChaCha, drounds: u32, out: &mut [u8; BUFSZ]) {
270        refill_wide_impl(m, state, drounds, out);
271    }
272});
273
274// Refill the buffer from a single-block round, updating the block count.
275dispatch_light128!(m, Mach, {
276    fn refill_narrow(state: &mut ChaCha, drounds: u32, out: &mut [u8; BLOCK]) {
277        let x = refill_narrow_rounds(state, drounds);
278        let x = State {
279            a: m.unpack(x.a),
280            b: m.unpack(x.b),
281            c: m.unpack(x.c),
282            d: m.unpack(x.d),
283        };
284        state.output_narrow(m, x, out);
285        state.inc_block_ct(m);
286    }
287});
288
289// Single-block, rounds-only; shared by try_apply_keystream for tails shorter than BUFSZ
290// and XChaCha's setup step.
291dispatch!(m, Mach, {
292    fn refill_narrow_rounds(state: &mut ChaCha, drounds: u32) -> State<vec128_storage> {
293        let k: Mach::u32x4 = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
294        let mut x = State {
295            a: k,
296            b: m.unpack(state.b),
297            c: m.unpack(state.c),
298            d: m.unpack(state.d),
299        };
300        for _ in 0..drounds {
301            x = round(x);
302            x = undiagonalize(round(diagonalize(x)));
303        }
304        State {
305            a: x.a.into(),
306            b: x.b.into(),
307            c: x.c.into(),
308            d: x.d.into(),
309        }
310    }
311});
312
313fn read_u32le(xs: &[u8]) -> u32 {
314    assert_eq!(xs.len(), 4);
315    u32::from(xs[0]) | (u32::from(xs[1]) << 8) | (u32::from(xs[2]) << 16) | (u32::from(xs[3]) << 24)
316}
317
318dispatch_light128!(m, Mach, {
319    fn init_chacha_x(key: &[u8; 32], nonce: &[u8; 24], rounds: u32) -> ChaCha {
320        let key0: Mach::u32x4 = m.read_le(&key[..16]);
321        let key1: Mach::u32x4 = m.read_le(&key[16..]);
322        let nonce0: Mach::u32x4 = m.read_le(&nonce[..16]);
323        let mut state = ChaCha {
324            b: key0.into(),
325            c: key1.into(),
326            d: nonce0.into(),
327        };
328        let x = refill_narrow_rounds(&mut state, rounds);
329        let ctr_nonce1 = [0, 0, read_u32le(&nonce[16..20]), read_u32le(&nonce[20..24])];
330        state.b = x.a;
331        state.c = x.d;
332        state.d = ctr_nonce1.into();
333        state
334    }
335});
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    /// Basic check that streamXX_eq is block-count invariant
342    #[test]
343    fn test_stream_eq() {
344        let key = hex!("fa44478c59ca70538e3549096ce8b523232c50d9e8e8d10c203ef6c8d07098a5");
345        let nonce = hex!("8d3a0d6d7827c00701020304");
346        let mut a = ChaCha::new(&key, &nonce);
347        let b = a.clone();
348        let mut out = [0u8; BLOCK];
349        assert!(a == b);
350        assert!(a.stream32_eq(&b));
351        assert!(a.stream64_eq(&b));
352        a.refill(0, &mut out);
353        assert!(a != b);
354        assert!(a.stream32_eq(&b));
355        assert!(a.stream64_eq(&b));
356    }
357}