1use std::iter::zip;
2
3#[cfg(target_arch = "x86")]
4use std::arch::x86::*;
5#[cfg(target_arch = "x86_64")]
6use std::arch::x86_64::*;
7
8use crate::engine::{
9 tables::{self, Mul128, Multiply128lutT, Skew},
10 utils, Engine, GfElement, ShardsRefMut, GF_MODULUS, GF_ORDER,
11};
12
13#[derive(Clone, Copy)]
23pub struct Ssse3 {
24 mul128: &'static Mul128,
25 skew: &'static Skew,
26}
27
28impl Ssse3 {
29 pub fn new() -> Self {
37 let mul128 = &*tables::MUL128;
38 let skew = &*tables::SKEW;
39
40 Self { mul128, skew }
41 }
42}
43
44impl Engine for Ssse3 {
45 fn fft(
46 &self,
47 data: &mut ShardsRefMut,
48 pos: usize,
49 size: usize,
50 truncated_size: usize,
51 skew_delta: usize,
52 ) {
53 unsafe {
54 self.fft_private_ssse3(data, pos, size, truncated_size, skew_delta);
55 }
56 }
57
58 fn ifft(
59 &self,
60 data: &mut ShardsRefMut,
61 pos: usize,
62 size: usize,
63 truncated_size: usize,
64 skew_delta: usize,
65 ) {
66 unsafe {
67 self.ifft_private_ssse3(data, pos, size, truncated_size, skew_delta);
68 }
69 }
70
71 fn mul(&self, x: &mut [[u8; 64]], log_m: GfElement) {
72 unsafe {
73 self.mul_ssse3(x, log_m);
74 }
75 }
76
77 fn eval_poly(erasures: &mut [GfElement; GF_ORDER], truncated_size: usize) {
78 unsafe { Self::eval_poly_ssse3(erasures, truncated_size) }
79 }
80}
81
82impl Default for Ssse3 {
86 fn default() -> Self {
87 Self::new()
88 }
89}
90
91impl Ssse3 {
97 #[target_feature(enable = "ssse3")]
98 unsafe fn mul_ssse3(&self, x: &mut [[u8; 64]], log_m: GfElement) {
99 let lut = &self.mul128[log_m as usize];
100
101 for chunk in x.iter_mut() {
102 let x_ptr = chunk.as_mut_ptr().cast::<__m128i>();
103 unsafe {
104 let x0_lo = _mm_loadu_si128(x_ptr);
105 let x1_lo = _mm_loadu_si128(x_ptr.add(1));
106 let x0_hi = _mm_loadu_si128(x_ptr.add(2));
107 let x1_hi = _mm_loadu_si128(x_ptr.add(3));
108 let (prod0_lo, prod0_hi) = Self::mul_128(x0_lo, x0_hi, lut);
109 let (prod1_lo, prod1_hi) = Self::mul_128(x1_lo, x1_hi, lut);
110 _mm_storeu_si128(x_ptr, prod0_lo);
111 _mm_storeu_si128(x_ptr.add(1), prod1_lo);
112 _mm_storeu_si128(x_ptr.add(2), prod0_hi);
113 _mm_storeu_si128(x_ptr.add(3), prod1_hi);
114 }
115 }
116 }
117
118 #[inline(always)]
120 fn mul_128(value_lo: __m128i, value_hi: __m128i, lut: &Multiply128lutT) -> (__m128i, __m128i) {
121 let mut prod_lo: __m128i;
122 let mut prod_hi: __m128i;
123
124 unsafe {
125 let t0_lo = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.lo[0]).cast::<__m128i>());
126 let t1_lo = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.lo[1]).cast::<__m128i>());
127 let t2_lo = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.lo[2]).cast::<__m128i>());
128 let t3_lo = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.lo[3]).cast::<__m128i>());
129
130 let t0_hi = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.hi[0]).cast::<__m128i>());
131 let t1_hi = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.hi[1]).cast::<__m128i>());
132 let t2_hi = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.hi[2]).cast::<__m128i>());
133 let t3_hi = _mm_loadu_si128(std::ptr::from_ref::<u128>(&lut.hi[3]).cast::<__m128i>());
134
135 let clr_mask = _mm_set1_epi8(0x0f);
136
137 let data_0 = _mm_and_si128(value_lo, clr_mask);
138 prod_lo = _mm_shuffle_epi8(t0_lo, data_0);
139 prod_hi = _mm_shuffle_epi8(t0_hi, data_0);
140
141 let data_1 = _mm_and_si128(_mm_srli_epi64(value_lo, 4), clr_mask);
142 prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(t1_lo, data_1));
143 prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(t1_hi, data_1));
144
145 let data_0 = _mm_and_si128(value_hi, clr_mask);
146 prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(t2_lo, data_0));
147 prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(t2_hi, data_0));
148
149 let data_1 = _mm_and_si128(_mm_srli_epi64(value_hi, 4), clr_mask);
150 prod_lo = _mm_xor_si128(prod_lo, _mm_shuffle_epi8(t3_lo, data_1));
151 prod_hi = _mm_xor_si128(prod_hi, _mm_shuffle_epi8(t3_hi, data_1));
152 }
153
154 (prod_lo, prod_hi)
155 }
156
157 #[inline(always)]
160 fn muladd_128(
161 mut x_lo: __m128i,
162 mut x_hi: __m128i,
163 y_lo: __m128i,
164 y_hi: __m128i,
165 lut: &Multiply128lutT,
166 ) -> (__m128i, __m128i) {
167 let (prod_lo, prod_hi) = Self::mul_128(y_lo, y_hi, lut);
168 unsafe {
169 x_lo = _mm_xor_si128(x_lo, prod_lo);
170 x_hi = _mm_xor_si128(x_hi, prod_hi);
171 }
172 (x_lo, x_hi)
173 }
174}
175
176impl Ssse3 {
180 #[inline(always)]
182 fn fftb_128(&self, x: &mut [u8; 64], y: &mut [u8; 64], log_m: GfElement) {
183 let lut = &self.mul128[log_m as usize];
184 let x_ptr = x.as_mut_ptr().cast::<__m128i>();
185 let y_ptr = y.as_mut_ptr().cast::<__m128i>();
186 unsafe {
187 let mut x0_lo = _mm_loadu_si128(x_ptr);
188 let mut x1_lo = _mm_loadu_si128(x_ptr.add(1));
189 let mut x0_hi = _mm_loadu_si128(x_ptr.add(2));
190 let mut x1_hi = _mm_loadu_si128(x_ptr.add(3));
191
192 let mut y0_lo = _mm_loadu_si128(y_ptr);
193 let mut y1_lo = _mm_loadu_si128(y_ptr.add(1));
194 let mut y0_hi = _mm_loadu_si128(y_ptr.add(2));
195 let mut y1_hi = _mm_loadu_si128(y_ptr.add(3));
196
197 (x0_lo, x0_hi) = Self::muladd_128(x0_lo, x0_hi, y0_lo, y0_hi, lut);
198 (x1_lo, x1_hi) = Self::muladd_128(x1_lo, x1_hi, y1_lo, y1_hi, lut);
199
200 _mm_storeu_si128(x_ptr, x0_lo);
201 _mm_storeu_si128(x_ptr.add(1), x1_lo);
202 _mm_storeu_si128(x_ptr.add(2), x0_hi);
203 _mm_storeu_si128(x_ptr.add(3), x1_hi);
204
205 y0_lo = _mm_xor_si128(y0_lo, x0_lo);
206 y1_lo = _mm_xor_si128(y1_lo, x1_lo);
207 y0_hi = _mm_xor_si128(y0_hi, x0_hi);
208 y1_hi = _mm_xor_si128(y1_hi, x1_hi);
209
210 _mm_storeu_si128(y_ptr, y0_lo);
211 _mm_storeu_si128(y_ptr.add(1), y1_lo);
212 _mm_storeu_si128(y_ptr.add(2), y0_hi);
213 _mm_storeu_si128(y_ptr.add(3), y1_hi);
214 }
215 }
216
217 #[inline(always)]
219 fn fft_butterfly_partial(&self, x: &mut [[u8; 64]], y: &mut [[u8; 64]], log_m: GfElement) {
220 for (x_chunk, y_chunk) in zip(x.iter_mut(), y.iter_mut()) {
221 self.fftb_128(x_chunk, y_chunk, log_m);
222 }
223 }
224
225 #[inline(always)]
226 fn fft_butterfly_two_layers(
227 &self,
228 data: &mut ShardsRefMut,
229 pos: usize,
230 dist: usize,
231 log_m01: GfElement,
232 log_m23: GfElement,
233 log_m02: GfElement,
234 ) {
235 let (s0, s1, s2, s3) = data.dist4_mut(pos, dist);
236
237 if log_m02 == GF_MODULUS {
240 utils::xor(s2, s0);
241 utils::xor(s3, s1);
242 } else {
243 self.fft_butterfly_partial(s0, s2, log_m02);
244 self.fft_butterfly_partial(s1, s3, log_m02);
245 }
246
247 if log_m01 == GF_MODULUS {
250 utils::xor(s1, s0);
251 } else {
252 self.fft_butterfly_partial(s0, s1, log_m01);
253 }
254
255 if log_m23 == GF_MODULUS {
256 utils::xor(s3, s2);
257 } else {
258 self.fft_butterfly_partial(s2, s3, log_m23);
259 }
260 }
261
262 #[target_feature(enable = "ssse3")]
263 unsafe fn fft_private_ssse3(
264 &self,
265 data: &mut ShardsRefMut,
266 pos: usize,
267 size: usize,
268 truncated_size: usize,
269 skew_delta: usize,
270 ) {
271 self.fft_private(data, pos, size, truncated_size, skew_delta);
273 }
274
275 #[inline(always)]
276 fn fft_private(
277 &self,
278 data: &mut ShardsRefMut,
279 pos: usize,
280 size: usize,
281 truncated_size: usize,
282 skew_delta: usize,
283 ) {
284 let mut dist4 = size;
287 let mut dist = size >> 2;
288 while dist != 0 {
289 let mut r = 0;
290 while r < truncated_size {
291 let base = r + dist + skew_delta - 1;
292
293 let log_m01 = self.skew[base];
294 let log_m02 = self.skew[base + dist];
295 let log_m23 = self.skew[base + dist * 2];
296
297 for i in r..r + dist {
298 self.fft_butterfly_two_layers(data, pos + i, dist, log_m01, log_m23, log_m02);
299 }
300
301 r += dist4;
302 }
303 dist4 = dist;
304 dist >>= 2;
305 }
306
307 if dist4 == 2 {
310 let mut r = 0;
311 while r < truncated_size {
312 let log_m = self.skew[r + skew_delta];
313
314 let (x, y) = data.dist2_mut(pos + r, 1);
315
316 if log_m == GF_MODULUS {
317 utils::xor(y, x);
318 } else {
319 self.fft_butterfly_partial(x, y, log_m);
320 }
321
322 r += 2;
323 }
324 }
325 }
326}
327
328impl Ssse3 {
332 #[inline(always)]
334 fn ifftb_128(&self, x: &mut [u8; 64], y: &mut [u8; 64], log_m: GfElement) {
335 let lut = &self.mul128[log_m as usize];
336 let x_ptr = x.as_mut_ptr().cast::<__m128i>();
337 let y_ptr = y.as_mut_ptr().cast::<__m128i>();
338
339 unsafe {
340 let mut x0_lo = _mm_loadu_si128(x_ptr);
341 let mut x1_lo = _mm_loadu_si128(x_ptr.add(1));
342 let mut x0_hi = _mm_loadu_si128(x_ptr.add(2));
343 let mut x1_hi = _mm_loadu_si128(x_ptr.add(3));
344
345 let mut y0_lo = _mm_loadu_si128(y_ptr);
346 let mut y1_lo = _mm_loadu_si128(y_ptr.add(1));
347 let mut y0_hi = _mm_loadu_si128(y_ptr.add(2));
348 let mut y1_hi = _mm_loadu_si128(y_ptr.add(3));
349
350 y0_lo = _mm_xor_si128(y0_lo, x0_lo);
351 y1_lo = _mm_xor_si128(y1_lo, x1_lo);
352 y0_hi = _mm_xor_si128(y0_hi, x0_hi);
353 y1_hi = _mm_xor_si128(y1_hi, x1_hi);
354
355 _mm_storeu_si128(y_ptr, y0_lo);
356 _mm_storeu_si128(y_ptr.add(1), y1_lo);
357 _mm_storeu_si128(y_ptr.add(2), y0_hi);
358 _mm_storeu_si128(y_ptr.add(3), y1_hi);
359
360 (x0_lo, x0_hi) = Self::muladd_128(x0_lo, x0_hi, y0_lo, y0_hi, lut);
361 (x1_lo, x1_hi) = Self::muladd_128(x1_lo, x1_hi, y1_lo, y1_hi, lut);
362
363 _mm_storeu_si128(x_ptr, x0_lo);
364 _mm_storeu_si128(x_ptr.add(1), x1_lo);
365 _mm_storeu_si128(x_ptr.add(2), x0_hi);
366 _mm_storeu_si128(x_ptr.add(3), x1_hi);
367 }
368 }
369
370 #[inline(always)]
371 fn ifft_butterfly_partial(&self, x: &mut [[u8; 64]], y: &mut [[u8; 64]], log_m: GfElement) {
372 for (x_chunk, y_chunk) in zip(x.iter_mut(), y.iter_mut()) {
373 self.ifftb_128(x_chunk, y_chunk, log_m);
374 }
375 }
376
377 #[inline(always)]
378 fn ifft_butterfly_two_layers(
379 &self,
380 data: &mut ShardsRefMut,
381 pos: usize,
382 dist: usize,
383 log_m01: GfElement,
384 log_m23: GfElement,
385 log_m02: GfElement,
386 ) {
387 let (s0, s1, s2, s3) = data.dist4_mut(pos, dist);
388
389 if log_m01 == GF_MODULUS {
392 utils::xor(s1, s0);
393 } else {
394 self.ifft_butterfly_partial(s0, s1, log_m01);
395 }
396
397 if log_m23 == GF_MODULUS {
398 utils::xor(s3, s2);
399 } else {
400 self.ifft_butterfly_partial(s2, s3, log_m23);
401 }
402
403 if log_m02 == GF_MODULUS {
406 utils::xor(s2, s0);
407 utils::xor(s3, s1);
408 } else {
409 self.ifft_butterfly_partial(s0, s2, log_m02);
410 self.ifft_butterfly_partial(s1, s3, log_m02);
411 }
412 }
413
414 #[target_feature(enable = "ssse3")]
415 unsafe fn ifft_private_ssse3(
416 &self,
417 data: &mut ShardsRefMut,
418 pos: usize,
419 size: usize,
420 truncated_size: usize,
421 skew_delta: usize,
422 ) {
423 self.ifft_private(data, pos, size, truncated_size, skew_delta);
425 }
426
427 #[inline(always)]
428 fn ifft_private(
429 &self,
430 data: &mut ShardsRefMut,
431 pos: usize,
432 size: usize,
433 truncated_size: usize,
434 skew_delta: usize,
435 ) {
436 let mut dist = 1;
439 let mut dist4 = 4;
440 while dist4 <= size {
441 let mut r = 0;
442 while r < truncated_size {
443 let base = r + dist + skew_delta - 1;
444
445 let log_m01 = self.skew[base];
446 let log_m02 = self.skew[base + dist];
447 let log_m23 = self.skew[base + dist * 2];
448
449 for i in r..r + dist {
450 self.ifft_butterfly_two_layers(data, pos + i, dist, log_m01, log_m23, log_m02);
451 }
452
453 r += dist4;
454 }
455 dist = dist4;
456 dist4 <<= 2;
457 }
458
459 if dist < size {
462 let log_m = self.skew[dist + skew_delta - 1];
463 if log_m == GF_MODULUS {
464 utils::xor_within(data, pos + dist, pos, dist);
465 } else {
466 let (mut a, mut b) = data.split_at_mut(pos + dist);
467 for i in 0..dist {
468 self.ifft_butterfly_partial(
469 &mut a[pos + i], &mut b[i], log_m,
472 );
473 }
474 }
475 }
476 }
477}
478
479impl Ssse3 {
483 #[target_feature(enable = "ssse3")]
484 unsafe fn eval_poly_ssse3(erasures: &mut [GfElement; GF_ORDER], truncated_size: usize) {
485 utils::eval_poly(erasures, truncated_size);
486 }
487}
488
489