snarkvm_algorithms/fft/
domain.rs

1// Copyright 2024 Aleo Network Foundation
2// This file is part of the snarkVM library.
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at:
7
8// http://www.apache.org/licenses/LICENSE-2.0
9
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! This module contains an `EvaluationDomain` abstraction for
17//! performing various kinds of polynomial arithmetic on top of
18//! the scalar field.
19//!
20//! In pairing-based SNARKs like GM17, we need to calculate
21//! a quotient polynomial over a target polynomial with roots
22//! at distinct points associated with each constraint of the
23//! constraint system. In order to be efficient, we choose these
24//! roots to be the powers of a 2^n root of unity in the field.
25//! This allows us to perform polynomial operations in O(n)
26//! by performing an O(n log n) FFT over such a domain.
27
28use crate::{
29    cfg_chunks_mut,
30    cfg_into_iter,
31    cfg_iter,
32    cfg_iter_mut,
33    fft::{DomainCoeff, SparsePolynomial},
34};
35use snarkvm_fields::{FftField, FftParameters, Field, batch_inversion};
36#[cfg(not(feature = "serial"))]
37use snarkvm_utilities::max_available_threads;
38use snarkvm_utilities::{execute_with_max_available_threads, serialize::*};
39
40use rand::Rng;
41use std::{borrow::Cow, fmt};
42
43use anyhow::{Result, ensure};
44
45#[cfg(not(feature = "serial"))]
46use rayon::prelude::*;
47
48#[cfg(feature = "serial")]
49use itertools::Itertools;
50
51/// Returns the ceiling of the base-2 logarithm of `x`.
52///
53/// ```
54/// use snarkvm_algorithms::fft::domain::log2;
55///
56/// assert_eq!(log2(16), 4);
57/// assert_eq!(log2(17), 5);
58/// assert_eq!(log2(1), 0);
59/// assert_eq!(log2(0), 0);
60/// assert_eq!(log2(usize::MAX), (core::mem::size_of::<usize>() * 8) as u32);
61/// assert_eq!(log2(1 << 15), 15);
62/// assert_eq!(log2(2usize.pow(18)), 18);
63/// ```
64pub fn log2(x: usize) -> u32 {
65    if x == 0 {
66        0
67    } else if x.is_power_of_two() {
68        1usize.leading_zeros() - x.leading_zeros()
69    } else {
70        0usize.leading_zeros() - x.leading_zeros()
71    }
72}
73
74// minimum size of a parallelized chunk
75#[allow(unused)]
76#[cfg(not(feature = "serial"))]
77const MIN_PARALLEL_CHUNK_SIZE: usize = 1 << 7;
78
79/// Defines a domain over which finite field (I)FFTs can be performed. Works
80/// only for fields that have a large multiplicative subgroup of size that is
81/// a power-of-2.
82#[derive(Copy, Clone, Hash, Eq, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
83pub struct EvaluationDomain<F: FftField> {
84    /// The size of the domain.
85    pub size: u64,
86    /// `log_2(self.size)`.
87    pub log_size_of_group: u32,
88    /// Size of the domain as a field element.
89    pub size_as_field_element: F,
90    /// Inverse of the size in the field.
91    pub size_inv: F,
92    /// A generator of the subgroup.
93    pub group_gen: F,
94    /// Inverse of the generator of the subgroup.
95    pub group_gen_inv: F,
96    /// Inverse of the multiplicative generator of the finite field.
97    pub generator_inv: F,
98}
99
100impl<F: FftField> fmt::Debug for EvaluationDomain<F> {
101    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
102        write!(f, "Multiplicative subgroup of size {}", self.size)
103    }
104}
105
106impl<F: FftField> EvaluationDomain<F> {
107    /// Sample an element that is *not* in the domain.
108    pub fn sample_element_outside_domain<R: Rng>(&self, rng: &mut R) -> F {
109        let mut t = F::rand(rng);
110        while self.evaluate_vanishing_polynomial(t).is_zero() {
111            t = F::rand(rng);
112        }
113        t
114    }
115
116    /// Construct a domain that is large enough for evaluations of a polynomial
117    /// having `num_coeffs` coefficients.
118    pub fn new(num_coeffs: usize) -> Option<Self> {
119        // Compute the size of our evaluation domain
120        let size = num_coeffs.checked_next_power_of_two()? as u64;
121        let log_size_of_group = size.trailing_zeros();
122
123        // libfqfft uses > https://github.com/scipr-lab/libfqfft/blob/e0183b2cef7d4c5deb21a6eaf3fe3b586d738fe0/libfqfft/evaluation_domain/domains/basic_radix2_domain.tcc#L33
124        if log_size_of_group > F::FftParameters::TWO_ADICITY {
125            return None;
126        }
127
128        // Compute the generator for the multiplicative subgroup.
129        // It should be the 2^(log_size_of_group) root of unity.
130        let group_gen = F::get_root_of_unity(size as usize)?;
131
132        // Check that it is indeed the 2^(log_size_of_group) root of unity.
133        debug_assert_eq!(group_gen.pow([size]), F::one());
134
135        let size_as_field_element = F::from(size);
136        let size_inv = size_as_field_element.inverse()?;
137
138        Some(EvaluationDomain {
139            size,
140            log_size_of_group,
141            size_as_field_element,
142            size_inv,
143            group_gen,
144            group_gen_inv: group_gen.inverse()?,
145            generator_inv: F::multiplicative_generator().inverse()?,
146        })
147    }
148
149    /// Return the size of a domain that is large enough for evaluations of a polynomial
150    /// having `num_coeffs` coefficients.
151    pub fn compute_size_of_domain(num_coeffs: usize) -> Option<usize> {
152        let size = num_coeffs.checked_next_power_of_two()?;
153        if size.trailing_zeros() <= F::FftParameters::TWO_ADICITY { Some(size) } else { None }
154    }
155
156    /// Return the size of `self`.
157    pub fn size(&self) -> usize {
158        self.size as usize
159    }
160
161    /// Compute an FFT.
162    pub fn fft<T: DomainCoeff<F>>(&self, coeffs: &[T]) -> Vec<T> {
163        let mut coeffs = coeffs.to_vec();
164        self.fft_in_place(&mut coeffs);
165        coeffs
166    }
167
168    /// Compute an FFT, modifying the vector in place.
169    pub fn fft_in_place<T: DomainCoeff<F>>(&self, coeffs: &mut Vec<T>) {
170        execute_with_max_available_threads(|| {
171            coeffs.resize(self.size(), T::zero());
172            self.in_order_fft_in_place(&mut *coeffs);
173        });
174    }
175
176    /// Compute an IFFT.
177    pub fn ifft<T: DomainCoeff<F>>(&self, evals: &[T]) -> Vec<T> {
178        let mut evals = evals.to_vec();
179        self.ifft_in_place(&mut evals);
180        evals
181    }
182
183    /// Compute an IFFT, modifying the vector in place.
184    #[inline]
185    pub fn ifft_in_place<T: DomainCoeff<F>>(&self, evals: &mut Vec<T>) {
186        execute_with_max_available_threads(|| {
187            evals.resize(self.size(), T::zero());
188            self.in_order_ifft_in_place(&mut *evals);
189        });
190    }
191
192    /// Compute an FFT over a coset of the domain.
193    pub fn coset_fft<T: DomainCoeff<F>>(&self, coeffs: &[T]) -> Vec<T> {
194        let mut coeffs = coeffs.to_vec();
195        self.coset_fft_in_place(&mut coeffs);
196        coeffs
197    }
198
199    /// Compute an FFT over a coset of the domain, modifying the input vector
200    /// in place.
201    pub fn coset_fft_in_place<T: DomainCoeff<F>>(&self, coeffs: &mut Vec<T>) {
202        execute_with_max_available_threads(|| {
203            Self::distribute_powers(coeffs, F::multiplicative_generator());
204            self.fft_in_place(coeffs);
205        });
206    }
207
208    /// Compute an IFFT over a coset of the domain.
209    pub fn coset_ifft<T: DomainCoeff<F>>(&self, evals: &[T]) -> Vec<T> {
210        let mut evals = evals.to_vec();
211        self.coset_ifft_in_place(&mut evals);
212        evals
213    }
214
215    /// Compute an IFFT over a coset of the domain, modifying the input vector in place.
216    pub fn coset_ifft_in_place<T: DomainCoeff<F>>(&self, evals: &mut Vec<T>) {
217        execute_with_max_available_threads(|| {
218            evals.resize(self.size(), T::zero());
219            self.in_order_coset_ifft_in_place(&mut *evals);
220        });
221    }
222
223    /// Multiply the `i`-th element of `coeffs` with `g^i`.
224    fn distribute_powers<T: DomainCoeff<F>>(coeffs: &mut [T], g: F) {
225        Self::distribute_powers_and_mul_by_const(coeffs, g, F::one());
226    }
227
228    /// Multiply the `i`-th element of `coeffs` with `c*g^i`.
229    #[cfg(feature = "serial")]
230    fn distribute_powers_and_mul_by_const<T: DomainCoeff<F>>(coeffs: &mut [T], g: F, c: F) {
231        // invariant: pow = c*g^i at the ith iteration of the loop
232        let mut pow = c;
233        coeffs.iter_mut().for_each(|coeff| {
234            *coeff *= pow;
235            pow *= &g
236        })
237    }
238
239    /// Multiply the `i`-th element of `coeffs` with `c*g^i`.
240    #[cfg(not(feature = "serial"))]
241    fn distribute_powers_and_mul_by_const<T: DomainCoeff<F>>(coeffs: &mut [T], g: F, c: F) {
242        let min_parallel_chunk_size = 1024;
243        let num_cpus_available = max_available_threads();
244        let num_elem_per_thread = core::cmp::max(coeffs.len() / num_cpus_available, min_parallel_chunk_size);
245
246        cfg_chunks_mut!(coeffs, num_elem_per_thread).enumerate().for_each(|(i, chunk)| {
247            let offset = c * g.pow([(i * num_elem_per_thread) as u64]);
248            let mut pow = offset;
249            chunk.iter_mut().for_each(|coeff| {
250                *coeff *= pow;
251                pow *= &g
252            })
253        });
254    }
255
256    /// Evaluate all the lagrange polynomials defined by this domain at the point
257    /// `tau`.
258    pub fn evaluate_all_lagrange_coefficients(&self, tau: F) -> Vec<F> {
259        // Evaluate all Lagrange polynomials
260        let size = self.size as usize;
261        let t_size = tau.pow([self.size]);
262        let one = F::one();
263        if t_size.is_one() {
264            let mut u = vec![F::zero(); size];
265            let mut omega_i = one;
266            for x in u.iter_mut().take(size) {
267                if omega_i == tau {
268                    *x = one;
269                    break;
270                }
271                omega_i *= &self.group_gen;
272            }
273            u
274        } else {
275            let mut l = (t_size - one) * self.size_inv;
276            let mut r = one;
277            let mut u = vec![F::zero(); size];
278            let mut ls = vec![F::zero(); size];
279            for i in 0..size {
280                u[i] = tau - r;
281                ls[i] = l;
282                l *= &self.group_gen;
283                r *= &self.group_gen;
284            }
285
286            batch_inversion(u.as_mut_slice());
287            cfg_iter_mut!(u).zip_eq(ls).for_each(|(tau_minus_r, l)| {
288                *tau_minus_r = l * *tau_minus_r;
289            });
290            u
291        }
292    }
293
294    /// Return the sparse vanishing polynomial.
295    pub fn vanishing_polynomial(&self) -> SparsePolynomial<F> {
296        let coeffs = [(0, -F::one()), (self.size(), F::one())];
297        SparsePolynomial::from_coefficients(coeffs)
298    }
299
300    /// This evaluates the vanishing polynomial for this domain at tau.
301    /// For multiplicative subgroups, this polynomial is `z(X) = X^self.size - 1`.
302    pub fn evaluate_vanishing_polynomial(&self, tau: F) -> F {
303        tau.pow([self.size]) - F::one()
304    }
305
306    /// Return an iterator over the elements of the domain.
307    pub fn elements(&self) -> Elements<F> {
308        Elements { cur_elem: F::one(), cur_pow: 0, domain: *self }
309    }
310
311    /// The target polynomial is the zero polynomial in our
312    /// evaluation domain, so we must perform division over
313    /// a coset.
314    pub fn divide_by_vanishing_poly_on_coset_in_place(&self, evals: &mut [F]) {
315        let i = self.evaluate_vanishing_polynomial(F::multiplicative_generator()).inverse().unwrap();
316
317        cfg_iter_mut!(evals).for_each(|eval| *eval *= &i);
318    }
319
320    /// Given an index in the `other` subdomain, return an index into this domain `self`
321    /// This assumes the `other`'s elements are also `self`'s first elements
322    pub fn reindex_by_subdomain(&self, other: &Self, index: usize) -> Result<usize> {
323        ensure!(self.size() > other.size(), "other.size() must be smaller than self.size()");
324
325        // Let this subgroup be G, and the subgroup we're re-indexing by be S.
326        // Since its a subgroup, the 0th element of S is at index 0 in G, the first element of S is at
327        // index |G|/|S|, the second at 2*|G|/|S|, etc.
328        // Thus for an index i that corresponds to S, the index in G is i*|G|/|S|
329        let period = self.size() / other.size();
330        if index < other.size() {
331            Ok(index * period)
332        } else {
333            // Let i now be the index of this element in G \ S
334            // Let x be the number of elements in G \ S, for every element in S. Then x = (|G|/|S| - 1).
335            // At index i in G \ S, the number of elements in S that appear before the index in G to which
336            // i corresponds to, is floor(i / x) + 1.
337            // The +1 is because index 0 of G is S_0, so the position is offset by at least one.
338            // The floor(i / x) term is because after x elements in G \ S, there is one more element from S
339            // that will have appeared in G.
340            let i = index - other.size();
341            let x = period - 1;
342            Ok(i + (i / x) + 1)
343        }
344    }
345
346    /// Perform O(n) multiplication of two polynomials that are presented by their
347    /// evaluations in the domain.
348    /// Returns the evaluations of the product over the domain.
349    pub fn mul_polynomials_in_evaluation_domain(&self, self_evals: Vec<F>, other_evals: &[F]) -> Result<Vec<F>> {
350        let mut result = self_evals;
351
352        ensure!(result.len() == other_evals.len());
353        cfg_iter_mut!(result).zip_eq(other_evals).for_each(|(a, b)| *a *= b);
354
355        Ok(result)
356    }
357}
358
359impl<F: FftField> EvaluationDomain<F> {
360    pub fn precompute_fft(&self) -> FFTPrecomputation<F> {
361        execute_with_max_available_threads(|| FFTPrecomputation {
362            roots: self.roots_of_unity(self.group_gen),
363            domain: *self,
364        })
365    }
366
367    pub fn precompute_ifft(&self) -> IFFTPrecomputation<F> {
368        execute_with_max_available_threads(|| IFFTPrecomputation {
369            inverse_roots: self.roots_of_unity(self.group_gen_inv),
370            domain: *self,
371        })
372    }
373
374    pub(crate) fn in_order_fft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
375        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
376        // SNP TODO: how to set threshold and check that the type is Fr
377        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
378            let result = snarkvm_algorithms_cuda::NTT(
379                self.size as usize,
380                x_s,
381                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
382                snarkvm_algorithms_cuda::NTTDirection::Forward,
383                snarkvm_algorithms_cuda::NTTType::Standard,
384            );
385            if result.is_ok() {
386                return;
387            }
388        }
389
390        let pc = self.precompute_fft();
391        self.fft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc)
392    }
393
394    pub fn in_order_fft_with_pc<T: DomainCoeff<F>>(&self, x_s: &[T], pc: &FFTPrecomputation<F>) -> Vec<T> {
395        let mut x_s = x_s.to_vec();
396        if self.size() != x_s.len() {
397            x_s.extend(core::iter::repeat(T::zero()).take(self.size() - x_s.len()));
398        }
399        self.fft_helper_in_place_with_pc(&mut x_s, FFTOrder::II, pc);
400        x_s
401    }
402
403    pub(crate) fn in_order_ifft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
404        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
405        // SNP TODO: how to set threshold
406        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
407            let result = snarkvm_algorithms_cuda::NTT(
408                self.size as usize,
409                x_s,
410                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
411                snarkvm_algorithms_cuda::NTTDirection::Inverse,
412                snarkvm_algorithms_cuda::NTTType::Standard,
413            );
414            if result.is_ok() {
415                return;
416            }
417        }
418
419        let pc = self.precompute_ifft();
420        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc);
421        cfg_iter_mut!(x_s).for_each(|val| *val *= self.size_inv);
422    }
423
424    pub(crate) fn in_order_coset_ifft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
425        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
426        // SNP TODO: how to set threshold
427        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
428            let result = snarkvm_algorithms_cuda::NTT(
429                self.size as usize,
430                x_s,
431                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
432                snarkvm_algorithms_cuda::NTTDirection::Inverse,
433                snarkvm_algorithms_cuda::NTTType::Coset,
434            );
435            if result.is_ok() {
436                return;
437            }
438        }
439
440        let pc = self.precompute_ifft();
441        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc);
442        let coset_shift = self.generator_inv;
443        Self::distribute_powers_and_mul_by_const(x_s, coset_shift, self.size_inv);
444    }
445
446    #[allow(unused)]
447    pub(crate) fn in_order_fft_in_place_with_pc<T: DomainCoeff<F>>(
448        &self,
449        x_s: &mut [T],
450        pre_comp: &FFTPrecomputation<F>,
451    ) {
452        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
453        // SNP TODO: how to set threshold
454        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
455            let result = snarkvm_algorithms_cuda::NTT(
456                self.size as usize,
457                x_s,
458                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
459                snarkvm_algorithms_cuda::NTTDirection::Forward,
460                snarkvm_algorithms_cuda::NTTType::Standard,
461            );
462            if result.is_ok() {
463                return;
464            }
465        }
466
467        self.fft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp)
468    }
469
470    pub(crate) fn out_order_fft_in_place_with_pc<T: DomainCoeff<F>>(
471        &self,
472        x_s: &mut [T],
473        pre_comp: &FFTPrecomputation<F>,
474    ) {
475        self.fft_helper_in_place_with_pc(x_s, FFTOrder::IO, pre_comp)
476    }
477
478    pub(crate) fn in_order_ifft_in_place_with_pc<T: DomainCoeff<F>>(
479        &self,
480        x_s: &mut [T],
481        pre_comp: &IFFTPrecomputation<F>,
482    ) {
483        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
484        // SNP TODO: how to set threshold
485        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
486            let result = snarkvm_algorithms_cuda::NTT(
487                self.size as usize,
488                x_s,
489                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
490                snarkvm_algorithms_cuda::NTTDirection::Inverse,
491                snarkvm_algorithms_cuda::NTTType::Standard,
492            );
493            if result.is_ok() {
494                return;
495            }
496        }
497
498        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp);
499        cfg_iter_mut!(x_s).for_each(|val| *val *= self.size_inv);
500    }
501
502    pub(crate) fn out_order_ifft_in_place_with_pc<T: DomainCoeff<F>>(
503        &self,
504        x_s: &mut [T],
505        pre_comp: &IFFTPrecomputation<F>,
506    ) {
507        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::OI, pre_comp);
508        cfg_iter_mut!(x_s).for_each(|val| *val *= self.size_inv);
509    }
510
511    #[allow(unused)]
512    pub(crate) fn in_order_coset_ifft_in_place_with_pc<T: DomainCoeff<F>>(
513        &self,
514        x_s: &mut [T],
515        pre_comp: &IFFTPrecomputation<F>,
516    ) {
517        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
518        // SNP TODO: how to set threshold
519        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
520            let result = snarkvm_algorithms_cuda::NTT(
521                self.size as usize,
522                x_s,
523                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
524                snarkvm_algorithms_cuda::NTTDirection::Inverse,
525                snarkvm_algorithms_cuda::NTTType::Coset,
526            );
527            if result.is_ok() {
528                return;
529            }
530        }
531
532        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp);
533        let coset_shift = self.generator_inv;
534        Self::distribute_powers_and_mul_by_const(x_s, coset_shift, self.size_inv);
535    }
536
537    fn fft_helper_in_place_with_pc<T: DomainCoeff<F>>(
538        &self,
539        x_s: &mut [T],
540        ord: FFTOrder,
541        pre_comp: &FFTPrecomputation<F>,
542    ) {
543        use FFTOrder::*;
544        let pc = pre_comp.precomputation_for_subdomain(self).unwrap();
545
546        let log_len = log2(x_s.len());
547
548        if ord == OI {
549            self.oi_helper_with_roots(x_s, &pc.roots);
550        } else {
551            self.io_helper_with_roots(x_s, &pc.roots);
552        }
553
554        if ord == II {
555            derange_helper(x_s, log_len);
556        }
557    }
558
559    // Handles doing an IFFT with handling of being in order and out of order.
560    // The results here must all be divided by |x_s|,
561    // which is left up to the caller to do.
562    fn ifft_helper_in_place_with_pc<T: DomainCoeff<F>>(
563        &self,
564        x_s: &mut [T],
565        ord: FFTOrder,
566        pre_comp: &IFFTPrecomputation<F>,
567    ) {
568        use FFTOrder::*;
569        let pc = pre_comp.precomputation_for_subdomain(self).unwrap();
570
571        let log_len = log2(x_s.len());
572
573        if ord == II {
574            derange_helper(x_s, log_len);
575        }
576
577        if ord == IO {
578            self.io_helper_with_roots(x_s, &pc.inverse_roots);
579        } else {
580            self.oi_helper_with_roots(x_s, &pc.inverse_roots);
581        }
582    }
583
584    /// Computes the first `self.size / 2` roots of unity for the entire domain.
585    /// e.g. for the domain [1, g, g^2, ..., g^{n - 1}], it computes
586    // [1, g, g^2, ..., g^{(n/2) - 1}]
587    #[cfg(feature = "serial")]
588    pub fn roots_of_unity(&self, root: F) -> Vec<F> {
589        compute_powers_serial((self.size as usize) / 2, root)
590    }
591
592    /// Computes the first `self.size / 2` roots of unity.
593    #[cfg(not(feature = "serial"))]
594    pub fn roots_of_unity(&self, root: F) -> Vec<F> {
595        // TODO: check if this method can replace parallel compute powers.
596        let log_size = log2(self.size as usize);
597        // early exit for short inputs
598        if log_size <= LOG_ROOTS_OF_UNITY_PARALLEL_SIZE {
599            compute_powers_serial((self.size as usize) / 2, root)
600        } else {
601            let mut temp = root;
602            // w, w^2, w^4, w^8, ..., w^(2^(log_size - 1))
603            let log_powers: Vec<F> = (0..(log_size - 1))
604                .map(|_| {
605                    let old_value = temp;
606                    temp.square_in_place();
607                    old_value
608                })
609                .collect();
610
611            // allocate the return array and start the recursion
612            let mut powers = vec![F::zero(); 1 << (log_size - 1)];
613            Self::roots_of_unity_recursive(&mut powers, &log_powers);
614            powers
615        }
616    }
617
618    #[cfg(not(feature = "serial"))]
619    fn roots_of_unity_recursive(out: &mut [F], log_powers: &[F]) {
620        assert_eq!(out.len(), 1 << log_powers.len());
621        // base case: just compute the powers sequentially,
622        // g = log_powers[0], out = [1, g, g^2, ...]
623        if log_powers.len() <= LOG_ROOTS_OF_UNITY_PARALLEL_SIZE as usize {
624            out[0] = F::one();
625            for idx in 1..out.len() {
626                out[idx] = out[idx - 1] * log_powers[0];
627            }
628            return;
629        }
630
631        // recursive case:
632        // 1. split log_powers in half
633        let (lr_lo, lr_hi) = log_powers.split_at((1 + log_powers.len()) / 2);
634        let mut scr_lo = vec![F::default(); 1 << lr_lo.len()];
635        let mut scr_hi = vec![F::default(); 1 << lr_hi.len()];
636        // 2. compute each half individually
637        rayon::join(
638            || Self::roots_of_unity_recursive(&mut scr_lo, lr_lo),
639            || Self::roots_of_unity_recursive(&mut scr_hi, lr_hi),
640        );
641        // 3. recombine halves
642        // At this point, out is a blank slice.
643        out.par_chunks_mut(scr_lo.len()).zip(&scr_hi).for_each(|(out_chunk, scr_hi)| {
644            for (out_elem, scr_lo) in out_chunk.iter_mut().zip(&scr_lo) {
645                *out_elem = *scr_hi * scr_lo;
646            }
647        });
648    }
649
650    #[inline(always)]
651    fn butterfly_fn_io<T: DomainCoeff<F>>(((lo, hi), root): ((&mut T, &mut T), &F)) {
652        let neg = *lo - *hi;
653        *lo += *hi;
654        *hi = neg;
655        *hi *= *root;
656    }
657
658    #[inline(always)]
659    fn butterfly_fn_oi<T: DomainCoeff<F>>(((lo, hi), root): ((&mut T, &mut T), &F)) {
660        *hi *= *root;
661        let neg = *lo - *hi;
662        *lo += *hi;
663        *hi = neg;
664    }
665
666    #[allow(clippy::too_many_arguments)]
667    fn apply_butterfly<T: DomainCoeff<F>, G: Fn(((&mut T, &mut T), &F)) + Copy + Sync + Send>(
668        g: G,
669        xi: &mut [T],
670        roots: &[F],
671        step: usize,
672        chunk_size: usize,
673        num_chunks: usize,
674        max_threads: usize,
675        gap: usize,
676    ) {
677        cfg_chunks_mut!(xi, chunk_size).for_each(|cxi| {
678            let (lo, hi) = cxi.split_at_mut(gap);
679            // If the chunk is sufficiently big that parallelism helps,
680            // we parallelize the butterfly operation within the chunk.
681
682            if gap > MIN_GAP_SIZE_FOR_PARALLELISATION && num_chunks < max_threads {
683                cfg_iter_mut!(lo).zip(hi).zip(cfg_iter!(roots).step_by(step)).for_each(g);
684            } else {
685                lo.iter_mut().zip(hi).zip(roots.iter().step_by(step)).for_each(g);
686            }
687        });
688    }
689
690    #[allow(clippy::unnecessary_to_owned)]
691    fn io_helper_with_roots<T: DomainCoeff<F>>(&self, xi: &mut [T], roots: &[F]) {
692        let mut roots = std::borrow::Cow::Borrowed(roots);
693
694        let mut step = 1;
695        let mut first = true;
696
697        #[cfg(not(feature = "serial"))]
698        let max_threads = snarkvm_utilities::parallel::max_available_threads();
699        #[cfg(feature = "serial")]
700        let max_threads = 1;
701
702        let mut gap = xi.len() / 2;
703        while gap > 0 {
704            // each butterfly cluster uses 2*gap positions
705            let chunk_size = 2 * gap;
706            let num_chunks = xi.len() / chunk_size;
707
708            // Only compact roots to achieve cache locality/compactness if
709            // the roots lookup is done a significant amount of times
710            // Which also implies a large lookup stride.
711            if num_chunks >= MIN_NUM_CHUNKS_FOR_COMPACTION {
712                if !first {
713                    roots = Cow::Owned(cfg_into_iter!(roots.into_owned()).step_by(step * 2).collect());
714                }
715                step = 1;
716                roots.to_mut().shrink_to_fit();
717            } else {
718                step = num_chunks;
719            }
720            first = false;
721
722            Self::apply_butterfly(
723                Self::butterfly_fn_io,
724                xi,
725                &roots[..],
726                step,
727                chunk_size,
728                num_chunks,
729                max_threads,
730                gap,
731            );
732
733            gap /= 2;
734        }
735    }
736
737    fn oi_helper_with_roots<T: DomainCoeff<F>>(&self, xi: &mut [T], roots_cache: &[F]) {
738        // The `cmp::min` is only necessary for the case where
739        // `MIN_NUM_CHUNKS_FOR_COMPACTION = 1`. Else, notice that we compact
740        // the roots cache by a stride of at least `MIN_NUM_CHUNKS_FOR_COMPACTION`.
741
742        let compaction_max_size =
743            core::cmp::min(roots_cache.len() / 2, roots_cache.len() / MIN_NUM_CHUNKS_FOR_COMPACTION);
744        let mut compacted_roots = vec![F::default(); compaction_max_size];
745
746        #[cfg(not(feature = "serial"))]
747        let max_threads = snarkvm_utilities::parallel::max_available_threads();
748        #[cfg(feature = "serial")]
749        let max_threads = 1;
750
751        let mut gap = 1;
752        while gap < xi.len() {
753            // each butterfly cluster uses 2*gap positions
754            let chunk_size = 2 * gap;
755            let num_chunks = xi.len() / chunk_size;
756
757            // Only compact roots to achieve cache locality/compactness if
758            // the roots lookup is done a significant amount of times
759            // Which also implies a large lookup stride.
760            let (roots, step) = if num_chunks >= MIN_NUM_CHUNKS_FOR_COMPACTION && gap < xi.len() / 2 {
761                cfg_iter_mut!(compacted_roots[..gap])
762                    .zip(cfg_iter!(roots_cache[..(gap * num_chunks)]).step_by(num_chunks))
763                    .for_each(|(a, b)| *a = *b);
764                (&compacted_roots[..gap], 1)
765            } else {
766                (roots_cache, num_chunks)
767            };
768
769            Self::apply_butterfly(Self::butterfly_fn_oi, xi, roots, step, chunk_size, num_chunks, max_threads, gap);
770
771            gap *= 2;
772        }
773    }
774}
775
776/// The minimum number of chunks at which root compaction
777/// is beneficial.
778const MIN_NUM_CHUNKS_FOR_COMPACTION: usize = 1 << 7;
779
780/// The minimum size of a chunk at which parallelization of `butterfly`s is
781/// beneficial. This value was chosen empirically.
782const MIN_GAP_SIZE_FOR_PARALLELISATION: usize = 1 << 10;
783
784// minimum size at which to parallelize.
785#[cfg(not(feature = "serial"))]
786const LOG_ROOTS_OF_UNITY_PARALLEL_SIZE: u32 = 7;
787
788#[inline]
789pub(super) fn bitrev(a: u64, log_len: u32) -> u64 {
790    a.reverse_bits() >> (64 - log_len)
791}
792
793pub(crate) fn derange<T>(xi: &mut [T]) {
794    derange_helper(xi, log2(xi.len()))
795}
796
797fn derange_helper<T>(xi: &mut [T], log_len: u32) {
798    for idx in 1..(xi.len() as u64 - 1) {
799        let ridx = bitrev(idx, log_len);
800        if idx < ridx {
801            xi.swap(idx as usize, ridx as usize);
802        }
803    }
804}
805
806#[derive(PartialEq, Eq, Debug)]
807enum FFTOrder {
808    /// Both the input and the output of the FFT must be in-order.
809    II,
810    /// The input of the FFT must be in-order, but the output does not have to
811    /// be.
812    IO,
813    /// The input of the FFT can be out of order, but the output must be
814    /// in-order.
815    OI,
816}
817
818pub(crate) fn compute_powers_serial<F: Field>(size: usize, root: F) -> Vec<F> {
819    compute_powers_and_mul_by_const_serial(size, root, F::one())
820}
821
822pub(crate) fn compute_powers_and_mul_by_const_serial<F: Field>(size: usize, root: F, c: F) -> Vec<F> {
823    let mut value = c;
824    (0..size)
825        .map(|_| {
826            let old_value = value;
827            value *= root;
828            old_value
829        })
830        .collect()
831}
832
833#[allow(unused)]
834#[cfg(not(feature = "serial"))]
835pub(crate) fn compute_powers<F: Field>(size: usize, g: F) -> Vec<F> {
836    if size < MIN_PARALLEL_CHUNK_SIZE {
837        return compute_powers_serial(size, g);
838    }
839    // compute the number of threads we will be using.
840    let num_cpus_available = max_available_threads();
841    let num_elem_per_thread = core::cmp::max(size / num_cpus_available, MIN_PARALLEL_CHUNK_SIZE);
842    let num_cpus_used = size / num_elem_per_thread;
843
844    // Split up the powers to compute across each thread evenly.
845    let res: Vec<F> = (0..num_cpus_used)
846        .into_par_iter()
847        .flat_map(|i| {
848            let offset = g.pow([(i * num_elem_per_thread) as u64]);
849            // Compute the size that this chunks' output should be
850            // (num_elem_per_thread, unless there are less than num_elem_per_thread elements remaining)
851            let num_elements_to_compute = core::cmp::min(size - i * num_elem_per_thread, num_elem_per_thread);
852            compute_powers_and_mul_by_const_serial(num_elements_to_compute, g, offset)
853        })
854        .collect();
855    res
856}
857
858/// An iterator over the elements of the domain.
859#[derive(Clone)]
860pub struct Elements<F: FftField> {
861    cur_elem: F,
862    cur_pow: u64,
863    domain: EvaluationDomain<F>,
864}
865
866impl<F: FftField> Iterator for Elements<F> {
867    type Item = F;
868
869    fn next(&mut self) -> Option<F> {
870        if self.cur_pow == self.domain.size {
871            None
872        } else {
873            let cur_elem = self.cur_elem;
874            self.cur_elem *= &self.domain.group_gen;
875            self.cur_pow += 1;
876            Some(cur_elem)
877        }
878    }
879}
880
881/// An iterator over the elements of the domain.
882#[derive(Clone, Eq, PartialEq, Debug, CanonicalDeserialize, CanonicalSerialize)]
883pub struct FFTPrecomputation<F: FftField> {
884    roots: Vec<F>,
885    domain: EvaluationDomain<F>,
886}
887
888impl<F: FftField> FFTPrecomputation<F> {
889    pub fn to_ifft_precomputation(&self) -> IFFTPrecomputation<F> {
890        let mut inverse_roots = self.roots.clone();
891        snarkvm_fields::batch_inversion(&mut inverse_roots);
892        IFFTPrecomputation { inverse_roots, domain: self.domain }
893    }
894
895    pub fn precomputation_for_subdomain<'a>(&'a self, domain: &EvaluationDomain<F>) -> Option<Cow<'a, Self>> {
896        if domain.size() == 1 {
897            return Some(Cow::Owned(Self { roots: vec![], domain: *domain }));
898        }
899        if &self.domain == domain {
900            Some(Cow::Borrowed(self))
901        } else if domain.size() < self.domain.size() {
902            let size_ratio = self.domain.size() / domain.size();
903            let roots = self.roots.iter().step_by(size_ratio).copied().collect();
904            Some(Cow::Owned(Self { roots, domain: *domain }))
905        } else {
906            None
907        }
908    }
909}
910
911/// An iterator over the elements of the domain.
912#[derive(Clone, Eq, PartialEq, Debug, CanonicalSerialize, CanonicalDeserialize)]
913pub struct IFFTPrecomputation<F: FftField> {
914    inverse_roots: Vec<F>,
915    domain: EvaluationDomain<F>,
916}
917
918impl<F: FftField> IFFTPrecomputation<F> {
919    pub fn precomputation_for_subdomain<'a>(&'a self, domain: &EvaluationDomain<F>) -> Option<Cow<'a, Self>> {
920        if domain.size() == 1 {
921            return Some(Cow::Owned(Self { inverse_roots: vec![], domain: *domain }));
922        }
923        if &self.domain == domain {
924            Some(Cow::Borrowed(self))
925        } else if domain.size() < self.domain.size() {
926            let size_ratio = self.domain.size() / domain.size();
927            let inverse_roots = self.inverse_roots.iter().step_by(size_ratio).copied().collect();
928            Some(Cow::Owned(Self { inverse_roots, domain: *domain }))
929        } else {
930            None
931        }
932    }
933}
934
935#[cfg(test)]
936mod tests {
937    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
938    use crate::fft::domain::FFTOrder;
939    use crate::fft::{DensePolynomial, EvaluationDomain};
940    use rand::Rng;
941    use snarkvm_curves::bls12_377::Fr;
942    use snarkvm_fields::{FftField, Field, One, Zero};
943    use snarkvm_utilities::{TestRng, Uniform};
944
945    #[test]
946    fn vanishing_polynomial_evaluation() {
947        let rng = &mut TestRng::default();
948        for coeffs in 0..10 {
949            let domain = EvaluationDomain::<Fr>::new(coeffs).unwrap();
950            let z = domain.vanishing_polynomial();
951            for _ in 0..100 {
952                let point = rng.gen();
953                assert_eq!(z.evaluate(point), domain.evaluate_vanishing_polynomial(point))
954            }
955        }
956    }
957
958    #[test]
959    fn vanishing_polynomial_vanishes_on_domain() {
960        for coeffs in 0..1000 {
961            let domain = EvaluationDomain::<Fr>::new(coeffs).unwrap();
962            let z = domain.vanishing_polynomial();
963            for point in domain.elements() {
964                assert!(z.evaluate(point).is_zero())
965            }
966        }
967    }
968
969    #[test]
970    fn size_of_elements() {
971        for coeffs in 1..10 {
972            let size = 1 << coeffs;
973            let domain = EvaluationDomain::<Fr>::new(size).unwrap();
974            let domain_size = domain.size();
975            assert_eq!(domain_size, domain.elements().count());
976        }
977    }
978
979    #[test]
980    fn elements_contents() {
981        for coeffs in 1..10 {
982            let size = 1 << coeffs;
983            let domain = EvaluationDomain::<Fr>::new(size).unwrap();
984            for (i, element) in domain.elements().enumerate() {
985                assert_eq!(element, domain.group_gen.pow([i as u64]));
986            }
987        }
988    }
989
990    /// Test that lagrange interpolation for a random polynomial at a random point works.
991    #[test]
992    fn non_systematic_lagrange_coefficients_test() {
993        let mut rng = TestRng::default();
994        for domain_dimension in 1..10 {
995            let domain_size = 1 << domain_dimension;
996            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
997            // Get random point & lagrange coefficients
998            let random_point = Fr::rand(&mut rng);
999            let lagrange_coefficients = domain.evaluate_all_lagrange_coefficients(random_point);
1000
1001            // Sample the random polynomial, evaluate it over the domain and the random point.
1002            let random_polynomial = DensePolynomial::<Fr>::rand(domain_size - 1, &mut rng);
1003            let polynomial_evaluations = domain.fft(random_polynomial.coeffs());
1004            let actual_evaluations = random_polynomial.evaluate(random_point);
1005
1006            // Do lagrange interpolation, and compare against the actual evaluation
1007            let mut interpolated_evaluation = Fr::zero();
1008            for i in 0..domain_size {
1009                interpolated_evaluation += lagrange_coefficients[i] * polynomial_evaluations[i];
1010            }
1011            assert_eq!(actual_evaluations, interpolated_evaluation);
1012        }
1013    }
1014
1015    /// Test that lagrange coefficients for a point in the domain is correct.
1016    #[test]
1017    fn systematic_lagrange_coefficients_test() {
1018        // This runs in time O(N^2) in the domain size, so keep the domain dimension low.
1019        // We generate lagrange coefficients for each element in the domain.
1020        for domain_dimension in 1..5 {
1021            let domain_size = 1 << domain_dimension;
1022            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1023            let all_domain_elements: Vec<Fr> = domain.elements().collect();
1024            for (i, domain_element) in all_domain_elements.iter().enumerate().take(domain_size) {
1025                let lagrange_coefficients = domain.evaluate_all_lagrange_coefficients(*domain_element);
1026                for (j, lagrange_coefficient) in lagrange_coefficients.iter().enumerate().take(domain_size) {
1027                    // Lagrange coefficient for the evaluation point, which should be 1
1028                    if i == j {
1029                        assert_eq!(*lagrange_coefficient, Fr::one());
1030                    } else {
1031                        assert_eq!(*lagrange_coefficient, Fr::zero());
1032                    }
1033                }
1034            }
1035        }
1036    }
1037
1038    /// Tests that the roots of unity result is the same as domain.elements().
1039    #[test]
1040    fn test_roots_of_unity() {
1041        let max_degree = 10;
1042        for log_domain_size in 0..max_degree {
1043            let domain_size = 1 << log_domain_size;
1044            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1045            let actual_roots = domain.roots_of_unity(domain.group_gen);
1046            for &value in &actual_roots {
1047                assert!(domain.evaluate_vanishing_polynomial(value).is_zero());
1048            }
1049            let expected_roots_elements = domain.elements();
1050            for (expected, &actual) in expected_roots_elements.zip(&actual_roots) {
1051                assert_eq!(expected, actual);
1052            }
1053            assert_eq!(actual_roots.len(), domain_size / 2);
1054        }
1055    }
1056
1057    /// Tests that the FFTs output the correct result.
1058    #[test]
1059    fn test_fft_correctness() {
1060        // This assumes a correct polynomial evaluation at point procedure.
1061        // It tests consistency of FFT/IFFT, and coset_fft/coset_ifft,
1062        // along with testing that each individual evaluation is correct.
1063
1064        let mut rng = TestRng::default();
1065
1066        // Runs in time O(degree^2)
1067        let log_degree = 5;
1068        let degree = 1 << log_degree;
1069        let random_polynomial = DensePolynomial::<Fr>::rand(degree - 1, &mut rng);
1070
1071        for log_domain_size in log_degree..(log_degree + 2) {
1072            let domain_size = 1 << log_domain_size;
1073            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1074            let polynomial_evaluations = domain.fft(&random_polynomial.coeffs);
1075            let polynomial_coset_evaluations = domain.coset_fft(&random_polynomial.coeffs);
1076            for (i, x) in domain.elements().enumerate() {
1077                let coset_x = Fr::multiplicative_generator() * x;
1078
1079                assert_eq!(polynomial_evaluations[i], random_polynomial.evaluate(x));
1080                assert_eq!(polynomial_coset_evaluations[i], random_polynomial.evaluate(coset_x));
1081            }
1082
1083            let randon_polynomial_from_subgroup =
1084                DensePolynomial::from_coefficients_vec(domain.ifft(&polynomial_evaluations));
1085            let random_polynomial_from_coset =
1086                DensePolynomial::from_coefficients_vec(domain.coset_ifft(&polynomial_coset_evaluations));
1087
1088            assert_eq!(
1089                random_polynomial, randon_polynomial_from_subgroup,
1090                "degree = {degree}, domain size = {domain_size}"
1091            );
1092            assert_eq!(
1093                random_polynomial, random_polynomial_from_coset,
1094                "degree = {degree}, domain size = {domain_size}"
1095            );
1096        }
1097    }
1098
1099    /// Tests that FFT precomputation is correctly subdomained
1100    #[test]
1101    fn test_fft_precomputation() {
1102        for i in 1..10 {
1103            let big_domain = EvaluationDomain::<Fr>::new(i).unwrap();
1104            let pc = big_domain.precompute_fft();
1105            for j in 1..i {
1106                let small_domain = EvaluationDomain::<Fr>::new(j).unwrap();
1107                let small_pc = small_domain.precompute_fft();
1108                assert_eq!(pc.precomputation_for_subdomain(&small_domain).unwrap().as_ref(), &small_pc);
1109            }
1110        }
1111    }
1112
1113    /// Tests that IFFT precomputation is correctly subdomained
1114    #[test]
1115    fn test_ifft_precomputation() {
1116        for i in 1..10 {
1117            let big_domain = EvaluationDomain::<Fr>::new(i).unwrap();
1118            let pc = big_domain.precompute_ifft();
1119            for j in 1..i {
1120                let small_domain = EvaluationDomain::<Fr>::new(j).unwrap();
1121                let small_pc = small_domain.precompute_ifft();
1122                assert_eq!(pc.precomputation_for_subdomain(&small_domain).unwrap().as_ref(), &small_pc);
1123            }
1124        }
1125    }
1126
1127    /// Tests that IFFT precomputation can be correctly computed from
1128    /// FFT precomputation
1129    #[test]
1130    fn test_ifft_precomputation_from_fft() {
1131        for i in 1..10 {
1132            let domain = EvaluationDomain::<Fr>::new(i).unwrap();
1133            let pc = domain.precompute_ifft();
1134            let fft_pc = domain.precompute_fft();
1135            assert_eq!(pc, fft_pc.to_ifft_precomputation())
1136        }
1137    }
1138
1139    /// Tests that the FFTs output the correct result.
1140    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
1141    #[test]
1142    fn test_fft_correctness_cuda() {
1143        let mut rng = TestRng::default();
1144        for log_domain in 2..20 {
1145            println!("Testing domain size {log_domain}");
1146            let domain_size = 1 << log_domain;
1147            let random_polynomial = DensePolynomial::<Fr>::rand(domain_size - 1, &mut rng);
1148            let mut polynomial_evaluations = random_polynomial.coeffs.clone();
1149            let mut polynomial_evaluations_cuda = random_polynomial.coeffs.clone();
1150
1151            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1152            let pc = domain.precompute_fft();
1153            domain.fft_helper_in_place_with_pc(&mut polynomial_evaluations, FFTOrder::II, &pc);
1154
1155            if snarkvm_algorithms_cuda::NTT::<Fr>(
1156                domain_size,
1157                &mut polynomial_evaluations_cuda,
1158                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1159                snarkvm_algorithms_cuda::NTTDirection::Forward,
1160                snarkvm_algorithms_cuda::NTTType::Standard,
1161            )
1162            .is_err()
1163            {
1164                println!("cuda error!");
1165            }
1166
1167            assert_eq!(polynomial_evaluations, polynomial_evaluations_cuda, "domain size = {domain_size}");
1168
1169            // iNTT
1170            if snarkvm_algorithms_cuda::NTT::<Fr>(
1171                domain_size,
1172                &mut polynomial_evaluations_cuda,
1173                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1174                snarkvm_algorithms_cuda::NTTDirection::Inverse,
1175                snarkvm_algorithms_cuda::NTTType::Standard,
1176            )
1177            .is_err()
1178            {
1179                println!("cuda error!");
1180            }
1181            assert_eq!(random_polynomial.coeffs, polynomial_evaluations_cuda, "domain size = {domain_size}");
1182
1183            // Coset NTT
1184            polynomial_evaluations = random_polynomial.coeffs.clone();
1185            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1186            let pc = domain.precompute_fft();
1187            EvaluationDomain::<Fr>::distribute_powers(&mut polynomial_evaluations, Fr::multiplicative_generator());
1188            domain.fft_helper_in_place_with_pc(&mut polynomial_evaluations, FFTOrder::II, &pc);
1189
1190            if snarkvm_algorithms_cuda::NTT::<Fr>(
1191                domain_size,
1192                &mut polynomial_evaluations_cuda,
1193                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1194                snarkvm_algorithms_cuda::NTTDirection::Forward,
1195                snarkvm_algorithms_cuda::NTTType::Coset,
1196            )
1197            .is_err()
1198            {
1199                println!("cuda error!");
1200            }
1201
1202            assert_eq!(polynomial_evaluations, polynomial_evaluations_cuda, "domain size = {domain_size}");
1203
1204            // Coset iNTT
1205            if snarkvm_algorithms_cuda::NTT::<Fr>(
1206                domain_size,
1207                &mut polynomial_evaluations_cuda,
1208                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1209                snarkvm_algorithms_cuda::NTTDirection::Inverse,
1210                snarkvm_algorithms_cuda::NTTType::Coset,
1211            )
1212            .is_err()
1213            {
1214                println!("cuda error!");
1215            }
1216            assert_eq!(random_polynomial.coeffs, polynomial_evaluations_cuda, "domain size = {domain_size}");
1217        }
1218    }
1219}