1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
//! Contains traits and implementations of multi-data used in SIMD.
//! The actual representation is driven by the feature flag `"simd"`, which, if set,
//! uses [`std::simd`].
use super::{days_ms, f16, i256, months_days_ns, BitChunk, BitChunkIter, NativeType};

/// Describes the ability to convert itself from a [`BitChunk`].
pub trait FromMaskChunk<T> {
    /// Convert itself from a slice.
    fn from_chunk(v: T) -> Self;
}

/// A struct that lends itself well to be compiled leveraging SIMD
/// # Safety
/// The `NativeType` and the `NativeSimd` must have possible a matching alignment.
/// e.g. slicing `&[NativeType]` by `align_of<NativeSimd>()` must be properly aligned/safe.
pub unsafe trait NativeSimd: Sized + Default + Copy {
    /// Number of lanes
    const LANES: usize;
    /// The [`NativeType`] of this struct. E.g. `f32` for a `NativeSimd = f32x16`.
    type Native: NativeType;
    /// The type holding bits for masks.
    type Chunk: BitChunk;
    /// Type used for masking.
    type Mask: FromMaskChunk<Self::Chunk>;

    /// Sets values to `default` based on `mask`.
    fn select(self, mask: Self::Mask, default: Self) -> Self;

    /// Convert itself from a slice.
    /// # Panics
    /// * iff `v.len()` != `T::LANES`
    fn from_chunk(v: &[Self::Native]) -> Self;

    /// creates a new Self from `v` by populating items from `v` up to its length.
    /// Items from `v` at positions larger than the number of lanes are ignored;
    /// remaining items are populated with `remaining`.
    fn from_incomplete_chunk(v: &[Self::Native], remaining: Self::Native) -> Self;

    /// Returns a tuple of 3 items whose middle item is itself, and the remaining
    /// are the head and tail of the un-aligned parts.
    fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]);
}

/// Trait implemented by some [`NativeType`] that have a SIMD representation.
pub trait Simd: NativeType {
    /// The SIMD type associated with this trait.
    /// This type supports SIMD operations
    type Simd: NativeSimd<Native = Self>;
}

#[cfg(not(feature = "simd"))]
mod native;
#[cfg(not(feature = "simd"))]
pub use native::*;
#[cfg(feature = "simd")]
mod packed;
#[cfg(feature = "simd")]
pub use packed::*;

macro_rules! native_simd {
    ($name:tt, $type:ty, $lanes:expr, $mask:ty) => {
        /// Multi-Data correspondence of the native type
        #[allow(non_camel_case_types)]
        #[derive(Copy, Clone)]
        pub struct $name(pub [$type; $lanes]);

        unsafe impl NativeSimd for $name {
            const LANES: usize = $lanes;
            type Native = $type;
            type Chunk = $mask;
            type Mask = $mask;

            #[inline]
            fn select(self, mask: $mask, default: Self) -> Self {
                let mut reduced = default;
                let iter = BitChunkIter::new(mask, Self::LANES);
                for (i, b) in (0..Self::LANES).zip(iter) {
                    reduced[i] = if b { self[i] } else { reduced[i] };
                }
                reduced
            }

            #[inline]
            fn from_chunk(v: &[$type]) -> Self {
                ($name)(v.try_into().unwrap())
            }

            #[inline]
            fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self {
                let mut a = [remaining; $lanes];
                a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b);
                Self(a)
            }

            #[inline]
            fn align(values: &[Self::Native]) -> (&[Self::Native], &[Self], &[Self::Native]) {
                unsafe { values.align_to::<Self>() }
            }
        }

        impl std::ops::Index<usize> for $name {
            type Output = $type;

            #[inline]
            fn index(&self, index: usize) -> &Self::Output {
                &self.0[index]
            }
        }

        impl std::ops::IndexMut<usize> for $name {
            #[inline]
            fn index_mut(&mut self, index: usize) -> &mut Self::Output {
                &mut self.0[index]
            }
        }

        impl Default for $name {
            #[inline]
            fn default() -> Self {
                ($name)([<$type>::default(); $lanes])
            }
        }
    };
}

#[cfg(not(feature = "simd"))]
pub(super) use native_simd;

// Types do not have specific intrinsics and thus SIMD can't be specialized.
// Therefore, we can declare their MD representation as `[$t; 8]` irrespectively
// of how they are represented in the different channels.
native_simd!(f16x32, f16, 32, u32);
native_simd!(days_msx8, days_ms, 8, u8);
native_simd!(months_days_nsx8, months_days_ns, 8, u8);
native_simd!(i128x8, i128, 8, u8);
native_simd!(i256x8, i256, 8, u8);

// In the native implementation, a mask is 1 bit wide, as per AVX512.
impl<T: BitChunk> FromMaskChunk<T> for T {
    #[inline]
    fn from_chunk(v: T) -> Self {
        v
    }
}

macro_rules! native {
    ($type:ty, $simd:ty) => {
        impl Simd for $type {
            type Simd = $simd;
        }
    };
}

native!(u8, u8x64);
native!(u16, u16x32);
native!(u32, u32x16);
native!(u64, u64x8);
native!(i8, i8x64);
native!(i16, i16x32);
native!(i32, i32x16);
native!(i64, i64x8);
native!(f16, f16x32);
native!(f32, f32x16);
native!(f64, f64x8);
native!(i128, i128x8);
native!(i256, i256x8);
native!(days_ms, days_msx8);
native!(months_days_ns, months_days_nsx8);