1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
ix!();
pub const HALFRATE_BLOCK_SIZE: usize = 256;
impl ProcessBlock for crate::HalfRateFilterSSE {
fn process_block(&mut self,
l: *mut f32,
r: *mut f32,
nsamples: Option<usize>) {
let nsamples = nsamples.unwrap_or(64);
let l: *mut __m128 = l as *mut __m128;
let r: *mut __m128 = r as *mut __m128;
let mut o = A1d::<__m128>::from_elem(HALFRATE_BLOCK_SIZE, unsafe { z128![] });
for k in (0_usize..nsamples).step_by(4) {
unsafe {
o[k] = _mm_shuffle_ps(*l.add(k >> 2), *r.add(k >> 2), _MM_SHUFFLE(0, 0, 0, 0));
o[k + 1] = _mm_shuffle_ps(*l.add(k >> 2), *r.add(k >> 2), _MM_SHUFFLE(1, 1, 1, 1));
o[k + 2] = _mm_shuffle_ps(*l.add(k >> 2), *r.add(k >> 2), _MM_SHUFFLE(2, 2, 2, 2));
o[k + 3] = _mm_shuffle_ps(*l.add(k >> 2), *r.add(k >> 2), _MM_SHUFFLE(3, 3, 3, 3));
}
}
for j in 0..self.m {
let mut tx0: __m128 = self.vx0[j];
let mut tx1: __m128 = self.vx1[j];
let mut tx2: __m128 = self.vx2[j];
let mut ty0: __m128 = self.vy0[j];
let mut ty1: __m128 = self.vy1[j];
let mut ty2: __m128 = self.vy2[j];
let ta: __m128 = self.va[j];
for k in (0_usize..nsamples).step_by(2) {
tx2 = tx1;
tx1 = tx0;
tx0 = o[k];
ty2 = ty1;
ty1 = ty0;
unsafe {
ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
}
o[k] = ty0;
tx2 = tx1;
tx1 = tx0;
tx0 = o[k + 1];
ty2 = ty1;
ty1 = ty0;
unsafe {
ty0 = _mm_add_ps(tx2, _mm_mul_ps(_mm_sub_ps(tx0, ty2), ta));
}
o[k + 1] = ty0;
}
self.vx0[j] = tx0;
self.vx1[j] = tx1;
self.vx2[j] = tx2;
self.vy0[j] = ty0;
self.vy1[j] = ty1;
self.vy2[j] = ty2;
}
let f_l: *mut f32 = l as *mut f32;
let f_r: *mut f32 = r as *mut f32;
let mut fa_r: __m128 = unsafe { _mm_setzero_ps() };
let mut fb_r: __m128 = unsafe { _mm_setzero_ps() };
for k in 0..nsamples {
let udx = k as usize;
unsafe {
let mut v_l: __m128 = _mm_add_ss(o[udx], self.oldout) ;
v_l = _mm_mul_ss(v_l, m128_half![]);
_mm_store_ss(f_l.add(udx), v_l);
fa_r = _mm_movehl_ps(fa_r, o[udx]);
fb_r = _mm_movehl_ps(fb_r, self.oldout);
let mut v_r: __m128 = _mm_add_ss(fa_r, fb_r);
v_r = _mm_mul_ss(v_r, m128_half![]);
_mm_store_ss(f_r.add(udx), v_r);
self.oldout = _mm_shuffle_ps(o[udx], o[udx], _MM_SHUFFLE(3, 3, 1, 1));
}
}
}
}