70 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
71 #define INCLUDED_volk_32fc_index_max_16u_a_H
83 volk_32fc_index_max_16u_a_sse3(
unsigned int* target,
lv_32fc_t* src0,
84 unsigned int num_points)
86 const unsigned int num_bytes = num_points*8;
93 __m128 xmm1, xmm2, xmm3;
94 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
96 xmm5.int_vec = xmmfive = _mm_setzero_si128();
97 xmm4.int_vec = xmmfour = _mm_setzero_si128();
98 holderf.int_vec = holder0 = _mm_setzero_si128();
99 holderi.int_vec = holder1 = _mm_setzero_si128();
101 int bound = num_bytes >> 5;
102 int leftovers0 = (num_bytes >> 4) & 1;
103 int leftovers1 = (num_bytes >> 3) & 1;
106 xmm8 = _mm_set_epi32(3, 2, 1, 0);
107 xmm9 = xmm8 = _mm_setzero_si128();
108 xmm10 = _mm_set_epi32(4, 4, 4, 4);
109 xmm3 = _mm_setzero_ps();
113 for(; i < bound; ++
i) {
114 xmm1 = _mm_load_ps((
float*)src0);
115 xmm2 = _mm_load_ps((
float*)&src0[2]);
119 xmm1 = _mm_mul_ps(xmm1, xmm1);
120 xmm2 = _mm_mul_ps(xmm2, xmm2);
122 xmm1 = _mm_hadd_ps(xmm1, xmm2);
124 xmm3 = _mm_max_ps(xmm1, xmm3);
126 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
127 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
129 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
130 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
132 xmm9 = _mm_add_epi32(xmm11, xmm12);
134 xmm8 = _mm_add_epi32(xmm8, xmm10);
141 for(i = 0; i < leftovers0; ++
i) {
142 xmm2 = _mm_load_ps((
float*)src0);
144 xmm1 = _mm_movelh_ps(
bit128_p(&xmm8)->float_vec,
bit128_p(&xmm8)->float_vec);
147 xmm2 = _mm_mul_ps(xmm2, xmm2);
151 xmm1 = _mm_hadd_ps(xmm2, xmm2);
153 xmm3 = _mm_max_ps(xmm1, xmm3);
155 xmm10 = _mm_set_epi32(2, 2, 2, 2);
157 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
158 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
160 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
161 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
163 xmm9 = _mm_add_epi32(xmm11, xmm12);
165 xmm8 = _mm_add_epi32(xmm8, xmm10);
169 for(i = 0; i < leftovers1; ++
i) {
174 xmm2 = _mm_load1_ps(&sq_dist);
178 xmm3 = _mm_max_ss(xmm3, xmm2);
180 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
181 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
183 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
185 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
186 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
188 xmm9 = _mm_add_epi32(xmm11, xmm12);
194 _mm_store_ps((
float*)&(holderf.f), xmm3);
195 _mm_store_si128(&(holderi.int_vec), xmm9);
197 target[0] = holderi.i[0];
198 sq_dist = holderf.f[0];
199 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
200 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
201 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
202 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
203 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
204 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
228 #ifdef LV_HAVE_GENERIC
230 volk_32fc_index_max_16u_generic(
unsigned int* target,
lv_32fc_t* src0,
231 unsigned int num_points)
233 const unsigned int num_bytes = num_points*8;
237 unsigned int index = 0;
241 for(; i < num_bytes >> 3; ++
i) {
244 index = sq_dist > max ? i : index;
245 max = sq_dist > max ? sq_dist : max;
#define bit128_p(x)
Definition: volk_common.h:94
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
Definition: volk_common.h:78
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80