82 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
83 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
90 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
98 volk_32f_x3_sum_of_poly_32f_a_sse3(
float* target,
float* src0,
float* center_point_array,
99 float* cutoff,
unsigned int num_points)
101 const unsigned int num_bytes = num_points*4;
110 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
112 xmm9 = _mm_setzero_ps();
113 xmm1 = _mm_setzero_ps();
115 xmm0 = _mm_load1_ps(¢er_point_array[0]);
116 xmm6 = _mm_load1_ps(¢er_point_array[1]);
117 xmm7 = _mm_load1_ps(¢er_point_array[2]);
118 xmm8 = _mm_load1_ps(¢er_point_array[3]);
120 xmm10 = _mm_load1_ps(cutoff);
122 int bound = num_bytes >> 4;
123 int leftovers = (num_bytes >> 2) & 3;
126 for(; i < bound; ++i) {
127 xmm2 = _mm_load_ps(src0);
128 xmm2 = _mm_max_ps(xmm10, xmm2);
129 xmm3 = _mm_mul_ps(xmm2, xmm2);
130 xmm4 = _mm_mul_ps(xmm2, xmm3);
131 xmm5 = _mm_mul_ps(xmm3, xmm3);
134 xmm2 = _mm_mul_ps(xmm2, xmm0);
135 xmm3 = _mm_mul_ps(xmm3, xmm6);
136 xmm4 = _mm_mul_ps(xmm4, xmm7);
137 xmm5 = _mm_mul_ps(xmm5, xmm8);
140 xmm2 = _mm_add_ps(xmm2, xmm3);
141 xmm3 = _mm_add_ps(xmm4, xmm5);
145 xmm9 = _mm_add_ps(xmm2, xmm9);
147 xmm1 = _mm_add_ps(xmm3, xmm1);
152 xmm2 = _mm_hadd_ps(xmm9, xmm1);
153 xmm3 = _mm_hadd_ps(xmm2, xmm2);
154 xmm4 = _mm_hadd_ps(xmm3, xmm3);
156 _mm_store_ss(&result, xmm4);
160 for(i = 0; i < leftovers; ++i) {
162 fst =
MAX(fst, *cutoff);
168 result += (center_point_array[0] * fst +
169 center_point_array[1] * sq +
170 center_point_array[2] * thrd +
171 center_point_array[3] * frth);
175 result += ((float)((bound * 4) + leftovers)) * center_point_array[4];
185 #include<immintrin.h>
188 volk_32f_x3_sum_of_poly_32f_a_avx(
float* target,
float* src0,
float* center_point_array,
189 float* cutoff,
unsigned int num_points)
191 const unsigned int eighth_points = num_points / 8;
197 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
199 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
201 cpa0 = _mm256_set1_ps(center_point_array[0]);
202 cpa1 = _mm256_set1_ps(center_point_array[1]);
203 cpa2 = _mm256_set1_ps(center_point_array[2]);
204 cpa3 = _mm256_set1_ps(center_point_array[3]);
205 cutoff_vec = _mm256_set1_ps(*cutoff);
206 target_vec = _mm256_setzero_ps();
210 for(i = 0; i < eighth_points; ++i) {
211 x_to_1 = _mm256_load_ps(src0);
212 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
213 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
214 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
216 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
218 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
219 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
220 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
221 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
223 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
224 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
226 target_vec = _mm256_add_ps(x_to_1, target_vec);
227 target_vec = _mm256_add_ps(x_to_3, target_vec);
234 target_vec = _mm256_hadd_ps(target_vec, target_vec);
235 _mm256_store_ps(temp_results, target_vec);
236 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
239 for(i = eighth_points*8; i < num_points; ++i) {
241 fst =
MAX(fst, *cutoff);
246 *target += (center_point_array[0] * fst +
247 center_point_array[1] * sq +
248 center_point_array[2] * thrd +
249 center_point_array[3] * frth);
252 *target += ((float)(num_points)) * center_point_array[4];
254 #endif // LV_HAVE_AVX
257 #ifdef LV_HAVE_GENERIC
260 volk_32f_x3_sum_of_poly_32f_generic(
float* target,
float* src0,
float* center_point_array,
261 float* cutoff,
unsigned int num_points)
263 const unsigned int num_bytes = num_points*4;
274 for(; i < num_bytes >> 2; ++i) {
276 fst =
MAX(fst, *cutoff);
283 result += (center_point_array[0] * fst +
284 center_point_array[1] * sq +
285 center_point_array[2] * thrd +
286 center_point_array[3] * frth);
297 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);
306 #include<immintrin.h>
309 volk_32f_x3_sum_of_poly_32f_u_avx(
float* target,
float* src0,
float* center_point_array,
310 float* cutoff,
unsigned int num_points)
312 const unsigned int eighth_points = num_points / 8;
318 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
320 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
322 cpa0 = _mm256_set1_ps(center_point_array[0]);
323 cpa1 = _mm256_set1_ps(center_point_array[1]);
324 cpa2 = _mm256_set1_ps(center_point_array[2]);
325 cpa3 = _mm256_set1_ps(center_point_array[3]);
326 cutoff_vec = _mm256_set1_ps(*cutoff);
327 target_vec = _mm256_setzero_ps();
331 for(i = 0; i < eighth_points; ++i) {
332 x_to_1 = _mm256_loadu_ps(src0);
333 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
334 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
335 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
337 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
339 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
340 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
341 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
342 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
344 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
345 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
347 target_vec = _mm256_add_ps(x_to_1, target_vec);
348 target_vec = _mm256_add_ps(x_to_3, target_vec);
355 target_vec = _mm256_hadd_ps(target_vec, target_vec);
356 _mm256_store_ps(temp_results, target_vec);
357 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
360 for(i = eighth_points*8; i < num_points; ++i) {
362 fst =
MAX(fst, *cutoff);
367 *target += (center_point_array[0] * fst +
368 center_point_array[1] * sq +
369 center_point_array[2] * thrd +
370 center_point_array[3] * frth);
373 *target += ((float)(num_points)) * center_point_array[4];
375 #endif // LV_HAVE_AVX
378 #include <arm_neon.h>
381 volk_32f_x3_sum_of_poly_32f_a_neon(
float* __restrict target,
float* __restrict src0,
382 float* __restrict center_point_array,
383 float* __restrict cutoff,
unsigned int num_points)
386 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
388 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
389 float32x2_t cutoff_vector;
390 float32x2x2_t x_low, x_high;
391 float32x4_t x_qvector, c_qvector, cpa_qvector;
393 float res_accumulators[4];
395 c_qvector = vld1q_f32( zero );
397 cutoff_vector = vdup_n_f32( *cutoff );
399 cpa_qvector = vld1q_f32( center_point_array );
401 for(i=0; i < num_points; ++i) {
403 x_to_1 = vdup_n_f32( *src0++ );
406 x_to_1 = vmax_f32(x_to_1, cutoff_vector );
407 x_to_2 = vmul_f32(x_to_1, x_to_1);
408 x_to_3 = vmul_f32(x_to_2, x_to_1);
409 x_to_4 = vmul_f32(x_to_3, x_to_1);
411 x_low = vzip_f32(x_to_1, x_to_2);
412 x_high = vzip_f32(x_to_3, x_to_4);
414 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
417 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
421 vst1q_f32(res_accumulators, c_qvector );
422 accumulator = res_accumulators[0] + res_accumulators[1] +
423 res_accumulators[2] + res_accumulators[3];
425 *target = accumulator + center_point_array[4] * (float)num_points;
434 volk_32f_x3_sum_of_poly_32f_neonvert(
float* __restrict target,
float* __restrict src0,
435 float* __restrict center_point_array,
436 float* __restrict cutoff,
unsigned int num_points)
439 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
443 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
444 accumulator1_vec = vld1q_f32(zero);
445 accumulator2_vec = vld1q_f32(zero);
446 accumulator3_vec = vld1q_f32(zero);
447 accumulator4_vec = vld1q_f32(zero);
448 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
449 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
452 cutoff_vector = vdupq_n_f32( *cutoff );
454 cpa_0 = vdupq_n_f32(center_point_array[0]);
455 cpa_1 = vdupq_n_f32(center_point_array[1]);
456 cpa_2 = vdupq_n_f32(center_point_array[2]);
457 cpa_3 = vdupq_n_f32(center_point_array[3]);
460 for(i=0; i < num_points/4; ++i) {
462 x_to_1 = vld1q_f32( src0 );
465 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector );
466 x_to_2 = vmulq_f32(x_to_1, x_to_1);
467 x_to_3 = vmulq_f32(x_to_2, x_to_1);
468 x_to_4 = vmulq_f32(x_to_3, x_to_1);
469 x_to_1 = vmulq_f32(x_to_1, cpa_0);
470 x_to_2 = vmulq_f32(x_to_2, cpa_1);
471 x_to_3 = vmulq_f32(x_to_3, cpa_2);
472 x_to_4 = vmulq_f32(x_to_4, cpa_3);
473 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
474 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
475 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
476 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
480 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
481 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
482 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
485 vst1q_f32(res_accumulators, accumulator1_vec );
486 accumulator = res_accumulators[0] + res_accumulators[1] +
487 res_accumulators[2] + res_accumulators[3];
494 for(i = 4*num_points/4; i < num_points; ++i) {
496 fst =
MAX(fst, *cutoff);
503 accumulator += (center_point_array[0] * fst +
504 center_point_array[1] * sq +
505 center_point_array[2] * thrd +
506 center_point_array[3] * frth);
509 *target = accumulator + center_point_array[4] * (float)num_points;
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:90
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27