58 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
59 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
67 #ifdef LV_HAVE_GENERIC
72 float * res = (
float*) result;
73 float * in = (
float*) input;
74 float * tp = (
float*) taps;
75 unsigned int n_2_ccomplex_blocks = num_points/2;
76 unsigned int isodd = num_points & 1;
78 float sum0[2] = {0,0};
79 float sum1[2] = {0,0};
82 for(i = 0; i < n_2_ccomplex_blocks; ++i) {
83 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
84 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
85 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
86 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
92 res[0] = sum0[0] + sum1[0];
93 res[1] = sum0[1] + sum1[1];
96 for(i = 0; i < isodd; ++i) {
97 *result += input[num_points - 1] * taps[num_points - 1];
105 #if LV_HAVE_SSE && LV_HAVE_64
107 static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
109 const unsigned int num_bytes = num_points*8;
110 unsigned int isodd = num_points & 1;
114 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
115 "# const float *taps, unsigned num_bytes)\n\t"
116 "# float sum0 = 0;\n\t"
117 "# float sum1 = 0;\n\t"
118 "# float sum2 = 0;\n\t"
119 "# float sum3 = 0;\n\t"
121 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
122 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
123 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
124 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
127 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
128 "# result[0] = sum0 + sum2;\n\t"
129 "# result[1] = sum1 + sum3;\n\t"
130 "# TODO: prefetch and better scheduling\n\t"
131 " xor %%r9, %%r9\n\t"
132 " xor %%r10, %%r10\n\t"
133 " movq %%rcx, %%rax\n\t"
134 " movq %%rcx, %%r8\n\t"
135 " movq %[rsi], %%r9\n\t"
136 " movq %[rdx], %%r10\n\t"
137 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
138 " movups 0(%%r9), %%xmm0\n\t"
139 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
140 " movups 0(%%r10), %%xmm2\n\t"
141 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
143 " jmp .%=L1_test\n\t"
144 " # 4 taps / loop\n\t"
145 " # something like ?? cycles / loop\n\t"
147 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
148 "# movups (%%r9), %%xmmA\n\t"
149 "# movups (%%r10), %%xmmB\n\t"
150 "# movups %%xmmA, %%xmmZ\n\t"
151 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
152 "# mulps %%xmmB, %%xmmA\n\t"
153 "# mulps %%xmmZ, %%xmmB\n\t"
154 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
155 "# xorps %%xmmPN, %%xmmA\n\t"
156 "# movups %%xmmA, %%xmmZ\n\t"
157 "# unpcklps %%xmmB, %%xmmA\n\t"
158 "# unpckhps %%xmmB, %%xmmZ\n\t"
159 "# movups %%xmmZ, %%xmmY\n\t"
160 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
161 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
162 "# addps %%xmmZ, %%xmmA\n\t"
163 "# addps %%xmmA, %%xmmC\n\t"
164 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
165 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
166 " movups 16(%%r9), %%xmm1\n\t"
167 " movups %%xmm0, %%xmm4\n\t"
168 " mulps %%xmm2, %%xmm0\n\t"
169 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
170 " movups 16(%%r10), %%xmm3\n\t"
171 " movups %%xmm1, %%xmm5\n\t"
172 " addps %%xmm0, %%xmm6\n\t"
173 " mulps %%xmm3, %%xmm1\n\t"
174 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
175 " addps %%xmm1, %%xmm6\n\t"
176 " mulps %%xmm4, %%xmm2\n\t"
177 " movups 32(%%r9), %%xmm0\n\t"
178 " addps %%xmm2, %%xmm7\n\t"
179 " mulps %%xmm5, %%xmm3\n\t"
181 " movups 32(%%r10), %%xmm2\n\t"
182 " addps %%xmm3, %%xmm7\n\t"
183 " add $32, %%r10\n\t"
187 " # We've handled the bulk of multiplies up to here.\n\t"
188 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
189 " # If so, we've got 2 more taps to do.\n\t"
192 " # The count was odd, do 2 more taps.\n\t"
193 " # Note that we've already got mm0/mm2 preloaded\n\t"
194 " # from the main loop.\n\t"
195 " movups %%xmm0, %%xmm4\n\t"
196 " mulps %%xmm2, %%xmm0\n\t"
197 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
198 " addps %%xmm0, %%xmm6\n\t"
199 " mulps %%xmm4, %%xmm2\n\t"
200 " addps %%xmm2, %%xmm7\n\t"
202 " # neg inversor\n\t"
203 " xorps %%xmm1, %%xmm1\n\t"
204 " mov $0x80000000, %%r9\n\t"
205 " movd %%r9, %%xmm1\n\t"
206 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
208 " xorps %%xmm1, %%xmm6\n\t"
209 " movups %%xmm6, %%xmm2\n\t"
210 " unpcklps %%xmm7, %%xmm6\n\t"
211 " unpckhps %%xmm7, %%xmm2\n\t"
212 " movups %%xmm2, %%xmm3\n\t"
213 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
214 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
215 " addps %%xmm2, %%xmm6\n\t"
216 " # xmm6 = r1 i2 r3 i4\n\t"
217 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
218 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
219 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
221 :[rsi]
"r" (input), [rdx]
"r" (taps),
"c" (num_bytes), [rdi]
"r" (result)
222 :
"rax",
"r8",
"r9",
"r10"
227 *result += input[num_points - 1] * taps[num_points - 1];
241 #include <pmmintrin.h>
243 static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
246 memset(&dotProduct, 0x0, 2*
sizeof(
float));
248 unsigned int number = 0;
249 const unsigned int halfPoints = num_points/2;
250 unsigned int isodd = num_points & 1;
252 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
257 dotProdVal = _mm_setzero_ps();
259 for(;number < halfPoints; number++){
261 x = _mm_loadu_ps((
float*)a);
262 y = _mm_loadu_ps((
float*)b);
264 yl = _mm_moveldup_ps(y);
265 yh = _mm_movehdup_ps(y);
267 tmp1 = _mm_mul_ps(x,yl);
269 x = _mm_shuffle_ps(x,x,0xB1);
271 tmp2 = _mm_mul_ps(x,yh);
273 z = _mm_addsub_ps(tmp1,tmp2);
275 dotProdVal = _mm_add_ps(dotProdVal, z);
283 _mm_storeu_ps((
float*)dotProductVector,dotProdVal);
285 dotProduct += ( dotProductVector[0] + dotProductVector[1] );
288 dotProduct += input[num_points - 1] * taps[num_points - 1];
291 *result = dotProduct;
296 #ifdef LV_HAVE_SSE4_1
298 #include <smmintrin.h>
300 static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
303 const unsigned int qtr_points = num_points/4;
304 const unsigned int isodd = num_points & 3;
306 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
307 float *p_input, *p_taps;
310 p_result = (__m64*)result;
311 p_input = (
float*)input;
312 p_taps = (
float*)taps;
314 static const __m128i neg = {0x000000000000000080000000};
316 real0 = _mm_setzero_ps();
317 real1 = _mm_setzero_ps();
318 im0 = _mm_setzero_ps();
319 im1 = _mm_setzero_ps();
321 for(; i < qtr_points; ++i) {
322 xmm0 = _mm_loadu_ps(p_input);
323 xmm1 = _mm_loadu_ps(p_taps);
328 xmm2 = _mm_loadu_ps(p_input);
329 xmm3 = _mm_loadu_ps(p_taps);
334 xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
335 xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
336 xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
337 xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
340 xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
342 xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
344 xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
346 xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
348 xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
349 xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
351 xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
352 xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
354 real0 = _mm_add_ps(xmm4, real0);
355 real1 = _mm_add_ps(xmm5, real1);
356 im0 = _mm_add_ps(xmm6, im0);
357 im1 = _mm_add_ps(xmm7, im1);
360 real1 = _mm_xor_ps(real1,
bit128_p(&neg)->float_vec);
362 im0 = _mm_add_ps(im0, im1);
363 real0 = _mm_add_ps(real0, real1);
365 im0 = _mm_add_ps(im0, real0);
367 _mm_storel_pi(p_result, im0);
369 for(i = num_points-isodd; i < num_points; i++) {
370 *result += input[i] * taps[i];
378 #include <immintrin.h>
380 static inline void volk_32fc_x2_dot_prod_32fc_u_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
382 unsigned int isodd = num_points & 3;
385 memset(&dotProduct, 0x0, 2*
sizeof(
float));
387 unsigned int number = 0;
388 const unsigned int quarterPoints = num_points / 4;
390 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
395 dotProdVal = _mm256_setzero_ps();
397 for(;number < quarterPoints; number++){
398 x = _mm256_loadu_ps((
float*)a);
399 y = _mm256_loadu_ps((
float*)b);
401 yl = _mm256_moveldup_ps(y);
402 yh = _mm256_movehdup_ps(y);
404 tmp1 = _mm256_mul_ps(x,yl);
406 x = _mm256_shuffle_ps(x,x,0xB1);
408 tmp2 = _mm256_mul_ps(x,yh);
410 z = _mm256_addsub_ps(tmp1,tmp2);
412 dotProdVal = _mm256_add_ps(dotProdVal, z);
420 _mm256_storeu_ps((
float*)dotProductVector,dotProdVal);
422 dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
424 for(i = num_points-isodd; i < num_points; i++) {
425 dotProduct += input[i] * taps[i];
428 *result = dotProduct;
436 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
437 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
445 #ifdef LV_HAVE_GENERIC
448 static inline void volk_32fc_x2_dot_prod_32fc_a_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
450 const unsigned int num_bytes = num_points*8;
452 float * res = (
float*) result;
453 float * in = (
float*) input;
454 float * tp = (
float*) taps;
455 unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
456 unsigned int isodd = num_points & 1;
458 float sum0[2] = {0,0};
459 float sum1[2] = {0,0};
462 for(i = 0; i < n_2_ccomplex_blocks; ++i) {
463 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
464 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
465 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
466 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
472 res[0] = sum0[0] + sum1[0];
473 res[1] = sum0[1] + sum1[1];
475 for(i = 0; i < isodd; ++i) {
476 *result += input[num_points - 1] * taps[num_points - 1];
483 #if LV_HAVE_SSE && LV_HAVE_64
486 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
488 const unsigned int num_bytes = num_points*8;
489 unsigned int isodd = num_points & 1;
493 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
494 "# const float *taps, unsigned num_bytes)\n\t"
495 "# float sum0 = 0;\n\t"
496 "# float sum1 = 0;\n\t"
497 "# float sum2 = 0;\n\t"
498 "# float sum3 = 0;\n\t"
500 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
501 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
502 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
503 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
506 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
507 "# result[0] = sum0 + sum2;\n\t"
508 "# result[1] = sum1 + sum3;\n\t"
509 "# TODO: prefetch and better scheduling\n\t"
510 " xor %%r9, %%r9\n\t"
511 " xor %%r10, %%r10\n\t"
512 " movq %%rcx, %%rax\n\t"
513 " movq %%rcx, %%r8\n\t"
514 " movq %[rsi], %%r9\n\t"
515 " movq %[rdx], %%r10\n\t"
516 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
517 " movaps 0(%%r9), %%xmm0\n\t"
518 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
519 " movaps 0(%%r10), %%xmm2\n\t"
520 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
522 " jmp .%=L1_test\n\t"
523 " # 4 taps / loop\n\t"
524 " # something like ?? cycles / loop\n\t"
526 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
527 "# movaps (%%r9), %%xmmA\n\t"
528 "# movaps (%%r10), %%xmmB\n\t"
529 "# movaps %%xmmA, %%xmmZ\n\t"
530 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
531 "# mulps %%xmmB, %%xmmA\n\t"
532 "# mulps %%xmmZ, %%xmmB\n\t"
533 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
534 "# xorps %%xmmPN, %%xmmA\n\t"
535 "# movaps %%xmmA, %%xmmZ\n\t"
536 "# unpcklps %%xmmB, %%xmmA\n\t"
537 "# unpckhps %%xmmB, %%xmmZ\n\t"
538 "# movaps %%xmmZ, %%xmmY\n\t"
539 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
540 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
541 "# addps %%xmmZ, %%xmmA\n\t"
542 "# addps %%xmmA, %%xmmC\n\t"
543 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
544 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
545 " movaps 16(%%r9), %%xmm1\n\t"
546 " movaps %%xmm0, %%xmm4\n\t"
547 " mulps %%xmm2, %%xmm0\n\t"
548 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
549 " movaps 16(%%r10), %%xmm3\n\t"
550 " movaps %%xmm1, %%xmm5\n\t"
551 " addps %%xmm0, %%xmm6\n\t"
552 " mulps %%xmm3, %%xmm1\n\t"
553 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
554 " addps %%xmm1, %%xmm6\n\t"
555 " mulps %%xmm4, %%xmm2\n\t"
556 " movaps 32(%%r9), %%xmm0\n\t"
557 " addps %%xmm2, %%xmm7\n\t"
558 " mulps %%xmm5, %%xmm3\n\t"
560 " movaps 32(%%r10), %%xmm2\n\t"
561 " addps %%xmm3, %%xmm7\n\t"
562 " add $32, %%r10\n\t"
566 " # We've handled the bulk of multiplies up to here.\n\t"
567 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
568 " # If so, we've got 2 more taps to do.\n\t"
571 " # The count was odd, do 2 more taps.\n\t"
572 " # Note that we've already got mm0/mm2 preloaded\n\t"
573 " # from the main loop.\n\t"
574 " movaps %%xmm0, %%xmm4\n\t"
575 " mulps %%xmm2, %%xmm0\n\t"
576 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
577 " addps %%xmm0, %%xmm6\n\t"
578 " mulps %%xmm4, %%xmm2\n\t"
579 " addps %%xmm2, %%xmm7\n\t"
581 " # neg inversor\n\t"
582 " xorps %%xmm1, %%xmm1\n\t"
583 " mov $0x80000000, %%r9\n\t"
584 " movd %%r9, %%xmm1\n\t"
585 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
587 " xorps %%xmm1, %%xmm6\n\t"
588 " movaps %%xmm6, %%xmm2\n\t"
589 " unpcklps %%xmm7, %%xmm6\n\t"
590 " unpckhps %%xmm7, %%xmm2\n\t"
591 " movaps %%xmm2, %%xmm3\n\t"
592 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
593 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
594 " addps %%xmm2, %%xmm6\n\t"
595 " # xmm6 = r1 i2 r3 i4\n\t"
596 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
597 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
598 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
600 :[rsi]
"r" (input), [rdx]
"r" (taps),
"c" (num_bytes), [rdi]
"r" (result)
601 :
"rax",
"r8",
"r9",
"r10"
606 *result += input[num_points - 1] * taps[num_points - 1];
615 #if LV_HAVE_SSE && LV_HAVE_32
617 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
619 volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
622 const unsigned int num_bytes = num_points*8;
623 unsigned int isodd = num_points & 1;
628 " #movl %%esp, %%ebp\n\t"
629 " movl 12(%%ebp), %%eax # input\n\t"
630 " movl 16(%%ebp), %%edx # taps\n\t"
631 " movl 20(%%ebp), %%ecx # n_bytes\n\t"
632 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
633 " movaps 0(%%eax), %%xmm0\n\t"
634 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
635 " movaps 0(%%edx), %%xmm2\n\t"
636 " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
637 " jmp .%=L1_test\n\t"
638 " # 4 taps / loop\n\t"
639 " # something like ?? cycles / loop\n\t"
641 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
642 "# movaps (%%eax), %%xmmA\n\t"
643 "# movaps (%%edx), %%xmmB\n\t"
644 "# movaps %%xmmA, %%xmmZ\n\t"
645 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
646 "# mulps %%xmmB, %%xmmA\n\t"
647 "# mulps %%xmmZ, %%xmmB\n\t"
648 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
649 "# xorps %%xmmPN, %%xmmA\n\t"
650 "# movaps %%xmmA, %%xmmZ\n\t"
651 "# unpcklps %%xmmB, %%xmmA\n\t"
652 "# unpckhps %%xmmB, %%xmmZ\n\t"
653 "# movaps %%xmmZ, %%xmmY\n\t"
654 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
655 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
656 "# addps %%xmmZ, %%xmmA\n\t"
657 "# addps %%xmmA, %%xmmC\n\t"
658 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
659 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
660 " movaps 16(%%eax), %%xmm1\n\t"
661 " movaps %%xmm0, %%xmm4\n\t"
662 " mulps %%xmm2, %%xmm0\n\t"
663 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
664 " movaps 16(%%edx), %%xmm3\n\t"
665 " movaps %%xmm1, %%xmm5\n\t"
666 " addps %%xmm0, %%xmm6\n\t"
667 " mulps %%xmm3, %%xmm1\n\t"
668 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
669 " addps %%xmm1, %%xmm6\n\t"
670 " mulps %%xmm4, %%xmm2\n\t"
671 " movaps 32(%%eax), %%xmm0\n\t"
672 " addps %%xmm2, %%xmm7\n\t"
673 " mulps %%xmm5, %%xmm3\n\t"
674 " addl $32, %%eax\n\t"
675 " movaps 32(%%edx), %%xmm2\n\t"
676 " addps %%xmm3, %%xmm7\n\t"
677 " addl $32, %%edx\n\t"
681 " # We've handled the bulk of multiplies up to here.\n\t"
682 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
683 " # If so, we've got 2 more taps to do.\n\t"
684 " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
685 " shrl $4, %%ecx\n\t"
686 " andl $1, %%ecx\n\t"
688 " # The count was odd, do 2 more taps.\n\t"
689 " # Note that we've already got mm0/mm2 preloaded\n\t"
690 " # from the main loop.\n\t"
691 " movaps %%xmm0, %%xmm4\n\t"
692 " mulps %%xmm2, %%xmm0\n\t"
693 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
694 " addps %%xmm0, %%xmm6\n\t"
695 " mulps %%xmm4, %%xmm2\n\t"
696 " addps %%xmm2, %%xmm7\n\t"
698 " # neg inversor\n\t"
699 " movl 8(%%ebp), %%eax \n\t"
700 " xorps %%xmm1, %%xmm1\n\t"
701 " movl $0x80000000, (%%eax)\n\t"
702 " movss (%%eax), %%xmm1\n\t"
703 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
705 " xorps %%xmm1, %%xmm6\n\t"
706 " movaps %%xmm6, %%xmm2\n\t"
707 " unpcklps %%xmm7, %%xmm6\n\t"
708 " unpckhps %%xmm7, %%xmm2\n\t"
709 " movaps %%xmm2, %%xmm3\n\t"
710 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
711 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
712 " addps %%xmm2, %%xmm6\n\t"
713 " # xmm6 = r1 i2 r3 i4\n\t"
714 " #movl 8(%%ebp), %%eax # @result\n\t"
715 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
716 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
717 " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
721 :
"eax",
"ecx",
"edx"
725 int getem = num_bytes % 16;
728 *result += (input[num_points - 1] * taps[num_points - 1]);
739 #include <pmmintrin.h>
741 static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
743 const unsigned int num_bytes = num_points*8;
744 unsigned int isodd = num_points & 1;
747 memset(&dotProduct, 0x0, 2*
sizeof(
float));
749 unsigned int number = 0;
750 const unsigned int halfPoints = num_bytes >> 4;
752 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
757 dotProdVal = _mm_setzero_ps();
759 for(;number < halfPoints; number++){
761 x = _mm_load_ps((
float*)a);
762 y = _mm_load_ps((
float*)b);
764 yl = _mm_moveldup_ps(y);
765 yh = _mm_movehdup_ps(y);
767 tmp1 = _mm_mul_ps(x,yl);
769 x = _mm_shuffle_ps(x,x,0xB1);
771 tmp2 = _mm_mul_ps(x,yh);
773 z = _mm_addsub_ps(tmp1,tmp2);
775 dotProdVal = _mm_add_ps(dotProdVal, z);
783 _mm_store_ps((
float*)dotProductVector,dotProdVal);
785 dotProduct += ( dotProductVector[0] + dotProductVector[1] );
788 dotProduct += input[num_points - 1] * taps[num_points - 1];
791 *result = dotProduct;
797 #ifdef LV_HAVE_SSE4_1
799 #include <smmintrin.h>
801 static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
804 const unsigned int qtr_points = num_points/4;
805 const unsigned int isodd = num_points & 3;
807 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
808 float *p_input, *p_taps;
811 static const __m128i neg = {0x000000000000000080000000};
813 p_result = (__m64*)result;
814 p_input = (
float*)input;
815 p_taps = (
float*)taps;
817 real0 = _mm_setzero_ps();
818 real1 = _mm_setzero_ps();
819 im0 = _mm_setzero_ps();
820 im1 = _mm_setzero_ps();
822 for(; i < qtr_points; ++i) {
823 xmm0 = _mm_load_ps(p_input);
824 xmm1 = _mm_load_ps(p_taps);
829 xmm2 = _mm_load_ps(p_input);
830 xmm3 = _mm_load_ps(p_taps);
835 xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
836 xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
837 xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
838 xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
841 xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
843 xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
845 xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
847 xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
849 xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
850 xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
852 xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
853 xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
855 real0 = _mm_add_ps(xmm4, real0);
856 real1 = _mm_add_ps(xmm5, real1);
857 im0 = _mm_add_ps(xmm6, im0);
858 im1 = _mm_add_ps(xmm7, im1);
861 real1 = _mm_xor_ps(real1,
bit128_p(&neg)->float_vec);
863 im0 = _mm_add_ps(im0, im1);
864 real0 = _mm_add_ps(real0, real1);
866 im0 = _mm_add_ps(im0, real0);
868 _mm_storel_pi(p_result, im0);
870 for(i = num_points-isodd; i < num_points; i++) {
871 *result += input[i] * taps[i];
878 #include <arm_neon.h>
880 static inline void volk_32fc_x2_dot_prod_32fc_neon(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
882 unsigned int quarter_points = num_points / 4;
889 float32x4x2_t a_val, b_val, c_val, accumulator;
890 float32x4x2_t tmp_real, tmp_imag;
891 accumulator.val[0] = vdupq_n_f32(0);
892 accumulator.val[1] = vdupq_n_f32(0);
894 for(number = 0; number < quarter_points; ++number) {
895 a_val = vld2q_f32((
float*)a_ptr);
896 b_val = vld2q_f32((
float*)b_ptr);
897 __builtin_prefetch(a_ptr+8);
898 __builtin_prefetch(b_ptr+8);
902 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
904 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
908 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
910 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
912 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
913 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
915 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
916 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
922 vst2q_f32((
float*)accum_result, accumulator);
923 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
926 for(number = quarter_points*4; number < num_points; ++number) {
927 *result += (*a_ptr++) * (*b_ptr++);
934 #include <arm_neon.h>
935 static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
937 unsigned int quarter_points = num_points / 4;
944 float32x4x2_t a_val, b_val, accumulator;
945 float32x4x2_t tmp_imag;
946 accumulator.val[0] = vdupq_n_f32(0);
947 accumulator.val[1] = vdupq_n_f32(0);
949 for(number = 0; number < quarter_points; ++number) {
950 a_val = vld2q_f32((
float*)a_ptr);
951 b_val = vld2q_f32((
float*)b_ptr);
952 __builtin_prefetch(a_ptr+8);
953 __builtin_prefetch(b_ptr+8);
956 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
957 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
960 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
961 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
963 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
964 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
971 vst2q_f32((
float*)accum_result, accumulator);
972 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
975 for(number = quarter_points*4; number < num_points; ++number) {
976 *result += (*a_ptr++) * (*b_ptr++);
983 static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
985 unsigned int quarter_points = num_points / 4;
992 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
993 accumulator1.val[0] = vdupq_n_f32(0);
994 accumulator1.val[1] = vdupq_n_f32(0);
995 accumulator2.val[0] = vdupq_n_f32(0);
996 accumulator2.val[1] = vdupq_n_f32(0);
998 for(number = 0; number < quarter_points; ++number) {
999 a_val = vld2q_f32((
float*)a_ptr);
1000 b_val = vld2q_f32((
float*)b_ptr);
1001 __builtin_prefetch(a_ptr+8);
1002 __builtin_prefetch(b_ptr+8);
1005 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1006 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1007 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1008 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1013 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1014 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1016 vst2q_f32((
float*)accum_result, accumulator1);
1017 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1020 for(number = quarter_points*4; number < num_points; ++number) {
1021 *result += (*a_ptr++) * (*b_ptr++);
1028 static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
1031 unsigned int quarter_points = num_points / 8;
1032 unsigned int number;
1038 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
1039 float32x4x2_t reduced_accumulator;
1040 accumulator1.val[0] = vdupq_n_f32(0);
1041 accumulator1.val[1] = vdupq_n_f32(0);
1042 accumulator1.val[2] = vdupq_n_f32(0);
1043 accumulator1.val[3] = vdupq_n_f32(0);
1044 accumulator2.val[0] = vdupq_n_f32(0);
1045 accumulator2.val[1] = vdupq_n_f32(0);
1046 accumulator2.val[2] = vdupq_n_f32(0);
1047 accumulator2.val[3] = vdupq_n_f32(0);
1050 for(number = 0; number < quarter_points; ++number) {
1051 a_val = vld4q_f32((
float*)a_ptr);
1052 b_val = vld4q_f32((
float*)b_ptr);
1053 __builtin_prefetch(a_ptr+8);
1054 __builtin_prefetch(b_ptr+8);
1057 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1058 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1060 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
1061 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
1063 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1064 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1066 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
1067 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
1073 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
1074 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
1075 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
1076 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
1077 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1078 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1081 vst2q_f32((
float*)accum_result, reduced_accumulator);
1082 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1085 for(number = quarter_points*8; number < num_points; ++number) {
1086 *result += (*a_ptr++) * (*b_ptr++);
1095 #include <immintrin.h>
1097 static inline void volk_32fc_x2_dot_prod_32fc_a_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
1099 unsigned int isodd = num_points & 3;
1102 memset(&dotProduct, 0x0, 2*
sizeof(
float));
1104 unsigned int number = 0;
1105 const unsigned int quarterPoints = num_points / 4;
1107 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1112 dotProdVal = _mm256_setzero_ps();
1114 for(;number < quarterPoints; number++){
1116 x = _mm256_load_ps((
float*)a);
1117 y = _mm256_load_ps((
float*)b);
1119 yl = _mm256_moveldup_ps(y);
1120 yh = _mm256_movehdup_ps(y);
1122 tmp1 = _mm256_mul_ps(x,yl);
1124 x = _mm256_shuffle_ps(x,x,0xB1);
1126 tmp2 = _mm256_mul_ps(x,yh);
1128 z = _mm256_addsub_ps(tmp1,tmp2);
1130 dotProdVal = _mm256_add_ps(dotProdVal, z);
1138 _mm256_store_ps((
float*)dotProductVector,dotProdVal);
1140 dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
1142 for(i = num_points-isodd; i < num_points; i++) {
1143 dotProduct += input[i] * taps[i];
1146 *result = dotProduct;
#define bit128_p(x)
Definition: volk_common.h:94
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56