GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_32f_dot_prod_32fc
25  *
26  * \b Overview
27  *
28  * This block computes the dot product (or inner product) between two
29  * vectors, the \p input and \p taps vectors. Given a set of \p
30  * num_points taps, the result is the sum of products between the two
31  * vectors. The result is a single value stored in the \p result
32  * address and will be complex.
33  *
34  * <b>Dispatcher Prototype</b>
35  * \code
36  * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points)
37  * \endcode
38  *
39  * \b Inputs
40  * \li input: vector of complex samples
41  * \li taps: floating point taps
42  * \li num_points: number of samples in both \p input and \p taps
43  *
44  * \b Outputs
45  * \li result: pointer to a complex value to hold the dot product result.
46  *
47  * \b Example
48  * \code
49  * int N = 10000;
50  * lv_32fc_t y;
51  * lv_32fc_t *x = (lv_32fc_t*)volk_malloc(N*sizeof(lv_32fc_t), volk_get_alignment());
52  * float *t = (float*)volk_malloc(N*sizeof(float), volk_get_alignment());
53  *
54  * <populate x and t with some values>
55  *
56  * volk_32fc_dot_prod_32fc(&y, x, t, N);
57  *
58  * volk_free(x);
59  * volk_free(t);
60  * \endcode
61  */
62 
63 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
64 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
65 
66 #include <volk/volk_common.h>
67 #include <stdio.h>
68 
69 #ifdef LV_HAVE_GENERIC
70 
71 static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
72 
73  float res[2];
74  float *realpt = &res[0], *imagpt = &res[1];
75  const float* aPtr = (float*)input;
76  const float* bPtr= taps;
77  unsigned int number = 0;
78 
79  *realpt = 0;
80  *imagpt = 0;
81 
82  for(number = 0; number < num_points; number++){
83  *realpt += ((*aPtr++) * (*bPtr));
84  *imagpt += ((*aPtr++) * (*bPtr++));
85  }
86 
87  *result = *(lv_32fc_t*)(&res[0]);
88 }
89 
90 #endif /*LV_HAVE_GENERIC*/
91 
92 
93 #ifdef LV_HAVE_AVX
94 
95 #include <immintrin.h>
96 
97 static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
98 
99  unsigned int number = 0;
100  const unsigned int sixteenthPoints = num_points / 16;
101 
102  float res[2];
103  float *realpt = &res[0], *imagpt = &res[1];
104  const float* aPtr = (float*)input;
105  const float* bPtr = taps;
106 
107  __m256 a0Val, a1Val, a2Val, a3Val;
108  __m256 b0Val, b1Val, b2Val, b3Val;
109  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
110  __m256 c0Val, c1Val, c2Val, c3Val;
111 
112  __m256 dotProdVal0 = _mm256_setzero_ps();
113  __m256 dotProdVal1 = _mm256_setzero_ps();
114  __m256 dotProdVal2 = _mm256_setzero_ps();
115  __m256 dotProdVal3 = _mm256_setzero_ps();
116 
117  for(;number < sixteenthPoints; number++){
118 
119  a0Val = _mm256_load_ps(aPtr);
120  a1Val = _mm256_load_ps(aPtr+8);
121  a2Val = _mm256_load_ps(aPtr+16);
122  a3Val = _mm256_load_ps(aPtr+24);
123 
124  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
125  x1Val = _mm256_load_ps(bPtr+8);
126  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
127  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
128  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
129  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
130 
131  // TODO: it may be possible to rearrange swizzling to better pipeline data
132  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
133  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
134  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
135  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
136 
137  c0Val = _mm256_mul_ps(a0Val, b0Val);
138  c1Val = _mm256_mul_ps(a1Val, b1Val);
139  c2Val = _mm256_mul_ps(a2Val, b2Val);
140  c3Val = _mm256_mul_ps(a3Val, b3Val);
141 
142  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
143  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
144  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
145  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
146 
147  aPtr += 32;
148  bPtr += 16;
149  }
150 
151  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
152  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
153  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
154 
155  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
156 
157  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
158 
159  *realpt = dotProductVector[0];
160  *imagpt = dotProductVector[1];
161  *realpt += dotProductVector[2];
162  *imagpt += dotProductVector[3];
163  *realpt += dotProductVector[4];
164  *imagpt += dotProductVector[5];
165  *realpt += dotProductVector[6];
166  *imagpt += dotProductVector[7];
167 
168  number = sixteenthPoints*16;
169  for(;number < num_points; number++){
170  *realpt += ((*aPtr++) * (*bPtr));
171  *imagpt += ((*aPtr++) * (*bPtr++));
172  }
173 
174  *result = *(lv_32fc_t*)(&res[0]);
175 }
176 
177 #endif /*LV_HAVE_AVX*/
178 
179 
180 
181 
182 #ifdef LV_HAVE_SSE
183 
184 
185 static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
186 
187  unsigned int number = 0;
188  const unsigned int sixteenthPoints = num_points / 8;
189 
190  float res[2];
191  float *realpt = &res[0], *imagpt = &res[1];
192  const float* aPtr = (float*)input;
193  const float* bPtr = taps;
194 
195  __m128 a0Val, a1Val, a2Val, a3Val;
196  __m128 b0Val, b1Val, b2Val, b3Val;
197  __m128 x0Val, x1Val, x2Val, x3Val;
198  __m128 c0Val, c1Val, c2Val, c3Val;
199 
200  __m128 dotProdVal0 = _mm_setzero_ps();
201  __m128 dotProdVal1 = _mm_setzero_ps();
202  __m128 dotProdVal2 = _mm_setzero_ps();
203  __m128 dotProdVal3 = _mm_setzero_ps();
204 
205  for(;number < sixteenthPoints; number++){
206 
207  a0Val = _mm_load_ps(aPtr);
208  a1Val = _mm_load_ps(aPtr+4);
209  a2Val = _mm_load_ps(aPtr+8);
210  a3Val = _mm_load_ps(aPtr+12);
211 
212  x0Val = _mm_load_ps(bPtr);
213  x1Val = _mm_load_ps(bPtr);
214  x2Val = _mm_load_ps(bPtr+4);
215  x3Val = _mm_load_ps(bPtr+4);
216  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
217  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
218  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
219  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
220 
221  c0Val = _mm_mul_ps(a0Val, b0Val);
222  c1Val = _mm_mul_ps(a1Val, b1Val);
223  c2Val = _mm_mul_ps(a2Val, b2Val);
224  c3Val = _mm_mul_ps(a3Val, b3Val);
225 
226  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
227  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
228  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
229  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
230 
231  aPtr += 16;
232  bPtr += 8;
233  }
234 
235  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
236  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
237  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
238 
239  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
240 
241  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
242 
243  *realpt = dotProductVector[0];
244  *imagpt = dotProductVector[1];
245  *realpt += dotProductVector[2];
246  *imagpt += dotProductVector[3];
247 
248  number = sixteenthPoints*8;
249  for(;number < num_points; number++){
250  *realpt += ((*aPtr++) * (*bPtr));
251  *imagpt += ((*aPtr++) * (*bPtr++));
252  }
253 
254  *result = *(lv_32fc_t*)(&res[0]);
255 }
256 
257 #endif /*LV_HAVE_SSE*/
258 
259 
260 
261 #ifdef LV_HAVE_AVX
262 
263 #include <immintrin.h>
264 
265 static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
266 
267  unsigned int number = 0;
268  const unsigned int sixteenthPoints = num_points / 16;
269 
270  float res[2];
271  float *realpt = &res[0], *imagpt = &res[1];
272  const float* aPtr = (float*)input;
273  const float* bPtr = taps;
274 
275  __m256 a0Val, a1Val, a2Val, a3Val;
276  __m256 b0Val, b1Val, b2Val, b3Val;
277  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
278  __m256 c0Val, c1Val, c2Val, c3Val;
279 
280  __m256 dotProdVal0 = _mm256_setzero_ps();
281  __m256 dotProdVal1 = _mm256_setzero_ps();
282  __m256 dotProdVal2 = _mm256_setzero_ps();
283  __m256 dotProdVal3 = _mm256_setzero_ps();
284 
285  for(;number < sixteenthPoints; number++){
286 
287  a0Val = _mm256_loadu_ps(aPtr);
288  a1Val = _mm256_loadu_ps(aPtr+8);
289  a2Val = _mm256_loadu_ps(aPtr+16);
290  a3Val = _mm256_loadu_ps(aPtr+24);
291 
292  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
293  x1Val = _mm256_loadu_ps(bPtr+8);
294  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
295  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
296  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
297  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
298 
299  // TODO: it may be possible to rearrange swizzling to better pipeline data
300  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
301  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
302  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
303  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
304 
305  c0Val = _mm256_mul_ps(a0Val, b0Val);
306  c1Val = _mm256_mul_ps(a1Val, b1Val);
307  c2Val = _mm256_mul_ps(a2Val, b2Val);
308  c3Val = _mm256_mul_ps(a3Val, b3Val);
309 
310  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
311  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
312  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
313  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
314 
315  aPtr += 32;
316  bPtr += 16;
317  }
318 
319  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
320  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
321  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
322 
323  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
324 
325  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
326 
327  *realpt = dotProductVector[0];
328  *imagpt = dotProductVector[1];
329  *realpt += dotProductVector[2];
330  *imagpt += dotProductVector[3];
331  *realpt += dotProductVector[4];
332  *imagpt += dotProductVector[5];
333  *realpt += dotProductVector[6];
334  *imagpt += dotProductVector[7];
335 
336  number = sixteenthPoints*16;
337  for(;number < num_points; number++){
338  *realpt += ((*aPtr++) * (*bPtr));
339  *imagpt += ((*aPtr++) * (*bPtr++));
340  }
341 
342  *result = *(lv_32fc_t*)(&res[0]);
343 }
344 #endif /*LV_HAVE_AVX*/
345 
346 #ifdef LV_HAVE_NEON
347 #include <arm_neon.h>
348 
349 static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
350 
351  unsigned int number;
352  const unsigned int quarterPoints = num_points / 8;
353 
354  float res[2];
355  float *realpt = &res[0], *imagpt = &res[1];
356  const float* inputPtr = (float*)input;
357  const float* tapsPtr = taps;
358  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
359  float accVector_real[4];
360  float accVector_imag[4];
361 
362  float32x4x2_t inputVector0, inputVector1;
363  float32x4_t tapsVector0, tapsVector1;
364  float32x4_t tmp_real0, tmp_imag0;
365  float32x4_t tmp_real1, tmp_imag1;
366  float32x4_t real_accumulator0, imag_accumulator0;
367  float32x4_t real_accumulator1, imag_accumulator1;
368 
369  // zero out accumulators
370  // take a *float, return float32x4_t
371  real_accumulator0 = vld1q_f32( zero );
372  imag_accumulator0 = vld1q_f32( zero );
373  real_accumulator1 = vld1q_f32( zero );
374  imag_accumulator1 = vld1q_f32( zero );
375 
376  for(number=0 ;number < quarterPoints; number++){
377  // load doublewords and duplicate in to second lane
378  tapsVector0 = vld1q_f32(tapsPtr );
379  tapsVector1 = vld1q_f32(tapsPtr+4 );
380 
381  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
382  inputVector0 = vld2q_f32(inputPtr );
383  inputVector1 = vld2q_f32(inputPtr+8 );
384  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
385 
386  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
387  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
388 
389  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
390  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
391 
392  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
393  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
394 
395  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
396  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
397 
398  tapsPtr += 8;
399  inputPtr += 16;
400  }
401 
402  real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
403  imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
404  // void vst1q_f32( float32_t * ptr, float32x4_t val);
405  // store results back to a complex (array of 2 floats)
406  vst1q_f32(accVector_real, real_accumulator0);
407  vst1q_f32(accVector_imag, imag_accumulator0);
408  *realpt = accVector_real[0] + accVector_real[1] +
409  accVector_real[2] + accVector_real[3] ;
410 
411  *imagpt = accVector_imag[0] + accVector_imag[1] +
412  accVector_imag[2] + accVector_imag[3] ;
413 
414  // clean up the remainder
415  for(number=quarterPoints*8; number < num_points; number++){
416  *realpt += ((*inputPtr++) * (*tapsPtr));
417  *imagpt += ((*inputPtr++) * (*tapsPtr++));
418  }
419 
420  *result = *(lv_32fc_t*)(&res[0]);
421 }
422 
423 #endif /*LV_HAVE_NEON*/
424 
425 #ifdef LV_HAVE_NEON
426 #include <arm_neon.h>
427 
428 static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
429 
430  unsigned int number;
431  const unsigned int quarterPoints = num_points / 4;
432 
433  float res[2];
434  float *realpt = &res[0], *imagpt = &res[1];
435  const float* inputPtr = (float*)input;
436  const float* tapsPtr = taps;
437  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
438  float accVector_real[4];
439  float accVector_imag[4];
440 
441  float32x4x2_t inputVector;
442  float32x4_t tapsVector;
443  float32x4_t tmp_real, tmp_imag;
444  float32x4_t real_accumulator, imag_accumulator;
445 
446 
447  // zero out accumulators
448  // take a *float, return float32x4_t
449  real_accumulator = vld1q_f32( zero );
450  imag_accumulator = vld1q_f32( zero );
451 
452  for(number=0 ;number < quarterPoints; number++){
453  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
454  // load doublewords and duplicate in to second lane
455  tapsVector = vld1q_f32(tapsPtr );
456 
457  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
458  inputVector = vld2q_f32(inputPtr );
459 
460  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
461  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
462 
463  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
464  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
465 
466 
467  tapsPtr += 4;
468  inputPtr += 8;
469 
470  }
471 
472  // store results back to a complex (array of 2 floats)
473  vst1q_f32(accVector_real, real_accumulator);
474  vst1q_f32(accVector_imag, imag_accumulator);
475  *realpt = accVector_real[0] + accVector_real[1] +
476  accVector_real[2] + accVector_real[3] ;
477 
478  *imagpt = accVector_imag[0] + accVector_imag[1] +
479  accVector_imag[2] + accVector_imag[3] ;
480 
481  // clean up the remainder
482  for(number=quarterPoints*4; number < num_points; number++){
483  *realpt += ((*inputPtr++) * (*tapsPtr));
484  *imagpt += ((*inputPtr++) * (*tapsPtr++));
485  }
486 
487  *result = *(lv_32fc_t*)(&res[0]);
488 }
489 
490 #endif /*LV_HAVE_NEON*/
491 
492 #ifdef LV_HAVE_NEON
493 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
494 #endif /*LV_HAVE_NEON*/
495 
496 #ifdef LV_HAVE_NEON
497 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
498 #endif /*LV_HAVE_NEON*/
499 
500 #ifdef LV_HAVE_SSE
501 
502 static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
503 
504  unsigned int number = 0;
505  const unsigned int sixteenthPoints = num_points / 8;
506 
507  float res[2];
508  float *realpt = &res[0], *imagpt = &res[1];
509  const float* aPtr = (float*)input;
510  const float* bPtr = taps;
511 
512  __m128 a0Val, a1Val, a2Val, a3Val;
513  __m128 b0Val, b1Val, b2Val, b3Val;
514  __m128 x0Val, x1Val, x2Val, x3Val;
515  __m128 c0Val, c1Val, c2Val, c3Val;
516 
517  __m128 dotProdVal0 = _mm_setzero_ps();
518  __m128 dotProdVal1 = _mm_setzero_ps();
519  __m128 dotProdVal2 = _mm_setzero_ps();
520  __m128 dotProdVal3 = _mm_setzero_ps();
521 
522  for(;number < sixteenthPoints; number++){
523 
524  a0Val = _mm_loadu_ps(aPtr);
525  a1Val = _mm_loadu_ps(aPtr+4);
526  a2Val = _mm_loadu_ps(aPtr+8);
527  a3Val = _mm_loadu_ps(aPtr+12);
528 
529  x0Val = _mm_loadu_ps(bPtr);
530  x1Val = _mm_loadu_ps(bPtr);
531  x2Val = _mm_loadu_ps(bPtr+4);
532  x3Val = _mm_loadu_ps(bPtr+4);
533  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
534  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
535  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
536  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
537 
538  c0Val = _mm_mul_ps(a0Val, b0Val);
539  c1Val = _mm_mul_ps(a1Val, b1Val);
540  c2Val = _mm_mul_ps(a2Val, b2Val);
541  c3Val = _mm_mul_ps(a3Val, b3Val);
542 
543  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
544  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
545  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
546  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
547 
548  aPtr += 16;
549  bPtr += 8;
550  }
551 
552  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
553  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
554  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
555 
556  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
557 
558  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
559 
560  *realpt = dotProductVector[0];
561  *imagpt = dotProductVector[1];
562  *realpt += dotProductVector[2];
563  *imagpt += dotProductVector[3];
564 
565  number = sixteenthPoints*8;
566  for(;number < num_points; number++){
567  *realpt += ((*aPtr++) * (*bPtr));
568  *imagpt += ((*aPtr++) * (*bPtr++));
569  }
570 
571  *result = *(lv_32fc_t*)(&res[0]);
572 }
573 
574 #endif /*LV_HAVE_SSE*/
575 
576 
577 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56