GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_x2_dot_prod_32f
25  *
26  * \b Overview
27  *
28  * This block computes the dot product (or inner product) between two
29  * vectors, the \p input and \p taps vectors. Given a set of \p
30  * num_points taps, the result is the sum of products between the two
31  * vectors. The result is a single value stored in the \p result
32  * address and is returned as a float.
33  *
34  * <b>Dispatcher Prototype</b>
35  * \code
36  * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points)
37  * \endcode
38  *
39  * \b Inputs
40  * \li input: vector of floats.
41  * \li taps: float taps.
42  * \li num_points: number of samples in both \p input and \p taps.
43  *
44  * \b Outputs
45  * \li result: pointer to a float value to hold the dot product result.
46  *
47  * \b Example
48  * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9).
49  * \code
50  * int N = 10;
51  * unsigned int alignment = volk_get_alignment();
52  * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
53  * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
54  * float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
55  *
56  * for(unsigned int ii = 0; ii < N; ++ii){
57  * increasing[ii] = (float)ii;
58  * ones[ii] = 1.f;
59  * }
60  *
61  * volk_32f_x2_dot_prod_32f(out, increasing, ones, N);
62  *
63  * printf("out = %1.2f\n", *out);
64  *
65  * volk_free(increasing);
66  * volk_free(ones);
67  * volk_free(out);
68  *
69  * return 0;
70  * \endcode
71  */
72 
73 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
74 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
75 
76 #include <volk/volk_common.h>
77 #include<stdio.h>
78 
79 
80 #ifdef LV_HAVE_GENERIC
81 
82 
83 static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
84 
85  float dotProduct = 0;
86  const float* aPtr = input;
87  const float* bPtr= taps;
88  unsigned int number = 0;
89 
90  for(number = 0; number < num_points; number++){
91  dotProduct += ((*aPtr++) * (*bPtr++));
92  }
93 
94  *result = dotProduct;
95 }
96 
97 #endif /*LV_HAVE_GENERIC*/
98 
99 
100 #ifdef LV_HAVE_SSE
101 
102 
103 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
104 
105  unsigned int number = 0;
106  const unsigned int sixteenthPoints = num_points / 16;
107 
108  float dotProduct = 0;
109  const float* aPtr = input;
110  const float* bPtr = taps;
111 
112  __m128 a0Val, a1Val, a2Val, a3Val;
113  __m128 b0Val, b1Val, b2Val, b3Val;
114  __m128 c0Val, c1Val, c2Val, c3Val;
115 
116  __m128 dotProdVal0 = _mm_setzero_ps();
117  __m128 dotProdVal1 = _mm_setzero_ps();
118  __m128 dotProdVal2 = _mm_setzero_ps();
119  __m128 dotProdVal3 = _mm_setzero_ps();
120 
121  for(;number < sixteenthPoints; number++){
122 
123  a0Val = _mm_loadu_ps(aPtr);
124  a1Val = _mm_loadu_ps(aPtr+4);
125  a2Val = _mm_loadu_ps(aPtr+8);
126  a3Val = _mm_loadu_ps(aPtr+12);
127  b0Val = _mm_loadu_ps(bPtr);
128  b1Val = _mm_loadu_ps(bPtr+4);
129  b2Val = _mm_loadu_ps(bPtr+8);
130  b3Val = _mm_loadu_ps(bPtr+12);
131 
132  c0Val = _mm_mul_ps(a0Val, b0Val);
133  c1Val = _mm_mul_ps(a1Val, b1Val);
134  c2Val = _mm_mul_ps(a2Val, b2Val);
135  c3Val = _mm_mul_ps(a3Val, b3Val);
136 
137  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
138  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
139  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
140  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
141 
142  aPtr += 16;
143  bPtr += 16;
144  }
145 
146  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
147  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
148  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
149 
150  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
151 
152  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
153 
154  dotProduct = dotProductVector[0];
155  dotProduct += dotProductVector[1];
156  dotProduct += dotProductVector[2];
157  dotProduct += dotProductVector[3];
158 
159  number = sixteenthPoints*16;
160  for(;number < num_points; number++){
161  dotProduct += ((*aPtr++) * (*bPtr++));
162  }
163 
164  *result = dotProduct;
165 
166 }
167 
168 #endif /*LV_HAVE_SSE*/
169 
170 #ifdef LV_HAVE_SSE3
171 
172 #include <pmmintrin.h>
173 
174 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
175  unsigned int number = 0;
176  const unsigned int sixteenthPoints = num_points / 16;
177 
178  float dotProduct = 0;
179  const float* aPtr = input;
180  const float* bPtr = taps;
181 
182  __m128 a0Val, a1Val, a2Val, a3Val;
183  __m128 b0Val, b1Val, b2Val, b3Val;
184  __m128 c0Val, c1Val, c2Val, c3Val;
185 
186  __m128 dotProdVal0 = _mm_setzero_ps();
187  __m128 dotProdVal1 = _mm_setzero_ps();
188  __m128 dotProdVal2 = _mm_setzero_ps();
189  __m128 dotProdVal3 = _mm_setzero_ps();
190 
191  for(;number < sixteenthPoints; number++){
192 
193  a0Val = _mm_loadu_ps(aPtr);
194  a1Val = _mm_loadu_ps(aPtr+4);
195  a2Val = _mm_loadu_ps(aPtr+8);
196  a3Val = _mm_loadu_ps(aPtr+12);
197  b0Val = _mm_loadu_ps(bPtr);
198  b1Val = _mm_loadu_ps(bPtr+4);
199  b2Val = _mm_loadu_ps(bPtr+8);
200  b3Val = _mm_loadu_ps(bPtr+12);
201 
202  c0Val = _mm_mul_ps(a0Val, b0Val);
203  c1Val = _mm_mul_ps(a1Val, b1Val);
204  c2Val = _mm_mul_ps(a2Val, b2Val);
205  c3Val = _mm_mul_ps(a3Val, b3Val);
206 
207  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
208  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
209  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
210  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
211 
212  aPtr += 16;
213  bPtr += 16;
214  }
215 
216  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
217  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
218  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
219 
220  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
221  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
222 
223  dotProduct = dotProductVector[0];
224  dotProduct += dotProductVector[1];
225  dotProduct += dotProductVector[2];
226  dotProduct += dotProductVector[3];
227 
228  number = sixteenthPoints*16;
229  for(;number < num_points; number++){
230  dotProduct += ((*aPtr++) * (*bPtr++));
231  }
232 
233  *result = dotProduct;
234 }
235 
236 #endif /*LV_HAVE_SSE3*/
237 
238 #ifdef LV_HAVE_SSE4_1
239 
240 #include <smmintrin.h>
241 
242 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
243  unsigned int number = 0;
244  const unsigned int sixteenthPoints = num_points / 16;
245 
246  float dotProduct = 0;
247  const float* aPtr = input;
248  const float* bPtr = taps;
249 
250  __m128 aVal1, bVal1, cVal1;
251  __m128 aVal2, bVal2, cVal2;
252  __m128 aVal3, bVal3, cVal3;
253  __m128 aVal4, bVal4, cVal4;
254 
255  __m128 dotProdVal = _mm_setzero_ps();
256 
257  for(;number < sixteenthPoints; number++){
258 
259  aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
260  aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
261  aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
262  aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
263 
264  bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
265  bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
266  bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
267  bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
268 
269  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
270  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
271  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
272  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
273 
274  cVal1 = _mm_or_ps(cVal1, cVal2);
275  cVal3 = _mm_or_ps(cVal3, cVal4);
276  cVal1 = _mm_or_ps(cVal1, cVal3);
277 
278  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
279  }
280 
281  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
282  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
283 
284  dotProduct = dotProductVector[0];
285  dotProduct += dotProductVector[1];
286  dotProduct += dotProductVector[2];
287  dotProduct += dotProductVector[3];
288 
289  number = sixteenthPoints * 16;
290  for(;number < num_points; number++){
291  dotProduct += ((*aPtr++) * (*bPtr++));
292  }
293 
294  *result = dotProduct;
295 }
296 
297 #endif /*LV_HAVE_SSE4_1*/
298 
299 #ifdef LV_HAVE_AVX
300 
301 #include <immintrin.h>
302 
303 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
304 
305  unsigned int number = 0;
306  const unsigned int sixteenthPoints = num_points / 16;
307 
308  float dotProduct = 0;
309  const float* aPtr = input;
310  const float* bPtr = taps;
311 
312  __m256 a0Val, a1Val;
313  __m256 b0Val, b1Val;
314  __m256 c0Val, c1Val;
315 
316  __m256 dotProdVal0 = _mm256_setzero_ps();
317  __m256 dotProdVal1 = _mm256_setzero_ps();
318 
319  for(;number < sixteenthPoints; number++){
320 
321  a0Val = _mm256_loadu_ps(aPtr);
322  a1Val = _mm256_loadu_ps(aPtr+8);
323  b0Val = _mm256_loadu_ps(bPtr);
324  b1Val = _mm256_loadu_ps(bPtr+8);
325 
326  c0Val = _mm256_mul_ps(a0Val, b0Val);
327  c1Val = _mm256_mul_ps(a1Val, b1Val);
328 
329  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
330  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
331 
332  aPtr += 16;
333  bPtr += 16;
334  }
335 
336  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
337 
338  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
339 
340  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
341 
342  dotProduct = dotProductVector[0];
343  dotProduct += dotProductVector[1];
344  dotProduct += dotProductVector[2];
345  dotProduct += dotProductVector[3];
346  dotProduct += dotProductVector[4];
347  dotProduct += dotProductVector[5];
348  dotProduct += dotProductVector[6];
349  dotProduct += dotProductVector[7];
350 
351  number = sixteenthPoints*16;
352  for(;number < num_points; number++){
353  dotProduct += ((*aPtr++) * (*bPtr++));
354  }
355 
356  *result = dotProduct;
357 
358 }
359 
360 #endif /*LV_HAVE_AVX*/
361 
362 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
363 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
364 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
365 
366 #include <volk/volk_common.h>
367 #include<stdio.h>
368 
369 
370 #ifdef LV_HAVE_GENERIC
371 
372 
373 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
374 
375  float dotProduct = 0;
376  const float* aPtr = input;
377  const float* bPtr= taps;
378  unsigned int number = 0;
379 
380  for(number = 0; number < num_points; number++){
381  dotProduct += ((*aPtr++) * (*bPtr++));
382  }
383 
384  *result = dotProduct;
385 }
386 
387 #endif /*LV_HAVE_GENERIC*/
388 
389 
390 #ifdef LV_HAVE_SSE
391 
392 
393 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
394 
395  unsigned int number = 0;
396  const unsigned int sixteenthPoints = num_points / 16;
397 
398  float dotProduct = 0;
399  const float* aPtr = input;
400  const float* bPtr = taps;
401 
402  __m128 a0Val, a1Val, a2Val, a3Val;
403  __m128 b0Val, b1Val, b2Val, b3Val;
404  __m128 c0Val, c1Val, c2Val, c3Val;
405 
406  __m128 dotProdVal0 = _mm_setzero_ps();
407  __m128 dotProdVal1 = _mm_setzero_ps();
408  __m128 dotProdVal2 = _mm_setzero_ps();
409  __m128 dotProdVal3 = _mm_setzero_ps();
410 
411  for(;number < sixteenthPoints; number++){
412 
413  a0Val = _mm_load_ps(aPtr);
414  a1Val = _mm_load_ps(aPtr+4);
415  a2Val = _mm_load_ps(aPtr+8);
416  a3Val = _mm_load_ps(aPtr+12);
417  b0Val = _mm_load_ps(bPtr);
418  b1Val = _mm_load_ps(bPtr+4);
419  b2Val = _mm_load_ps(bPtr+8);
420  b3Val = _mm_load_ps(bPtr+12);
421 
422  c0Val = _mm_mul_ps(a0Val, b0Val);
423  c1Val = _mm_mul_ps(a1Val, b1Val);
424  c2Val = _mm_mul_ps(a2Val, b2Val);
425  c3Val = _mm_mul_ps(a3Val, b3Val);
426 
427  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
428  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
429  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
430  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
431 
432  aPtr += 16;
433  bPtr += 16;
434  }
435 
436  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
437  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
438  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
439 
440  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
441 
442  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
443 
444  dotProduct = dotProductVector[0];
445  dotProduct += dotProductVector[1];
446  dotProduct += dotProductVector[2];
447  dotProduct += dotProductVector[3];
448 
449  number = sixteenthPoints*16;
450  for(;number < num_points; number++){
451  dotProduct += ((*aPtr++) * (*bPtr++));
452  }
453 
454  *result = dotProduct;
455 
456 }
457 
458 #endif /*LV_HAVE_SSE*/
459 
460 #ifdef LV_HAVE_SSE3
461 
462 #include <pmmintrin.h>
463 
464 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
465  unsigned int number = 0;
466  const unsigned int sixteenthPoints = num_points / 16;
467 
468  float dotProduct = 0;
469  const float* aPtr = input;
470  const float* bPtr = taps;
471 
472  __m128 a0Val, a1Val, a2Val, a3Val;
473  __m128 b0Val, b1Val, b2Val, b3Val;
474  __m128 c0Val, c1Val, c2Val, c3Val;
475 
476  __m128 dotProdVal0 = _mm_setzero_ps();
477  __m128 dotProdVal1 = _mm_setzero_ps();
478  __m128 dotProdVal2 = _mm_setzero_ps();
479  __m128 dotProdVal3 = _mm_setzero_ps();
480 
481  for(;number < sixteenthPoints; number++){
482 
483  a0Val = _mm_load_ps(aPtr);
484  a1Val = _mm_load_ps(aPtr+4);
485  a2Val = _mm_load_ps(aPtr+8);
486  a3Val = _mm_load_ps(aPtr+12);
487  b0Val = _mm_load_ps(bPtr);
488  b1Val = _mm_load_ps(bPtr+4);
489  b2Val = _mm_load_ps(bPtr+8);
490  b3Val = _mm_load_ps(bPtr+12);
491 
492  c0Val = _mm_mul_ps(a0Val, b0Val);
493  c1Val = _mm_mul_ps(a1Val, b1Val);
494  c2Val = _mm_mul_ps(a2Val, b2Val);
495  c3Val = _mm_mul_ps(a3Val, b3Val);
496 
497  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
498  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
499  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
500  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
501 
502  aPtr += 16;
503  bPtr += 16;
504  }
505 
506  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
507  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
508  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
509 
510  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
511  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
512 
513  dotProduct = dotProductVector[0];
514  dotProduct += dotProductVector[1];
515  dotProduct += dotProductVector[2];
516  dotProduct += dotProductVector[3];
517 
518  number = sixteenthPoints*16;
519  for(;number < num_points; number++){
520  dotProduct += ((*aPtr++) * (*bPtr++));
521  }
522 
523  *result = dotProduct;
524 }
525 
526 #endif /*LV_HAVE_SSE3*/
527 
528 #ifdef LV_HAVE_SSE4_1
529 
530 #include <smmintrin.h>
531 
532 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
533  unsigned int number = 0;
534  const unsigned int sixteenthPoints = num_points / 16;
535 
536  float dotProduct = 0;
537  const float* aPtr = input;
538  const float* bPtr = taps;
539 
540  __m128 aVal1, bVal1, cVal1;
541  __m128 aVal2, bVal2, cVal2;
542  __m128 aVal3, bVal3, cVal3;
543  __m128 aVal4, bVal4, cVal4;
544 
545  __m128 dotProdVal = _mm_setzero_ps();
546 
547  for(;number < sixteenthPoints; number++){
548 
549  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
550  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
551  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
552  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
553 
554  bVal1 = _mm_load_ps(bPtr); bPtr += 4;
555  bVal2 = _mm_load_ps(bPtr); bPtr += 4;
556  bVal3 = _mm_load_ps(bPtr); bPtr += 4;
557  bVal4 = _mm_load_ps(bPtr); bPtr += 4;
558 
559  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
560  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
561  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
562  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
563 
564  cVal1 = _mm_or_ps(cVal1, cVal2);
565  cVal3 = _mm_or_ps(cVal3, cVal4);
566  cVal1 = _mm_or_ps(cVal1, cVal3);
567 
568  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
569  }
570 
571  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
572  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
573 
574  dotProduct = dotProductVector[0];
575  dotProduct += dotProductVector[1];
576  dotProduct += dotProductVector[2];
577  dotProduct += dotProductVector[3];
578 
579  number = sixteenthPoints * 16;
580  for(;number < num_points; number++){
581  dotProduct += ((*aPtr++) * (*bPtr++));
582  }
583 
584  *result = dotProduct;
585 }
586 
587 #endif /*LV_HAVE_SSE4_1*/
588 
589 #ifdef LV_HAVE_AVX
590 
591 #include <immintrin.h>
592 
593 static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
594 
595  unsigned int number = 0;
596  const unsigned int sixteenthPoints = num_points / 16;
597 
598  float dotProduct = 0;
599  const float* aPtr = input;
600  const float* bPtr = taps;
601 
602  __m256 a0Val, a1Val;
603  __m256 b0Val, b1Val;
604  __m256 c0Val, c1Val;
605 
606  __m256 dotProdVal0 = _mm256_setzero_ps();
607  __m256 dotProdVal1 = _mm256_setzero_ps();
608 
609  for(;number < sixteenthPoints; number++){
610 
611  a0Val = _mm256_load_ps(aPtr);
612  a1Val = _mm256_load_ps(aPtr+8);
613  b0Val = _mm256_load_ps(bPtr);
614  b1Val = _mm256_load_ps(bPtr+8);
615 
616  c0Val = _mm256_mul_ps(a0Val, b0Val);
617  c1Val = _mm256_mul_ps(a1Val, b1Val);
618 
619  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
620  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
621 
622  aPtr += 16;
623  bPtr += 16;
624  }
625 
626  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
627 
628  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
629 
630  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
631 
632  dotProduct = dotProductVector[0];
633  dotProduct += dotProductVector[1];
634  dotProduct += dotProductVector[2];
635  dotProduct += dotProductVector[3];
636  dotProduct += dotProductVector[4];
637  dotProduct += dotProductVector[5];
638  dotProduct += dotProductVector[6];
639  dotProduct += dotProductVector[7];
640 
641  number = sixteenthPoints*16;
642  for(;number < num_points; number++){
643  dotProduct += ((*aPtr++) * (*bPtr++));
644  }
645 
646  *result = dotProduct;
647 
648 }
649 
650 #endif /*LV_HAVE_AVX*/
651 
652 #ifdef LV_HAVE_NEON
653 #include <arm_neon.h>
654 
655 static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
656 
657  unsigned int quarter_points = num_points / 16;
658  float dotProduct = 0;
659  const float* aPtr = input;
660  const float* bPtr= taps;
661  unsigned int number = 0;
662 
663  float32x4x4_t a_val, b_val, accumulator0;
664  accumulator0.val[0] = vdupq_n_f32(0);
665  accumulator0.val[1] = vdupq_n_f32(0);
666  accumulator0.val[2] = vdupq_n_f32(0);
667  accumulator0.val[3] = vdupq_n_f32(0);
668  // factor of 4 loop unroll with independent accumulators
669  // uses 12 out of 16 neon q registers
670  for( number = 0; number < quarter_points; ++number) {
671  a_val = vld4q_f32(aPtr);
672  b_val = vld4q_f32(bPtr);
673  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
674  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
675  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
676  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
677  aPtr += 16;
678  bPtr += 16;
679  }
680  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
681  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
682  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
683  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
684  vst1q_f32(accumulator, accumulator0.val[0]);
685  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
686 
687  for(number = quarter_points*16; number < num_points; number++){
688  dotProduct += ((*aPtr++) * (*bPtr++));
689  }
690 
691  *result = dotProduct;
692 }
693 
694 #endif
695 
696 
697 
698 
699 #ifdef LV_HAVE_NEON
700 static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
701 
702  unsigned int quarter_points = num_points / 8;
703  float dotProduct = 0;
704  const float* aPtr = input;
705  const float* bPtr= taps;
706  unsigned int number = 0;
707 
708  float32x4x2_t a_val, b_val, accumulator_val;
709  accumulator_val.val[0] = vdupq_n_f32(0);
710  accumulator_val.val[1] = vdupq_n_f32(0);
711  // factor of 2 loop unroll with independent accumulators
712  for( number = 0; number < quarter_points; ++number) {
713  a_val = vld2q_f32(aPtr);
714  b_val = vld2q_f32(bPtr);
715  accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
716  accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
717  aPtr += 8;
718  bPtr += 8;
719  }
720  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
721  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
722  vst1q_f32(accumulator, accumulator_val.val[0]);
723  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
724 
725  for(number = quarter_points*8; number < num_points; number++){
726  dotProduct += ((*aPtr++) * (*bPtr++));
727  }
728 
729  *result = dotProduct;
730 }
731 
732 #endif /* LV_HAVE_NEON */
733 
734 #ifdef LV_HAVE_NEON
735 extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
736 #endif /* LV_HAVE_NEON */
737 
738 #ifdef LV_HAVE_NEON
739 extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
740 #endif /* LV_HAVE_NEON */
741 
742 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9