GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_x3_sum_of_poly_32f
25  *
26  * \b Overview
27  *
28  * Calculates the unscaled area under a fourth order polynomial using the
29  * rectangular method. The result is the sum of y-values. To get the area,
30  * multiply by the rectangle/bin width.
31  *
32  * Expressed as a formula, this function calculates
33  * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$
34  *
35  * <b>Dispatcher Prototype</b>
36  * \code
37  * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
38  * \endcode
39  *
40  * \b Inputs
41  * \li src0: x values
42  * \li center_point_array: polynomial coefficients in order {c1, c2, c3, c4, c0}
43  * \li cutoff: the minimum x value to use (will clamp to cutoff if input < cutoff)
44  * \li num_points: The number of values in both input vectors.
45  *
46  * \b Outputs
47  * \li complexVector: The sum of y values generated by polynomial.
48  *
49  * \b Example
50  * The following estimates \f$\int_0^\pi e^x dx\f$ by using the Taylor expansion
51  * centered around \f$x=1.5\f$,
52  * \f$ e^x = e^1.5 * (1 + x + \frac{1}{2} x^2 + \frac{1}{6}x^3 + \frac{1}{24}x^4) \f$
53  * \code
54  * int npoints = 4096;
55  * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
56  * float* input = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment());
57  * float* result = (float*)volk_malloc(sizeof(float), volk_get_alignment());
58  * float* cutoff = (float*)volk_malloc(sizeof(float), volk_get_alignment());
59  * // load precomputed Taylor series coefficients
60  * coefficients[0] = 4.48168907033806f; // c1
61  * coefficients[1] = coefficients[0] * 0.5f; // c2
62  * coefficients[2] = coefficients[0] * 1.0f/6.0f; // c3
63  * coefficients[3] = coefficients[0] * 1.0f/24.0f; // c4
64  * coefficients[4] = coefficients[0]; // c0
65  * *cutoff = -2.0;
66  * *result = 0.0f;
67  * // generate uniform input data
68  * float dx = (float)M_PI/ (float)npoints;
69  * for(unsigned int ii=0; ii < npoints; ++ii){
70  * input[ii] = dx * (float)ii - 1.5f;
71  * }
72  * volk_32f_x3_sum_of_poly_32f(result, input, coefficients, cutoff, npoints);
73  * // multiply by bin width to get approximate area
74  * std::cout << "result is " << *result * (input[1]-input[0]) << std::endl;
75  * volk_free(coefficients);
76  * volk_free(input);
77  * volk_free(result);
78  * volk_free(cutoff);
79  * \endcode
80  */
81 
82 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
83 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
84 
85 #include<inttypes.h>
86 #include<stdio.h>
87 #include<volk/volk_complex.h>
88 
89 #ifndef MAX
90 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
91 #endif
92 
93 #ifdef LV_HAVE_SSE3
94 #include<xmmintrin.h>
95 #include<pmmintrin.h>
96 
97 static inline void
98 volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
99  float* cutoff, unsigned int num_points)
100 {
101  const unsigned int num_bytes = num_points*4;
102 
103  float result = 0.0;
104  float fst = 0.0;
105  float sq = 0.0;
106  float thrd = 0.0;
107  float frth = 0.0;
108  //float fith = 0.0;
109 
110  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
111 
112  xmm9 = _mm_setzero_ps();
113  xmm1 = _mm_setzero_ps();
114 
115  xmm0 = _mm_load1_ps(&center_point_array[0]);
116  xmm6 = _mm_load1_ps(&center_point_array[1]);
117  xmm7 = _mm_load1_ps(&center_point_array[2]);
118  xmm8 = _mm_load1_ps(&center_point_array[3]);
119  //xmm11 = _mm_load1_ps(&center_point_array[4]);
120  xmm10 = _mm_load1_ps(cutoff);
121 
122  int bound = num_bytes >> 4;
123  int leftovers = (num_bytes >> 2) & 3;
124  int i = 0;
125 
126  for(; i < bound; ++i) {
127  xmm2 = _mm_load_ps(src0);
128  xmm2 = _mm_max_ps(xmm10, xmm2);
129  xmm3 = _mm_mul_ps(xmm2, xmm2);
130  xmm4 = _mm_mul_ps(xmm2, xmm3);
131  xmm5 = _mm_mul_ps(xmm3, xmm3);
132  //xmm12 = _mm_mul_ps(xmm3, xmm4);
133 
134  xmm2 = _mm_mul_ps(xmm2, xmm0);
135  xmm3 = _mm_mul_ps(xmm3, xmm6);
136  xmm4 = _mm_mul_ps(xmm4, xmm7);
137  xmm5 = _mm_mul_ps(xmm5, xmm8);
138  //xmm12 = _mm_mul_ps(xmm12, xmm11);
139 
140  xmm2 = _mm_add_ps(xmm2, xmm3);
141  xmm3 = _mm_add_ps(xmm4, xmm5);
142 
143  src0 += 4;
144 
145  xmm9 = _mm_add_ps(xmm2, xmm9);
146 
147  xmm1 = _mm_add_ps(xmm3, xmm1);
148 
149  //xmm9 = _mm_add_ps(xmm12, xmm9);
150  }
151 
152  xmm2 = _mm_hadd_ps(xmm9, xmm1);
153  xmm3 = _mm_hadd_ps(xmm2, xmm2);
154  xmm4 = _mm_hadd_ps(xmm3, xmm3);
155 
156  _mm_store_ss(&result, xmm4);
157 
158 
159 
160  for(i = 0; i < leftovers; ++i) {
161  fst = src0[i];
162  fst = MAX(fst, *cutoff);
163  sq = fst * fst;
164  thrd = fst * sq;
165  frth = sq * sq;
166  //fith = sq * thrd;
167 
168  result += (center_point_array[0] * fst +
169  center_point_array[1] * sq +
170  center_point_array[2] * thrd +
171  center_point_array[3] * frth);// +
172  //center_point_array[4] * fith);
173  }
174 
175  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
176 
177  target[0] = result;
178 }
179 
180 
181 #endif /*LV_HAVE_SSE3*/
182 
183 
184 #ifdef LV_HAVE_AVX
185 #include<immintrin.h>
186 
187 static inline void
188 volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
189  float* cutoff, unsigned int num_points)
190 {
191  const unsigned int eighth_points = num_points / 8;
192  float fst = 0.0;
193  float sq = 0.0;
194  float thrd = 0.0;
195  float frth = 0.0;
196 
197  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
198  __m256 target_vec;
199  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
200 
201  cpa0 = _mm256_set1_ps(center_point_array[0]);
202  cpa1 = _mm256_set1_ps(center_point_array[1]);
203  cpa2 = _mm256_set1_ps(center_point_array[2]);
204  cpa3 = _mm256_set1_ps(center_point_array[3]);
205  cutoff_vec = _mm256_set1_ps(*cutoff);
206  target_vec = _mm256_setzero_ps();
207 
208  unsigned int i;
209 
210  for(i = 0; i < eighth_points; ++i) {
211  x_to_1 = _mm256_load_ps(src0);
212  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
213  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
214  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
215  // x^1 * x^3 is slightly faster than x^2 * x^2
216  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
217 
218  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
219  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
220  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
221  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
222 
223  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
224  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
225  // this is slightly faster than result += (x_to_1 + x_to_3)
226  target_vec = _mm256_add_ps(x_to_1, target_vec);
227  target_vec = _mm256_add_ps(x_to_3, target_vec);
228 
229  src0 += 8;
230  }
231 
232  // the hadd for vector reduction has very very slight impact @ 50k iters
233  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
234  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
235  _mm256_store_ps(temp_results, target_vec);
236  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
237 
238 
239  for(i = eighth_points*8; i < num_points; ++i) {
240  fst = *(src0++);
241  fst = MAX(fst, *cutoff);
242  sq = fst * fst;
243  thrd = fst * sq;
244  frth = sq * sq;
245 
246  *target += (center_point_array[0] * fst +
247  center_point_array[1] * sq +
248  center_point_array[2] * thrd +
249  center_point_array[3] * frth);
250  }
251 
252  *target += ((float)(num_points)) * center_point_array[4];
253 }
254 #endif // LV_HAVE_AVX
255 
256 
257 #ifdef LV_HAVE_GENERIC
258 
259 static inline void
260 volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
261  float* cutoff, unsigned int num_points)
262 {
263  const unsigned int num_bytes = num_points*4;
264 
265  float result = 0.0;
266  float fst = 0.0;
267  float sq = 0.0;
268  float thrd = 0.0;
269  float frth = 0.0;
270  //float fith = 0.0;
271 
272  unsigned int i = 0;
273 
274  for(; i < num_bytes >> 2; ++i) {
275  fst = src0[i];
276  fst = MAX(fst, *cutoff);
277 
278  sq = fst * fst;
279  thrd = fst * sq;
280  frth = sq * sq;
281  //fith = sq * thrd;
282 
283  result += (center_point_array[0] * fst +
284  center_point_array[1] * sq +
285  center_point_array[2] * thrd +
286  center_point_array[3] * frth); //+
287  //center_point_array[4] * fith);
288  /*printf("%f12...%d\n", (center_point_array[0] * fst +
289  center_point_array[1] * sq +
290  center_point_array[2] * thrd +
291  center_point_array[3] * frth) +
292  //center_point_array[4] * fith) +
293  (center_point_array[4]), i);
294  */
295  }
296 
297  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
298 
299  *target = result;
300 }
301 
302 #endif /*LV_HAVE_GENERIC*/
303 
304 
305 #ifdef LV_HAVE_AVX
306 #include<immintrin.h>
307 
308 static inline void
309 volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
310  float* cutoff, unsigned int num_points)
311 {
312  const unsigned int eighth_points = num_points / 8;
313  float fst = 0.0;
314  float sq = 0.0;
315  float thrd = 0.0;
316  float frth = 0.0;
317 
318  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
319  __m256 target_vec;
320  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
321 
322  cpa0 = _mm256_set1_ps(center_point_array[0]);
323  cpa1 = _mm256_set1_ps(center_point_array[1]);
324  cpa2 = _mm256_set1_ps(center_point_array[2]);
325  cpa3 = _mm256_set1_ps(center_point_array[3]);
326  cutoff_vec = _mm256_set1_ps(*cutoff);
327  target_vec = _mm256_setzero_ps();
328 
329  unsigned int i;
330 
331  for(i = 0; i < eighth_points; ++i) {
332  x_to_1 = _mm256_loadu_ps(src0);
333  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
334  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
335  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
336  // x^1 * x^3 is slightly faster than x^2 * x^2
337  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
338 
339  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
340  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
341  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
342  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
343 
344  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
345  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
346  // this is slightly faster than result += (x_to_1 + x_to_3)
347  target_vec = _mm256_add_ps(x_to_1, target_vec);
348  target_vec = _mm256_add_ps(x_to_3, target_vec);
349 
350  src0 += 8;
351  }
352 
353  // the hadd for vector reduction has very very slight impact @ 50k iters
354  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
355  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
356  _mm256_store_ps(temp_results, target_vec);
357  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
358 
359 
360  for(i = eighth_points*8; i < num_points; ++i) {
361  fst = *(src0++);
362  fst = MAX(fst, *cutoff);
363  sq = fst * fst;
364  thrd = fst * sq;
365  frth = sq * sq;
366 
367  *target += (center_point_array[0] * fst +
368  center_point_array[1] * sq +
369  center_point_array[2] * thrd +
370  center_point_array[3] * frth);
371  }
372 
373  *target += ((float)(num_points)) * center_point_array[4];
374 }
375 #endif // LV_HAVE_AVX
376 
377 #ifdef LV_HAVE_NEON
378 #include <arm_neon.h>
379 
380 static inline void
381 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
382  float* __restrict center_point_array,
383  float* __restrict cutoff, unsigned int num_points)
384 {
385  unsigned int i;
386  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
387 
388  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
389  float32x2_t cutoff_vector;
390  float32x2x2_t x_low, x_high;
391  float32x4_t x_qvector, c_qvector, cpa_qvector;
392  float accumulator;
393  float res_accumulators[4];
394 
395  c_qvector = vld1q_f32( zero );
396  // load the cutoff in to a vector
397  cutoff_vector = vdup_n_f32( *cutoff );
398  // ... center point array
399  cpa_qvector = vld1q_f32( center_point_array );
400 
401  for(i=0; i < num_points; ++i) {
402  // load x (src0)
403  x_to_1 = vdup_n_f32( *src0++ );
404 
405  // Get a vector of max(src0, cutoff)
406  x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
407  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
408  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
409  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
410  // zip up doubles to interleave
411  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
412  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
413  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
414  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
415  // now we finally have [x^4 | x^3 | x^2 | x] !
416 
417  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
418 
419  }
420  // there should be better vector reduction techniques
421  vst1q_f32(res_accumulators, c_qvector );
422  accumulator = res_accumulators[0] + res_accumulators[1] +
423  res_accumulators[2] + res_accumulators[3];
424 
425  *target = accumulator + center_point_array[4] * (float)num_points;
426 }
427 
428 #endif /* LV_HAVE_NEON */
429 
430 
431 #ifdef LV_HAVE_NEON
432 
433 static inline void
434 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
435  float* __restrict center_point_array,
436  float* __restrict cutoff, unsigned int num_points)
437 {
438  unsigned int i;
439  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
440 
441  float accumulator;
442 
443  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
444  accumulator1_vec = vld1q_f32(zero);
445  accumulator2_vec = vld1q_f32(zero);
446  accumulator3_vec = vld1q_f32(zero);
447  accumulator4_vec = vld1q_f32(zero);
448  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
449  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
450 
451  // load the cutoff in to a vector
452  cutoff_vector = vdupq_n_f32( *cutoff );
453  // ... center point array
454  cpa_0 = vdupq_n_f32(center_point_array[0]);
455  cpa_1 = vdupq_n_f32(center_point_array[1]);
456  cpa_2 = vdupq_n_f32(center_point_array[2]);
457  cpa_3 = vdupq_n_f32(center_point_array[3]);
458 
459  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
460  for(i=0; i < num_points/4; ++i) {
461  // load x
462  x_to_1 = vld1q_f32( src0 );
463 
464  // Get a vector of max(src0, cutoff)
465  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
466  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
467  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
468  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
469  x_to_1 = vmulq_f32(x_to_1, cpa_0);
470  x_to_2 = vmulq_f32(x_to_2, cpa_1);
471  x_to_3 = vmulq_f32(x_to_3, cpa_2);
472  x_to_4 = vmulq_f32(x_to_4, cpa_3);
473  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
474  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
475  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
476  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
477 
478  src0 += 4;
479  }
480  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
481  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
482  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
483 
484  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
485  vst1q_f32(res_accumulators, accumulator1_vec );
486  accumulator = res_accumulators[0] + res_accumulators[1] +
487  res_accumulators[2] + res_accumulators[3];
488 
489  float fst = 0.0;
490  float sq = 0.0;
491  float thrd = 0.0;
492  float frth = 0.0;
493 
494  for(i = 4*num_points/4; i < num_points; ++i) {
495  fst = src0[i];
496  fst = MAX(fst, *cutoff);
497 
498  sq = fst * fst;
499  thrd = fst * sq;
500  frth = sq * sq;
501  //fith = sq * thrd;
502 
503  accumulator += (center_point_array[0] * fst +
504  center_point_array[1] * sq +
505  center_point_array[2] * thrd +
506  center_point_array[3] * frth); //+
507  }
508 
509  *target = accumulator + center_point_array[4] * (float)num_points;
510 }
511 
512 #endif /* LV_HAVE_NEON */
513 
514 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:90
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27