GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_log2_32f
25  *
26  * \b Overview
27  *
28  * Computes base 2 log of input vector and stores results in output vector.
29  *
30  * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation
31  * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm
32  *
33  * Permission is hereby granted, free of charge, to any person obtaining a
34  * copy of this software and associated documentation files (the
35  * "Software"), to deal in the Software without restriction, including
36  * without limitation the rights to use, copy, modify, merge, publish,
37  * distribute, sub license, and/or sell copies of the Software, and to
38  * permit persons to whom the Software is furnished to do so, subject to
39  * the following conditions:
40  *
41  * The above copyright notice and this permission notice (including the
42  * next paragraph) shall be included in all copies or substantial portions
43  * of the Software.
44  *
45  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
46  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
47  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
48  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
49  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
50  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
51  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
52  *
53  * This is the MIT License (MIT)
54  *
55  * <b>Dispatcher Prototype</b>
56  * \code
57  * void volk_32f_log2_32f(float* bVector, const float* aVector, unsigned int num_points)
58  * \endcode
59  *
60  * \b Inputs
61  * \li aVector: the input vector of floats.
62  * \li num_points: The number of data points.
63  *
64  * \b Outputs
65  * \li cVector: The output vector.
66  *
67  * \b Example
68  * \code
69  * int N = 10;
70  * unsigned int alignment = volk_get_alignment();
71  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
72  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
73  *
74  * for(unsigned int ii = 0; ii < N; ++ii){
75  * in[ii] = std::pow(2.f,((float)ii));
76  * }
77  *
78  * volk_32f_log2_32f(out, in, N);
79  *
80  * for(unsigned int ii = 0; ii < N; ++ii){
81  * printf("out(%i) = %f\n", ii, out[ii]);
82  * }
83  *
84  * volk_free(in);
85  * volk_free(out);
86  * \endcode
87  */
88 
89 #ifndef INCLUDED_volk_32f_log2_32f_a_H
90 #define INCLUDED_volk_32f_log2_32f_a_H
91 
92 #include <stdio.h>
93 #include <stdlib.h>
94 #include <inttypes.h>
95 #include <math.h>
96 
97 #define POLY0(x, c0) _mm_set1_ps(c0)
98 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
99 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
100 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
101 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
102 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
103 
104 #define LOG_POLY_DEGREE 6
105 
106 #ifdef LV_HAVE_GENERIC
107 
108 static inline void
109 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
110 {
111  float* bPtr = bVector;
112  const float* aPtr = aVector;
113  unsigned int number = 0;
114 
115  for(number = 0; number < num_points; number++) {
116  *bPtr++ = log2(*aPtr++);
117  }
118 
119 }
120 #endif /* LV_HAVE_GENERIC */
121 
122 
123 #ifdef LV_HAVE_SSE4_1
124 #include <smmintrin.h>
125 
126 static inline void
127 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
128 {
129  float* bPtr = bVector;
130  const float* aPtr = aVector;
131 
132  unsigned int number = 0;
133  const unsigned int quarterPoints = num_points / 4;
134 
135  __m128 aVal, bVal, mantissa, frac, leadingOne;
136  __m128i bias, exp;
137 
138  for(;number < quarterPoints; number++){
139 
140  aVal = _mm_load_ps(aPtr);
141  bias = _mm_set1_epi32(127);
142  leadingOne = _mm_set1_ps(1.0f);
143  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
144  bVal = _mm_cvtepi32_ps(exp);
145 
146  // Now to extract mantissa
147  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
148 
149 #if LOG_POLY_DEGREE == 6
150  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
151 #elif LOG_POLY_DEGREE == 5
152  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
153 #elif LOG_POLY_DEGREE == 4
154  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
155 #elif LOG_POLY_DEGREE == 3
156  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
157 #else
158 #error
159 #endif
160 
161  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
162  _mm_store_ps(bPtr, bVal);
163 
164  aPtr += 4;
165  bPtr += 4;
166  }
167 
168  number = quarterPoints * 4;
169  for(;number < num_points; number++){
170  *bPtr++ = log2(*aPtr++);
171  }
172 }
173 
174 #endif /* LV_HAVE_SSE4_1 for aligned */
175 
176 
177 #ifdef LV_HAVE_NEON
178 #include <arm_neon.h>
179 
180 /* these macros allow us to embed logs in other kernels */
181 #define VLOG2Q_NEON_PREAMBLE() \
182  int32x4_t one = vdupq_n_s32(0x000800000); \
183  /* minimax polynomial */ \
184  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
185  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
186  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
187  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
188  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
189  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
190  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
191  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
192  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
193  int32x4_t exp_bias = vdupq_n_s32(127);
194 
195 
196 #define VLOG2Q_NEON_F32(log2_approx, aval) \
197  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
198  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
199  exponent_i = vshrq_n_s32(exponent_i, 23); \
200  \
201  /* extract the exponent and significand \
202  we can treat this as fixed point to save ~9% on the \
203  conversion + float add */ \
204  significand_i = vorrq_s32(one, significand_i); \
205  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
206  /* debias the exponent and convert to float */ \
207  exponent_i = vsubq_s32(exponent_i, exp_bias); \
208  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
209  \
210  /* put the significand through a polynomial fit of log2(x) [1,2] \
211  add the result to the exponent */ \
212  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
213  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
214  log2_approx = vaddq_f32(log2_approx, tmp1); \
215  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
216  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
217  log2_approx = vaddq_f32(log2_approx, tmp1); \
218  \
219  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
220  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
221  log2_approx = vaddq_f32(log2_approx, tmp1); \
222  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
223  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
224  log2_approx = vaddq_f32(log2_approx, tmp1); \
225  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
226  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
227  log2_approx = vaddq_f32(log2_approx, tmp1); \
228  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
229  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
230  log2_approx = vaddq_f32(log2_approx, tmp1);
231 
232 static inline void
233 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
234 {
235  float* bPtr = bVector;
236  const float* aPtr = aVector;
237  unsigned int number;
238  const unsigned int quarterPoints = num_points / 4;
239 
240  int32x4_t aval;
241  float32x4_t log2_approx;
242 
243  VLOG2Q_NEON_PREAMBLE()
244  // lms
245  //p0 = vdupq_n_f32(-1.649132280361871);
246  //p1 = vdupq_n_f32(1.995047138579499);
247  //p2 = vdupq_n_f32(-0.336914839219728);
248 
249  // keep in mind a single precision float is represented as
250  // (-1)^sign * 2^exp * 1.significand, so the log2 is
251  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
252  for(number = 0; number < quarterPoints; ++number){
253  // load float in to an int register without conversion
254  aval = vld1q_s32((int*)aPtr);
255 
256  VLOG2Q_NEON_F32(log2_approx, aval)
257 
258  vst1q_f32(bPtr, log2_approx);
259 
260  aPtr += 4;
261  bPtr += 4;
262  }
263 
264  for(number = quarterPoints * 4; number < num_points; number++){
265  *bPtr++ = log2(*aPtr++);
266  }
267 }
268 
269 #endif /* LV_HAVE_NEON */
270 
271 
272 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
273 
274 #ifndef INCLUDED_volk_32f_log2_32f_u_H
275 #define INCLUDED_volk_32f_log2_32f_u_H
276 
277 
278 #ifdef LV_HAVE_GENERIC
279 
280 static inline void
281 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
282 {
283  float* bPtr = bVector;
284  const float* aPtr = aVector;
285  unsigned int number = 0;
286 
287  for(number = 0; number < num_points; number++){
288  *bPtr++ = log2(*aPtr++);
289  }
290 }
291 
292 #endif /* LV_HAVE_GENERIC */
293 
294 
295 #ifdef LV_HAVE_SSE4_1
296 #include <smmintrin.h>
297 
298 static inline void
299 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
300 {
301  float* bPtr = bVector;
302  const float* aPtr = aVector;
303 
304  unsigned int number = 0;
305  const unsigned int quarterPoints = num_points / 4;
306 
307  __m128 aVal, bVal, mantissa, frac, leadingOne;
308  __m128i bias, exp;
309 
310  for(;number < quarterPoints; number++){
311 
312  aVal = _mm_loadu_ps(aPtr);
313  bias = _mm_set1_epi32(127);
314  leadingOne = _mm_set1_ps(1.0f);
315  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
316  bVal = _mm_cvtepi32_ps(exp);
317 
318  // Now to extract mantissa
319  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
320 
321 #if LOG_POLY_DEGREE == 6
322  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
323 #elif LOG_POLY_DEGREE == 5
324  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
325 #elif LOG_POLY_DEGREE == 4
326  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
327 #elif LOG_POLY_DEGREE == 3
328  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
329 #else
330 #error
331 #endif
332 
333  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
334  _mm_storeu_ps(bPtr, bVal);
335 
336  aPtr += 4;
337  bPtr += 4;
338  }
339 
340  number = quarterPoints * 4;
341  for(;number < num_points; number++){
342  *bPtr++ = log2(*aPtr++);
343  }
344 }
345 
346 #endif /* LV_HAVE_SSE4_1 for unaligned */
347 
348 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_log2_32f.h:100
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_log2_32f.h:101
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_log2_32f.h:99
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_log2_32f.h:102