GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_tanh_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_tanh_32f
25  *
26  * \b Overview
27  *
28  * Computes the hyperbolic tangent of each element of the aVector:
29  *
30  * c[i] = tanh(a[i])
31  *
32  * <b>Dispatcher Prototype</b>
33  * \code
34  * void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points)
35  * \endcode
36  *
37  * \b Inputs
38  * \li aVector: The buffer of points.
39  * \li num_points: The number of values in input buffer.
40  *
41  * \b Outputs
42  * \li cVector: The output buffer.
43  *
44  * \b Example
45  * \code
46  * int N = 10;
47  * unsigned int alignment = volk_get_alignment();
48  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
49  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
50  *
51  * for(unsigned int ii = 0; ii < N; ++ii){
52  * // the approximate artanh(x) for x<1
53  * float x = (float)ii / (float)N;
54  * in[ii] = 0.5 * std::log((1.f+x)/(1.f-x));
55  * }
56  *
57  * volk_32f_tanh_32f(out, in, N);
58  *
59  * for(unsigned int ii = 0; ii < N; ++ii){
60  * printf("out(%i) = %f\n", ii, out[ii]);
61  * }
62  *
63  * volk_free(in);
64  * volk_free(out);
65  * \endcode
66  */
67 
68 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
69 #define INCLUDED_volk_32f_tanh_32f_a_H
70 
71 #include <inttypes.h>
72 #include <stdio.h>
73 #include <math.h>
74 #include <string.h>
75 
76 #ifdef LV_HAVE_GENERIC
77 
78 static inline void
79 volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
80  unsigned int num_points)
81 {
82  unsigned int number = 0;
83  float* cPtr = cVector;
84  const float* aPtr = aVector;
85  for(; number < num_points; number++) {
86  *cPtr++ = tanh(*aPtr++);
87  }
88 }
89 
90 #endif /* LV_HAVE_GENERIC */
91 
92 
93 #ifdef LV_HAVE_GENERIC
94 
95 static inline void
96 volk_32f_tanh_32f_series(float* cVector, const float* aVector,
97  unsigned int num_points)
98 {
99  unsigned int number = 0;
100  float* cPtr = cVector;
101  const float* aPtr = aVector;
102  for(; number < num_points; number++) {
103  if(*aPtr > 4.97)
104  *cPtr++ = 1;
105  else if(*aPtr <= -4.97)
106  *cPtr++ = -1;
107  else {
108  float x2 = (*aPtr) * (*aPtr);
109  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
110  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
111  *cPtr++ = a / b;
112  aPtr++;
113  }
114  }
115 }
116 
117 #endif /* LV_HAVE_GENERIC */
118 
119 
120 
121 #ifdef LV_HAVE_SSE
122 #include <xmmintrin.h>
123 
124 static inline void
125 volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
126  unsigned int num_points)
127 {
128  unsigned int number = 0;
129  const unsigned int quarterPoints = num_points / 4;
130 
131  float* cPtr = cVector;
132  const float* aPtr = aVector;
133 
134  __m128 aVal, cVal, x2, a, b;
135  __m128 const1, const2, const3, const4, const5, const6;
136  const1 = _mm_set_ps1(135135.0f);
137  const2 = _mm_set_ps1(17325.0f);
138  const3 = _mm_set_ps1(378.0f);
139  const4 = _mm_set_ps1(62370.0f);
140  const5 = _mm_set_ps1(3150.0f);
141  const6 = _mm_set_ps1(28.0f);
142  for(;number < quarterPoints; number++){
143 
144  aVal = _mm_load_ps(aPtr);
145  x2 = _mm_mul_ps(aVal, aVal);
146  a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
147  b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
148 
149  cVal = _mm_div_ps(a, b);
150 
151  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
152 
153  aPtr += 4;
154  cPtr += 4;
155  }
156 
157  number = quarterPoints * 4;
158  for(;number < num_points; number++) {
159  if(*aPtr > 4.97)
160  *cPtr++ = 1;
161  else if(*aPtr <= -4.97)
162  *cPtr++ = -1;
163  else {
164  float x2 = (*aPtr) * (*aPtr);
165  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
166  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
167  *cPtr++ = a / b;
168  aPtr++;
169  }
170  }
171 }
172 #endif /* LV_HAVE_SSE */
173 
174 
175 #ifdef LV_HAVE_AVX
176 #include <immintrin.h>
177 
178 static inline void
179 volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
180  unsigned int num_points)
181 {
182  unsigned int number = 0;
183  const unsigned int eighthPoints = num_points / 8;
184 
185  float* cPtr = cVector;
186  const float* aPtr = aVector;
187 
188  __m256 aVal, cVal, x2, a, b;
189  __m256 const1, const2, const3, const4, const5, const6;
190  const1 = _mm256_set1_ps(135135.0f);
191  const2 = _mm256_set1_ps(17325.0f);
192  const3 = _mm256_set1_ps(378.0f);
193  const4 = _mm256_set1_ps(62370.0f);
194  const5 = _mm256_set1_ps(3150.0f);
195  const6 = _mm256_set1_ps(28.0f);
196  for(;number < eighthPoints; number++){
197 
198  aVal = _mm256_load_ps(aPtr);
199  x2 = _mm256_mul_ps(aVal, aVal);
200  a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
201  b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
202 
203  cVal = _mm256_div_ps(a, b);
204 
205  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
206 
207  aPtr += 8;
208  cPtr += 8;
209  }
210 
211  number = eighthPoints * 8;
212  for(;number < num_points; number++) {
213  if(*aPtr > 4.97)
214  *cPtr++ = 1;
215  else if(*aPtr <= -4.97)
216  *cPtr++ = -1;
217  else {
218  float x2 = (*aPtr) * (*aPtr);
219  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
220  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
221  *cPtr++ = a / b;
222  aPtr++;
223  }
224  }
225 }
226 #endif /* LV_HAVE_AVX */
227 
228 
229 
230 
231 #ifdef LV_HAVE_SSE
232 #include <xmmintrin.h>
233 
234 static inline void
235 volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
236  unsigned int num_points)
237 {
238  unsigned int number = 0;
239  const unsigned int quarterPoints = num_points / 4;
240 
241  float* cPtr = cVector;
242  const float* aPtr = aVector;
243 
244  __m128 aVal, cVal, x2, a, b;
245  __m128 const1, const2, const3, const4, const5, const6;
246  const1 = _mm_set_ps1(135135.0f);
247  const2 = _mm_set_ps1(17325.0f);
248  const3 = _mm_set_ps1(378.0f);
249  const4 = _mm_set_ps1(62370.0f);
250  const5 = _mm_set_ps1(3150.0f);
251  const6 = _mm_set_ps1(28.0f);
252  for(;number < quarterPoints; number++){
253 
254  aVal = _mm_loadu_ps(aPtr);
255  x2 = _mm_mul_ps(aVal, aVal);
256  a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
257  b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
258 
259  cVal = _mm_div_ps(a, b);
260 
261  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
262 
263  aPtr += 4;
264  cPtr += 4;
265  }
266 
267  number = quarterPoints * 4;
268  for(;number < num_points; number++) {
269  if(*aPtr > 4.97)
270  *cPtr++ = 1;
271  else if(*aPtr <= -4.97)
272  *cPtr++ = -1;
273  else {
274  float x2 = (*aPtr) * (*aPtr);
275  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
276  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
277  *cPtr++ = a / b;
278  aPtr++;
279  }
280  }
281 }
282 #endif /* LV_HAVE_SSE */
283 
284 
285 #ifdef LV_HAVE_AVX
286 #include <immintrin.h>
287 
288 static inline void
289 volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
290  unsigned int num_points)
291 {
292  unsigned int number = 0;
293  const unsigned int eighthPoints = num_points / 8;
294 
295  float* cPtr = cVector;
296  const float* aPtr = aVector;
297 
298  __m256 aVal, cVal, x2, a, b;
299  __m256 const1, const2, const3, const4, const5, const6;
300  const1 = _mm256_set1_ps(135135.0f);
301  const2 = _mm256_set1_ps(17325.0f);
302  const3 = _mm256_set1_ps(378.0f);
303  const4 = _mm256_set1_ps(62370.0f);
304  const5 = _mm256_set1_ps(3150.0f);
305  const6 = _mm256_set1_ps(28.0f);
306  for(;number < eighthPoints; number++){
307 
308  aVal = _mm256_loadu_ps(aPtr);
309  x2 = _mm256_mul_ps(aVal, aVal);
310  a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
311  b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
312 
313  cVal = _mm256_div_ps(a, b);
314 
315  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
316 
317  aPtr += 8;
318  cPtr += 8;
319  }
320 
321  number = eighthPoints * 8;
322  for(;number < num_points; number++) {
323  if(*aPtr > 4.97)
324  *cPtr++ = 1;
325  else if(*aPtr <= -4.97)
326  *cPtr++ = -1;
327  else {
328  float x2 = (*aPtr) * (*aPtr);
329  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
330  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
331  *cPtr++ = a / b;
332  aPtr++;
333  }
334  }
335 }
336 #endif /* LV_HAVE_AVX */
337 
338 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */