GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_acos_32f
25  *
26  * \b Overview
27  *
28  * Computes arccosine of the input vector and stores results in the output vector.
29  *
30  * <b>Dispatcher Prototype</b>
31  * \code
32  * void volk_32f_acos_32f(float* bVector, const float* aVector, unsigned int num_points)
33  * \endcode
34  *
35  * \b Inputs
36  * \li aVector: The input vector of floats.
37  * \li num_points: The number of data points.
38  *
39  * \b Outputs
40  * \li bVector: The vector where results will be stored.
41  *
42  * \b Example
43  * Calculate common angles around the top half of the unit circle.
44  * \code
45  * int N = 10;
46  * unsigned int alignment = volk_get_alignment();
47  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
48  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
49  *
50  * in[0] = 1;
51  * in[1] = std::sqrt(3.f)/2.f;
52  * in[2] = std::sqrt(2.f)/2.f;
53  * in[3] = 0.5;
54  * in[4] = in[5] = 0;
55  * for(unsigned int ii = 6; ii < N; ++ii){
56  * in[ii] = - in[N-ii-1];
57  * }
58  *
59  * volk_32f_acos_32f(out, in, N);
60  *
61  * for(unsigned int ii = 0; ii < N; ++ii){
62  * printf("acos(%1.3f) = %1.3f\n", in[ii], out[ii]);
63  * }
64  *
65  * volk_free(in);
66  * volk_free(out);
67  * \endcode
68  */
69 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define ACOS_TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_acos_32f_a_H
78 #define INCLUDED_volk_32f_acos_32f_a_H
79 
80 #ifdef LV_HAVE_SSE4_1
81 #include <smmintrin.h>
82 
83 static inline void
84 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int quarterPoints = num_points / 4;
91  int i, j;
92 
93  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
94  __m128 fzeroes, fones, ftwos, ffours, condition;
95 
96  pi = _mm_set1_ps(3.14159265358979323846);
97  pio2 = _mm_set1_ps(3.14159265358979323846/2);
98  fzeroes = _mm_setzero_ps();
99  fones = _mm_set1_ps(1.0);
100  ftwos = _mm_set1_ps(2.0);
101  ffours = _mm_set1_ps(4.0);
102 
103  for(;number < quarterPoints; number++){
104  aVal = _mm_load_ps(aPtr);
105  d = aVal;
106  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
107  z = aVal;
108  condition = _mm_cmplt_ps(z, fzeroes);
109  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
110  x = z;
111  condition = _mm_cmplt_ps(z, fones);
112  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
113 
114  for(i = 0; i < 2; i++)
115  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
116  x = _mm_div_ps(fones, x);
117  y = fzeroes;
118  for(j = ACOS_TERMS - 1; j >=0 ; j--)
119  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
120 
121  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
122  condition = _mm_cmpgt_ps(z, fones);
123 
124  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
125  arccosine = y;
126  condition = _mm_cmplt_ps(aVal, fzeroes);
127  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
128  condition = _mm_cmplt_ps(d, fzeroes);
129  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
130 
131  _mm_store_ps(bPtr, arccosine);
132  aPtr += 4;
133  bPtr += 4;
134  }
135 
136  number = quarterPoints * 4;
137  for(;number < num_points; number++){
138  *bPtr++ = acos(*aPtr++);
139  }
140 }
141 
142 #endif /* LV_HAVE_SSE4_1 for aligned */
143 
144 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
145 
146 
147 #ifndef INCLUDED_volk_32f_acos_32f_u_H
148 #define INCLUDED_volk_32f_acos_32f_u_H
149 
150 #ifdef LV_HAVE_SSE4_1
151 #include <smmintrin.h>
152 
153 static inline void
154 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
155 {
156  float* bPtr = bVector;
157  const float* aPtr = aVector;
158 
159  unsigned int number = 0;
160  unsigned int quarterPoints = num_points / 4;
161  int i, j;
162 
163  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
164  __m128 fzeroes, fones, ftwos, ffours, condition;
165 
166  pi = _mm_set1_ps(3.14159265358979323846);
167  pio2 = _mm_set1_ps(3.14159265358979323846/2);
168  fzeroes = _mm_setzero_ps();
169  fones = _mm_set1_ps(1.0);
170  ftwos = _mm_set1_ps(2.0);
171  ffours = _mm_set1_ps(4.0);
172 
173  for(;number < quarterPoints; number++){
174  aVal = _mm_loadu_ps(aPtr);
175  d = aVal;
176  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
177  z = aVal;
178  condition = _mm_cmplt_ps(z, fzeroes);
179  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
180  x = z;
181  condition = _mm_cmplt_ps(z, fones);
182  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
183 
184  for(i = 0; i < 2; i++)
185  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
186  x = _mm_div_ps(fones, x);
187  y = fzeroes;
188 
189  for(j = ACOS_TERMS - 1; j >=0 ; j--)
190  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
191 
192  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
193  condition = _mm_cmpgt_ps(z, fones);
194 
195  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
196  arccosine = y;
197  condition = _mm_cmplt_ps(aVal, fzeroes);
198  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
199  condition = _mm_cmplt_ps(d, fzeroes);
200  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
201 
202  _mm_storeu_ps(bPtr, arccosine);
203  aPtr += 4;
204  bPtr += 4;
205  }
206 
207  number = quarterPoints * 4;
208  for(;number < num_points; number++){
209  *bPtr++ = acos(*aPtr++);
210  }
211 }
212 
213 #endif /* LV_HAVE_SSE4_1 for aligned */
214 
215 #ifdef LV_HAVE_GENERIC
216 
217 static inline void
218 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
219 {
220  float* bPtr = bVector;
221  const float* aPtr = aVector;
222  unsigned int number = 0;
223 
224  for(number = 0; number < num_points; number++){
225  *bPtr++ = acos(*aPtr++);
226  }
227 
228 }
229 #endif /* LV_HAVE_GENERIC */
230 
231 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:75