GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_cos_32f
25  *
26  * \b Overview
27  *
28  * Computes cosine of the input vector and stores results in the output vector.
29  *
30  * <b>Dispatcher Prototype</b>
31  * \code
32  * void volk_32f_cos_32f(float* bVector, const float* aVector, unsigned int num_points)
33  * \endcode
34  *
35  * \b Inputs
36  * \li aVector: The input vector of floats.
37  * \li num_points: The number of data points.
38  *
39  * \b Outputs
40  * \li bVector: The vector where results will be stored.
41  *
42  * \b Example
43  * Calculate cos(theta) for common angles.
44  * \code
45  * int N = 10;
46  * unsigned int alignment = volk_get_alignment();
47  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
48  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
49  *
50  * in[0] = 0.000;
51  * in[1] = 0.524;
52  * in[2] = 0.786;
53  * in[3] = 1.047;
54  * in[4] = 1.571;
55  * in[5] = 1.571;
56  * in[6] = 2.094;
57  * in[7] = 2.356;
58  * in[8] = 2.618;
59  * in[9] = 3.142;
60  *
61  * volk_32f_cos_32f(out, in, N);
62  *
63  * for(unsigned int ii = 0; ii < N; ++ii){
64  * printf("cos(%1.3f) = %1.3f\n", in[ii], out[ii]);
65  * }
66  *
67  * volk_free(in);
68  * volk_free(out);
69  * \endcode
70  */
71 
72 #include <stdio.h>
73 #include <math.h>
74 #include <inttypes.h>
75 
76 #ifndef INCLUDED_volk_32f_cos_32f_a_H
77 #define INCLUDED_volk_32f_cos_32f_a_H
78 
79 #ifdef LV_HAVE_SSE4_1
80 #include <smmintrin.h>
81 
82 static inline void
83  volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  float* bPtr = bVector;
86  const float* aPtr = aVector;
87 
88  unsigned int number = 0;
89  unsigned int quarterPoints = num_points / 4;
90  unsigned int i = 0;
91 
92  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
93  __m128 sine, cosine, condition1, condition3;
94  __m128i q, r, ones, twos, fours;
95 
96  m4pi = _mm_set1_ps(1.273239545);
97  pio4A = _mm_set1_ps(0.78515625);
98  pio4B = _mm_set1_ps(0.241876e-3);
99  ffours = _mm_set1_ps(4.0);
100  ftwos = _mm_set1_ps(2.0);
101  fones = _mm_set1_ps(1.0);
102  fzeroes = _mm_setzero_ps();
103  ones = _mm_set1_epi32(1);
104  twos = _mm_set1_epi32(2);
105  fours = _mm_set1_epi32(4);
106 
107  cp1 = _mm_set1_ps(1.0);
108  cp2 = _mm_set1_ps(0.83333333e-1);
109  cp3 = _mm_set1_ps(0.2777778e-2);
110  cp4 = _mm_set1_ps(0.49603e-4);
111  cp5 = _mm_set1_ps(0.551e-6);
112 
113  for(;number < quarterPoints; number++){
114 
115  aVal = _mm_load_ps(aPtr);
116  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
117  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
118  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
119 
120  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
121  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
122 
123  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
124  s = _mm_mul_ps(s, s);
125  // Evaluate Taylor series
126  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
127 
128  for(i = 0; i < 3; i++)
129  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
130  s = _mm_div_ps(s, ftwos);
131 
132  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
133  cosine = _mm_sub_ps(fones, s);
134 
135  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
136 
137  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
138 
139  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
140  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
141  _mm_store_ps(bPtr, cosine);
142  aPtr += 4;
143  bPtr += 4;
144  }
145 
146  number = quarterPoints * 4;
147  for(;number < num_points; number++){
148  *bPtr++ = cos(*aPtr++);
149  }
150 }
151 
152 #endif /* LV_HAVE_SSE4_1 for aligned */
153 
154 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
155 
156 
157 
158 #ifndef INCLUDED_volk_32f_cos_32f_u_H
159 #define INCLUDED_volk_32f_cos_32f_u_H
160 
161 #ifdef LV_HAVE_SSE4_1
162 #include <smmintrin.h>
163 
164 static inline void
165 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
166 {
167  float* bPtr = bVector;
168  const float* aPtr = aVector;
169 
170  unsigned int number = 0;
171  unsigned int quarterPoints = num_points / 4;
172  unsigned int i = 0;
173 
174  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
175  __m128 sine, cosine, condition1, condition3;
176  __m128i q, r, ones, twos, fours;
177 
178  m4pi = _mm_set1_ps(1.273239545);
179  pio4A = _mm_set1_ps(0.78515625);
180  pio4B = _mm_set1_ps(0.241876e-3);
181  ffours = _mm_set1_ps(4.0);
182  ftwos = _mm_set1_ps(2.0);
183  fones = _mm_set1_ps(1.0);
184  fzeroes = _mm_setzero_ps();
185  ones = _mm_set1_epi32(1);
186  twos = _mm_set1_epi32(2);
187  fours = _mm_set1_epi32(4);
188 
189  cp1 = _mm_set1_ps(1.0);
190  cp2 = _mm_set1_ps(0.83333333e-1);
191  cp3 = _mm_set1_ps(0.2777778e-2);
192  cp4 = _mm_set1_ps(0.49603e-4);
193  cp5 = _mm_set1_ps(0.551e-6);
194 
195  for(;number < quarterPoints; number++){
196  aVal = _mm_loadu_ps(aPtr);
197  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
198  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
199  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
200 
201  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
202  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
203 
204  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
205  s = _mm_mul_ps(s, s);
206  // Evaluate Taylor series
207  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
208 
209  for(i = 0; i < 3; i++){
210  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
211  }
212  s = _mm_div_ps(s, ftwos);
213 
214  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
215  cosine = _mm_sub_ps(fones, s);
216 
217  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
218 
219  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
220 
221  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
222  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
223  _mm_storeu_ps(bPtr, cosine);
224  aPtr += 4;
225  bPtr += 4;
226  }
227 
228  number = quarterPoints * 4;
229  for(;number < num_points; number++){
230  *bPtr++ = cos(*aPtr++);
231  }
232 }
233 
234 #endif /* LV_HAVE_SSE4_1 for unaligned */
235 
236 
237 #ifdef LV_HAVE_GENERIC
238 
239 static inline void
240 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
241 {
242  float* bPtr = bVector;
243  const float* aPtr = aVector;
244  unsigned int number = 0;
245 
246  for(; number < num_points; number++){
247  *bPtr++ = cos(*aPtr++);
248  }
249 }
250 
251 #endif /* LV_HAVE_GENERIC */
252 
253 #endif /* INCLUDED_volk_32f_cos_32f_u_H */