76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
80 #include <smmintrin.h>
83 volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
85 float* bPtr = bVector;
86 const float* aPtr = aVector;
88 unsigned int number = 0;
89 unsigned int quarterPoints = num_points / 4;
92 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
93 __m128 sine, cosine, condition1, condition2;
94 __m128i q, r, ones, twos, fours;
96 m4pi = _mm_set1_ps(1.273239545);
97 pio4A = _mm_set1_ps(0.78515625);
98 pio4B = _mm_set1_ps(0.241876e-3);
99 ffours = _mm_set1_ps(4.0);
100 ftwos = _mm_set1_ps(2.0);
101 fones = _mm_set1_ps(1.0);
102 fzeroes = _mm_setzero_ps();
103 ones = _mm_set1_epi32(1);
104 twos = _mm_set1_epi32(2);
105 fours = _mm_set1_epi32(4);
107 cp1 = _mm_set1_ps(1.0);
108 cp2 = _mm_set1_ps(0.83333333e-1);
109 cp3 = _mm_set1_ps(0.2777778e-2);
110 cp4 = _mm_set1_ps(0.49603e-4);
111 cp5 = _mm_set1_ps(0.551e-6);
113 for(;number < quarterPoints; number++) {
114 aVal = _mm_load_ps(aPtr);
115 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
116 q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
117 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
119 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
120 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
122 s = _mm_div_ps(s, _mm_set1_ps(8.0));
123 s = _mm_mul_ps(s, s);
125 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
127 for(i = 0; i < 3; i++) {
128 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
130 s = _mm_div_ps(s, ftwos);
132 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
133 cosine = _mm_sub_ps(fones, s);
135 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
136 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
140 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
141 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
142 _mm_store_ps(bPtr, sine);
147 number = quarterPoints * 4;
148 for(;number < num_points; number++) {
149 *bPtr++ = sin(*aPtr++);
158 #ifndef INCLUDED_volk_32f_sin_32f_u_H
159 #define INCLUDED_volk_32f_sin_32f_u_H
161 #ifdef LV_HAVE_SSE4_1
162 #include <smmintrin.h>
165 volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
167 float* bPtr = bVector;
168 const float* aPtr = aVector;
170 unsigned int number = 0;
171 unsigned int quarterPoints = num_points / 4;
174 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
175 __m128 sine, cosine, condition1, condition2;
176 __m128i q, r, ones, twos, fours;
178 m4pi = _mm_set1_ps(1.273239545);
179 pio4A = _mm_set1_ps(0.78515625);
180 pio4B = _mm_set1_ps(0.241876e-3);
181 ffours = _mm_set1_ps(4.0);
182 ftwos = _mm_set1_ps(2.0);
183 fones = _mm_set1_ps(1.0);
184 fzeroes = _mm_setzero_ps();
185 ones = _mm_set1_epi32(1);
186 twos = _mm_set1_epi32(2);
187 fours = _mm_set1_epi32(4);
189 cp1 = _mm_set1_ps(1.0);
190 cp2 = _mm_set1_ps(0.83333333e-1);
191 cp3 = _mm_set1_ps(0.2777778e-2);
192 cp4 = _mm_set1_ps(0.49603e-4);
193 cp5 = _mm_set1_ps(0.551e-6);
195 for(;number < quarterPoints; number++) {
196 aVal = _mm_loadu_ps(aPtr);
197 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
198 q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
199 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
201 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
202 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
204 s = _mm_div_ps(s, _mm_set1_ps(8.0));
205 s = _mm_mul_ps(s, s);
207 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
209 for(i = 0; i < 3; i++) {
210 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
212 s = _mm_div_ps(s, ftwos);
214 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
215 cosine = _mm_sub_ps(fones, s);
217 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
218 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
220 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
221 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
222 _mm_storeu_ps(bPtr, sine);
227 number = quarterPoints * 4;
228 for(;number < num_points; number++){
229 *bPtr++ = sin(*aPtr++);
236 #ifdef LV_HAVE_GENERIC
239 volk_32f_sin_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points)
241 float* bPtr = bVector;
242 const float* aPtr = aVector;
243 unsigned int number = 0;
245 for(number = 0; number < num_points; number++) {
246 *bPtr++ = sin(*aPtr++);