71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72 #define INCLUDED_volk_32f_x2_pow_32f_a_H
79 #define POLY0(x, c0) _mm_set1_ps(c0)
80 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
81 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
82 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
83 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
84 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
86 #define POW_POLY_DEGREE 3
89 #include <smmintrin.h>
92 volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
const float* bVector,
93 const float* aVector,
unsigned int num_points)
95 float* cPtr = cVector;
96 const float* bPtr = bVector;
97 const float* aPtr = aVector;
99 unsigned int number = 0;
100 const unsigned int quarterPoints = num_points / 4;
102 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
103 __m128 tmp, fx, mask, pow2n, z, y;
104 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
105 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
106 __m128i bias, exp, emm0, pi32_0x7f;
108 one = _mm_set1_ps(1.0);
109 exp_hi = _mm_set1_ps(88.3762626647949);
110 exp_lo = _mm_set1_ps(-88.3762626647949);
111 ln2 = _mm_set1_ps(0.6931471805);
112 log2EF = _mm_set1_ps(1.44269504088896341);
113 half = _mm_set1_ps(0.5);
114 exp_C1 = _mm_set1_ps(0.693359375);
115 exp_C2 = _mm_set1_ps(-2.12194440e-4);
116 pi32_0x7f = _mm_set1_epi32(0x7f);
118 exp_p0 = _mm_set1_ps(1.9875691500e-4);
119 exp_p1 = _mm_set1_ps(1.3981999507e-3);
120 exp_p2 = _mm_set1_ps(8.3334519073e-3);
121 exp_p3 = _mm_set1_ps(4.1665795894e-2);
122 exp_p4 = _mm_set1_ps(1.6666665459e-1);
123 exp_p5 = _mm_set1_ps(5.0000001201e-1);
125 for(;number < quarterPoints; number++){
127 aVal = _mm_load_ps(aPtr);
128 bias = _mm_set1_epi32(127);
129 leadingOne = _mm_set1_ps(1.0f);
130 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
131 logarithm = _mm_cvtepi32_ps(exp);
133 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
135 #if POW_POLY_DEGREE == 6
136 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
137 #elif POW_POLY_DEGREE == 5
138 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
139 #elif POW_POLY_DEGREE == 4
140 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
141 #elif POW_POLY_DEGREE == 3
142 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
147 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
148 logarithm = _mm_mul_ps(logarithm, ln2);
152 bVal = _mm_load_ps(bPtr);
153 bVal = _mm_mul_ps(bVal, logarithm);
156 tmp = _mm_setzero_ps();
158 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
160 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
162 emm0 = _mm_cvttps_epi32(fx);
163 tmp = _mm_cvtepi32_ps(emm0);
165 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
166 fx = _mm_sub_ps(tmp, mask);
168 tmp = _mm_mul_ps(fx, exp_C1);
169 z = _mm_mul_ps(fx, exp_C2);
170 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
171 z = _mm_mul_ps(bVal, bVal);
173 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
174 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
175 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
176 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
177 y = _mm_add_ps(y, one);
179 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
181 pow2n = _mm_castsi128_ps(emm0);
182 cVal = _mm_mul_ps(y, pow2n);
184 _mm_store_ps(cPtr, cVal);
191 number = quarterPoints * 4;
192 for(;number < num_points; number++){
193 *cPtr++ = pow(*aPtr++, *bPtr++);
203 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
204 #define INCLUDED_volk_32f_x2_pow_32f_u_H
206 #ifdef LV_HAVE_GENERIC
209 volk_32f_x2_pow_32f_generic(
float* cVector,
const float* bVector,
210 const float* aVector,
unsigned int num_points)
212 float* cPtr = cVector;
213 const float* bPtr = bVector;
214 const float* aPtr = aVector;
215 unsigned int number = 0;
217 for(number = 0; number < num_points; number++){
218 *cPtr++ = pow(*aPtr++, *bPtr++);
224 #ifdef LV_HAVE_SSE4_1
225 #include <smmintrin.h>
228 volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
const float* bVector,
229 const float* aVector,
unsigned int num_points)
231 float* cPtr = cVector;
232 const float* bPtr = bVector;
233 const float* aPtr = aVector;
235 unsigned int number = 0;
236 const unsigned int quarterPoints = num_points / 4;
238 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
239 __m128 tmp, fx, mask, pow2n, z, y;
240 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
241 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
242 __m128i bias, exp, emm0, pi32_0x7f;
244 one = _mm_set1_ps(1.0);
245 exp_hi = _mm_set1_ps(88.3762626647949);
246 exp_lo = _mm_set1_ps(-88.3762626647949);
247 ln2 = _mm_set1_ps(0.6931471805);
248 log2EF = _mm_set1_ps(1.44269504088896341);
249 half = _mm_set1_ps(0.5);
250 exp_C1 = _mm_set1_ps(0.693359375);
251 exp_C2 = _mm_set1_ps(-2.12194440e-4);
252 pi32_0x7f = _mm_set1_epi32(0x7f);
254 exp_p0 = _mm_set1_ps(1.9875691500e-4);
255 exp_p1 = _mm_set1_ps(1.3981999507e-3);
256 exp_p2 = _mm_set1_ps(8.3334519073e-3);
257 exp_p3 = _mm_set1_ps(4.1665795894e-2);
258 exp_p4 = _mm_set1_ps(1.6666665459e-1);
259 exp_p5 = _mm_set1_ps(5.0000001201e-1);
261 for(;number < quarterPoints; number++){
263 aVal = _mm_loadu_ps(aPtr);
264 bias = _mm_set1_epi32(127);
265 leadingOne = _mm_set1_ps(1.0f);
266 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
267 logarithm = _mm_cvtepi32_ps(exp);
269 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
271 #if POW_POLY_DEGREE == 6
272 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
273 #elif POW_POLY_DEGREE == 5
274 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
275 #elif POW_POLY_DEGREE == 4
276 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
277 #elif POW_POLY_DEGREE == 3
278 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
283 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
284 logarithm = _mm_mul_ps(logarithm, ln2);
288 bVal = _mm_loadu_ps(bPtr);
289 bVal = _mm_mul_ps(bVal, logarithm);
292 tmp = _mm_setzero_ps();
294 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
296 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
298 emm0 = _mm_cvttps_epi32(fx);
299 tmp = _mm_cvtepi32_ps(emm0);
301 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
302 fx = _mm_sub_ps(tmp, mask);
304 tmp = _mm_mul_ps(fx, exp_C1);
305 z = _mm_mul_ps(fx, exp_C2);
306 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
307 z = _mm_mul_ps(bVal, bVal);
309 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
310 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
311 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
312 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
313 y = _mm_add_ps(y, one);
315 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
317 pow2n = _mm_castsi128_ps(emm0);
318 cVal = _mm_mul_ps(y, pow2n);
320 _mm_storeu_ps(cPtr, cVal);
327 number = quarterPoints * 4;
328 for(;number < num_points; number++){
329 *cPtr++ = pow(*aPtr++, *bPtr++);
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_x2_pow_32f.h:82
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_x2_pow_32f.h:84
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_x2_pow_32f.h:81
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_x2_pow_32f.h:83