76 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
85 #include <immintrin.h>
87 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
88 unsigned int number = 0;
90 const unsigned int quarterPoints = num_points / 4;
91 unsigned int isodd = num_points & 3;
92 __m256 x, yl, yh, z, tmp1, tmp2;
97 yl = _mm256_set1_ps(
lv_creal(scalar));
98 yh = _mm256_set1_ps(
lv_cimag(scalar));
100 for(;number < quarterPoints; number++){
101 x = _mm256_loadu_ps((
float*)a);
103 tmp1 = _mm256_mul_ps(x,yl);
105 x = _mm256_shuffle_ps(x,x,0xB1);
107 tmp2 = _mm256_mul_ps(x,yh);
109 z = _mm256_addsub_ps(tmp1,tmp2);
111 _mm256_storeu_ps((
float*)c,z);
117 for(i = num_points-isodd; i < num_points; i++) {
118 *c++ = (*a++) * scalar;
125 #include <pmmintrin.h>
127 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
128 unsigned int number = 0;
129 const unsigned int halfPoints = num_points / 2;
131 __m128 x, yl, yh, z, tmp1, tmp2;
139 for(;number < halfPoints; number++){
141 x = _mm_loadu_ps((
float*)a);
143 tmp1 = _mm_mul_ps(x,yl);
145 x = _mm_shuffle_ps(x,x,0xB1);
147 tmp2 = _mm_mul_ps(x,yh);
149 z = _mm_addsub_ps(tmp1,tmp2);
151 _mm_storeu_ps((
float*)c,z);
157 if((num_points % 2) != 0) {
163 #ifdef LV_HAVE_GENERIC
165 static inline void volk_32fc_s32fc_multiply_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
168 unsigned int number = num_points;
172 *cPtr++ = (*aPtr++) * scalar;
173 *cPtr++ = (*aPtr++) * scalar;
174 *cPtr++ = (*aPtr++) * scalar;
175 *cPtr++ = (*aPtr++) * scalar;
176 *cPtr++ = (*aPtr++) * scalar;
177 *cPtr++ = (*aPtr++) * scalar;
178 *cPtr++ = (*aPtr++) * scalar;
179 *cPtr++ = (*aPtr++) * scalar;
185 *cPtr++ = *aPtr++ * scalar;
191 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
192 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
200 #include <immintrin.h>
202 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
203 unsigned int number = 0;
205 const unsigned int quarterPoints = num_points / 4;
206 unsigned int isodd = num_points & 3;
207 __m256 x, yl, yh, z, tmp1, tmp2;
212 yl = _mm256_set1_ps(
lv_creal(scalar));
213 yh = _mm256_set1_ps(
lv_cimag(scalar));
215 for(;number < quarterPoints; number++){
216 x = _mm256_load_ps((
float*)a);
218 tmp1 = _mm256_mul_ps(x,yl);
220 x = _mm256_shuffle_ps(x,x,0xB1);
222 tmp2 = _mm256_mul_ps(x,yh);
224 z = _mm256_addsub_ps(tmp1,tmp2);
226 _mm256_store_ps((
float*)c,z);
232 for(i = num_points-isodd; i < num_points; i++) {
233 *c++ = (*a++) * scalar;
240 #include <pmmintrin.h>
242 static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
243 unsigned int number = 0;
244 const unsigned int halfPoints = num_points / 2;
246 __m128 x, yl, yh, z, tmp1, tmp2;
254 for(;number < halfPoints; number++){
256 x = _mm_load_ps((
float*)a);
258 tmp1 = _mm_mul_ps(x,yl);
260 x = _mm_shuffle_ps(x,x,0xB1);
262 tmp2 = _mm_mul_ps(x,yh);
264 z = _mm_addsub_ps(tmp1,tmp2);
266 _mm_store_ps((
float*)c,z);
272 if((num_points % 2) != 0) {
279 #include <arm_neon.h>
281 static inline void volk_32fc_s32fc_multiply_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
284 unsigned int number = num_points;
285 unsigned int quarter_points = num_points / 4;
287 float32x4x2_t a_val, scalar_val;
288 float32x4x2_t tmp_imag;
290 scalar_val = vld2q_f32((
const float*)&scalar);
291 for(number = 0; number < quarter_points; ++number) {
292 a_val = vld2q_f32((
float*)aPtr);
293 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
294 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
296 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
297 tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
299 vst2q_f32((
float*)cVector, tmp_imag);
304 for(number = quarter_points*4; number < num_points; number++){
305 *cPtr++ = *aPtr++ * scalar;
310 #ifdef LV_HAVE_GENERIC
312 static inline void volk_32fc_s32fc_multiply_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
315 unsigned int number = num_points;
319 *cPtr++ = (*aPtr++) * scalar;
320 *cPtr++ = (*aPtr++) * scalar;
321 *cPtr++ = (*aPtr++) * scalar;
322 *cPtr++ = (*aPtr++) * scalar;
323 *cPtr++ = (*aPtr++) * scalar;
324 *cPtr++ = (*aPtr++) * scalar;
325 *cPtr++ = (*aPtr++) * scalar;
326 *cPtr++ = (*aPtr++) * scalar;
332 *cPtr++ = *aPtr++ * scalar;
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80