66 #ifndef INCLUDED_volk_32u_byteswap_u_H
67 #define INCLUDED_volk_32u_byteswap_u_H
73 #include <emmintrin.h>
75 static inline void volk_32u_byteswap_u_sse2(
uint32_t* intsToSwap,
unsigned int num_points){
76 unsigned int number = 0;
79 __m128i input, byte1, byte2, byte3, byte4, output;
80 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
81 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
83 const uint64_t quarterPoints = num_points / 4;
84 for(;number < quarterPoints; number++){
86 input = _mm_loadu_si128((__m128i*)inputPtr);
88 byte1 = _mm_slli_epi32(input, 24);
89 byte2 = _mm_slli_epi32(input, 8);
90 byte3 = _mm_srli_epi32(input, 8);
91 byte4 = _mm_srli_epi32(input, 24);
93 output = _mm_or_si128(byte1, byte4);
94 byte2 = _mm_and_si128(byte2, byte2mask);
95 output = _mm_or_si128(output, byte2);
96 byte3 = _mm_and_si128(byte3, byte3mask);
97 output = _mm_or_si128(output, byte3);
99 _mm_storeu_si128((__m128i*)inputPtr, output);
104 number = quarterPoints*4;
105 for(; number < num_points; number++){
107 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
108 *inputPtr = outputVal;
116 #include <arm_neon.h>
118 static inline void volk_32u_byteswap_neon(
uint32_t* intsToSwap,
unsigned int num_points){
120 unsigned int number = 0;
121 unsigned int n8points = num_points / 8;
123 uint8x8x4_t input_table;
124 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
125 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
135 int_lookup01 = vcreate_u8(74609667900706840);
136 int_lookup23 = vcreate_u8(219290013576860186);
137 int_lookup45 = vcreate_u8(363970359253013532);
138 int_lookup67 = vcreate_u8(508650704929166878);
140 for(number = 0; number < n8points; ++number){
141 input_table = vld4_u8((
uint8_t*) inputPtr);
142 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
143 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
144 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
145 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
146 vst1_u8((
uint8_t*) inputPtr, swapped_int01);
147 vst1_u8((
uint8_t*) (inputPtr+2), swapped_int23);
148 vst1_u8((
uint8_t*) (inputPtr+4), swapped_int45);
149 vst1_u8((
uint8_t*) (inputPtr+6), swapped_int67);
154 for(number = n8points * 8; number < num_points; ++number){
156 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
165 #ifdef LV_HAVE_GENERIC
167 static inline void volk_32u_byteswap_generic(
uint32_t* intsToSwap,
unsigned int num_points){
171 for(point = 0; point < num_points; point++){
173 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
183 #ifndef INCLUDED_volk_32u_byteswap_a_H
184 #define INCLUDED_volk_32u_byteswap_a_H
191 #include <emmintrin.h>
194 static inline void volk_32u_byteswap_a_sse2(
uint32_t* intsToSwap,
unsigned int num_points){
195 unsigned int number = 0;
198 __m128i input, byte1, byte2, byte3, byte4, output;
199 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
200 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
202 const uint64_t quarterPoints = num_points / 4;
203 for(;number < quarterPoints; number++){
205 input = _mm_load_si128((__m128i*)inputPtr);
207 byte1 = _mm_slli_epi32(input, 24);
208 byte2 = _mm_slli_epi32(input, 8);
209 byte3 = _mm_srli_epi32(input, 8);
210 byte4 = _mm_srli_epi32(input, 24);
212 output = _mm_or_si128(byte1, byte4);
213 byte2 = _mm_and_si128(byte2, byte2mask);
214 output = _mm_or_si128(output, byte2);
215 byte3 = _mm_and_si128(byte3, byte3mask);
216 output = _mm_or_si128(output, byte3);
218 _mm_store_si128((__m128i*)inputPtr, output);
223 number = quarterPoints*4;
224 for(; number < num_points; number++){
226 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
227 *inputPtr = outputVal;
234 #ifdef LV_HAVE_GENERIC
236 static inline void volk_32u_byteswap_a_generic(
uint32_t* intsToSwap,
unsigned int num_points){
240 for(point = 0; point < num_points; point++){
242 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
unsigned char uint8_t
Definition: stdint.h:78
unsigned int uint32_t
Definition: stdint.h:80
unsigned __int64 uint64_t
Definition: stdint.h:90