53 #ifndef INCLUDED_volk_16u_byteswap_u_H
54 #define INCLUDED_volk_16u_byteswap_u_H
59 #ifdef LV_HAVE_GENERIC
62 unsigned int num_points)
64 uint16_t* inputPtr = intsToSwap;
65 for (
unsigned int point = 0; point < num_points; point++) {
66 uint16_t output = *inputPtr;
67 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
76 #include <immintrin.h>
77 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap,
unsigned int num_points)
81 const unsigned int nPerSet = 16;
82 const uint64_t nSets = num_points / nPerSet;
84 uint16_t* inputPtr = (uint16_t*)intsToSwap;
86 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
87 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
88 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
90 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
92 for (number = 0; number < nSets; number++) {
94 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
95 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
98 _mm256_store_si256((__m256i*)inputPtr, output);
105 for (number = nPerSet * nSets; number < num_points; number++) {
106 uint16_t outputVal = *inputPtr;
107 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
108 *inputPtr = outputVal;
116 #include <immintrin.h>
117 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap,
unsigned int num_points)
121 const unsigned int nPerSet = 16;
122 const uint64_t nSets = num_points / nPerSet;
124 uint16_t* inputPtr = (uint16_t*)intsToSwap;
126 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
127 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
128 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
130 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
132 for (number = 0; number < nSets; number++) {
134 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
135 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
138 _mm256_storeu_si256((__m256i*)inputPtr, output);
145 for (number = nPerSet * nSets; number < num_points; number++) {
146 uint16_t outputVal = *inputPtr;
147 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
148 *inputPtr = outputVal;
156 #include <emmintrin.h>
160 unsigned int number = 0;
161 uint16_t* inputPtr = intsToSwap;
162 __m128i input, left, right, output;
164 const unsigned int eighthPoints = num_points / 8;
165 for (; number < eighthPoints; number++) {
167 input = _mm_loadu_si128((__m128i*)inputPtr);
169 left = _mm_slli_epi16(input, 8);
170 right = _mm_srli_epi16(input, 8);
172 output = _mm_or_si128(left, right);
174 _mm_storeu_si128((__m128i*)inputPtr, output);
179 number = eighthPoints * 8;
180 for (; number < num_points; number++) {
181 uint16_t outputVal = *inputPtr;
182 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
183 *inputPtr = outputVal;
191 #ifndef INCLUDED_volk_16u_byteswap_a_H
192 #define INCLUDED_volk_16u_byteswap_a_H
194 #include <inttypes.h>
198 #include <emmintrin.h>
202 uint16_t* inputPtr = intsToSwap;
203 __m128i input, left, right, output;
205 const unsigned int eighthPoints = num_points / 8;
206 for (
unsigned int number = 0; number < eighthPoints; number++) {
208 input = _mm_load_si128((__m128i*)inputPtr);
210 left = _mm_slli_epi16(input, 8);
211 right = _mm_srli_epi16(input, 8);
213 output = _mm_or_si128(left, right);
215 _mm_store_si128((__m128i*)inputPtr, output);
225 #include <arm_neon.h>
230 unsigned int eighth_points = num_points / 8;
231 uint16x8_t input, output;
232 uint16_t* inputPtr = intsToSwap;
234 for (number = 0; number < eighth_points; number++) {
235 input = vld1q_u16(inputPtr);
236 output = vsriq_n_u16(output, input, 8);
237 output = vsliq_n_u16(output, input, 8);
238 vst1q_u16(inputPtr, output);
247 #include <arm_neon.h>
250 unsigned int num_points)
252 uint16_t* inputPtr = intsToSwap;
253 unsigned int number = 0;
254 unsigned int n16points = num_points / 16;
256 uint8x8x4_t input_table;
257 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
258 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
268 int_lookup01 = vcreate_u8(1232017111498883080);
269 int_lookup23 = vcreate_u8(1376697457175036426);
270 int_lookup45 = vcreate_u8(1521377802851189772);
271 int_lookup67 = vcreate_u8(1666058148527343118);
273 for (number = 0; number < n16points; ++number) {
274 input_table = vld4_u8((uint8_t*)inputPtr);
275 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
276 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
277 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
278 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
279 vst1_u8((uint8_t*)inputPtr, swapped_int01);
280 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
281 vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
282 vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
291 #ifdef LV_HAVE_GENERIC
294 unsigned int num_points)
296 uint16_t* inputPtr = intsToSwap;
297 for (
unsigned int point = 0; point < num_points; point++) {
298 uint16_t output = *inputPtr;
299 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
308 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap,
unsigned int num_points);
309 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap,
unsigned int num_points)
311 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);