Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_16u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16u_byteswap_u_H
54 #define INCLUDED_volk_16u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_GENERIC
60 
61 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
62  unsigned int num_points)
63 {
64  uint16_t* inputPtr = intsToSwap;
65  for (unsigned int point = 0; point < num_points; point++) {
66  uint16_t output = *inputPtr;
67  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
68  *inputPtr = output;
69  inputPtr++;
70  }
71 }
72 #endif /* LV_HAVE_GENERIC */
73 
74 
75 #if LV_HAVE_AVX2
76 #include <immintrin.h>
77 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
78 {
79  unsigned int number;
80 
81  const unsigned int nPerSet = 16;
82  const uint64_t nSets = num_points / nPerSet;
83 
84  uint16_t* inputPtr = (uint16_t*)intsToSwap;
85 
86  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
87  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
88  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
89 
90  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
91 
92  for (number = 0; number < nSets; number++) {
93  // Load the 32t values, increment inputPtr later since we're doing it in-place.
94  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
95  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
96 
97  // Store the results
98  _mm256_store_si256((__m256i*)inputPtr, output);
99  inputPtr += nPerSet;
100  }
101 
102  _mm256_zeroupper();
103 
104  // Byteswap any remaining points:
105  for (number = nPerSet * nSets; number < num_points; number++) {
106  uint16_t outputVal = *inputPtr;
107  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
108  *inputPtr = outputVal;
109  inputPtr++;
110  }
111 }
112 #endif /* LV_HAVE_AVX2 */
113 
114 
115 #if LV_HAVE_AVX2
116 #include <immintrin.h>
117 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
118 {
119  unsigned int number;
120 
121  const unsigned int nPerSet = 16;
122  const uint64_t nSets = num_points / nPerSet;
123 
124  uint16_t* inputPtr = (uint16_t*)intsToSwap;
125 
126  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
127  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
128  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
129 
130  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
131 
132  for (number = 0; number < nSets; number++) {
133  // Load the 32t values, increment inputPtr later since we're doing it in-place.
134  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
135  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
136 
137  // Store the results
138  _mm256_storeu_si256((__m256i*)inputPtr, output);
139  inputPtr += nPerSet;
140  }
141 
142  _mm256_zeroupper();
143 
144  // Byteswap any remaining points:
145  for (number = nPerSet * nSets; number < num_points; number++) {
146  uint16_t outputVal = *inputPtr;
147  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
148  *inputPtr = outputVal;
149  inputPtr++;
150  }
151 }
152 #endif /* LV_HAVE_AVX2 */
153 
154 
155 #ifdef LV_HAVE_SSE2
156 #include <emmintrin.h>
157 
158 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
159 {
160  unsigned int number = 0;
161  uint16_t* inputPtr = intsToSwap;
162  __m128i input, left, right, output;
163 
164  const unsigned int eighthPoints = num_points / 8;
165  for (; number < eighthPoints; number++) {
166  // Load the 16t values, increment inputPtr later since we're doing it in-place.
167  input = _mm_loadu_si128((__m128i*)inputPtr);
168  // Do the two shifts
169  left = _mm_slli_epi16(input, 8);
170  right = _mm_srli_epi16(input, 8);
171  // Or the left and right halves together
172  output = _mm_or_si128(left, right);
173  // Store the results
174  _mm_storeu_si128((__m128i*)inputPtr, output);
175  inputPtr += 8;
176  }
177 
178  // Byteswap any remaining points:
179  number = eighthPoints * 8;
180  for (; number < num_points; number++) {
181  uint16_t outputVal = *inputPtr;
182  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
183  *inputPtr = outputVal;
184  inputPtr++;
185  }
186 }
187 #endif /* LV_HAVE_SSE2 */
188 
189 
190 #endif /* INCLUDED_volk_16u_byteswap_u_H */
191 #ifndef INCLUDED_volk_16u_byteswap_a_H
192 #define INCLUDED_volk_16u_byteswap_a_H
193 
194 #include <inttypes.h>
195 #include <stdio.h>
196 
197 #ifdef LV_HAVE_SSE2
198 #include <emmintrin.h>
199 
200 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
201 {
202  uint16_t* inputPtr = intsToSwap;
203  __m128i input, left, right, output;
204 
205  const unsigned int eighthPoints = num_points / 8;
206  for (unsigned int number = 0; number < eighthPoints; number++) {
207  // Load the 16t values, increment inputPtr later since we're doing it in-place.
208  input = _mm_load_si128((__m128i*)inputPtr);
209  // Do the two shifts
210  left = _mm_slli_epi16(input, 8);
211  right = _mm_srli_epi16(input, 8);
212  // Or the left and right halves together
213  output = _mm_or_si128(left, right);
214  // Store the results
215  _mm_store_si128((__m128i*)inputPtr, output);
216  inputPtr += 8;
217  }
218 
219  // Byteswap any remaining points:
220  volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
221 }
222 #endif /* LV_HAVE_SSE2 */
223 
224 #ifdef LV_HAVE_NEON
225 #include <arm_neon.h>
226 
227 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
228 {
229  unsigned int number;
230  unsigned int eighth_points = num_points / 8;
231  uint16x8_t input, output;
232  uint16_t* inputPtr = intsToSwap;
233 
234  for (number = 0; number < eighth_points; number++) {
235  input = vld1q_u16(inputPtr);
236  output = vsriq_n_u16(output, input, 8);
237  output = vsliq_n_u16(output, input, 8);
238  vst1q_u16(inputPtr, output);
239  inputPtr += 8;
240  }
241 
242  volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
243 }
244 #endif /* LV_HAVE_NEON */
245 
246 #ifdef LV_HAVE_NEON
247 #include <arm_neon.h>
248 
249 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
250  unsigned int num_points)
251 {
252  uint16_t* inputPtr = intsToSwap;
253  unsigned int number = 0;
254  unsigned int n16points = num_points / 16;
255 
256  uint8x8x4_t input_table;
257  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
258  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
259 
260  /* these magic numbers are used as byte-indices in the LUT.
261  they are pre-computed to save time. A simple C program
262  can calculate them; for example for lookup01:
263  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
264  for(ii=0; ii < 8; ++ii) {
265  index += ((uint64_t)(*(chars+ii))) << (ii*8);
266  }
267  */
268  int_lookup01 = vcreate_u8(1232017111498883080);
269  int_lookup23 = vcreate_u8(1376697457175036426);
270  int_lookup45 = vcreate_u8(1521377802851189772);
271  int_lookup67 = vcreate_u8(1666058148527343118);
272 
273  for (number = 0; number < n16points; ++number) {
274  input_table = vld4_u8((uint8_t*)inputPtr);
275  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
276  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
277  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
278  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
279  vst1_u8((uint8_t*)inputPtr, swapped_int01);
280  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
281  vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
282  vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
283 
284  inputPtr += 16;
285  }
286 
287  volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
288 }
289 #endif /* LV_HAVE_NEON */
290 
291 #ifdef LV_HAVE_GENERIC
292 
293 static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
294  unsigned int num_points)
295 {
296  uint16_t* inputPtr = intsToSwap;
297  for (unsigned int point = 0; point < num_points; point++) {
298  uint16_t output = *inputPtr;
299  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
300  *inputPtr = output;
301  inputPtr++;
302  }
303 }
304 #endif /* LV_HAVE_GENERIC */
305 
306 #ifdef LV_HAVE_ORC
307 
308 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
309 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
310 {
311  volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
312 }
313 #endif /* LV_HAVE_ORC */
314 
315 
316 #endif /* INCLUDED_volk_16u_byteswap_a_H */
volk_16u_byteswap_neon_table
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:249
volk_16u_byteswap_neon
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:227
volk_16u_byteswap_a_sse2
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:200
volk_16u_byteswap_a_generic
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:293
volk_16u_byteswap_u_sse2
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:158
volk_16u_byteswap_generic
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:61