53 #ifndef INCLUDED_volk_8i_convert_16i_u_H
54 #define INCLUDED_volk_8i_convert_16i_u_H
60 #include <immintrin.h>
62 static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
63 const int8_t* inputVector,
64 unsigned int num_points)
66 unsigned int number = 0;
67 const unsigned int sixteenthPoints = num_points / 16;
69 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
70 __m256i* outputVectorPtr = (__m256i*)outputVector;
74 for (; number < sixteenthPoints; number++) {
75 inputVal = _mm_loadu_si128(inputVectorPtr);
76 ret = _mm256_cvtepi8_epi16(inputVal);
77 ret = _mm256_slli_epi16(ret, 8);
78 _mm256_storeu_si256(outputVectorPtr, ret);
84 number = sixteenthPoints * 16;
85 for (; number < num_points; number++) {
86 outputVector[number] = (int16_t)(inputVector[number]) * 256;
93 #include <smmintrin.h>
95 static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
96 const int8_t* inputVector,
97 unsigned int num_points)
99 unsigned int number = 0;
100 const unsigned int sixteenthPoints = num_points / 16;
102 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
103 __m128i* outputVectorPtr = (__m128i*)outputVector;
107 for (; number < sixteenthPoints; number++) {
108 inputVal = _mm_loadu_si128(inputVectorPtr);
109 ret = _mm_cvtepi8_epi16(inputVal);
110 ret = _mm_slli_epi16(ret, 8);
111 _mm_storeu_si128(outputVectorPtr, ret);
115 inputVal = _mm_srli_si128(inputVal, 8);
116 ret = _mm_cvtepi8_epi16(inputVal);
117 ret = _mm_slli_epi16(ret, 8);
118 _mm_storeu_si128(outputVectorPtr, ret);
125 number = sixteenthPoints * 16;
126 for (; number < num_points; number++) {
127 outputVector[number] = (int16_t)(inputVector[number]) * 256;
133 #ifdef LV_HAVE_GENERIC
136 const int8_t* inputVector,
137 unsigned int num_points)
139 int16_t* outputVectorPtr = outputVector;
140 const int8_t* inputVectorPtr = inputVector;
141 unsigned int number = 0;
143 for (number = 0; number < num_points; number++) {
144 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
153 #ifndef INCLUDED_volk_8i_convert_16i_a_H
154 #define INCLUDED_volk_8i_convert_16i_a_H
156 #include <inttypes.h>
160 #include <immintrin.h>
162 static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
163 const int8_t* inputVector,
164 unsigned int num_points)
166 unsigned int number = 0;
167 const unsigned int sixteenthPoints = num_points / 16;
169 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
170 __m256i* outputVectorPtr = (__m256i*)outputVector;
174 for (; number < sixteenthPoints; number++) {
175 inputVal = _mm_load_si128(inputVectorPtr);
176 ret = _mm256_cvtepi8_epi16(inputVal);
177 ret = _mm256_slli_epi16(ret, 8);
178 _mm256_store_si256(outputVectorPtr, ret);
184 number = sixteenthPoints * 16;
185 for (; number < num_points; number++) {
186 outputVector[number] = (int16_t)(inputVector[number]) * 256;
192 #ifdef LV_HAVE_SSE4_1
193 #include <smmintrin.h>
195 static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
196 const int8_t* inputVector,
197 unsigned int num_points)
199 unsigned int number = 0;
200 const unsigned int sixteenthPoints = num_points / 16;
202 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
203 __m128i* outputVectorPtr = (__m128i*)outputVector;
207 for (; number < sixteenthPoints; number++) {
208 inputVal = _mm_load_si128(inputVectorPtr);
209 ret = _mm_cvtepi8_epi16(inputVal);
210 ret = _mm_slli_epi16(ret, 8);
211 _mm_store_si128(outputVectorPtr, ret);
215 inputVal = _mm_srli_si128(inputVal, 8);
216 ret = _mm_cvtepi8_epi16(inputVal);
217 ret = _mm_slli_epi16(ret, 8);
218 _mm_store_si128(outputVectorPtr, ret);
225 number = sixteenthPoints * 16;
226 for (; number < num_points; number++) {
227 outputVector[number] = (int16_t)(inputVector[number]) * 256;
233 #ifdef LV_HAVE_GENERIC
236 const int8_t* inputVector,
237 unsigned int num_points)
239 int16_t* outputVectorPtr = outputVector;
240 const int8_t* inputVectorPtr = inputVector;
241 unsigned int number = 0;
243 for (number = 0; number < num_points; number++) {
244 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
251 #include <arm_neon.h>
254 const int8_t* inputVector,
255 unsigned int num_points)
257 int16_t* outputVectorPtr = outputVector;
258 const int8_t* inputVectorPtr = inputVector;
260 const unsigned int eighth_points = num_points / 8;
263 int16x8_t converted_vec;
268 for (number = 0; number < eighth_points; ++number) {
269 input_vec = vld1_s8(inputVectorPtr);
270 converted_vec = vmovl_s8(input_vec);
272 converted_vec = vshlq_n_s16(converted_vec, 8);
273 vst1q_s16(outputVectorPtr, converted_vec);
276 outputVectorPtr += 8;
279 for (number = eighth_points * 8; number < num_points; number++) {
280 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
287 extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
288 const int8_t* inputVector,
289 unsigned int num_points);
291 static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
292 const int8_t* inputVector,
293 unsigned int num_points)
295 volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);