92 #ifndef INCLUDED_volk_32f_log2_32f_a_H
93 #define INCLUDED_volk_32f_log2_32f_a_H
100 #define LOG_POLY_DEGREE 6
104 float const result = log2f(f);
105 return isinf(result) ? copysignf(127.0f, result) : result;
108 #ifdef LV_HAVE_GENERIC
113 float* bPtr = bVector;
114 const float* aPtr = aVector;
115 unsigned int number = 0;
117 for(number = 0; number < num_points; number++)
122 #if LV_HAVE_AVX2 && LV_HAVE_FMA
123 #include <immintrin.h>
125 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
126 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
127 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
128 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
129 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
130 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
133 volk_32f_log2_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
135 float* bPtr = bVector;
136 const float* aPtr = aVector;
138 unsigned int number = 0;
139 const unsigned int eighthPoints = num_points / 8;
141 __m256 aVal, bVal, mantissa, frac, leadingOne;
144 for(;number < eighthPoints; number++){
146 aVal = _mm256_load_ps(aPtr);
147 bias = _mm256_set1_epi32(127);
148 leadingOne = _mm256_set1_ps(1.0f);
149 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
150 bVal = _mm256_cvtepi32_ps(exp);
153 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
155 #if LOG_POLY_DEGREE == 6
156 mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
157 #elif LOG_POLY_DEGREE == 5
158 mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
159 #elif LOG_POLY_DEGREE == 4
160 mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
161 #elif LOG_POLY_DEGREE == 3
162 mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
167 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
168 _mm256_store_ps(bPtr, bVal);
174 number = eighthPoints * 8;
181 #include <immintrin.h>
183 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
184 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
185 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
186 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
187 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
188 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
191 volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
193 float* bPtr = bVector;
194 const float* aPtr = aVector;
196 unsigned int number = 0;
197 const unsigned int eighthPoints = num_points / 8;
199 __m256 aVal, bVal, mantissa, frac, leadingOne;
202 for(;number < eighthPoints; number++){
204 aVal = _mm256_load_ps(aPtr);
205 bias = _mm256_set1_epi32(127);
206 leadingOne = _mm256_set1_ps(1.0f);
207 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
208 bVal = _mm256_cvtepi32_ps(exp);
211 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
213 #if LOG_POLY_DEGREE == 6
214 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
215 #elif LOG_POLY_DEGREE == 5
216 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
217 #elif LOG_POLY_DEGREE == 4
218 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
219 #elif LOG_POLY_DEGREE == 3
220 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
225 bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
226 _mm256_store_ps(bPtr, bVal);
232 number = eighthPoints * 8;
238 #ifdef LV_HAVE_SSE4_1
239 #include <smmintrin.h>
241 #define POLY0(x, c0) _mm_set1_ps(c0)
242 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
243 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
244 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
245 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
246 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
249 volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
251 float* bPtr = bVector;
252 const float* aPtr = aVector;
254 unsigned int number = 0;
255 const unsigned int quarterPoints = num_points / 4;
257 __m128 aVal, bVal, mantissa, frac, leadingOne;
260 for(;number < quarterPoints; number++){
262 aVal = _mm_load_ps(aPtr);
263 bias = _mm_set1_epi32(127);
264 leadingOne = _mm_set1_ps(1.0f);
265 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
266 bVal = _mm_cvtepi32_ps(exp);
269 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
271 #if LOG_POLY_DEGREE == 6
272 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
273 #elif LOG_POLY_DEGREE == 5
274 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
275 #elif LOG_POLY_DEGREE == 4
276 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
277 #elif LOG_POLY_DEGREE == 3
278 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
283 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
284 _mm_store_ps(bPtr, bVal);
290 number = quarterPoints * 4;
297 #include <arm_neon.h>
300 #define VLOG2Q_NEON_PREAMBLE() \
301 int32x4_t one = vdupq_n_s32(0x000800000); \
303 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
304 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
305 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
306 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
307 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
308 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
309 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
310 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
311 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
312 int32x4_t exp_bias = vdupq_n_s32(127);
315 #define VLOG2Q_NEON_F32(log2_approx, aval) \
316 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
317 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
318 exponent_i = vshrq_n_s32(exponent_i, 23); \
323 significand_i = vorrq_s32(one, significand_i); \
324 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
326 exponent_i = vsubq_s32(exponent_i, exp_bias); \
327 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
331 log2_approx = vaddq_f32(exponent_f, p0); \
332 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
333 log2_approx = vaddq_f32(log2_approx, tmp1); \
334 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
335 tmp1 = vmulq_f32(sig_2, p2); \
336 log2_approx = vaddq_f32(log2_approx, tmp1); \
338 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
339 tmp1 = vmulq_f32(sig_3, p3); \
340 log2_approx = vaddq_f32(log2_approx, tmp1); \
341 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
342 tmp1 = vmulq_f32(sig_4, p4); \
343 log2_approx = vaddq_f32(log2_approx, tmp1); \
344 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
345 tmp1 = vmulq_f32(sig_5, p5); \
346 log2_approx = vaddq_f32(log2_approx, tmp1); \
347 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
348 tmp1 = vmulq_f32(sig_6, p6); \
349 log2_approx = vaddq_f32(log2_approx, tmp1);
354 float* bPtr = bVector;
355 const float* aPtr = aVector;
357 const unsigned int quarterPoints = num_points / 4;
360 float32x4_t log2_approx;
371 for(number = 0; number < quarterPoints; ++number){
373 aval = vld1q_s32((
int*)aPtr);
377 vst1q_f32(bPtr, log2_approx);
383 number = quarterPoints * 4;
392 #ifndef INCLUDED_volk_32f_log2_32f_u_H
393 #define INCLUDED_volk_32f_log2_32f_u_H
396 #ifdef LV_HAVE_GENERIC
401 float* bPtr = bVector;
402 const float* aPtr = aVector;
403 unsigned int number = 0;
405 for(number = 0; number < num_points; number++){
406 float const result = log2f(*aPtr++);
407 *bPtr++ = isinf(result) ? -127.0f : result;
414 #ifdef LV_HAVE_SSE4_1
415 #include <smmintrin.h>
417 #define POLY0(x, c0) _mm_set1_ps(c0)
418 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
419 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
420 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
421 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
422 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
425 volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
427 float* bPtr = bVector;
428 const float* aPtr = aVector;
430 unsigned int number = 0;
431 const unsigned int quarterPoints = num_points / 4;
433 __m128 aVal, bVal, mantissa, frac, leadingOne;
436 for(;number < quarterPoints; number++){
438 aVal = _mm_loadu_ps(aPtr);
439 bias = _mm_set1_epi32(127);
440 leadingOne = _mm_set1_ps(1.0f);
441 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
442 bVal = _mm_cvtepi32_ps(exp);
445 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
447 #if LOG_POLY_DEGREE == 6
448 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
449 #elif LOG_POLY_DEGREE == 5
450 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
451 #elif LOG_POLY_DEGREE == 4
452 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
453 #elif LOG_POLY_DEGREE == 3
454 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
459 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460 _mm_storeu_ps(bPtr, bVal);
466 number = quarterPoints * 4;
472 #if LV_HAVE_AVX2 && LV_HAVE_FMA
473 #include <immintrin.h>
475 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
476 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
477 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
478 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
479 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
480 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
483 volk_32f_log2_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
485 float* bPtr = bVector;
486 const float* aPtr = aVector;
488 unsigned int number = 0;
489 const unsigned int eighthPoints = num_points / 8;
491 __m256 aVal, bVal, mantissa, frac, leadingOne;
494 for(;number < eighthPoints; number++){
496 aVal = _mm256_loadu_ps(aPtr);
497 bias = _mm256_set1_epi32(127);
498 leadingOne = _mm256_set1_ps(1.0f);
499 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
500 bVal = _mm256_cvtepi32_ps(exp);
503 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
505 #if LOG_POLY_DEGREE == 6
506 mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
507 #elif LOG_POLY_DEGREE == 5
508 mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
509 #elif LOG_POLY_DEGREE == 4
510 mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
511 #elif LOG_POLY_DEGREE == 3
512 mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
517 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
518 _mm256_storeu_ps(bPtr, bVal);
524 number = eighthPoints * 8;
531 #include <immintrin.h>
533 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
534 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
535 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
536 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
537 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
538 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
541 volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
543 float* bPtr = bVector;
544 const float* aPtr = aVector;
546 unsigned int number = 0;
547 const unsigned int eighthPoints = num_points / 8;
549 __m256 aVal, bVal, mantissa, frac, leadingOne;
552 for(;number < eighthPoints; number++){
554 aVal = _mm256_loadu_ps(aPtr);
555 bias = _mm256_set1_epi32(127);
556 leadingOne = _mm256_set1_ps(1.0f);
557 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
558 bVal = _mm256_cvtepi32_ps(exp);
561 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
563 #if LOG_POLY_DEGREE == 6
564 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
565 #elif LOG_POLY_DEGREE == 5
566 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
567 #elif LOG_POLY_DEGREE == 4
568 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
569 #elif LOG_POLY_DEGREE == 3
570 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
575 bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
576 _mm256_storeu_ps(bPtr, bVal);
582 number = eighthPoints * 8;