Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72 #define INCLUDED_volk_32f_x2_pow_32f_a_H
73 
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <inttypes.h>
77 #include <math.h>
78 
79 #define POW_POLY_DEGREE 3
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
85 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
86 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
87 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
88 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
89 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
90 
91 static inline void
92 volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
93  const float* aVector, unsigned int num_points)
94 {
95  float* cPtr = cVector;
96  const float* bPtr = bVector;
97  const float* aPtr = aVector;
98 
99  unsigned int number = 0;
100  const unsigned int eighthPoints = num_points / 8;
101 
102  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
103  __m256 tmp, fx, mask, pow2n, z, y;
104  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
105  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
106  __m256i bias, exp, emm0, pi32_0x7f;
107 
108  one = _mm256_set1_ps(1.0);
109  exp_hi = _mm256_set1_ps(88.3762626647949);
110  exp_lo = _mm256_set1_ps(-88.3762626647949);
111  ln2 = _mm256_set1_ps(0.6931471805);
112  log2EF = _mm256_set1_ps(1.44269504088896341);
113  half = _mm256_set1_ps(0.5);
114  exp_C1 = _mm256_set1_ps(0.693359375);
115  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
116  pi32_0x7f = _mm256_set1_epi32(0x7f);
117 
118  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
119  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
120  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
121  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
122  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
123  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
124 
125  for(;number < eighthPoints; number++){
126  // First compute the logarithm
127  aVal = _mm256_load_ps(aPtr);
128  bias = _mm256_set1_epi32(127);
129  leadingOne = _mm256_set1_ps(1.0f);
130  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
131  logarithm = _mm256_cvtepi32_ps(exp);
132 
133  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
134 
135 #if POW_POLY_DEGREE == 6
136  mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
137 #elif POW_POLY_DEGREE == 5
138  mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
139 #elif POW_POLY_DEGREE == 4
140  mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
141 #elif POW_POLY_DEGREE == 3
142  mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
143 #else
144 #error
145 #endif
146 
147  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
148  logarithm = _mm256_mul_ps(logarithm, ln2);
149 
150  // Now calculate b*lna
151  bVal = _mm256_load_ps(bPtr);
152  bVal = _mm256_mul_ps(bVal, logarithm);
153 
154  // Now compute exp(b*lna)
155  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
156 
157  fx = _mm256_fmadd_ps(bVal, log2EF, half);
158 
159  emm0 = _mm256_cvttps_epi32(fx);
160  tmp = _mm256_cvtepi32_ps(emm0);
161 
162  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
163  fx = _mm256_sub_ps(tmp, mask);
164 
165  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
166  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
167  z = _mm256_mul_ps(bVal, bVal);
168 
169  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
170  y = _mm256_fmadd_ps(y, bVal, exp_p2);
171  y = _mm256_fmadd_ps(y, bVal, exp_p3);
172  y = _mm256_fmadd_ps(y, bVal, exp_p4);
173  y = _mm256_fmadd_ps(y, bVal, exp_p5);
174  y = _mm256_fmadd_ps(y, z, bVal);
175  y = _mm256_add_ps(y, one);
176 
177  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
178 
179  pow2n = _mm256_castsi256_ps(emm0);
180  cVal = _mm256_mul_ps(y, pow2n);
181 
182  _mm256_store_ps(cPtr, cVal);
183 
184  aPtr += 8;
185  bPtr += 8;
186  cPtr += 8;
187  }
188 
189  number = eighthPoints * 8;
190  for(;number < num_points; number++){
191  *cPtr++ = pow(*aPtr++, *bPtr++);
192  }
193 }
194 
195 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
196 
197 #ifdef LV_HAVE_AVX2
198 #include <immintrin.h>
199 
200 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
201 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
202 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
203 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
204 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
205 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
206 
207 static inline void
208 volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
209  const float* aVector, unsigned int num_points)
210 {
211  float* cPtr = cVector;
212  const float* bPtr = bVector;
213  const float* aPtr = aVector;
214 
215  unsigned int number = 0;
216  const unsigned int eighthPoints = num_points / 8;
217 
218  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
219  __m256 tmp, fx, mask, pow2n, z, y;
220  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
221  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
222  __m256i bias, exp, emm0, pi32_0x7f;
223 
224  one = _mm256_set1_ps(1.0);
225  exp_hi = _mm256_set1_ps(88.3762626647949);
226  exp_lo = _mm256_set1_ps(-88.3762626647949);
227  ln2 = _mm256_set1_ps(0.6931471805);
228  log2EF = _mm256_set1_ps(1.44269504088896341);
229  half = _mm256_set1_ps(0.5);
230  exp_C1 = _mm256_set1_ps(0.693359375);
231  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
232  pi32_0x7f = _mm256_set1_epi32(0x7f);
233 
234  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
235  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
236  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
237  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
238  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
239  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
240 
241  for(;number < eighthPoints; number++){
242  // First compute the logarithm
243  aVal = _mm256_load_ps(aPtr);
244  bias = _mm256_set1_epi32(127);
245  leadingOne = _mm256_set1_ps(1.0f);
246  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
247  logarithm = _mm256_cvtepi32_ps(exp);
248 
249  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
250 
251 #if POW_POLY_DEGREE == 6
252  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
253 #elif POW_POLY_DEGREE == 5
254  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
255 #elif POW_POLY_DEGREE == 4
256  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
257 #elif POW_POLY_DEGREE == 3
258  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
259 #else
260 #error
261 #endif
262 
263  logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
264  logarithm = _mm256_mul_ps(logarithm, ln2);
265 
266  // Now calculate b*lna
267  bVal = _mm256_load_ps(bPtr);
268  bVal = _mm256_mul_ps(bVal, logarithm);
269 
270  // Now compute exp(b*lna)
271  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
272 
273  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
274 
275  emm0 = _mm256_cvttps_epi32(fx);
276  tmp = _mm256_cvtepi32_ps(emm0);
277 
278  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
279  fx = _mm256_sub_ps(tmp, mask);
280 
281  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
282  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
283  z = _mm256_mul_ps(bVal, bVal);
284 
285  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
286  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
287  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
288  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
289  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
290  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
291  y = _mm256_add_ps(y, one);
292 
293  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
294 
295  pow2n = _mm256_castsi256_ps(emm0);
296  cVal = _mm256_mul_ps(y, pow2n);
297 
298  _mm256_store_ps(cPtr, cVal);
299 
300  aPtr += 8;
301  bPtr += 8;
302  cPtr += 8;
303  }
304 
305  number = eighthPoints * 8;
306  for(;number < num_points; number++){
307  *cPtr++ = pow(*aPtr++, *bPtr++);
308  }
309 }
310 
311 #endif /* LV_HAVE_AVX2 for aligned */
312 
313 
314 #ifdef LV_HAVE_SSE4_1
315 #include <smmintrin.h>
316 
317 #define POLY0(x, c0) _mm_set1_ps(c0)
318 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
319 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
320 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
321 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
322 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
323 
324 static inline void
325 volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
326  const float* aVector, unsigned int num_points)
327 {
328  float* cPtr = cVector;
329  const float* bPtr = bVector;
330  const float* aPtr = aVector;
331 
332  unsigned int number = 0;
333  const unsigned int quarterPoints = num_points / 4;
334 
335  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
336  __m128 tmp, fx, mask, pow2n, z, y;
337  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
338  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
339  __m128i bias, exp, emm0, pi32_0x7f;
340 
341  one = _mm_set1_ps(1.0);
342  exp_hi = _mm_set1_ps(88.3762626647949);
343  exp_lo = _mm_set1_ps(-88.3762626647949);
344  ln2 = _mm_set1_ps(0.6931471805);
345  log2EF = _mm_set1_ps(1.44269504088896341);
346  half = _mm_set1_ps(0.5);
347  exp_C1 = _mm_set1_ps(0.693359375);
348  exp_C2 = _mm_set1_ps(-2.12194440e-4);
349  pi32_0x7f = _mm_set1_epi32(0x7f);
350 
351  exp_p0 = _mm_set1_ps(1.9875691500e-4);
352  exp_p1 = _mm_set1_ps(1.3981999507e-3);
353  exp_p2 = _mm_set1_ps(8.3334519073e-3);
354  exp_p3 = _mm_set1_ps(4.1665795894e-2);
355  exp_p4 = _mm_set1_ps(1.6666665459e-1);
356  exp_p5 = _mm_set1_ps(5.0000001201e-1);
357 
358  for(;number < quarterPoints; number++){
359  // First compute the logarithm
360  aVal = _mm_load_ps(aPtr);
361  bias = _mm_set1_epi32(127);
362  leadingOne = _mm_set1_ps(1.0f);
363  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
364  logarithm = _mm_cvtepi32_ps(exp);
365 
366  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
367 
368 #if POW_POLY_DEGREE == 6
369  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
370 #elif POW_POLY_DEGREE == 5
371  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
372 #elif POW_POLY_DEGREE == 4
373  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
374 #elif POW_POLY_DEGREE == 3
375  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
376 #else
377 #error
378 #endif
379 
380  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
381  logarithm = _mm_mul_ps(logarithm, ln2);
382 
383 
384  // Now calculate b*lna
385  bVal = _mm_load_ps(bPtr);
386  bVal = _mm_mul_ps(bVal, logarithm);
387 
388  // Now compute exp(b*lna)
389  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
390 
391  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
392 
393  emm0 = _mm_cvttps_epi32(fx);
394  tmp = _mm_cvtepi32_ps(emm0);
395 
396  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
397  fx = _mm_sub_ps(tmp, mask);
398 
399  tmp = _mm_mul_ps(fx, exp_C1);
400  z = _mm_mul_ps(fx, exp_C2);
401  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
402  z = _mm_mul_ps(bVal, bVal);
403 
404  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
405  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
406  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
407  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
408  y = _mm_add_ps(y, one);
409 
410  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
411 
412  pow2n = _mm_castsi128_ps(emm0);
413  cVal = _mm_mul_ps(y, pow2n);
414 
415  _mm_store_ps(cPtr, cVal);
416 
417  aPtr += 4;
418  bPtr += 4;
419  cPtr += 4;
420  }
421 
422  number = quarterPoints * 4;
423  for(;number < num_points; number++){
424  *cPtr++ = powf(*aPtr++, *bPtr++);
425  }
426 }
427 
428 #endif /* LV_HAVE_SSE4_1 for aligned */
429 
430 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
431 
432 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
433 #define INCLUDED_volk_32f_x2_pow_32f_u_H
434 
435 #include <stdio.h>
436 #include <stdlib.h>
437 #include <inttypes.h>
438 #include <math.h>
439 
440 #define POW_POLY_DEGREE 3
441 
442 #ifdef LV_HAVE_GENERIC
443 
444 static inline void
445 volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
446  const float* aVector, unsigned int num_points)
447 {
448  float* cPtr = cVector;
449  const float* bPtr = bVector;
450  const float* aPtr = aVector;
451  unsigned int number = 0;
452 
453  for(number = 0; number < num_points; number++){
454  *cPtr++ = powf(*aPtr++, *bPtr++);
455  }
456 }
457 #endif /* LV_HAVE_GENERIC */
458 
459 
460 #ifdef LV_HAVE_SSE4_1
461 #include <smmintrin.h>
462 
463 #define POLY0(x, c0) _mm_set1_ps(c0)
464 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
465 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
466 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
467 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
468 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
469 
470 static inline void
471 volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
472  const float* aVector, unsigned int num_points)
473 {
474  float* cPtr = cVector;
475  const float* bPtr = bVector;
476  const float* aPtr = aVector;
477 
478  unsigned int number = 0;
479  const unsigned int quarterPoints = num_points / 4;
480 
481  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
482  __m128 tmp, fx, mask, pow2n, z, y;
483  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
484  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
485  __m128i bias, exp, emm0, pi32_0x7f;
486 
487  one = _mm_set1_ps(1.0);
488  exp_hi = _mm_set1_ps(88.3762626647949);
489  exp_lo = _mm_set1_ps(-88.3762626647949);
490  ln2 = _mm_set1_ps(0.6931471805);
491  log2EF = _mm_set1_ps(1.44269504088896341);
492  half = _mm_set1_ps(0.5);
493  exp_C1 = _mm_set1_ps(0.693359375);
494  exp_C2 = _mm_set1_ps(-2.12194440e-4);
495  pi32_0x7f = _mm_set1_epi32(0x7f);
496 
497  exp_p0 = _mm_set1_ps(1.9875691500e-4);
498  exp_p1 = _mm_set1_ps(1.3981999507e-3);
499  exp_p2 = _mm_set1_ps(8.3334519073e-3);
500  exp_p3 = _mm_set1_ps(4.1665795894e-2);
501  exp_p4 = _mm_set1_ps(1.6666665459e-1);
502  exp_p5 = _mm_set1_ps(5.0000001201e-1);
503 
504  for(;number < quarterPoints; number++){
505  // First compute the logarithm
506  aVal = _mm_loadu_ps(aPtr);
507  bias = _mm_set1_epi32(127);
508  leadingOne = _mm_set1_ps(1.0f);
509  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
510  logarithm = _mm_cvtepi32_ps(exp);
511 
512  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
513 
514 #if POW_POLY_DEGREE == 6
515  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
516 #elif POW_POLY_DEGREE == 5
517  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
518 #elif POW_POLY_DEGREE == 4
519  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
520 #elif POW_POLY_DEGREE == 3
521  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
522 #else
523 #error
524 #endif
525 
526  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
527  logarithm = _mm_mul_ps(logarithm, ln2);
528 
529 
530  // Now calculate b*lna
531  bVal = _mm_loadu_ps(bPtr);
532  bVal = _mm_mul_ps(bVal, logarithm);
533 
534  // Now compute exp(b*lna)
535  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
536 
537  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
538 
539  emm0 = _mm_cvttps_epi32(fx);
540  tmp = _mm_cvtepi32_ps(emm0);
541 
542  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
543  fx = _mm_sub_ps(tmp, mask);
544 
545  tmp = _mm_mul_ps(fx, exp_C1);
546  z = _mm_mul_ps(fx, exp_C2);
547  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
548  z = _mm_mul_ps(bVal, bVal);
549 
550  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
551  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
552  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
553  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
554  y = _mm_add_ps(y, one);
555 
556  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
557 
558  pow2n = _mm_castsi128_ps(emm0);
559  cVal = _mm_mul_ps(y, pow2n);
560 
561  _mm_storeu_ps(cPtr, cVal);
562 
563  aPtr += 4;
564  bPtr += 4;
565  cPtr += 4;
566  }
567 
568  number = quarterPoints * 4;
569  for(;number < num_points; number++){
570  *cPtr++ = powf(*aPtr++, *bPtr++);
571  }
572 }
573 
574 #endif /* LV_HAVE_SSE4_1 for unaligned */
575 
576 #if LV_HAVE_AVX2 && LV_HAVE_FMA
577 #include <immintrin.h>
578 
579 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
580 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
581 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
582 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
583 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
584 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
585 
586 static inline void
587 volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
588  const float* aVector, unsigned int num_points)
589 {
590  float* cPtr = cVector;
591  const float* bPtr = bVector;
592  const float* aPtr = aVector;
593 
594  unsigned int number = 0;
595  const unsigned int eighthPoints = num_points / 8;
596 
597  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
598  __m256 tmp, fx, mask, pow2n, z, y;
599  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
600  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
601  __m256i bias, exp, emm0, pi32_0x7f;
602 
603  one = _mm256_set1_ps(1.0);
604  exp_hi = _mm256_set1_ps(88.3762626647949);
605  exp_lo = _mm256_set1_ps(-88.3762626647949);
606  ln2 = _mm256_set1_ps(0.6931471805);
607  log2EF = _mm256_set1_ps(1.44269504088896341);
608  half = _mm256_set1_ps(0.5);
609  exp_C1 = _mm256_set1_ps(0.693359375);
610  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
611  pi32_0x7f = _mm256_set1_epi32(0x7f);
612 
613  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
614  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
615  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
616  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
617  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
618  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
619 
620  for(;number < eighthPoints; number++){
621  // First compute the logarithm
622  aVal = _mm256_loadu_ps(aPtr);
623  bias = _mm256_set1_epi32(127);
624  leadingOne = _mm256_set1_ps(1.0f);
625  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
626  logarithm = _mm256_cvtepi32_ps(exp);
627 
628  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
629 
630 #if POW_POLY_DEGREE == 6
631  mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
632 #elif POW_POLY_DEGREE == 5
633  mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
634 #elif POW_POLY_DEGREE == 4
635  mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
636 #elif POW_POLY_DEGREE == 3
637  mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
638 #else
639 #error
640 #endif
641 
642  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
643  logarithm = _mm256_mul_ps(logarithm, ln2);
644 
645 
646  // Now calculate b*lna
647  bVal = _mm256_loadu_ps(bPtr);
648  bVal = _mm256_mul_ps(bVal, logarithm);
649 
650  // Now compute exp(b*lna)
651  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
652 
653  fx = _mm256_fmadd_ps(bVal, log2EF, half);
654 
655  emm0 = _mm256_cvttps_epi32(fx);
656  tmp = _mm256_cvtepi32_ps(emm0);
657 
658  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
659  fx = _mm256_sub_ps(tmp, mask);
660 
661  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
662  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
663  z = _mm256_mul_ps(bVal, bVal);
664 
665  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
666  y = _mm256_fmadd_ps(y, bVal, exp_p2);
667  y = _mm256_fmadd_ps(y, bVal, exp_p3);
668  y = _mm256_fmadd_ps(y, bVal, exp_p4);
669  y = _mm256_fmadd_ps(y, bVal, exp_p5);
670  y = _mm256_fmadd_ps(y, z, bVal);
671  y = _mm256_add_ps(y, one);
672 
673  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
674 
675  pow2n = _mm256_castsi256_ps(emm0);
676  cVal = _mm256_mul_ps(y, pow2n);
677 
678  _mm256_storeu_ps(cPtr, cVal);
679 
680  aPtr += 8;
681  bPtr += 8;
682  cPtr += 8;
683  }
684 
685  number = eighthPoints * 8;
686  for(;number < num_points; number++){
687  *cPtr++ = pow(*aPtr++, *bPtr++);
688  }
689 }
690 
691 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
692 
693 #ifdef LV_HAVE_AVX2
694 #include <immintrin.h>
695 
696 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
697 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
698 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
699 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
700 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
701 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
702 
703 static inline void
704 volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
705  const float* aVector, unsigned int num_points)
706 {
707  float* cPtr = cVector;
708  const float* bPtr = bVector;
709  const float* aPtr = aVector;
710 
711  unsigned int number = 0;
712  const unsigned int eighthPoints = num_points / 8;
713 
714  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
715  __m256 tmp, fx, mask, pow2n, z, y;
716  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
717  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
718  __m256i bias, exp, emm0, pi32_0x7f;
719 
720  one = _mm256_set1_ps(1.0);
721  exp_hi = _mm256_set1_ps(88.3762626647949);
722  exp_lo = _mm256_set1_ps(-88.3762626647949);
723  ln2 = _mm256_set1_ps(0.6931471805);
724  log2EF = _mm256_set1_ps(1.44269504088896341);
725  half = _mm256_set1_ps(0.5);
726  exp_C1 = _mm256_set1_ps(0.693359375);
727  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
728  pi32_0x7f = _mm256_set1_epi32(0x7f);
729 
730  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
731  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
732  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
733  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
734  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
735  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
736 
737  for(;number < eighthPoints; number++){
738  // First compute the logarithm
739  aVal = _mm256_loadu_ps(aPtr);
740  bias = _mm256_set1_epi32(127);
741  leadingOne = _mm256_set1_ps(1.0f);
742  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
743  logarithm = _mm256_cvtepi32_ps(exp);
744 
745  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
746 
747 #if POW_POLY_DEGREE == 6
748  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
749 #elif POW_POLY_DEGREE == 5
750  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
751 #elif POW_POLY_DEGREE == 4
752  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
753 #elif POW_POLY_DEGREE == 3
754  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
755 #else
756 #error
757 #endif
758 
759  logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
760  logarithm = _mm256_mul_ps(logarithm, ln2);
761 
762  // Now calculate b*lna
763  bVal = _mm256_loadu_ps(bPtr);
764  bVal = _mm256_mul_ps(bVal, logarithm);
765 
766  // Now compute exp(b*lna)
767  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
768 
769  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
770 
771  emm0 = _mm256_cvttps_epi32(fx);
772  tmp = _mm256_cvtepi32_ps(emm0);
773 
774  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
775  fx = _mm256_sub_ps(tmp, mask);
776 
777  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
778  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
779  z = _mm256_mul_ps(bVal, bVal);
780 
781  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
782  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
783  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
784  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
785  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
786  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
787  y = _mm256_add_ps(y, one);
788 
789  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
790 
791  pow2n = _mm256_castsi256_ps(emm0);
792  cVal = _mm256_mul_ps(y, pow2n);
793 
794  _mm256_storeu_ps(cPtr, cVal);
795 
796  aPtr += 8;
797  bPtr += 8;
798  cPtr += 8;
799  }
800 
801  number = eighthPoints * 8;
802  for(;number < num_points; number++){
803  *cPtr++ = pow(*aPtr++, *bPtr++);
804  }
805 }
806 
807 #endif /* LV_HAVE_AVX2 for unaligned */
808 
809 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
volk_32f_x2_pow_32f_generic
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:445