69 #define Mln2 0.6931471805f
71 #define B 1065353216.0f
75 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
76 #define INCLUDED_volk_32f_expfast_32f_a_H
78 #if LV_HAVE_AVX && LV_HAVE_FMA
80 #include <immintrin.h>
82 static inline void volk_32f_expfast_32f_a_avx_fma(
float* bVector,
84 unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 const unsigned int eighthPoints = num_points / 8;
92 __m256 aVal, bVal, a, b;
94 a = _mm256_set1_ps(
A /
Mln2);
95 b = _mm256_set1_ps(
B -
C);
97 for (; number < eighthPoints; number++) {
98 aVal = _mm256_load_ps(aPtr);
99 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
100 bVal = _mm256_castsi256_ps(exp);
102 _mm256_store_ps(bPtr, bVal);
107 number = eighthPoints * 8;
108 for (; number < num_points; number++) {
109 *bPtr++ = expf(*aPtr++);
117 #include <immintrin.h>
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
128 __m256 aVal, bVal, a, b;
130 a = _mm256_set1_ps(
A /
Mln2);
131 b = _mm256_set1_ps(
B -
C);
133 for (; number < eighthPoints; number++) {
134 aVal = _mm256_load_ps(aPtr);
135 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
136 bVal = _mm256_castsi256_ps(exp);
138 _mm256_store_ps(bPtr, bVal);
143 number = eighthPoints * 8;
144 for (; number < num_points; number++) {
145 *bPtr++ = expf(*aPtr++);
151 #ifdef LV_HAVE_SSE4_1
152 #include <smmintrin.h>
154 static inline void volk_32f_expfast_32f_a_sse4_1(
float* bVector,
155 const float* aVector,
156 unsigned int num_points)
158 float* bPtr = bVector;
159 const float* aPtr = aVector;
161 unsigned int number = 0;
162 const unsigned int quarterPoints = num_points / 4;
164 __m128 aVal, bVal, a, b;
166 a = _mm_set1_ps(
A /
Mln2);
167 b = _mm_set1_ps(
B -
C);
169 for (; number < quarterPoints; number++) {
170 aVal = _mm_load_ps(aPtr);
171 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
172 bVal = _mm_castsi128_ps(exp);
174 _mm_store_ps(bPtr, bVal);
179 number = quarterPoints * 4;
180 for (; number < num_points; number++) {
181 *bPtr++ = expf(*aPtr++);
189 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
190 #define INCLUDED_volk_32f_expfast_32f_u_H
192 #if LV_HAVE_AVX && LV_HAVE_FMA
193 #include <immintrin.h>
195 static inline void volk_32f_expfast_32f_u_avx_fma(
float* bVector,
196 const float* aVector,
197 unsigned int num_points)
199 float* bPtr = bVector;
200 const float* aPtr = aVector;
202 unsigned int number = 0;
203 const unsigned int eighthPoints = num_points / 8;
205 __m256 aVal, bVal, a, b;
207 a = _mm256_set1_ps(
A /
Mln2);
208 b = _mm256_set1_ps(
B -
C);
210 for (; number < eighthPoints; number++) {
211 aVal = _mm256_loadu_ps(aPtr);
212 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
213 bVal = _mm256_castsi256_ps(exp);
215 _mm256_storeu_ps(bPtr, bVal);
220 number = eighthPoints * 8;
221 for (; number < num_points; number++) {
222 *bPtr++ = expf(*aPtr++);
229 #include <immintrin.h>
234 float* bPtr = bVector;
235 const float* aPtr = aVector;
237 unsigned int number = 0;
238 const unsigned int eighthPoints = num_points / 8;
240 __m256 aVal, bVal, a, b;
242 a = _mm256_set1_ps(
A /
Mln2);
243 b = _mm256_set1_ps(
B -
C);
245 for (; number < eighthPoints; number++) {
246 aVal = _mm256_loadu_ps(aPtr);
247 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
248 bVal = _mm256_castsi256_ps(exp);
250 _mm256_storeu_ps(bPtr, bVal);
255 number = eighthPoints * 8;
256 for (; number < num_points; number++) {
257 *bPtr++ = expf(*aPtr++);
264 #ifdef LV_HAVE_SSE4_1
265 #include <smmintrin.h>
267 static inline void volk_32f_expfast_32f_u_sse4_1(
float* bVector,
268 const float* aVector,
269 unsigned int num_points)
271 float* bPtr = bVector;
272 const float* aPtr = aVector;
274 unsigned int number = 0;
275 const unsigned int quarterPoints = num_points / 4;
277 __m128 aVal, bVal, a, b;
279 a = _mm_set1_ps(
A /
Mln2);
280 b = _mm_set1_ps(
B -
C);
282 for (; number < quarterPoints; number++) {
283 aVal = _mm_loadu_ps(aPtr);
284 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
285 bVal = _mm_castsi128_ps(exp);
287 _mm_storeu_ps(bPtr, bVal);
292 number = quarterPoints * 4;
293 for (; number < num_points; number++) {
294 *bPtr++ = expf(*aPtr++);
301 #ifdef LV_HAVE_GENERIC
304 const float* aVector,
305 unsigned int num_points)
307 float* bPtr = bVector;
308 const float* aPtr = aVector;
309 unsigned int number = 0;
311 for (number = 0; number < num_points; number++) {
312 *bPtr++ = expf(*aPtr++);