77 #ifndef INCLUDED_volk_32f_atan_32f_a_H
78 #define INCLUDED_volk_32f_atan_32f_a_H
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
84 volk_32f_atan_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, pio2, x, y, z, arctangent;
94 __m256 fzeroes, fones, ftwos, ffours, condition;
96 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
97 fzeroes = _mm256_setzero_ps();
98 fones = _mm256_set1_ps(1.0);
99 ftwos = _mm256_set1_ps(2.0);
100 ffours = _mm256_set1_ps(4.0);
102 for(;number < eighthPoints; number++){
103 aVal = _mm256_load_ps(aPtr);
105 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
106 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
107 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
108 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
110 for(
i = 0;
i < 2;
i++){
111 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
113 x = _mm256_div_ps(fones, x);
115 for(j =
TERMS - 1; j >=0 ; j--){
116 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
119 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
120 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
122 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
124 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
125 arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
127 _mm256_store_ps(bPtr, arctangent);
132 number = eighthPoints * 8;
133 for(;number < num_points; number++){
134 *bPtr++ = atan(*aPtr++);
142 #include <immintrin.h>
147 float* bPtr = bVector;
148 const float* aPtr = aVector;
150 unsigned int number = 0;
151 unsigned int eighthPoints = num_points / 8;
154 __m256 aVal, pio2, x, y, z, arctangent;
155 __m256 fzeroes, fones, ftwos, ffours, condition;
157 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
158 fzeroes = _mm256_setzero_ps();
159 fones = _mm256_set1_ps(1.0);
160 ftwos = _mm256_set1_ps(2.0);
161 ffours = _mm256_set1_ps(4.0);
163 for(;number < eighthPoints; number++){
164 aVal = _mm256_load_ps(aPtr);
166 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
167 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
168 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
169 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
171 for(
i = 0;
i < 2;
i++){
172 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
174 x = _mm256_div_ps(fones, x);
176 for(j =
TERMS - 1; j >=0 ; j--){
177 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
180 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
181 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
183 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
185 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
186 arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
188 _mm256_store_ps(bPtr, arctangent);
193 number = eighthPoints * 8;
194 for(;number < num_points; number++){
195 *bPtr++ = atan(*aPtr++);
201 #ifdef LV_HAVE_SSE4_1
202 #include <smmintrin.h>
205 volk_32f_atan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
207 float* bPtr = bVector;
208 const float* aPtr = aVector;
210 unsigned int number = 0;
211 unsigned int quarterPoints = num_points / 4;
214 __m128 aVal, pio2, x, y, z, arctangent;
215 __m128 fzeroes, fones, ftwos, ffours, condition;
217 pio2 = _mm_set1_ps(3.14159265358979323846/2);
218 fzeroes = _mm_setzero_ps();
219 fones = _mm_set1_ps(1.0);
220 ftwos = _mm_set1_ps(2.0);
221 ffours = _mm_set1_ps(4.0);
223 for(;number < quarterPoints; number++){
224 aVal = _mm_load_ps(aPtr);
226 condition = _mm_cmplt_ps(z, fzeroes);
227 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
228 condition = _mm_cmplt_ps(z, fones);
229 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
231 for(
i = 0;
i < 2;
i++){
232 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
234 x = _mm_div_ps(fones, x);
236 for(j =
TERMS - 1; j >=0 ; j--){
237 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
240 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
241 condition = _mm_cmpgt_ps(z, fones);
243 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
245 condition = _mm_cmplt_ps(aVal, fzeroes);
246 arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
248 _mm_store_ps(bPtr, arctangent);
253 number = quarterPoints * 4;
254 for(;number < num_points; number++){
255 *bPtr++ = atanf(*aPtr++);
263 #ifndef INCLUDED_volk_32f_atan_32f_u_H
264 #define INCLUDED_volk_32f_atan_32f_u_H
266 #if LV_HAVE_AVX2 && LV_HAVE_FMA
267 #include <immintrin.h>
270 volk_32f_atan_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
272 float* bPtr = bVector;
273 const float* aPtr = aVector;
275 unsigned int number = 0;
276 unsigned int eighthPoints = num_points / 8;
279 __m256 aVal, pio2, x, y, z, arctangent;
280 __m256 fzeroes, fones, ftwos, ffours, condition;
282 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
283 fzeroes = _mm256_setzero_ps();
284 fones = _mm256_set1_ps(1.0);
285 ftwos = _mm256_set1_ps(2.0);
286 ffours = _mm256_set1_ps(4.0);
288 for(;number < eighthPoints; number++){
289 aVal = _mm256_loadu_ps(aPtr);
291 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
292 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
293 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
294 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
296 for(
i = 0;
i < 2;
i++){
297 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
299 x = _mm256_div_ps(fones, x);
301 for(j =
TERMS - 1; j >=0 ; j--){
302 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
305 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
306 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
308 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
310 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
311 arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
313 _mm256_storeu_ps(bPtr, arctangent);
318 number = eighthPoints * 8;
319 for(;number < num_points; number++){
320 *bPtr++ = atan(*aPtr++);
328 #include <immintrin.h>
333 float* bPtr = bVector;
334 const float* aPtr = aVector;
336 unsigned int number = 0;
337 unsigned int eighthPoints = num_points / 8;
340 __m256 aVal, pio2, x, y, z, arctangent;
341 __m256 fzeroes, fones, ftwos, ffours, condition;
343 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
344 fzeroes = _mm256_setzero_ps();
345 fones = _mm256_set1_ps(1.0);
346 ftwos = _mm256_set1_ps(2.0);
347 ffours = _mm256_set1_ps(4.0);
349 for(;number < eighthPoints; number++){
350 aVal = _mm256_loadu_ps(aPtr);
352 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
353 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
354 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
355 x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
357 for(
i = 0;
i < 2;
i++){
358 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
360 x = _mm256_div_ps(fones, x);
362 for(j =
TERMS - 1; j >=0 ; j--){
363 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
366 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
367 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
369 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
371 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
372 arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
374 _mm256_storeu_ps(bPtr, arctangent);
379 number = eighthPoints * 8;
380 for(;number < num_points; number++){
381 *bPtr++ = atan(*aPtr++);
387 #ifdef LV_HAVE_SSE4_1
388 #include <smmintrin.h>
391 volk_32f_atan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
393 float* bPtr = bVector;
394 const float* aPtr = aVector;
396 unsigned int number = 0;
397 unsigned int quarterPoints = num_points / 4;
400 __m128 aVal, pio2, x, y, z, arctangent;
401 __m128 fzeroes, fones, ftwos, ffours, condition;
403 pio2 = _mm_set1_ps(3.14159265358979323846/2);
404 fzeroes = _mm_setzero_ps();
405 fones = _mm_set1_ps(1.0);
406 ftwos = _mm_set1_ps(2.0);
407 ffours = _mm_set1_ps(4.0);
409 for(;number < quarterPoints; number++){
410 aVal = _mm_loadu_ps(aPtr);
412 condition = _mm_cmplt_ps(z, fzeroes);
413 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
414 condition = _mm_cmplt_ps(z, fones);
415 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
417 for(
i = 0;
i < 2;
i++)
418 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
419 x = _mm_div_ps(fones, x);
421 for(j =
TERMS - 1; j >= 0; j--)
422 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
424 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
425 condition = _mm_cmpgt_ps(z, fones);
427 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
429 condition = _mm_cmplt_ps(aVal, fzeroes);
430 arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
432 _mm_storeu_ps(bPtr, arctangent);
437 number = quarterPoints * 4;
438 for(;number < num_points; number++){
439 *bPtr++ = atanf(*aPtr++);
445 #ifdef LV_HAVE_GENERIC
450 float* bPtr = bVector;
451 const float* aPtr = aVector;
452 unsigned int number = 0;
454 for(number = 0; number < num_points; number++){
455 *bPtr++ = atanf(*aPtr++);