78 #ifndef INCLUDED_volk_32f_tan_32f_a_H
79 #define INCLUDED_volk_32f_tan_32f_a_H
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
85 volk_32f_tan_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
87 float* bPtr = bVector;
88 const float* aPtr = aVector;
90 unsigned int number = 0;
91 unsigned int eighthPoints = num_points / 8;
94 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
96 __m256 sine, cosine, tangent, condition1, condition2, condition3;
97 __m256i q, r, ones, twos, fours;
99 m4pi = _mm256_set1_ps(1.273239545);
100 pio4A = _mm256_set1_ps(0.78515625);
101 pio4B = _mm256_set1_ps(0.241876e-3);
102 ffours = _mm256_set1_ps(4.0);
103 ftwos = _mm256_set1_ps(2.0);
104 fones = _mm256_set1_ps(1.0);
105 fzeroes = _mm256_setzero_ps();
106 ones = _mm256_set1_epi32(1);
107 twos = _mm256_set1_epi32(2);
108 fours = _mm256_set1_epi32(4);
110 cp1 = _mm256_set1_ps(1.0);
111 cp2 = _mm256_set1_ps(0.83333333e-1);
112 cp3 = _mm256_set1_ps(0.2777778e-2);
113 cp4 = _mm256_set1_ps(0.49603e-4);
114 cp5 = _mm256_set1_ps(0.551e-6);
116 for (; number < eighthPoints; number++) {
117 aVal = _mm256_load_ps(aPtr);
118 s = _mm256_sub_ps(aVal,
119 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
120 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
121 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
122 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
124 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
125 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
129 _mm256_set1_ps(8.0));
130 s = _mm256_mul_ps(s, s);
135 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
140 for (
i = 0;
i < 3;
i++) {
141 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
143 s = _mm256_div_ps(s, ftwos);
145 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
146 cosine = _mm256_sub_ps(fones, s);
148 condition1 = _mm256_cmp_ps(
149 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
152 condition2 = _mm256_cmp_ps(
154 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
155 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
157 condition3 = _mm256_cmp_ps(
158 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
162 __m256 temp = cosine;
164 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
165 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
166 sine = _mm256_sub_ps(
167 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
168 cosine = _mm256_sub_ps(
170 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
171 tangent = _mm256_div_ps(sine, cosine);
172 _mm256_store_ps(bPtr, tangent);
177 number = eighthPoints * 8;
178 for (; number < num_points; number++) {
179 *bPtr++ = tan(*aPtr++);
186 #include <immintrin.h>
189 volk_32f_tan_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
191 float* bPtr = bVector;
192 const float* aPtr = aVector;
194 unsigned int number = 0;
195 unsigned int eighthPoints = num_points / 8;
198 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
200 __m256 sine, cosine, tangent, condition1, condition2, condition3;
201 __m256i q, r, ones, twos, fours;
203 m4pi = _mm256_set1_ps(1.273239545);
204 pio4A = _mm256_set1_ps(0.78515625);
205 pio4B = _mm256_set1_ps(0.241876e-3);
206 ffours = _mm256_set1_ps(4.0);
207 ftwos = _mm256_set1_ps(2.0);
208 fones = _mm256_set1_ps(1.0);
209 fzeroes = _mm256_setzero_ps();
210 ones = _mm256_set1_epi32(1);
211 twos = _mm256_set1_epi32(2);
212 fours = _mm256_set1_epi32(4);
214 cp1 = _mm256_set1_ps(1.0);
215 cp2 = _mm256_set1_ps(0.83333333e-1);
216 cp3 = _mm256_set1_ps(0.2777778e-2);
217 cp4 = _mm256_set1_ps(0.49603e-4);
218 cp5 = _mm256_set1_ps(0.551e-6);
220 for (; number < eighthPoints; number++) {
221 aVal = _mm256_load_ps(aPtr);
222 s = _mm256_sub_ps(aVal,
223 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
224 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
225 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
226 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
228 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
229 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
233 _mm256_set1_ps(8.0));
234 s = _mm256_mul_ps(s, s);
242 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
251 for (
i = 0;
i < 3;
i++) {
252 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
254 s = _mm256_div_ps(s, ftwos);
256 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
257 cosine = _mm256_sub_ps(fones, s);
259 condition1 = _mm256_cmp_ps(
260 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
263 condition2 = _mm256_cmp_ps(
265 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
266 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
268 condition3 = _mm256_cmp_ps(
269 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
273 __m256 temp = cosine;
275 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
276 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
277 sine = _mm256_sub_ps(
278 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
279 cosine = _mm256_sub_ps(
281 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
282 tangent = _mm256_div_ps(sine, cosine);
283 _mm256_store_ps(bPtr, tangent);
288 number = eighthPoints * 8;
289 for (; number < num_points; number++) {
290 *bPtr++ = tan(*aPtr++);
296 #ifdef LV_HAVE_SSE4_1
297 #include <smmintrin.h>
300 volk_32f_tan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
302 float* bPtr = bVector;
303 const float* aPtr = aVector;
305 unsigned int number = 0;
306 unsigned int quarterPoints = num_points / 4;
309 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
311 __m128 sine, cosine, tangent, condition1, condition2, condition3;
312 __m128i q, r, ones, twos, fours;
314 m4pi = _mm_set1_ps(1.273239545);
315 pio4A = _mm_set1_ps(0.78515625);
316 pio4B = _mm_set1_ps(0.241876e-3);
317 ffours = _mm_set1_ps(4.0);
318 ftwos = _mm_set1_ps(2.0);
319 fones = _mm_set1_ps(1.0);
320 fzeroes = _mm_setzero_ps();
321 ones = _mm_set1_epi32(1);
322 twos = _mm_set1_epi32(2);
323 fours = _mm_set1_epi32(4);
325 cp1 = _mm_set1_ps(1.0);
326 cp2 = _mm_set1_ps(0.83333333e-1);
327 cp3 = _mm_set1_ps(0.2777778e-2);
328 cp4 = _mm_set1_ps(0.49603e-4);
329 cp5 = _mm_set1_ps(0.551e-6);
331 for (; number < quarterPoints; number++) {
332 aVal = _mm_load_ps(aPtr);
334 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
335 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
336 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
338 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
339 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
342 s, _mm_set1_ps(8.0));
343 s = _mm_mul_ps(s, s);
350 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
358 for (
i = 0;
i < 3;
i++) {
359 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
361 s = _mm_div_ps(s, ftwos);
363 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
364 cosine = _mm_sub_ps(fones, s);
366 condition1 = _mm_cmpneq_ps(
367 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
368 condition2 = _mm_cmpneq_ps(
369 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
370 _mm_cmplt_ps(aVal, fzeroes));
371 condition3 = _mm_cmpneq_ps(
372 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
374 __m128 temp = cosine;
375 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
376 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
378 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
380 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
381 tangent = _mm_div_ps(sine, cosine);
382 _mm_store_ps(bPtr, tangent);
387 number = quarterPoints * 4;
388 for (; number < num_points; number++) {
389 *bPtr++ = tanf(*aPtr++);
398 #ifndef INCLUDED_volk_32f_tan_32f_u_H
399 #define INCLUDED_volk_32f_tan_32f_u_H
401 #if LV_HAVE_AVX2 && LV_HAVE_FMA
402 #include <immintrin.h>
405 volk_32f_tan_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
407 float* bPtr = bVector;
408 const float* aPtr = aVector;
410 unsigned int number = 0;
411 unsigned int eighthPoints = num_points / 8;
414 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
416 __m256 sine, cosine, tangent, condition1, condition2, condition3;
417 __m256i q, r, ones, twos, fours;
419 m4pi = _mm256_set1_ps(1.273239545);
420 pio4A = _mm256_set1_ps(0.78515625);
421 pio4B = _mm256_set1_ps(0.241876e-3);
422 ffours = _mm256_set1_ps(4.0);
423 ftwos = _mm256_set1_ps(2.0);
424 fones = _mm256_set1_ps(1.0);
425 fzeroes = _mm256_setzero_ps();
426 ones = _mm256_set1_epi32(1);
427 twos = _mm256_set1_epi32(2);
428 fours = _mm256_set1_epi32(4);
430 cp1 = _mm256_set1_ps(1.0);
431 cp2 = _mm256_set1_ps(0.83333333e-1);
432 cp3 = _mm256_set1_ps(0.2777778e-2);
433 cp4 = _mm256_set1_ps(0.49603e-4);
434 cp5 = _mm256_set1_ps(0.551e-6);
436 for (; number < eighthPoints; number++) {
437 aVal = _mm256_loadu_ps(aPtr);
438 s = _mm256_sub_ps(aVal,
439 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
440 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
441 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
442 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
444 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
445 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
449 _mm256_set1_ps(8.0));
450 s = _mm256_mul_ps(s, s);
455 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
460 for (
i = 0;
i < 3;
i++) {
461 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
463 s = _mm256_div_ps(s, ftwos);
465 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
466 cosine = _mm256_sub_ps(fones, s);
468 condition1 = _mm256_cmp_ps(
469 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
472 condition2 = _mm256_cmp_ps(
474 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
475 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
477 condition3 = _mm256_cmp_ps(
478 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
482 __m256 temp = cosine;
484 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
485 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
486 sine = _mm256_sub_ps(
487 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
488 cosine = _mm256_sub_ps(
490 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
491 tangent = _mm256_div_ps(sine, cosine);
492 _mm256_storeu_ps(bPtr, tangent);
497 number = eighthPoints * 8;
498 for (; number < num_points; number++) {
499 *bPtr++ = tan(*aPtr++);
506 #include <immintrin.h>
509 volk_32f_tan_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
511 float* bPtr = bVector;
512 const float* aPtr = aVector;
514 unsigned int number = 0;
515 unsigned int eighthPoints = num_points / 8;
518 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
520 __m256 sine, cosine, tangent, condition1, condition2, condition3;
521 __m256i q, r, ones, twos, fours;
523 m4pi = _mm256_set1_ps(1.273239545);
524 pio4A = _mm256_set1_ps(0.78515625);
525 pio4B = _mm256_set1_ps(0.241876e-3);
526 ffours = _mm256_set1_ps(4.0);
527 ftwos = _mm256_set1_ps(2.0);
528 fones = _mm256_set1_ps(1.0);
529 fzeroes = _mm256_setzero_ps();
530 ones = _mm256_set1_epi32(1);
531 twos = _mm256_set1_epi32(2);
532 fours = _mm256_set1_epi32(4);
534 cp1 = _mm256_set1_ps(1.0);
535 cp2 = _mm256_set1_ps(0.83333333e-1);
536 cp3 = _mm256_set1_ps(0.2777778e-2);
537 cp4 = _mm256_set1_ps(0.49603e-4);
538 cp5 = _mm256_set1_ps(0.551e-6);
540 for (; number < eighthPoints; number++) {
541 aVal = _mm256_loadu_ps(aPtr);
542 s = _mm256_sub_ps(aVal,
543 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
544 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
545 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
546 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
548 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
549 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
553 _mm256_set1_ps(8.0));
554 s = _mm256_mul_ps(s, s);
562 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
571 for (
i = 0;
i < 3;
i++) {
572 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
574 s = _mm256_div_ps(s, ftwos);
576 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
577 cosine = _mm256_sub_ps(fones, s);
579 condition1 = _mm256_cmp_ps(
580 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
583 condition2 = _mm256_cmp_ps(
585 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
586 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
588 condition3 = _mm256_cmp_ps(
589 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
593 __m256 temp = cosine;
595 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
596 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
597 sine = _mm256_sub_ps(
598 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
599 cosine = _mm256_sub_ps(
601 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
602 tangent = _mm256_div_ps(sine, cosine);
603 _mm256_storeu_ps(bPtr, tangent);
608 number = eighthPoints * 8;
609 for (; number < num_points; number++) {
610 *bPtr++ = tan(*aPtr++);
617 #ifdef LV_HAVE_SSE4_1
618 #include <smmintrin.h>
621 volk_32f_tan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
623 float* bPtr = bVector;
624 const float* aPtr = aVector;
626 unsigned int number = 0;
627 unsigned int quarterPoints = num_points / 4;
630 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
632 __m128 sine, cosine, tangent, condition1, condition2, condition3;
633 __m128i q, r, ones, twos, fours;
635 m4pi = _mm_set1_ps(1.273239545);
636 pio4A = _mm_set1_ps(0.78515625);
637 pio4B = _mm_set1_ps(0.241876e-3);
638 ffours = _mm_set1_ps(4.0);
639 ftwos = _mm_set1_ps(2.0);
640 fones = _mm_set1_ps(1.0);
641 fzeroes = _mm_setzero_ps();
642 ones = _mm_set1_epi32(1);
643 twos = _mm_set1_epi32(2);
644 fours = _mm_set1_epi32(4);
646 cp1 = _mm_set1_ps(1.0);
647 cp2 = _mm_set1_ps(0.83333333e-1);
648 cp3 = _mm_set1_ps(0.2777778e-2);
649 cp4 = _mm_set1_ps(0.49603e-4);
650 cp5 = _mm_set1_ps(0.551e-6);
652 for (; number < quarterPoints; number++) {
653 aVal = _mm_loadu_ps(aPtr);
655 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
656 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
657 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
659 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
660 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
663 s, _mm_set1_ps(8.0));
664 s = _mm_mul_ps(s, s);
671 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
679 for (
i = 0;
i < 3;
i++) {
680 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
682 s = _mm_div_ps(s, ftwos);
684 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
685 cosine = _mm_sub_ps(fones, s);
687 condition1 = _mm_cmpneq_ps(
688 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
689 condition2 = _mm_cmpneq_ps(
690 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
691 _mm_cmplt_ps(aVal, fzeroes));
692 condition3 = _mm_cmpneq_ps(
693 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
695 __m128 temp = cosine;
696 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
697 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
699 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
701 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
702 tangent = _mm_div_ps(sine, cosine);
703 _mm_storeu_ps(bPtr, tangent);
708 number = quarterPoints * 4;
709 for (; number < num_points; number++) {
710 *bPtr++ = tanf(*aPtr++);
717 #ifdef LV_HAVE_GENERIC
722 float* bPtr = bVector;
723 const float* aPtr = aVector;
724 unsigned int number = 0;
726 for (; number < num_points; number++) {
727 *bPtr++ = tanf(*aPtr++);
734 #include <arm_neon.h>
740 unsigned int number = 0;
741 unsigned int quarter_points = num_points / 4;
742 float* bVectorPtr = bVector;
743 const float* aVectorPtr = aVector;
748 for (number = 0; number < quarter_points; number++) {
749 a_vec = vld1q_f32(aVectorPtr);
753 vst1q_f32(bVectorPtr, b_vec);
760 for (number = quarter_points * 4; number < num_points; number++) {
761 *bVectorPtr++ = tanf(*aVectorPtr++);