Grok 10.0.3
math-inl.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Include guard (still compiled once per target)
17#if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
21#else
22#define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
23#endif
24
25#include "hwy/highway.h"
26
28namespace hwy {
29namespace HWY_NAMESPACE {
30
39template <class D, class V>
40HWY_INLINE V Acos(const D d, V x);
41template <class D, class V>
43 return Acos(d, x);
44}
45
54template <class D, class V>
55HWY_INLINE V Acosh(const D d, V x);
56template <class D, class V>
58 return Acosh(d, x);
59}
60
69template <class D, class V>
70HWY_INLINE V Asin(const D d, V x);
71template <class D, class V>
73 return Asin(d, x);
74}
75
84template <class D, class V>
85HWY_INLINE V Asinh(const D d, V x);
86template <class D, class V>
88 return Asinh(d, x);
89}
90
99template <class D, class V>
100HWY_INLINE V Atan(const D d, V x);
101template <class D, class V>
103 return Atan(d, x);
104}
105
114template <class D, class V>
115HWY_INLINE V Atanh(const D d, V x);
116template <class D, class V>
118 return Atanh(d, x);
119}
120
129template <class D, class V>
130HWY_INLINE V Cos(const D d, V x);
131template <class D, class V>
133 return Cos(d, x);
134}
135
144template <class D, class V>
145HWY_INLINE V Exp(const D d, V x);
146template <class D, class V>
148 return Exp(d, x);
149}
150
159template <class D, class V>
160HWY_INLINE V Expm1(const D d, V x);
161template <class D, class V>
163 return Expm1(d, x);
164}
165
174template <class D, class V>
175HWY_INLINE V Log(const D d, V x);
176template <class D, class V>
178 return Log(d, x);
179}
180
189template <class D, class V>
190HWY_INLINE V Log10(const D d, V x);
191template <class D, class V>
193 return Log10(d, x);
194}
195
204template <class D, class V>
205HWY_INLINE V Log1p(const D d, V x);
206template <class D, class V>
208 return Log1p(d, x);
209}
210
219template <class D, class V>
220HWY_INLINE V Log2(const D d, V x);
221template <class D, class V>
223 return Log2(d, x);
224}
225
234template <class D, class V>
235HWY_INLINE V Sin(const D d, V x);
236template <class D, class V>
238 return Sin(d, x);
239}
240
249template <class D, class V>
250HWY_INLINE V Sinh(const D d, V x);
251template <class D, class V>
253 return Sinh(d, x);
254}
255
264template <class D, class V>
265HWY_INLINE V Tanh(const D d, V x);
266template <class D, class V>
268 return Tanh(d, x);
269}
270
272// Implementation
274namespace impl {
275
276// Estrin's Scheme is a faster method for evaluating large polynomials on
277// super scalar architectures. It works by factoring the Horner's Method
278// polynomial into power of two sub-trees that can be evaluated in parallel.
279// Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
280template <class T>
282 return MulAdd(c1, x, c0);
283}
284template <class T>
285HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
286 T x2 = Mul(x, x);
287 return MulAdd(x2, c2, MulAdd(c1, x, c0));
288}
289template <class T>
290HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
291 T x2 = Mul(x, x);
292 return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
293}
294template <class T>
295HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
296 T x2 = Mul(x, x);
297 T x4 = Mul(x2, x2);
298 return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
299}
300template <class T>
301HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
302 T x2 = Mul(x, x);
303 T x4 = Mul(x2, x2);
304 return MulAdd(x4, MulAdd(c5, x, c4),
305 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
306}
307template <class T>
308HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
309 T c6) {
310 T x2 = Mul(x, x);
311 T x4 = Mul(x2, x2);
312 return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
313 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
314}
315template <class T>
316HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
317 T c6, T c7) {
318 T x2 = Mul(x, x);
319 T x4 = Mul(x2, x2);
320 return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
321 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
322}
323template <class T>
324HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
325 T c6, T c7, T c8) {
326 T x2 = Mul(x, x);
327 T x4 = Mul(x2, x2);
328 T x8 = Mul(x4, x4);
329 return MulAdd(x8, c8,
330 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
331 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
332}
333template <class T>
334HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
335 T c6, T c7, T c8, T c9) {
336 T x2 = Mul(x, x);
337 T x4 = Mul(x2, x2);
338 T x8 = Mul(x4, x4);
339 return MulAdd(x8, MulAdd(c9, x, c8),
340 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
341 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
342}
343template <class T>
344HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
345 T c6, T c7, T c8, T c9, T c10) {
346 T x2 = Mul(x, x);
347 T x4 = Mul(x2, x2);
348 T x8 = Mul(x4, x4);
349 return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
350 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
351 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
352}
353template <class T>
354HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
355 T c6, T c7, T c8, T c9, T c10, T c11) {
356 T x2 = Mul(x, x);
357 T x4 = Mul(x2, x2);
358 T x8 = Mul(x4, x4);
359 return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
360 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
361 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
362}
363template <class T>
364HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
365 T c6, T c7, T c8, T c9, T c10, T c11,
366 T c12) {
367 T x2 = Mul(x, x);
368 T x4 = Mul(x2, x2);
369 T x8 = Mul(x4, x4);
370 return MulAdd(
371 x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
372 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
373 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
374}
375template <class T>
376HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
377 T c6, T c7, T c8, T c9, T c10, T c11,
378 T c12, T c13) {
379 T x2 = Mul(x, x);
380 T x4 = Mul(x2, x2);
381 T x8 = Mul(x4, x4);
382 return MulAdd(x8,
383 MulAdd(x4, MulAdd(c13, x, c12),
384 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
385 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
386 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
387}
388template <class T>
389HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
390 T c6, T c7, T c8, T c9, T c10, T c11,
391 T c12, T c13, T c14) {
392 T x2 = Mul(x, x);
393 T x4 = Mul(x2, x2);
394 T x8 = Mul(x4, x4);
395 return MulAdd(x8,
396 MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
397 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
398 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
399 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
400}
401template <class T>
402HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
403 T c6, T c7, T c8, T c9, T c10, T c11,
404 T c12, T c13, T c14, T c15) {
405 T x2 = Mul(x, x);
406 T x4 = Mul(x2, x2);
407 T x8 = Mul(x4, x4);
408 return MulAdd(x8,
409 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
410 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
411 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
412 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
413}
414template <class T>
415HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
416 T c6, T c7, T c8, T c9, T c10, T c11,
417 T c12, T c13, T c14, T c15, T c16) {
418 T x2 = Mul(x, x);
419 T x4 = Mul(x2, x2);
420 T x8 = Mul(x4, x4);
421 T x16 = Mul(x8, x8);
422 return MulAdd(
423 x16, c16,
424 MulAdd(x8,
425 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
426 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
427 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
428 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
429}
430template <class T>
431HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
432 T c6, T c7, T c8, T c9, T c10, T c11,
433 T c12, T c13, T c14, T c15, T c16, T c17) {
434 T x2 = Mul(x, x);
435 T x4 = Mul(x2, x2);
436 T x8 = Mul(x4, x4);
437 T x16 = Mul(x8, x8);
438 return MulAdd(
439 x16, MulAdd(c17, x, c16),
440 MulAdd(x8,
441 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
442 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
443 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
444 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
445}
446template <class T>
447HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
448 T c6, T c7, T c8, T c9, T c10, T c11,
449 T c12, T c13, T c14, T c15, T c16, T c17,
450 T c18) {
451 T x2 = Mul(x, x);
452 T x4 = Mul(x2, x2);
453 T x8 = Mul(x4, x4);
454 T x16 = Mul(x8, x8);
455 return MulAdd(
456 x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
457 MulAdd(x8,
458 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
459 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
460 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
461 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
462}
463
464template <class FloatOrDouble>
465struct AsinImpl {};
466template <class FloatOrDouble>
467struct AtanImpl {};
468template <class FloatOrDouble>
469struct CosSinImpl {};
470template <class FloatOrDouble>
471struct ExpImpl {};
472template <class FloatOrDouble>
473struct LogImpl {};
474
475template <>
476struct AsinImpl<float> {
477 // Polynomial approximation for asin(x) over the range [0, 0.5).
478 template <class D, class V>
479 HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
480 const auto k0 = Set(d, +0.1666677296f);
481 const auto k1 = Set(d, +0.07495029271f);
482 const auto k2 = Set(d, +0.04547423869f);
483 const auto k3 = Set(d, +0.02424046025f);
484 const auto k4 = Set(d, +0.04197454825f);
485
486 return Estrin(x2, k0, k1, k2, k3, k4);
487 }
488};
489
490#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
491
492template <>
493struct AsinImpl<double> {
494 // Polynomial approximation for asin(x) over the range [0, 0.5).
495 template <class D, class V>
496 HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
497 const auto k0 = Set(d, +0.1666666666666497543);
498 const auto k1 = Set(d, +0.07500000000378581611);
499 const auto k2 = Set(d, +0.04464285681377102438);
500 const auto k3 = Set(d, +0.03038195928038132237);
501 const auto k4 = Set(d, +0.02237176181932048341);
502 const auto k5 = Set(d, +0.01735956991223614604);
503 const auto k6 = Set(d, +0.01388715184501609218);
504 const auto k7 = Set(d, +0.01215360525577377331);
505 const auto k8 = Set(d, +0.006606077476277170610);
506 const auto k9 = Set(d, +0.01929045477267910674);
507 const auto k10 = Set(d, -0.01581918243329996643);
508 const auto k11 = Set(d, +0.03161587650653934628);
509
510 return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
511 }
512};
513
514#endif
515
516template <>
517struct AtanImpl<float> {
518 // Polynomial approximation for atan(x) over the range [0, 1.0).
519 template <class D, class V>
521 const auto k0 = Set(d, -0.333331018686294555664062f);
522 const auto k1 = Set(d, +0.199926957488059997558594f);
523 const auto k2 = Set(d, -0.142027363181114196777344f);
524 const auto k3 = Set(d, +0.106347933411598205566406f);
525 const auto k4 = Set(d, -0.0748900920152664184570312f);
526 const auto k5 = Set(d, +0.0425049886107444763183594f);
527 const auto k6 = Set(d, -0.0159569028764963150024414f);
528 const auto k7 = Set(d, +0.00282363896258175373077393f);
529
530 const auto y = Mul(x, x);
531 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
532 }
533};
534
535#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
536
537template <>
538struct AtanImpl<double> {
539 // Polynomial approximation for atan(x) over the range [0, 1.0).
540 template <class D, class V>
541 HWY_INLINE V AtanPoly(D d, V x) {
542 const auto k0 = Set(d, -0.333333333333311110369124);
543 const auto k1 = Set(d, +0.199999999996591265594148);
544 const auto k2 = Set(d, -0.14285714266771329383765);
545 const auto k3 = Set(d, +0.111111105648261418443745);
546 const auto k4 = Set(d, -0.090908995008245008229153);
547 const auto k5 = Set(d, +0.0769219538311769618355029);
548 const auto k6 = Set(d, -0.0666573579361080525984562);
549 const auto k7 = Set(d, +0.0587666392926673580854313);
550 const auto k8 = Set(d, -0.0523674852303482457616113);
551 const auto k9 = Set(d, +0.0466667150077840625632675);
552 const auto k10 = Set(d, -0.0407629191276836500001934);
553 const auto k11 = Set(d, +0.0337852580001353069993897);
554 const auto k12 = Set(d, -0.0254517624932312641616861);
555 const auto k13 = Set(d, +0.016599329773529201970117);
556 const auto k14 = Set(d, -0.00889896195887655491740809);
557 const auto k15 = Set(d, +0.00370026744188713119232403);
558 const auto k16 = Set(d, -0.00110611831486672482563471);
559 const auto k17 = Set(d, +0.000209850076645816976906797);
560 const auto k18 = Set(d, -1.88796008463073496563746e-5);
561
562 const auto y = Mul(x, x);
563 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
564 k12, k13, k14, k15, k16, k17, k18),
565 Mul(y, x), x);
566 }
567};
568
569#endif
570
571template <>
572struct CosSinImpl<float> {
573 // Rounds float toward zero and returns as int32_t.
574 template <class D, class V>
576 return ConvertTo(Rebind<int32_t, D>(), x);
577 }
578
579 template <class D, class V>
580 HWY_INLINE V Poly(D d, V x) {
581 const auto k0 = Set(d, -1.66666597127914428710938e-1f);
582 const auto k1 = Set(d, +8.33307858556509017944336e-3f);
583 const auto k2 = Set(d, -1.981069071916863322258e-4f);
584 const auto k3 = Set(d, +2.6083159809786593541503e-6f);
585
586 const auto y = Mul(x, x);
587 return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
588 }
589
590 template <class D, class V, class VI32>
591 HWY_INLINE V CosReduce(D d, V x, VI32 q) {
592 // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
593 const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
594 const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
595 const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
596 const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);
597
598 // Extended precision modular arithmetic.
599 const V qf = ConvertTo(d, q);
600 x = MulAdd(qf, kHalfPiPart0f, x);
601 x = MulAdd(qf, kHalfPiPart1f, x);
602 x = MulAdd(qf, kHalfPiPart2f, x);
603 x = MulAdd(qf, kHalfPiPart3f, x);
604 return x;
605 }
606
607 template <class D, class V, class VI32>
608 HWY_INLINE V SinReduce(D d, V x, VI32 q) {
609 // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
610 const V kPiPart0f = Set(d, -3.140625f);
611 const V kPiPart1f = Set(d, -0.0009670257568359375f);
612 const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
613 const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);
614
615 // Extended precision modular arithmetic.
616 const V qf = ConvertTo(d, q);
617 x = MulAdd(qf, kPiPart0f, x);
618 x = MulAdd(qf, kPiPart1f, x);
619 x = MulAdd(qf, kPiPart2f, x);
620 x = MulAdd(qf, kPiPart3f, x);
621 return x;
622 }
623
624 // (q & 2) == 0 ? -0.0 : +0.0
625 template <class D, class VI32>
627 const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
628 return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
629 }
630
631 // ((q & 1) ? -0.0 : +0.0)
632 template <class D, class VI32>
634 const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
635 return BitCast(d, ShiftLeft<31>(And(q, kOne)));
636 }
637};
638
639#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
640
641template <>
642struct CosSinImpl<double> {
643 // Rounds double toward zero and returns as int32_t.
644 template <class D, class V>
645 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
646 return DemoteTo(Rebind<int32_t, D>(), x);
647 }
648
649 template <class D, class V>
650 HWY_INLINE V Poly(D d, V x) {
651 const auto k0 = Set(d, -0.166666666666666657414808);
652 const auto k1 = Set(d, +0.00833333333333332974823815);
653 const auto k2 = Set(d, -0.000198412698412696162806809);
654 const auto k3 = Set(d, +2.75573192239198747630416e-6);
655 const auto k4 = Set(d, -2.50521083763502045810755e-8);
656 const auto k5 = Set(d, +1.60590430605664501629054e-10);
657 const auto k6 = Set(d, -7.64712219118158833288484e-13);
658 const auto k7 = Set(d, +2.81009972710863200091251e-15);
659 const auto k8 = Set(d, -7.97255955009037868891952e-18);
660
661 const auto y = Mul(x, x);
662 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
663 }
664
665 template <class D, class V, class VI32>
666 HWY_INLINE V CosReduce(D d, V x, VI32 q) {
667 // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
668 const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
669 const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
670 const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
671 const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);
672
673 // Extended precision modular arithmetic.
674 const V qf = PromoteTo(d, q);
675 x = MulAdd(qf, kHalfPiPart0d, x);
676 x = MulAdd(qf, kHalfPiPart1d, x);
677 x = MulAdd(qf, kHalfPiPart2d, x);
678 x = MulAdd(qf, kHalfPiPart3d, x);
679 return x;
680 }
681
682 template <class D, class V, class VI32>
683 HWY_INLINE V SinReduce(D d, V x, VI32 q) {
684 // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
685 const V kPiPart0d = Set(d, -3.1415926218032836914);
686 const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
687 const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
688 const V kPiPart3d = Set(d, -1.2736634327021899816e-24);
689
690 // Extended precision modular arithmetic.
691 const V qf = PromoteTo(d, q);
692 x = MulAdd(qf, kPiPart0d, x);
693 x = MulAdd(qf, kPiPart1d, x);
694 x = MulAdd(qf, kPiPart2d, x);
695 x = MulAdd(qf, kPiPart3d, x);
696 return x;
697 }
698
699 // (q & 2) == 0 ? -0.0 : +0.0
700 template <class D, class VI32>
701 HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
702 const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
703 return BitCast(
704 d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
705 }
706
707 // ((q & 1) ? -0.0 : +0.0)
708 template <class D, class VI32>
709 HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
710 const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
711 return BitCast(
712 d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
713 }
714};
715
716#endif
717
718template <>
719struct ExpImpl<float> {
720 // Rounds float toward zero and returns as int32_t.
721 template <class D, class V>
723 return ConvertTo(Rebind<int32_t, D>(), x);
724 }
725
726 template <class D, class V>
727 HWY_INLINE V ExpPoly(D d, V x) {
728 const auto k0 = Set(d, +0.5f);
729 const auto k1 = Set(d, +0.166666671633720397949219f);
730 const auto k2 = Set(d, +0.0416664853692054748535156f);
731 const auto k3 = Set(d, +0.00833336077630519866943359f);
732 const auto k4 = Set(d, +0.00139304355252534151077271f);
733 const auto k5 = Set(d, +0.000198527617612853646278381f);
734
735 return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
736 }
737
738 // Computes 2^x, where x is an integer.
739 template <class D, class VI32>
741 const Rebind<int32_t, D> di32;
742 const VI32 kOffset = Set(di32, 0x7F);
743 return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
744 }
745
746 // Sets the exponent of 'x' to 2^e.
747 template <class D, class V, class VI32>
748 HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
749 const VI32 y = ShiftRight<1>(e);
750 return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
751 }
752
753 template <class D, class V, class VI32>
754 HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
755 // kLn2Part0f + kLn2Part1f ~= -ln(2)
756 const V kLn2Part0f = Set(d, -0.693145751953125f);
757 const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);
758
759 // Extended precision modular arithmetic.
760 const V qf = ConvertTo(d, q);
761 x = MulAdd(qf, kLn2Part0f, x);
762 x = MulAdd(qf, kLn2Part1f, x);
763 return x;
764 }
765};
766
767template <>
768struct LogImpl<float> {
769 template <class D, class V>
771 const Rebind<int32_t, D> di32;
772 const Rebind<uint32_t, D> du32;
773 const auto kBias = Set(di32, 0x7F);
774 return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
775 }
776
777 // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
778 template <class D, class V>
779 HWY_INLINE V LogPoly(D d, V x) {
780 const V k0 = Set(d, 0.66666662693f);
781 const V k1 = Set(d, 0.40000972152f);
782 const V k2 = Set(d, 0.28498786688f);
783 const V k3 = Set(d, 0.24279078841f);
784
785 const V x2 = Mul(x, x);
786 const V x4 = Mul(x2, x2);
787 return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
788 }
789};
790
791#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
792template <>
793struct ExpImpl<double> {
794 // Rounds double toward zero and returns as int32_t.
795 template <class D, class V>
796 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
797 return DemoteTo(Rebind<int32_t, D>(), x);
798 }
799
800 template <class D, class V>
801 HWY_INLINE V ExpPoly(D d, V x) {
802 const auto k0 = Set(d, +0.5);
803 const auto k1 = Set(d, +0.166666666666666851703837);
804 const auto k2 = Set(d, +0.0416666666666665047591422);
805 const auto k3 = Set(d, +0.00833333333331652721664984);
806 const auto k4 = Set(d, +0.00138888888889774492207962);
807 const auto k5 = Set(d, +0.000198412698960509205564975);
808 const auto k6 = Set(d, +2.4801587159235472998791e-5);
809 const auto k7 = Set(d, +2.75572362911928827629423e-6);
810 const auto k8 = Set(d, +2.75573911234900471893338e-7);
811 const auto k9 = Set(d, +2.51112930892876518610661e-8);
812 const auto k10 = Set(d, +2.08860621107283687536341e-9);
813
814 return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
815 Mul(x, x), x);
816 }
817
818 // Computes 2^x, where x is an integer.
819 template <class D, class VI32>
820 HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
821 const Rebind<int32_t, D> di32;
822 const Rebind<int64_t, D> di64;
823 const VI32 kOffset = Set(di32, 0x3FF);
824 return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
825 }
826
827 // Sets the exponent of 'x' to 2^e.
828 template <class D, class V, class VI32>
829 HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
830 const VI32 y = ShiftRight<1>(e);
831 return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
832 }
833
834 template <class D, class V, class VI32>
835 HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
836 // kLn2Part0d + kLn2Part1d ~= -ln(2)
837 const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
838 const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);
839
840 // Extended precision modular arithmetic.
841 const V qf = PromoteTo(d, q);
842 x = MulAdd(qf, kLn2Part0d, x);
843 x = MulAdd(qf, kLn2Part1d, x);
844 return x;
845 }
846};
847
848template <>
849struct LogImpl<double> {
850 template <class D, class V>
851 HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
852 const Rebind<int64_t, D> di64;
853 const Rebind<uint64_t, D> du64;
854 return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
855 Set(di64, 0x3FF));
856 }
857
858 // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
859 template <class D, class V>
860 HWY_INLINE V LogPoly(D d, V x) {
861 const V k0 = Set(d, 0.6666666666666735130);
862 const V k1 = Set(d, 0.3999999999940941908);
863 const V k2 = Set(d, 0.2857142874366239149);
864 const V k3 = Set(d, 0.2222219843214978396);
865 const V k4 = Set(d, 0.1818357216161805012);
866 const V k5 = Set(d, 0.1531383769920937332);
867 const V k6 = Set(d, 0.1479819860511658591);
868
869 const V x2 = Mul(x, x);
870 const V x4 = Mul(x2, x2);
871 return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
872 (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
873 }
874};
875
876#endif
877
878template <class D, class V, bool kAllowSubnormals = true>
879HWY_INLINE V Log(const D d, V x) {
880 // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
881 using T = TFromD<D>;
882 impl::LogImpl<T> impl;
883
884 constexpr bool kIsF32 = (sizeof(T) == 4);
885
886 // Float Constants
887 const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
888 : static_cast<T>(0.693147180369123816490));
889 const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
890 : static_cast<T>(1.90821492927058770002e-10));
891 const V kOne = Set(d, static_cast<T>(+1.0));
892 const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
893 : static_cast<T>(2.2250738585072014e-308));
894 const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
895 : static_cast<T>(1.8014398509481984e+16));
896
897 // Integer Constants
898 using TI = MakeSigned<T>;
899 const Rebind<TI, D> di;
900 using VI = decltype(Zero(di));
901 const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
902 : static_cast<TI>(0xFFFFFFFFLL));
903 const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
904 : static_cast<TI>(0x3FE6A09E00000000LL));
905 const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
906 : static_cast<TI>(0x3FF0000000000000LL));
907 const VI kExpScale =
908 Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
909 const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
910 : static_cast<TI>(0xFFFFF00000000LL));
911
912 // Scale up 'x' so that it is no longer denormalized.
913 VI exp_bits;
914 V exp;
915 if (kAllowSubnormals == true) {
916 const auto is_denormal = Lt(x, kMinNormal);
917 x = IfThenElse(is_denormal, Mul(x, kScale), x);
918
919 // Compute the new exponent.
920 exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
921 const VI exp_scale =
922 BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
923 exp = ConvertTo(
924 d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
925 } else {
926 // Compute the new exponent.
927 exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
928 exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
929 }
930
931 // Renormalize.
932 const V y = Or(And(x, BitCast(d, kLowerBits)),
933 BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
934
935 // Approximate and reconstruct.
936 const V ym1 = Sub(y, kOne);
937 const V z = Div(ym1, Add(y, kOne));
938
939 return MulSub(
940 exp, kLn2Hi,
941 Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
942}
943
944} // namespace impl
945
946template <class D, class V>
947HWY_INLINE V Acos(const D d, V x) {
948 using T = TFromD<D>;
949
950 const V kZero = Zero(d);
951 const V kHalf = Set(d, static_cast<T>(+0.5));
952 const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
953 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
954
955 const V sign_x = And(SignBit(d), x);
956 const V abs_x = Xor(x, sign_x);
957 const auto mask = Lt(abs_x, kHalf);
958 const V yy =
959 IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
960 const V y = IfThenElse(mask, abs_x, Sqrt(yy));
961
963 const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
964
965 const V t_plus_y = Add(t, y);
966 const V z =
967 IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
968 Add(t_plus_y, t_plus_y));
969 return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
970}
971
972template <class D, class V>
973HWY_INLINE V Acosh(const D d, V x) {
974 using T = TFromD<D>;
975
976 const V kLarge = Set(d, static_cast<T>(268435456.0));
977 const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
978 const V kOne = Set(d, static_cast<T>(+1.0));
979 const V kTwo = Set(d, static_cast<T>(+2.0));
980
981 const auto is_x_large = Gt(x, kLarge);
982 const auto is_x_gt_2 = Gt(x, kTwo);
983
984 const V x_minus_1 = Sub(x, kOne);
985 const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
986 const V y1 =
987 Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
988 const V y2 =
989 IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
990 const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
991
992 const auto is_pole = Eq(y2, kOne);
993 const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
994 return Add(IfThenElse(is_x_gt_2, z,
995 IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
996 IfThenElseZero(is_x_large, kLog2));
997}
998
999template <class D, class V>
1000HWY_INLINE V Asin(const D d, V x) {
1001 using T = TFromD<D>;
1002
1003 const V kHalf = Set(d, static_cast<T>(+0.5));
1004 const V kTwo = Set(d, static_cast<T>(+2.0));
1005 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1006
1007 const V sign_x = And(SignBit(d), x);
1008 const V abs_x = Xor(x, sign_x);
1009 const auto mask = Lt(abs_x, kHalf);
1010 const V yy =
1011 IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
1012 const V y = IfThenElse(mask, abs_x, Sqrt(yy));
1013
1014 impl::AsinImpl<T> impl;
1015 const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
1016 const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
1017 return Or(IfThenElse(mask, z0, z1), sign_x);
1018}
1019
1020template <class D, class V>
1021HWY_INLINE V Asinh(const D d, V x) {
1022 using T = TFromD<D>;
1023
1024 const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
1025 const V kLarge = Set(d, static_cast<T>(268435456.0));
1026 const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
1027 const V kOne = Set(d, static_cast<T>(+1.0));
1028 const V kTwo = Set(d, static_cast<T>(+2.0));
1029
1030 const V sign_x = And(SignBit(d), x); // Extract the sign bit
1031 const V abs_x = Xor(x, sign_x);
1032
1033 const auto is_x_large = Gt(abs_x, kLarge);
1034 const auto is_x_lt_2 = Lt(abs_x, kTwo);
1035
1036 const V x2 = Mul(x, x);
1037 const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
1038
1039 const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
1040 const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
1041 const V y2 =
1042 IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
1043 const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
1044
1045 const auto is_pole = Eq(y2, kOne);
1046 const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
1047 const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
1048 const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
1049 return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
1050 sign_x);
1051}
1052
1053template <class D, class V>
1054HWY_INLINE V Atan(const D d, V x) {
1055 using T = TFromD<D>;
1056
1057 const V kOne = Set(d, static_cast<T>(+1.0));
1058 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1059
1060 const V sign = And(SignBit(d), x);
1061 const V abs_x = Xor(x, sign);
1062 const auto mask = Gt(abs_x, kOne);
1063
1064 impl::AtanImpl<T> impl;
1065 const auto divisor = IfThenElse(mask, abs_x, kOne);
1066 const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
1067 return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
1068}
1069
1070template <class D, class V>
1071HWY_INLINE V Atanh(const D d, V x) {
1072 using T = TFromD<D>;
1073
1074 const V kHalf = Set(d, static_cast<T>(+0.5));
1075 const V kOne = Set(d, static_cast<T>(+1.0));
1076
1077 const V sign = And(SignBit(d), x); // Extract the sign bit
1078 const V abs_x = Xor(x, sign);
1079 return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
1080 Xor(kHalf, sign));
1081}
1082
1083template <class D, class V>
1084HWY_INLINE V Cos(const D d, V x) {
1085 using T = TFromD<D>;
1087
1088 // Float Constants
1089 const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
1090
1091 // Integer Constants
1092 const Rebind<int32_t, D> di32;
1093 using VI32 = decltype(Zero(di32));
1094 const VI32 kOne = Set(di32, 1);
1095
1096 const V y = Abs(x); // cos(x) == cos(|x|)
1097
1098 // Compute the quadrant, q = int(|x| / pi) * 2 + 1
1099 const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
1100
1101 // Reduce range, apply sign, and approximate.
1102 return impl.Poly(
1103 d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
1104}
1105
1106template <class D, class V>
1107HWY_INLINE V Exp(const D d, V x) {
1108 using T = TFromD<D>;
1109
1110 const V kHalf = Set(d, static_cast<T>(+0.5));
1111 const V kLowerBound =
1112 Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
1113 const V kNegZero = Set(d, static_cast<T>(-0.0));
1114 const V kOne = Set(d, static_cast<T>(+1.0));
1115 const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
1116
1117 impl::ExpImpl<T> impl;
1118
1119 // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1120 const auto q =
1121 impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1122
1123 // Reduce, approximate, and then reconstruct.
1124 const V y = impl.LoadExpShortRange(
1125 d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
1126 return IfThenElseZero(Ge(x, kLowerBound), y);
1127}
1128
1129template <class D, class V>
1130HWY_INLINE V Expm1(const D d, V x) {
1131 using T = TFromD<D>;
1132
1133 const V kHalf = Set(d, static_cast<T>(+0.5));
1134 const V kLowerBound =
1135 Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
1136 const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
1137 const V kNegOne = Set(d, static_cast<T>(-1.0));
1138 const V kNegZero = Set(d, static_cast<T>(-0.0));
1139 const V kOne = Set(d, static_cast<T>(+1.0));
1140 const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
1141
1142 impl::ExpImpl<T> impl;
1143
1144 // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1145 const auto q =
1146 impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1147
1148 // Reduce, approximate, and then reconstruct.
1149 const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
1150 const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
1151 Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
1152 return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
1153}
1154
1155template <class D, class V>
1156HWY_INLINE V Log(const D d, V x) {
1157 return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
1158}
1159
1160template <class D, class V>
1161HWY_INLINE V Log10(const D d, V x) {
1162 using T = TFromD<D>;
1163 return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
1164}
1165
1166template <class D, class V>
1167HWY_INLINE V Log1p(const D d, V x) {
1168 using T = TFromD<D>;
1169 const V kOne = Set(d, static_cast<T>(+1.0));
1170
1171 const V y = Add(x, kOne);
1172 const auto is_pole = Eq(y, kOne);
1173 const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
1174 const auto non_pole =
1175 Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
1176 return IfThenElse(is_pole, x, non_pole);
1177}
1178
1179template <class D, class V>
1180HWY_INLINE V Log2(const D d, V x) {
1181 using T = TFromD<D>;
1182 return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
1183}
1184
1185template <class D, class V>
1186HWY_INLINE V Sin(const D d, V x) {
1187 using T = TFromD<D>;
1189
1190 // Float Constants
1191 const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
1192 const V kHalf = Set(d, static_cast<T>(0.5));
1193
1194 // Integer Constants
1195 const Rebind<int32_t, D> di32;
1196 using VI32 = decltype(Zero(di32));
1197
1198 const V abs_x = Abs(x);
1199 const V sign_x = Xor(abs_x, x);
1200
1201 // Compute the quadrant, q = int((|x| / pi) + 0.5)
1202 const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));
1203
1204 // Reduce range, apply sign, and approximate.
1205 return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
1206 Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
1207}
1208
1209template <class D, class V>
1210HWY_INLINE V Sinh(const D d, V x) {
1211 using T = TFromD<D>;
1212 const V kHalf = Set(d, static_cast<T>(+0.5));
1213 const V kOne = Set(d, static_cast<T>(+1.0));
1214 const V kTwo = Set(d, static_cast<T>(+2.0));
1215
1216 const V sign = And(SignBit(d), x); // Extract the sign bit
1217 const V abs_x = Xor(x, sign);
1218 const V y = Expm1(d, abs_x);
1219 const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
1220 return Xor(z, sign); // Reapply the sign bit
1221}
1222
1223template <class D, class V>
1224HWY_INLINE V Tanh(const D d, V x) {
1225 using T = TFromD<D>;
1226 const V kLimit = Set(d, static_cast<T>(18.714973875));
1227 const V kOne = Set(d, static_cast<T>(+1.0));
1228 const V kTwo = Set(d, static_cast<T>(+2.0));
1229
1230 const V sign = And(SignBit(d), x); // Extract the sign bit
1231 const V abs_x = Xor(x, sign);
1232 const V y = Expm1(d, Mul(abs_x, kTwo));
1233 const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
1234 return Xor(z, sign); // Reapply the sign bit
1235}
1236
1237// NOLINTNEXTLINE(google-readability-namespace-comments)
1238} // namespace HWY_NAMESPACE
1239} // namespace hwy
1241
1242#endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
#define HWY_NOINLINE
Definition: base.h:63
#define HWY_INLINE
Definition: base.h:62
#define HWY_MAYBE_UNUSED
Definition: base.h:73
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1)
Definition: math-inl.h:281
HWY_INLINE V Log(const D d, V x)
Definition: math-inl.h:879
d
Definition: rvv-inl.h:1742
HWY_NOINLINE V CallSin(const D d, VecArg< V > x)
Definition: math-inl.h:237
V VecArg
Definition: ops/shared-inl.h:306
HWY_NOINLINE V CallAsin(const D d, VecArg< V > x)
Definition: math-inl.h:72
HWY_INLINE V Atan(const D d, V x)
Highway SIMD version of std::atan(x).
Definition: math-inl.h:1054
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_INLINE V Cos(const D d, V x)
Highway SIMD version of std::cos(x).
Definition: math-inl.h:1084
HWY_NOINLINE V CallAcos(const D d, VecArg< V > x)
Definition: math-inl.h:42
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_INLINE V Sin(const D d, V x)
Highway SIMD version of std::sin(x).
Definition: math-inl.h:1186
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_INLINE V Exp(const D d, V x)
Highway SIMD version of std::exp(x).
Definition: math-inl.h:1107
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_INLINE V Log10(const D d, V x)
Highway SIMD version of std::log10(x).
Definition: math-inl.h:1161
HWY_INLINE V Log1p(const D d, V x)
Highway SIMD version of std::log1p(x).
Definition: math-inl.h:1167
HWY_NOINLINE V CallExpm1(const D d, VecArg< V > x)
Definition: math-inl.h:162
HWY_NOINLINE V CallLog1p(const D d, VecArg< V > x)
Definition: math-inl.h:207
HWY_INLINE V Atanh(const D d, V x)
Highway SIMD version of std::atanh(x).
Definition: math-inl.h:1071
HWY_NOINLINE V CallLog10(const D d, VecArg< V > x)
Definition: math-inl.h:192
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_NOINLINE V CallLog2(const D d, VecArg< V > x)
Definition: math-inl.h:222
HWY_NOINLINE V CallExp(const D d, VecArg< V > x)
Definition: math-inl.h:147
HWY_NOINLINE V CallAtanh(const D d, VecArg< V > x)
Definition: math-inl.h:117
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_INLINE V Log2(const D d, V x)
Highway SIMD version of std::log2(x).
Definition: math-inl.h:1180
HWY_INLINE V Acos(const D d, V x)
Highway SIMD version of std::acos(x).
Definition: math-inl.h:947
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_NOINLINE V CallAtan(const D d, VecArg< V > x)
Definition: math-inl.h:102
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_INLINE V Acosh(const D d, V x)
Highway SIMD version of std::acosh(x).
Definition: math-inl.h:973
HWY_NOINLINE V CallLog(const D d, VecArg< V > x)
Definition: math-inl.h:177
HWY_INLINE V Tanh(const D d, V x)
Highway SIMD version of std::tanh(x).
Definition: math-inl.h:1224
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE V Log(const D d, V x)
Highway SIMD version of std::log(x).
Definition: math-inl.h:1156
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_INLINE V Asin(const D d, V x)
Highway SIMD version of std::asin(x).
Definition: math-inl.h:1000
HWY_INLINE V Asinh(const D d, V x)
Highway SIMD version of std::asinh(x).
Definition: math-inl.h:1021
HWY_NOINLINE V CallAsinh(const D d, VecArg< V > x)
Definition: math-inl.h:87
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_INLINE V Expm1(const D d, V x)
Highway SIMD version of std::expm1(x).
Definition: math-inl.h:1130
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_NOINLINE V CallCos(const D d, VecArg< V > x)
Definition: math-inl.h:132
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_NOINLINE V CallSinh(const D d, VecArg< V > x)
Definition: math-inl.h:252
HWY_INLINE V Sinh(const D d, V x)
Highway SIMD version of std::sinh(x).
Definition: math-inl.h:1210
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_NOINLINE V CallTanh(const D d, VecArg< V > x)
Definition: math-inl.h:267
typename D::T TFromD
Definition: ops/shared-inl.h:191
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
HWY_NOINLINE V CallAcosh(const D d, VecArg< V > x)
Definition: math-inl.h:57
Definition: aligned_allocator.h:27
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
HWY_INLINE V AsinPoly(D d, V x2, V)
Definition: math-inl.h:479
Definition: math-inl.h:465
HWY_INLINE V AtanPoly(D d, V x)
Definition: math-inl.h:520
Definition: math-inl.h:467
HWY_INLINE Vec< Rebind< float, D > > SinSignFromQuadrant(D d, VI32 q)
Definition: math-inl.h:633
HWY_INLINE Vec< Rebind< float, D > > CosSignFromQuadrant(D d, VI32 q)
Definition: math-inl.h:626
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition: math-inl.h:575
HWY_INLINE V SinReduce(D d, V x, VI32 q)
Definition: math-inl.h:608
HWY_INLINE V Poly(D d, V x)
Definition: math-inl.h:580
HWY_INLINE V CosReduce(D d, V x, VI32 q)
Definition: math-inl.h:591
Definition: math-inl.h:469
HWY_INLINE Vec< D > Pow2I(D d, VI32 x)
Definition: math-inl.h:740
HWY_INLINE V ExpReduce(D d, V x, VI32 q)
Definition: math-inl.h:754
HWY_INLINE V ExpPoly(D d, V x)
Definition: math-inl.h:727
HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e)
Definition: math-inl.h:748
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition: math-inl.h:722
Definition: math-inl.h:471
HWY_INLINE Vec< Rebind< int32_t, D > > Log2p1NoSubnormal(D, V x)
Definition: math-inl.h:770
HWY_INLINE V LogPoly(D d, V x)
Definition: math-inl.h:779
Definition: math-inl.h:473