Grok 10.0.3
emu128-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Single-element vectors and operations.
17// External include guard in highway.h - see comment there.
18
19#include <stddef.h>
20#include <stdint.h>
21
22#include "hwy/base.h"
23#include "hwy/ops/shared-inl.h"
24
26namespace hwy {
27namespace HWY_NAMESPACE {
28
29template <typename T>
30using Full128 = Simd<T, 16 / sizeof(T), 0>;
31
32// (Wrapper class required for overloading comparison operators.)
33template <typename T, size_t N = 16 / sizeof(T)>
34struct Vec128 {
35 HWY_INLINE Vec128() = default;
36 Vec128(const Vec128&) = default;
37 Vec128& operator=(const Vec128&) = default;
38
40 return *this = (*this * other);
41 }
43 return *this = (*this / other);
44 }
46 return *this = (*this + other);
47 }
49 return *this = (*this - other);
50 }
52 return *this = (*this & other);
53 }
55 return *this = (*this | other);
56 }
58 return *this = (*this ^ other);
59 }
60
61 // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
62 // relies on this for LoadInterleaved*. CAVEAT: this method of padding
63 // prevents using range for, especially in SumOfLanes, where it would be
64 // incorrect. Moving padding to another field would require handling the case
65 // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
66 T raw[16 / sizeof(T)] = {};
67};
68
69// 0 or FF..FF, same size as Vec128.
70template <typename T, size_t N = 16 / sizeof(T)>
71struct Mask128 {
73 static HWY_INLINE Raw FromBool(bool b) {
74 return b ? static_cast<Raw>(~Raw{0}) : 0;
75 }
76
77 // Must match the size of Vec128.
78 Raw bits[16 / sizeof(T)] = {};
79};
80
81namespace detail {
82
83// Deduce Simd<T, N, 0> from Vec128<T, N>
84struct Deduce128 {
85 template <typename T, size_t N>
87 return Simd<T, N, 0>();
88 }
89};
90
91} // namespace detail
92
93template <class V>
94using DFromV = decltype(detail::Deduce128()(V()));
95
96template <class V>
97using TFromV = TFromD<DFromV<V>>;
98
99// ------------------------------ BitCast
100
101template <typename T, size_t N, typename FromT, size_t FromN>
103 Vec128<T, N> to;
104 static_assert(sizeof(T) * N == sizeof(FromT) * FromN,
105 "Casting does not change size");
106 CopyBytes<sizeof(T) * N>(v.raw, to.raw);
107 return to;
108}
109
110// ------------------------------ Set
111
112template <typename T, size_t N>
113HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
114 Vec128<T, N> v;
115 ZeroBytes<sizeof(T) * N>(v.raw);
116 return v;
117}
118
119template <class D>
120using VFromD = decltype(Zero(D()));
121
122template <typename T, size_t N, typename T2>
123HWY_API Vec128<T, N> Set(Simd<T, N, 0> /* tag */, const T2 t) {
125 for (size_t i = 0; i < N; ++i) {
126 v.raw[i] = static_cast<T>(t);
127 }
128 return v;
129}
130
131template <typename T, size_t N>
132HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
133 return Zero(d);
134}
135
136namespace detail {
137
138template <typename T, HWY_IF_FLOAT(T)>
140 return t + T{1};
141}
142
143template <typename T, HWY_IF_NOT_FLOAT(T)>
144HWY_INLINE constexpr T IncrementWithWraparound(T t) {
145 using TU = MakeUnsigned<T>;
146 return static_cast<T>(static_cast<TU>(static_cast<TU>(t) + TU{1}) &
147 hwy::LimitsMax<TU>());
148}
149
150} // namespace detail
151
152template <typename T, size_t N, typename T2>
153HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> /* tag */, T2 first) {
155 T counter = static_cast<T>(first);
156 for (size_t i = 0; i < N; ++i) {
157 v.raw[i] = counter;
158 counter = detail::IncrementWithWraparound(counter);
159 }
160 return v;
161}
162
163// ================================================== LOGICAL
164
165// ------------------------------ Not
166template <typename T, size_t N>
168 const Simd<T, N, 0> d;
169 const RebindToUnsigned<decltype(d)> du;
170 using TU = TFromD<decltype(du)>;
171 VFromD<decltype(du)> vu = BitCast(du, v);
172 for (size_t i = 0; i < N; ++i) {
173 vu.raw[i] = static_cast<TU>(~vu.raw[i]);
174 }
175 return BitCast(d, vu);
176}
177
178// ------------------------------ And
179template <typename T, size_t N>
181 const Simd<T, N, 0> d;
182 const RebindToUnsigned<decltype(d)> du;
183 auto au = BitCast(du, a);
184 auto bu = BitCast(du, b);
185 for (size_t i = 0; i < N; ++i) {
186 au.raw[i] &= bu.raw[i];
187 }
188 return BitCast(d, au);
189}
190template <typename T, size_t N>
191HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
192 return And(a, b);
193}
194
195// ------------------------------ AndNot
196template <typename T, size_t N>
198 return And(Not(a), b);
199}
200
201// ------------------------------ Or
202template <typename T, size_t N>
204 const Simd<T, N, 0> d;
205 const RebindToUnsigned<decltype(d)> du;
206 auto au = BitCast(du, a);
207 auto bu = BitCast(du, b);
208 for (size_t i = 0; i < N; ++i) {
209 au.raw[i] |= bu.raw[i];
210 }
211 return BitCast(d, au);
212}
213template <typename T, size_t N>
214HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
215 return Or(a, b);
216}
217
218// ------------------------------ Xor
219template <typename T, size_t N>
221 const Simd<T, N, 0> d;
222 const RebindToUnsigned<decltype(d)> du;
223 auto au = BitCast(du, a);
224 auto bu = BitCast(du, b);
225 for (size_t i = 0; i < N; ++i) {
226 au.raw[i] ^= bu.raw[i];
227 }
228 return BitCast(d, au);
229}
230template <typename T, size_t N>
231HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
232 return Xor(a, b);
233}
234
235// ------------------------------ Or3
236
237template <typename T, size_t N>
238HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
239 return Or(o1, Or(o2, o3));
240}
241
242// ------------------------------ OrAnd
243template <typename T, size_t N>
244HWY_API Vec128<T, N> OrAnd(const Vec128<T, N> o, const Vec128<T, N> a1,
245 const Vec128<T, N> a2) {
246 return Or(o, And(a1, a2));
247}
248
249// ------------------------------ IfVecThenElse
250template <typename T, size_t N>
251HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
252 Vec128<T, N> no) {
253 return Or(And(mask, yes), AndNot(mask, no));
254}
255
256// ------------------------------ CopySign
257template <typename T, size_t N>
258HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
259 const Vec128<T, N> sign) {
260 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
261 const auto msb = SignBit(Simd<T, N, 0>());
262 return Or(AndNot(msb, magn), And(msb, sign));
263}
264
265template <typename T, size_t N>
266HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
267 const Vec128<T, N> sign) {
268 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
269 return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
270}
271
272// ------------------------------ BroadcastSignBit
273template <typename T, size_t N>
275 // This is used inside ShiftRight, so we cannot implement in terms of it.
276 for (size_t i = 0; i < N; ++i) {
277 v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
278 }
279 return v;
280}
281
282// ------------------------------ Mask
283
284template <typename TFrom, typename TTo, size_t N>
285HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
286 Mask128<TFrom, N> mask) {
287 Mask128<TTo, N> to;
288 static_assert(sizeof(TTo) * N == sizeof(TFrom) * N, "Must have same size");
289 CopyBytes<sizeof(TTo) * N>(mask.bits, to.bits);
290 return to;
291}
292
293// v must be 0 or FF..FF.
294template <typename T, size_t N>
295HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
296 Mask128<T, N> mask;
297 static_assert(sizeof(v) == sizeof(mask), "Must have same size");
298 CopyBytes<sizeof(T) * N>(v.raw, mask.bits);
299 return mask;
300}
301
302template <typename T, size_t N>
305 CopyBytes<sizeof(T) * N>(mask.bits, v.raw);
306 return v;
307}
308
309template <typename T, size_t N>
311 return VecFromMask(mask);
312}
313
314template <typename T, size_t N>
315HWY_API Mask128<T, N> FirstN(Simd<T, N, 0> /*tag*/, size_t n) {
316 Mask128<T, N> m;
317 for (size_t i = 0; i < N; ++i) {
318 m.bits[i] = Mask128<T, N>::FromBool(i < n);
319 }
320 return m;
321}
322
323// Returns mask ? yes : no.
324template <typename T, size_t N>
326 const Vec128<T, N> yes, const Vec128<T, N> no) {
327 return IfVecThenElse(VecFromMask(mask), yes, no);
328}
329
330template <typename T, size_t N>
331HWY_API Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask,
332 const Vec128<T, N> yes) {
333 return IfVecThenElse(VecFromMask(mask), yes, Zero(Simd<T, N, 0>()));
334}
335
336template <typename T, size_t N>
337HWY_API Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask,
338 const Vec128<T, N> no) {
339 return IfVecThenElse(VecFromMask(mask), Zero(Simd<T, N, 0>()), no);
340}
341
342template <typename T, size_t N>
343HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
344 Vec128<T, N> no) {
345 for (size_t i = 0; i < N; ++i) {
346 v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i];
347 }
348 return v;
349}
350
351template <typename T, size_t N>
352HWY_API Vec128<T, N> ZeroIfNegative(const Vec128<T, N> v) {
353 return IfNegativeThenElse(v, Zero(Simd<T, N, 0>()), v);
354}
355
356// ------------------------------ Mask logical
357
358template <typename T, size_t N>
359HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
360 return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
361}
362
363template <typename T, size_t N>
364HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
365 const Simd<T, N, 0> d;
366 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
367}
368
369template <typename T, size_t N>
370HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
371 const Simd<T, N, 0> d;
372 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
373}
374
375template <typename T, size_t N>
376HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
377 const Simd<T, N, 0> d;
378 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
379}
380
381template <typename T, size_t N>
382HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
383 const Simd<T, N, 0> d;
384 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
385}
386
387// ================================================== SHIFTS
388
389// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
390
391template <int kBits, typename T, size_t N>
393 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
394 for (size_t i = 0; i < N; ++i) {
395 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << kBits;
396 v.raw[i] = static_cast<T>(shifted);
397 }
398 return v;
399}
400
401template <int kBits, typename T, size_t N>
403 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
404#if __cplusplus >= 202002L
405 // Signed right shift is now guaranteed to be arithmetic (rounding toward
406 // negative infinity, i.e. shifting in the sign bit).
407 for (size_t i = 0; i < N; ++i) {
408 v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
409 }
410#else
411 if (IsSigned<T>()) {
412 // Emulate arithmetic shift using only logical (unsigned) shifts, because
413 // signed shifts are still implementation-defined.
414 using TU = hwy::MakeUnsigned<T>;
415 for (size_t i = 0; i < N; ++i) {
416 const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
417 const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
418 const size_t sign_shift =
419 static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
420 const TU upper = static_cast<TU>(sign << sign_shift);
421 v.raw[i] = static_cast<T>(shifted | upper);
422 }
423 } else { // T is unsigned
424 for (size_t i = 0; i < N; ++i) {
425 v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
426 }
427 }
428#endif
429 return v;
430}
431
432// ------------------------------ RotateRight (ShiftRight)
433
434namespace detail {
435
436// For partial specialization: kBits == 0 results in an invalid shift count
437template <int kBits>
439 template <typename T, size_t N>
441 return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
442 }
443};
444
445template <>
446struct RotateRight<0> {
447 template <typename T, size_t N>
449 return v;
450 }
451};
452
453} // namespace detail
454
455template <int kBits, typename T, size_t N>
457 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
459}
460
461// ------------------------------ ShiftLeftSame
462
463template <typename T, size_t N>
464HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
465 for (size_t i = 0; i < N; ++i) {
466 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
467 v.raw[i] = static_cast<T>(shifted);
468 }
469 return v;
470}
471
472template <typename T, size_t N>
473HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
474#if __cplusplus >= 202002L
475 // Signed right shift is now guaranteed to be arithmetic (rounding toward
476 // negative infinity, i.e. shifting in the sign bit).
477 for (size_t i = 0; i < N; ++i) {
478 v.raw[i] = static_cast<T>(v.raw[i] >> bits);
479 }
480#else
481 if (IsSigned<T>()) {
482 // Emulate arithmetic shift using only logical (unsigned) shifts, because
483 // signed shifts are still implementation-defined.
484 using TU = hwy::MakeUnsigned<T>;
485 for (size_t i = 0; i < N; ++i) {
486 const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
487 const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
488 const size_t sign_shift =
489 static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
490 const TU upper = static_cast<TU>(sign << sign_shift);
491 v.raw[i] = static_cast<T>(shifted | upper);
492 }
493 } else {
494 for (size_t i = 0; i < N; ++i) {
495 v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
496 }
497 }
498#endif
499 return v;
500}
501
502// ------------------------------ Shl
503
504template <typename T, size_t N>
506 for (size_t i = 0; i < N; ++i) {
507 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
508 << bits.raw[i];
509 v.raw[i] = static_cast<T>(shifted);
510 }
511 return v;
512}
513
514template <typename T, size_t N>
516#if __cplusplus >= 202002L
517 // Signed right shift is now guaranteed to be arithmetic (rounding toward
518 // negative infinity, i.e. shifting in the sign bit).
519 for (size_t i = 0; i < N; ++i) {
520 v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
521 }
522#else
523 if (IsSigned<T>()) {
524 // Emulate arithmetic shift using only logical (unsigned) shifts, because
525 // signed shifts are still implementation-defined.
526 using TU = hwy::MakeUnsigned<T>;
527 for (size_t i = 0; i < N; ++i) {
528 const TU shifted =
529 static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
530 const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
531 const size_t sign_shift = static_cast<size_t>(
532 static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
533 const TU upper = static_cast<TU>(sign << sign_shift);
534 v.raw[i] = static_cast<T>(shifted | upper);
535 }
536 } else { // T is unsigned
537 for (size_t i = 0; i < N; ++i) {
538 v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
539 }
540 }
541#endif
542 return v;
543}
544
545// ================================================== ARITHMETIC
546
547template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
549 for (size_t i = 0; i < N; ++i) {
550 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
551 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
552 a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
553 }
554 return a;
555}
556template <typename T, size_t N, HWY_IF_FLOAT(T)>
557HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
558 for (size_t i = 0; i < N; ++i) {
559 a.raw[i] += b.raw[i];
560 }
561 return a;
562}
563
564template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
566 for (size_t i = 0; i < N; ++i) {
567 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
568 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
569 a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
570 }
571 return a;
572}
573template <typename T, size_t N, HWY_IF_FLOAT(T)>
574HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
575 for (size_t i = 0; i < N; ++i) {
576 a.raw[i] -= b.raw[i];
577 }
578 return a;
579}
580
581// ------------------------------ SumsOf8
582
583template <size_t N>
584HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(const Vec128<uint8_t, N> v) {
585 Vec128<uint64_t, (N + 7) / 8> sums;
586 for (size_t i = 0; i < N; ++i) {
587 sums.raw[i / 8] += v.raw[i];
588 }
589 return sums;
590}
591
592// ------------------------------ SaturatedAdd
593template <typename T, size_t N>
595 for (size_t i = 0; i < N; ++i) {
596 a.raw[i] = static_cast<T>(
597 HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] + b.raw[i]),
598 hwy::HighestValue<T>()));
599 }
600 return a;
601}
602
603// ------------------------------ SaturatedSub
604template <typename T, size_t N>
606 for (size_t i = 0; i < N; ++i) {
607 a.raw[i] = static_cast<T>(
608 HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] - b.raw[i]),
609 hwy::HighestValue<T>()));
610 }
611 return a;
612}
613
614// ------------------------------ AverageRound
615template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
617 for (size_t i = 0; i < N; ++i) {
618 a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
619 }
620 return a;
621}
622
623// ------------------------------ Abs
624
625template <typename T, size_t N, HWY_IF_SIGNED(T)>
627 for (size_t i = 0; i < N; ++i) {
628 const T s = a.raw[i];
629 const T min = hwy::LimitsMin<T>();
630 a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
631 }
632 return a;
633}
634template <typename T, size_t N, HWY_IF_FLOAT(T)>
635HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
636 for (size_t i = 0; i < N; ++i) {
637 v.raw[i] = std::abs(v.raw[i]);
638 }
639 return v;
640}
641
642// ------------------------------ Min/Max
643
644template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
646 for (size_t i = 0; i < N; ++i) {
647 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
648 }
649 return a;
650}
651
652template <typename T, size_t N, HWY_IF_FLOAT(T)>
653HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
654 for (size_t i = 0; i < N; ++i) {
655 if (std::isnan(a.raw[i])) {
656 a.raw[i] = b.raw[i];
657 } else if (std::isnan(b.raw[i])) {
658 // no change
659 } else {
660 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
661 }
662 }
663 return a;
664}
665
666template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
668 for (size_t i = 0; i < N; ++i) {
669 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
670 }
671 return a;
672}
673
674template <typename T, size_t N, HWY_IF_FLOAT(T)>
675HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
676 for (size_t i = 0; i < N; ++i) {
677 if (std::isnan(a.raw[i])) {
678 a.raw[i] = b.raw[i];
679 } else if (std::isnan(b.raw[i])) {
680 // no change
681 } else {
682 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
683 }
684 }
685 return a;
686}
687
688// ------------------------------ Neg
689
690template <typename T, size_t N, HWY_IF_FLOAT(T)>
692 return Xor(v, SignBit(Simd<T, N, 0>()));
693}
694
695template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
696HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
697 return Zero(Simd<T, N, 0>()) - v;
698}
699
700// ------------------------------ Mul/Div
701
702template <typename T, size_t N, HWY_IF_FLOAT(T)>
704 for (size_t i = 0; i < N; ++i) {
705 a.raw[i] *= b.raw[i];
706 }
707 return a;
708}
709
710template <typename T, size_t N, HWY_IF_SIGNED(T)>
711HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
712 for (size_t i = 0; i < N; ++i) {
713 a.raw[i] = static_cast<T>(static_cast<int64_t>(a.raw[i]) * b.raw[i]);
714 }
715 return a;
716}
717
718template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
719HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
720 for (size_t i = 0; i < N; ++i) {
721 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * b.raw[i]);
722 }
723 return a;
724}
725
726template <typename T, size_t N>
728 for (size_t i = 0; i < N; ++i) {
729 a.raw[i] /= b.raw[i];
730 }
731 return a;
732}
733
734// Returns the upper 16 bits of a * b in each lane.
735template <size_t N>
737 const Vec128<int16_t, N> b) {
738 for (size_t i = 0; i < N; ++i) {
739 a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
740 }
741 return a;
742}
743template <size_t N>
745 const Vec128<uint16_t, N> b) {
746 for (size_t i = 0; i < N; ++i) {
747 // Cast to uint32_t first to prevent overflow. Otherwise the result of
748 // uint16_t * uint16_t is in "int" which may overflow. In practice the
749 // result is the same but this way it is also defined.
750 a.raw[i] = static_cast<uint16_t>(
751 (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
752 16);
753 }
754 return a;
755}
756
757template <size_t N>
760 for (size_t i = 0; i < N; ++i) {
761 a.raw[i] = static_cast<int16_t>((2 * a.raw[i] * b.raw[i] + 32768) >> 16);
762 }
763 return a;
764}
765
766// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
767template <size_t N>
768HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
769 const Vec128<int32_t, N> b) {
770 Vec128<int64_t, (N + 1) / 2> mul;
771 for (size_t i = 0; i < N; i += 2) {
772 const int64_t a64 = a.raw[i];
773 mul.raw[i / 2] = a64 * b.raw[i];
774 }
775 return mul;
776}
777template <size_t N>
778HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
779 const Vec128<uint32_t, N> b) {
780 Vec128<uint64_t, (N + 1) / 2> mul;
781 for (size_t i = 0; i < N; i += 2) {
782 const uint64_t a64 = a.raw[i];
783 mul.raw[i / 2] = a64 * b.raw[i];
784 }
785 return mul;
786}
787
788template <size_t N>
789HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(const Vec128<int32_t, N> a,
790 const Vec128<int32_t, N> b) {
791 Vec128<int64_t, (N + 1) / 2> mul;
792 for (size_t i = 0; i < N; i += 2) {
793 const int64_t a64 = a.raw[i + 1];
794 mul.raw[i / 2] = a64 * b.raw[i + 1];
795 }
796 return mul;
797}
798template <size_t N>
800 const Vec128<uint32_t, N> b) {
801 Vec128<uint64_t, (N + 1) / 2> mul;
802 for (size_t i = 0; i < N; i += 2) {
803 const uint64_t a64 = a.raw[i + 1];
804 mul.raw[i / 2] = a64 * b.raw[i + 1];
805 }
806 return mul;
807}
808
809template <size_t N>
810HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
811 for (size_t i = 0; i < N; ++i) {
812 // Zero inputs are allowed, but callers are responsible for replacing the
813 // return value with something else (typically using IfThenElse). This check
814 // avoids a ubsan error. The result is arbitrary.
815 v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
816 }
817 return v;
818}
819
820template <size_t N>
822 return Abs(a - b);
823}
824
825// ------------------------------ Floating-point multiply-add variants
826
827template <typename T, size_t N>
829 const Vec128<T, N> add) {
830 return mul * x + add;
831}
832
833template <typename T, size_t N>
835 const Vec128<T, N> add) {
836 return add - mul * x;
837}
838
839template <typename T, size_t N>
841 const Vec128<T, N> sub) {
842 return mul * x - sub;
843}
844
845template <typename T, size_t N>
847 const Vec128<T, N> sub) {
848 return Neg(mul) * x - sub;
849}
850
851// ------------------------------ Floating-point square root
852
853template <size_t N>
854HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
855 for (size_t i = 0; i < N; ++i) {
856 const float half = v.raw[i] * 0.5f;
857 uint32_t bits;
858 CopyBytes<4>(&v.raw[i], &bits);
859 // Initial guess based on log2(f)
860 bits = 0x5F3759DF - (bits >> 1);
861 CopyBytes<4>(&bits, &v.raw[i]);
862 // One Newton-Raphson iteration
863 v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
864 }
865 return v;
866}
867
868template <typename T, size_t N>
870 for (size_t i = 0; i < N; ++i) {
871 v.raw[i] = std::sqrt(v.raw[i]);
872 }
873 return v;
874}
875
876// ------------------------------ Floating-point rounding
877
878template <typename T, size_t N>
880 using TI = MakeSigned<T>;
881 const Vec128<T, N> a = Abs(v);
882 for (size_t i = 0; i < N; ++i) {
883 if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
884 continue;
885 }
886 const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
887 const TI rounded = static_cast<TI>(v.raw[i] + bias);
888 if (rounded == 0) {
889 v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
890 continue;
891 }
892 const T rounded_f = static_cast<T>(rounded);
893 // Round to even
894 if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
895 v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
896 continue;
897 }
898 v.raw[i] = rounded_f;
899 }
900 return v;
901}
902
903// Round-to-nearest even.
904template <size_t N>
905HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
906 using T = float;
907 using TI = int32_t;
908
909 const Vec128<float, N> abs = Abs(v);
910 Vec128<int32_t, N> ret;
911 for (size_t i = 0; i < N; ++i) {
912 const bool signbit = std::signbit(v.raw[i]);
913
914 if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
915 // Check if too large to cast or NaN
916 if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
917 ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
918 continue;
919 }
920 ret.raw[i] = static_cast<TI>(v.raw[i]);
921 continue;
922 }
923 const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
924 const TI rounded = static_cast<TI>(v.raw[i] + bias);
925 if (rounded == 0) {
926 ret.raw[i] = 0;
927 continue;
928 }
929 const T rounded_f = static_cast<T>(rounded);
930 // Round to even
931 if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
932 ret.raw[i] = rounded - (signbit ? -1 : 1);
933 continue;
934 }
935 ret.raw[i] = rounded;
936 }
937 return ret;
938}
939
940template <typename T, size_t N>
942 using TI = MakeSigned<T>;
943 const Vec128<T, N> abs = Abs(v);
944 for (size_t i = 0; i < N; ++i) {
945 if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN
946 continue;
947 }
948 const TI truncated = static_cast<TI>(v.raw[i]);
949 if (truncated == 0) {
950 v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
951 continue;
952 }
953 v.raw[i] = static_cast<T>(truncated);
954 }
955 return v;
956}
957
958// Toward +infinity, aka ceiling
959template <typename Float, size_t N>
961 constexpr int kMantissaBits = MantissaBits<Float>();
962 using Bits = MakeUnsigned<Float>;
963 const Bits kExponentMask = MaxExponentField<Float>();
964 const Bits kMantissaMask = MantissaMask<Float>();
965 const Bits kBias = kExponentMask / 2;
966
967 for (size_t i = 0; i < N; ++i) {
968 const bool positive = v.raw[i] > Float(0.0);
969
970 Bits bits;
971 CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
972
973 const int exponent =
974 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
975 // Already an integer.
976 if (exponent >= kMantissaBits) continue;
977 // |v| <= 1 => 0 or 1.
978 if (exponent < 0) {
979 v.raw[i] = positive ? Float{1} : Float{-0.0};
980 continue;
981 }
982
983 const Bits mantissa_mask = kMantissaMask >> exponent;
984 // Already an integer
985 if ((bits & mantissa_mask) == 0) continue;
986
987 // Clear fractional bits and round up
988 if (positive) bits += (kMantissaMask + 1) >> exponent;
989 bits &= ~mantissa_mask;
990
991 CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
992 }
993 return v;
994}
995
996// Toward -infinity, aka floor
997template <typename Float, size_t N>
999 constexpr int kMantissaBits = MantissaBits<Float>();
1000 using Bits = MakeUnsigned<Float>;
1001 const Bits kExponentMask = MaxExponentField<Float>();
1002 const Bits kMantissaMask = MantissaMask<Float>();
1003 const Bits kBias = kExponentMask / 2;
1004
1005 for (size_t i = 0; i < N; ++i) {
1006 const bool negative = v.raw[i] < Float(0.0);
1007
1008 Bits bits;
1009 CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
1010
1011 const int exponent =
1012 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1013 // Already an integer.
1014 if (exponent >= kMantissaBits) continue;
1015 // |v| <= 1 => -1 or 0.
1016 if (exponent < 0) {
1017 v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1018 continue;
1019 }
1020
1021 const Bits mantissa_mask = kMantissaMask >> exponent;
1022 // Already an integer
1023 if ((bits & mantissa_mask) == 0) continue;
1024
1025 // Clear fractional bits and round down
1026 if (negative) bits += (kMantissaMask + 1) >> exponent;
1027 bits &= ~mantissa_mask;
1028
1029 CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
1030 }
1031 return v;
1032}
1033
1034// ------------------------------ Floating-point classification
1035
1036template <typename T, size_t N>
1037HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
1038 Mask128<T, N> ret;
1039 for (size_t i = 0; i < N; ++i) {
1040 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1041 MakeUnsigned<T> bits;
1042 memcpy(&bits, &v.raw[i], sizeof(T));
1043 bits += bits;
1044 bits >>= 1; // clear sign bit
1045 // NaN if all exponent bits are set and the mantissa is not zero.
1046 ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
1047 }
1048 return ret;
1049}
1050
1051template <typename T, size_t N, HWY_IF_FLOAT(T)>
1052HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
1053 const Simd<T, N, 0> d;
1054 const RebindToSigned<decltype(d)> di;
1055 const VFromD<decltype(di)> vi = BitCast(di, v);
1056 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1057 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1058}
1059
1060// Returns whether normal/subnormal/zero.
1061template <typename T, size_t N, HWY_IF_FLOAT(T)>
1062HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
1063 const Simd<T, N, 0> d;
1064 const RebindToUnsigned<decltype(d)> du;
1065 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1066 using VI = VFromD<decltype(di)>;
1067 using VU = VFromD<decltype(du)>;
1068 const VU vu = BitCast(du, v);
1069 // 'Shift left' to clear the sign bit, then right so we can compare with the
1070 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1071 // negative and non-negative floats would be greater).
1072 const VI exp =
1073 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1074 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1075}
1076
1077// ================================================== COMPARE
1078
1079template <typename T, size_t N>
1081 Mask128<T, N> m;
1082 for (size_t i = 0; i < N; ++i) {
1083 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
1084 }
1085 return m;
1086}
1087
1088template <typename T, size_t N>
1090 Mask128<T, N> m;
1091 for (size_t i = 0; i < N; ++i) {
1092 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
1093 }
1094 return m;
1095}
1096
1097template <typename T, size_t N>
1099 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1100 return (v & bit) == bit;
1101}
1102
1103template <typename T, size_t N>
1105 Mask128<T, N> m;
1106 for (size_t i = 0; i < N; ++i) {
1107 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
1108 }
1109 return m;
1110}
1111template <typename T, size_t N>
1112HWY_API Mask128<T, N> operator>(const Vec128<T, N> a, const Vec128<T, N> b) {
1113 Mask128<T, N> m;
1114 for (size_t i = 0; i < N; ++i) {
1115 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
1116 }
1117 return m;
1118}
1119
1120template <typename T, size_t N>
1122 Mask128<T, N> m;
1123 for (size_t i = 0; i < N; ++i) {
1124 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
1125 }
1126 return m;
1127}
1128template <typename T, size_t N>
1129HWY_API Mask128<T, N> operator>=(const Vec128<T, N> a, const Vec128<T, N> b) {
1130 Mask128<T, N> m;
1131 for (size_t i = 0; i < N; ++i) {
1132 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
1133 }
1134 return m;
1135}
1136
1137// ------------------------------ Lt128
1138
1139// Only makes sense for full vectors of u64.
1142 const bool lt =
1143 (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
1145 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1146 return ret;
1147}
1148
1151 const Vec128<uint64_t> b) {
1152 const bool lt = a.raw[1] < b.raw[1];
1154 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1155 return ret;
1156}
1157
1158// ------------------------------ Min128, Max128 (Lt128)
1159
1160template <class D, class V = VFromD<D>>
1161HWY_API V Min128(D d, const V a, const V b) {
1162 return IfThenElse(Lt128(d, a, b), a, b);
1163}
1164
1165template <class D, class V = VFromD<D>>
1166HWY_API V Max128(D d, const V a, const V b) {
1167 return IfThenElse(Lt128(d, b, a), a, b);
1168}
1169
1170template <class D, class V = VFromD<D>>
1171HWY_API V Min128Upper(D d, const V a, const V b) {
1172 return IfThenElse(Lt128Upper(d, a, b), a, b);
1173}
1174
1175template <class D, class V = VFromD<D>>
1176HWY_API V Max128Upper(D d, const V a, const V b) {
1177 return IfThenElse(Lt128Upper(d, b, a), a, b);
1178}
1179
1180// ================================================== MEMORY
1181
1182// ------------------------------ Load
1183
1184template <typename T, size_t N>
1185HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
1186 const T* HWY_RESTRICT aligned) {
1187 Vec128<T, N> v;
1188 CopyBytes<sizeof(T) * N>(aligned, v.raw);
1189 return v;
1190}
1191
1192template <typename T, size_t N>
1193HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1194 const T* HWY_RESTRICT aligned) {
1195 return IfThenElseZero(m, Load(d, aligned));
1196}
1197
1198template <typename T, size_t N>
1200 return Load(d, p);
1201}
1202
1203// In some use cases, "load single lane" is sufficient; otherwise avoid this.
1204template <typename T, size_t N>
1206 const T* HWY_RESTRICT aligned) {
1207 return Load(d, aligned);
1208}
1209
1210// ------------------------------ Store
1211
1212template <typename T, size_t N>
1213HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
1214 T* HWY_RESTRICT aligned) {
1215 CopyBytes<sizeof(T) * N>(v.raw, aligned);
1216}
1217
1218template <typename T, size_t N>
1220 Store(v, d, p);
1221}
1222
1223template <typename T, size_t N>
1224HWY_API void BlendedStore(const Vec128<T, N> v, Mask128<T, N> m,
1225 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1226 for (size_t i = 0; i < N; ++i) {
1227 if (m.bits[i]) p[i] = v.raw[i];
1228 }
1229}
1230
1231// ------------------------------ LoadInterleaved2/3/4
1232
1233// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1234// We implement those here because scalar code is likely faster than emulation
1235// via shuffles.
1236#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1237#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1238#else
1239#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1240#endif
1241
1242template <typename T, size_t N>
1244 Vec128<T, N>& v0, Vec128<T, N>& v1) {
1245 alignas(16) T buf0[N];
1246 alignas(16) T buf1[N];
1247 for (size_t i = 0; i < N; ++i) {
1248 buf0[i] = *unaligned++;
1249 buf1[i] = *unaligned++;
1250 }
1251 v0 = Load(d, buf0);
1252 v1 = Load(d, buf1);
1253}
1254
1255template <typename T, size_t N>
1257 Vec128<T, N>& v0, Vec128<T, N>& v1,
1258 Vec128<T, N>& v2) {
1259 alignas(16) T buf0[N];
1260 alignas(16) T buf1[N];
1261 alignas(16) T buf2[N];
1262 for (size_t i = 0; i < N; ++i) {
1263 buf0[i] = *unaligned++;
1264 buf1[i] = *unaligned++;
1265 buf2[i] = *unaligned++;
1266 }
1267 v0 = Load(d, buf0);
1268 v1 = Load(d, buf1);
1269 v2 = Load(d, buf2);
1270}
1271
1272template <typename T, size_t N>
1274 Vec128<T, N>& v0, Vec128<T, N>& v1,
1275 Vec128<T, N>& v2, Vec128<T, N>& v3) {
1276 alignas(16) T buf0[N];
1277 alignas(16) T buf1[N];
1278 alignas(16) T buf2[N];
1279 alignas(16) T buf3[N];
1280 for (size_t i = 0; i < N; ++i) {
1281 buf0[i] = *unaligned++;
1282 buf1[i] = *unaligned++;
1283 buf2[i] = *unaligned++;
1284 buf3[i] = *unaligned++;
1285 }
1286 v0 = Load(d, buf0);
1287 v1 = Load(d, buf1);
1288 v2 = Load(d, buf2);
1289 v3 = Load(d, buf3);
1290}
1291
1292// ------------------------------ StoreInterleaved2/3/4
1293
1294template <typename T, size_t N>
1296 Simd<T, N, 0> /* tag */,
1297 T* HWY_RESTRICT unaligned) {
1298 for (size_t i = 0; i < N; ++i) {
1299 *unaligned++ = v0.raw[i];
1300 *unaligned++ = v1.raw[i];
1301 }
1302}
1303
1304template <typename T, size_t N>
1306 const Vec128<T, N> v2, Simd<T, N, 0> /* tag */,
1307 T* HWY_RESTRICT unaligned) {
1308 for (size_t i = 0; i < N; ++i) {
1309 *unaligned++ = v0.raw[i];
1310 *unaligned++ = v1.raw[i];
1311 *unaligned++ = v2.raw[i];
1312 }
1313}
1314
1315template <typename T, size_t N>
1317 const Vec128<T, N> v2, const Vec128<T, N> v3,
1318 Simd<T, N, 0> /* tag */,
1319 T* HWY_RESTRICT unaligned) {
1320 for (size_t i = 0; i < N; ++i) {
1321 *unaligned++ = v0.raw[i];
1322 *unaligned++ = v1.raw[i];
1323 *unaligned++ = v2.raw[i];
1324 *unaligned++ = v3.raw[i];
1325 }
1326}
1327
1328// ------------------------------ Stream
1329
1330template <typename T, size_t N>
1331HWY_API void Stream(const Vec128<T, N> v, Simd<T, N, 0> d,
1332 T* HWY_RESTRICT aligned) {
1333 Store(v, d, aligned);
1334}
1335
1336// ------------------------------ Scatter
1337
1338template <typename T, size_t N, typename Offset>
1340 const Vec128<Offset, N> offset) {
1341 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1342 for (size_t i = 0; i < N; ++i) {
1343 uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
1344 CopyBytes<sizeof(T)>(&v.raw[i], base8);
1345 }
1346}
1347
1348template <typename T, size_t N, typename Index>
1350 T* HWY_RESTRICT base, const Vec128<Index, N> index) {
1351 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1352 for (size_t i = 0; i < N; ++i) {
1353 base[index.raw[i]] = v.raw[i];
1354 }
1355}
1356
1357// ------------------------------ Gather
1358
1359template <typename T, size_t N, typename Offset>
1361 const Vec128<Offset, N> offset) {
1362 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
1364 for (size_t i = 0; i < N; ++i) {
1365 const uint8_t* base8 =
1366 reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
1367 CopyBytes<sizeof(T)>(base8, &v.raw[i]);
1368 }
1369 return v;
1370}
1371
1372template <typename T, size_t N, typename Index>
1373HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> /* tag */,
1374 const T* HWY_RESTRICT base,
1375 const Vec128<Index, N> index) {
1376 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
1377 Vec128<T, N> v;
1378 for (size_t i = 0; i < N; ++i) {
1379 v.raw[i] = base[index.raw[i]];
1380 }
1381 return v;
1382}
1383
1384// ================================================== CONVERT
1385
1386// ConvertTo and DemoteTo with floating-point input and integer output truncate
1387// (rounding toward zero).
1388
1389template <typename FromT, typename ToT, size_t N>
1391 Vec128<FromT, N> from) {
1392 static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
1393 Vec128<ToT, N> ret;
1394 for (size_t i = 0; i < N; ++i) {
1395 // For bits Y > X, floatX->floatY and intX->intY are always representable.
1396 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1397 }
1398 return ret;
1399}
1400
1401// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
1402// so we overload for FromT=double and ToT={float,int32_t}.
1403template <size_t N>
1405 Vec128<double, N> from) {
1406 Vec128<float, N> ret;
1407 for (size_t i = 0; i < N; ++i) {
1408 // Prevent ubsan errors when converting float to narrower integer/float
1409 if (std::isinf(from.raw[i]) ||
1410 std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1411 ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
1413 continue;
1414 }
1415 ret.raw[i] = static_cast<float>(from.raw[i]);
1416 }
1417 return ret;
1418}
1419template <size_t N>
1421 Vec128<double, N> from) {
1423 for (size_t i = 0; i < N; ++i) {
1424 // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1425 if (std::isinf(from.raw[i]) ||
1426 std::fabs(from.raw[i]) > static_cast<double>(HighestValue<int32_t>())) {
1427 ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<int32_t>()
1428 : HighestValue<int32_t>();
1429 continue;
1430 }
1431 ret.raw[i] = static_cast<int32_t>(from.raw[i]);
1432 }
1433 return ret;
1434}
1435
1436template <typename FromT, typename ToT, size_t N>
1438 Vec128<FromT, N> from) {
1439 static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
1440 static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
1441
1442 Vec128<ToT, N> ret;
1443 for (size_t i = 0; i < N; ++i) {
1444 // Int to int: choose closest value in ToT to `from` (avoids UB)
1445 from.raw[i] =
1446 HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw[i]), LimitsMax<ToT>());
1447 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1448 }
1449 return ret;
1450}
1451
1452template <size_t N>
1453HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
1454 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
1455 const Repartition<uint32_t, decltype(dbf16)> du32;
1456 const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
1457 // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
1458 const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
1459 return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
1460}
1461
1462namespace detail {
1463
1464HWY_INLINE void StoreU16ToF16(const uint16_t val,
1466#if HWY_NATIVE_FLOAT16
1467 CopyBytes<2>(&val, to);
1468#else
1469 to->bits = val;
1470#endif
1471}
1472
1474#if HWY_NATIVE_FLOAT16
1475 uint16_t bits16;
1476 CopyBytes<2>(from, &bits16);
1477 return bits16;
1478#else
1479 return from->bits;
1480#endif
1481}
1482
1483} // namespace detail
1484
1485template <size_t N>
1486HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
1487 const Vec128<float16_t, N> v) {
1488 Vec128<float, N> ret;
1489 for (size_t i = 0; i < N; ++i) {
1490 const uint16_t bits16 = detail::U16FromF16(&v.raw[i]);
1491 const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1492 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1493 const uint32_t mantissa = bits16 & 0x3FF;
1494
1495 // Subnormal or zero
1496 if (biased_exp == 0) {
1497 const float subnormal =
1498 (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1499 ret.raw[i] = sign ? -subnormal : subnormal;
1500 continue;
1501 }
1502
1503 // Normalized: convert the representation directly (faster than
1504 // ldexp/tables).
1505 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1506 const uint32_t mantissa32 = mantissa << (23 - 10);
1507 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1508 CopyBytes<4>(&bits32, &ret.raw[i]);
1509 }
1510 return ret;
1511}
1512
1513template <size_t N>
1514HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
1515 const Vec128<bfloat16_t, N> v) {
1516 Vec128<float, N> ret;
1517 for (size_t i = 0; i < N; ++i) {
1518 ret.raw[i] = F32FromBF16(v.raw[i]);
1519 }
1520 return ret;
1521}
1522
1523template <size_t N>
1524HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
1525 const Vec128<float, N> v) {
1526 Vec128<float16_t, N> ret;
1527 for (size_t i = 0; i < N; ++i) {
1528 uint32_t bits32;
1529 CopyBytes<4>(&v.raw[i], &bits32);
1530 const uint32_t sign = bits32 >> 31;
1531 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1532 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1533
1534 const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1535
1536 // Tiny or zero => zero.
1537 if (exp < -24) {
1538 ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]);
1539 continue;
1540 }
1541
1542 uint32_t biased_exp16, mantissa16;
1543
1544 // exp = [-24, -15] => subnormal
1545 if (exp < -14) {
1546 biased_exp16 = 0;
1547 const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1548 HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1549 mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1550 (mantissa32 >> (13 + sub_exp)));
1551 } else {
1552 // exp = [-14, 15]
1553 biased_exp16 = static_cast<uint32_t>(exp + 15);
1554 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1555 mantissa16 = mantissa32 >> 13;
1556 }
1557
1558 HWY_DASSERT(mantissa16 < 1024);
1559 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1560 HWY_DASSERT(bits16 < 0x10000);
1561 const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1562 detail::StoreU16ToF16(narrowed, &ret.raw[i]);
1563 }
1564 return ret;
1565}
1566
1567template <size_t N>
1568HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> /* tag */,
1569 const Vec128<float, N> v) {
1570 Vec128<bfloat16_t, N> ret;
1571 for (size_t i = 0; i < N; ++i) {
1572 ret.raw[i] = BF16FromF32(v.raw[i]);
1573 }
1574 return ret;
1575}
1576
1577template <typename FromT, typename ToT, size_t N, HWY_IF_FLOAT(FromT)>
1579 Vec128<FromT, N> from) {
1580 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1581 Vec128<ToT, N> ret;
1582 for (size_t i = 0; i < N; ++i) {
1583 // float## -> int##: return closest representable value. We cannot exactly
1584 // represent LimitsMax<ToT> in FromT, so use double.
1585 const double f = static_cast<double>(from.raw[i]);
1586 if (std::isinf(from.raw[i]) ||
1587 std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1588 ret.raw[i] =
1589 std::signbit(from.raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1590 continue;
1591 }
1592 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1593 }
1594 return ret;
1595}
1596
1597template <typename FromT, typename ToT, size_t N, HWY_IF_NOT_FLOAT(FromT)>
1598HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
1599 Vec128<FromT, N> from) {
1600 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1601 Vec128<ToT, N> ret;
1602 for (size_t i = 0; i < N; ++i) {
1603 // int## -> float##: no check needed
1604 ret.raw[i] = static_cast<ToT>(from.raw[i]);
1605 }
1606 return ret;
1607}
1608
1609template <size_t N>
1611 return DemoteTo(Simd<uint8_t, N, 0>(), v);
1612}
1613
1614// ================================================== COMBINE
1615
1616template <typename T, size_t N>
1618 Vec128<T, N / 2> ret;
1619 CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1620 return ret;
1621}
1622
1623template <typename T, size_t N>
1624HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
1625 Vec128<T, N> v) {
1626 return LowerHalf(v);
1627}
1628
1629template <typename T, size_t N>
1631 Vec128<T, N> v) {
1632 Vec128<T, N / 2> ret;
1633 CopyBytes<N / 2 * sizeof(T)>(&v.raw[N / 2], ret.raw);
1634 return ret;
1635}
1636
1637template <typename T, size_t N>
1638HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> /* tag */,
1639 Vec128<T, N / 2> v) {
1640 Vec128<T, N> ret;
1641 CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1642 return ret;
1643}
1644
1645template <typename T, size_t N>
1647 Vec128<T, N / 2> lo_half) {
1648 Vec128<T, N> ret;
1649 CopyBytes<N / 2 * sizeof(T)>(lo_half.raw, &ret.raw[0]);
1650 CopyBytes<N / 2 * sizeof(T)>(hi_half.raw, &ret.raw[N / 2]);
1651 return ret;
1652}
1653
1654template <typename T, size_t N>
1656 Vec128<T, N> lo) {
1657 Vec128<T, N> ret;
1658 CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
1659 CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
1660 return ret;
1661}
1662
1663template <typename T, size_t N>
1665 Vec128<T, N> lo) {
1666 Vec128<T, N> ret;
1667 CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
1668 CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
1669 return ret;
1670}
1671
1672template <typename T, size_t N>
1674 const Vec128<T, N> hi,
1675 const Vec128<T, N> lo) {
1676 Vec128<T, N> ret;
1677 CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
1678 CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
1679 return ret;
1680}
1681
1682template <typename T, size_t N>
1683HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
1684 Vec128<T, N> lo) {
1685 Vec128<T, N> ret;
1686 CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
1687 CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
1688 return ret;
1689}
1690
1691template <typename T, size_t N>
1693 Vec128<T, N> lo) {
1694 Vec128<T, N> ret;
1695 for (size_t i = 0; i < N / 2; ++i) {
1696 ret.raw[i] = lo.raw[2 * i];
1697 }
1698 for (size_t i = 0; i < N / 2; ++i) {
1699 ret.raw[N / 2 + i] = hi.raw[2 * i];
1700 }
1701 return ret;
1702}
1703
1704template <typename T, size_t N>
1706 Vec128<T, N> lo) {
1707 Vec128<T, N> ret;
1708 for (size_t i = 0; i < N / 2; ++i) {
1709 ret.raw[i] = lo.raw[2 * i + 1];
1710 }
1711 for (size_t i = 0; i < N / 2; ++i) {
1712 ret.raw[N / 2 + i] = hi.raw[2 * i + 1];
1713 }
1714 return ret;
1715}
1716
1717// ------------------------------ CombineShiftRightBytes
1718
1719template <int kBytes, typename T, size_t N, class V = Vec128<T, N>>
1721 V ret;
1722 const uint8_t* HWY_RESTRICT lo8 =
1723 reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
1724 uint8_t* HWY_RESTRICT ret8 =
1725 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1726 CopyBytes<sizeof(T) * N - kBytes>(lo8 + kBytes, ret8);
1727 CopyBytes<kBytes>(hi.raw, ret8 + sizeof(T) * N - kBytes);
1728 return ret;
1729}
1730
1731// ------------------------------ ShiftLeftBytes
1732
1733template <int kBytes, typename T, size_t N>
1734HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1735 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1736 Vec128<T, N> ret;
1737 uint8_t* HWY_RESTRICT ret8 =
1738 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1739 ZeroBytes<kBytes>(ret8);
1740 CopyBytes<sizeof(T) * N - kBytes>(v.raw, ret8 + kBytes);
1741 return ret;
1742}
1743
1744template <int kBytes, typename T, size_t N>
1745HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
1746 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
1747}
1748
1749// ------------------------------ ShiftLeftLanes
1750
1751template <int kLanes, typename T, size_t N>
1752HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1753 const Repartition<uint8_t, decltype(d)> d8;
1754 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1755}
1756
1757template <int kLanes, typename T, size_t N>
1758HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
1759 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
1760}
1761
1762// ------------------------------ ShiftRightBytes
1763template <int kBytes, typename T, size_t N>
1764HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
1765 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1766 Vec128<T, N> ret;
1767 const uint8_t* HWY_RESTRICT v8 =
1768 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1769 uint8_t* HWY_RESTRICT ret8 =
1770 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1771 CopyBytes<sizeof(T) * N - kBytes>(v8 + kBytes, ret8);
1772 ZeroBytes<kBytes>(ret8 + sizeof(T) * N - kBytes);
1773 return ret;
1774}
1775
1776// ------------------------------ ShiftRightLanes
1777template <int kLanes, typename T, size_t N>
1778HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
1779 const Repartition<uint8_t, decltype(d)> d8;
1780 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
1781}
1782
1783// ================================================== SWIZZLE
1784
1785template <typename T, size_t N>
1787 return v.raw[0];
1788}
1789
1790template <typename T, size_t N>
1792 v.raw[i] = t;
1793 return v;
1794}
1795
1796template <typename T, size_t N>
1797HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
1798 return v.raw[i];
1799}
1800
1801template <typename T, size_t N>
1803 for (size_t i = 0; i < N; i += 2) {
1804 v.raw[i + 1] = v.raw[i];
1805 }
1806 return v;
1807}
1808
1809template <typename T, size_t N>
1811 for (size_t i = 0; i < N; i += 2) {
1812 v.raw[i] = v.raw[i + 1];
1813 }
1814 return v;
1815}
1816
1817template <typename T, size_t N>
1818HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
1819 for (size_t i = 0; i < N; i += 2) {
1820 odd.raw[i] = even.raw[i];
1821 }
1822 return odd;
1823}
1824
1825template <typename T, size_t N>
1826HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
1827 return even;
1828}
1829
1830// ------------------------------ SwapAdjacentBlocks
1831
1832template <typename T, size_t N>
1833HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
1834 return v;
1835}
1836
1837// ------------------------------ TableLookupLanes
1838
1839// Returned by SetTableIndices for use by TableLookupLanes.
1840template <typename T, size_t N>
1841struct Indices128 {
1843};
1844
1845template <typename T, size_t N, typename TI>
1847 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1848 Indices128<T, N> ret;
1849 CopyBytes<N * sizeof(T)>(vec.raw, ret.raw);
1850 return ret;
1851}
1852
1853template <typename T, size_t N, typename TI>
1855 return IndicesFromVec(d, LoadU(Simd<TI, N, 0>(), idx));
1856}
1857
1858template <typename T, size_t N>
1859HWY_API Vec128<T, N> TableLookupLanes(const Vec128<T, N> v,
1860 const Indices128<T, N> idx) {
1861 Vec128<T, N> ret;
1862 for (size_t i = 0; i < N; ++i) {
1863 ret.raw[i] = v.raw[idx.raw[i]];
1864 }
1865 return ret;
1866}
1867
1868// ------------------------------ ReverseBlocks
1869
1870// Single block: no change
1871template <typename T, size_t N>
1873 const Vec128<T, N> v) {
1874 return v;
1875}
1876
1877// ------------------------------ Reverse
1878
1879template <typename T, size_t N>
1881 Vec128<T, N> ret;
1882 for (size_t i = 0; i < N; ++i) {
1883 ret.raw[i] = v.raw[N - 1 - i];
1884 }
1885 return ret;
1886}
1887
1888template <typename T, size_t N>
1890 Vec128<T, N> ret;
1891 for (size_t i = 0; i < N; i += 2) {
1892 ret.raw[i + 0] = v.raw[i + 1];
1893 ret.raw[i + 1] = v.raw[i + 0];
1894 }
1895 return ret;
1896}
1897
1898template <typename T, size_t N>
1900 Vec128<T, N> ret;
1901 for (size_t i = 0; i < N; i += 4) {
1902 ret.raw[i + 0] = v.raw[i + 3];
1903 ret.raw[i + 1] = v.raw[i + 2];
1904 ret.raw[i + 2] = v.raw[i + 1];
1905 ret.raw[i + 3] = v.raw[i + 0];
1906 }
1907 return ret;
1908}
1909
1910template <typename T, size_t N>
1912 Vec128<T, N> ret;
1913 for (size_t i = 0; i < N; i += 8) {
1914 ret.raw[i + 0] = v.raw[i + 7];
1915 ret.raw[i + 1] = v.raw[i + 6];
1916 ret.raw[i + 2] = v.raw[i + 5];
1917 ret.raw[i + 3] = v.raw[i + 4];
1918 ret.raw[i + 4] = v.raw[i + 3];
1919 ret.raw[i + 5] = v.raw[i + 2];
1920 ret.raw[i + 6] = v.raw[i + 1];
1921 ret.raw[i + 7] = v.raw[i + 0];
1922 }
1923 return ret;
1924}
1925
1926// ================================================== BLOCKWISE
1927
1928// ------------------------------ Shuffle*
1929
1930// Swap 32-bit halves in 64-bit halves.
1931template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1933 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1934 return Reverse2(DFromV<decltype(v)>(), v);
1935}
1936
1937// Swap 64-bit halves
1938template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1940 Vec128<T> ret;
1941 ret.raw[3] = v.raw[1];
1942 ret.raw[2] = v.raw[0];
1943 ret.raw[1] = v.raw[3];
1944 ret.raw[0] = v.raw[2];
1945 return ret;
1946}
1947template <typename T, HWY_IF_LANE_SIZE(T, 8)>
1949 return Reverse2(DFromV<decltype(v)>(), v);
1950}
1951
1952// Rotate right 32 bits
1953template <typename T>
1954HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
1955 Vec128<T> ret;
1956 ret.raw[3] = v.raw[0];
1957 ret.raw[2] = v.raw[3];
1958 ret.raw[1] = v.raw[2];
1959 ret.raw[0] = v.raw[1];
1960 return ret;
1961}
1962
1963// Rotate left 32 bits
1964template <typename T>
1965HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
1966 Vec128<T> ret;
1967 ret.raw[3] = v.raw[2];
1968 ret.raw[2] = v.raw[1];
1969 ret.raw[1] = v.raw[0];
1970 ret.raw[0] = v.raw[3];
1971 return ret;
1972}
1973
1974template <typename T>
1975HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
1976 return Reverse4(DFromV<decltype(v)>(), v);
1977}
1978
1979// ------------------------------ Broadcast/splat any lane
1980
1981template <int kLane, typename T, size_t N>
1983 for (size_t i = 0; i < N; ++i) {
1984 v.raw[i] = v.raw[kLane];
1985 }
1986 return v;
1987}
1988
1989// ------------------------------ TableLookupBytes, TableLookupBytesOr0
1990
1991template <typename T, size_t N, typename TI, size_t NI>
1993 const Vec128<TI, NI> indices) {
1994 const uint8_t* HWY_RESTRICT v_bytes =
1995 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1996 const uint8_t* HWY_RESTRICT idx_bytes =
1997 reinterpret_cast<const uint8_t*>(indices.raw);
1998 Vec128<TI, NI> ret;
1999 uint8_t* HWY_RESTRICT ret_bytes =
2000 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2001 for (size_t i = 0; i < NI * sizeof(TI); ++i) {
2002 const size_t idx = idx_bytes[i];
2003 // Avoid out of bounds reads.
2004 ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
2005 }
2006 return ret;
2007}
2008
2009template <typename T, size_t N, typename TI, size_t NI>
2011 const Vec128<TI, NI> indices) {
2012 // Same as TableLookupBytes, which already returns 0 if out of bounds.
2013 return TableLookupBytes(v, indices);
2014}
2015
2016// ------------------------------ InterleaveLower/InterleaveUpper
2017
2018template <typename T, size_t N>
2020 const Vec128<T, N> b) {
2021 Vec128<T, N> ret;
2022 for (size_t i = 0; i < N / 2; ++i) {
2023 ret.raw[2 * i + 0] = a.raw[i];
2024 ret.raw[2 * i + 1] = b.raw[i];
2025 }
2026 return ret;
2027}
2028
2029// Additional overload for the optional tag (also for 256/512).
2030template <class V>
2031HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2032 return InterleaveLower(a, b);
2033}
2034
2035template <typename T, size_t N>
2037 const Vec128<T, N> a,
2038 const Vec128<T, N> b) {
2039 Vec128<T, N> ret;
2040 for (size_t i = 0; i < N / 2; ++i) {
2041 ret.raw[2 * i + 0] = a.raw[N / 2 + i];
2042 ret.raw[2 * i + 1] = b.raw[N / 2 + i];
2043 }
2044 return ret;
2045}
2046
2047// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2048
2049// Same as Interleave*, except that the return lanes are double-width integers;
2050// this is necessary because the single-lane scalar cannot return two values.
2051template <class V, class DW = RepartitionToWide<DFromV<V>>>
2052HWY_API VFromD<DW> ZipLower(V a, V b) {
2053 return BitCast(DW(), InterleaveLower(a, b));
2054}
2055template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2056HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2057 return BitCast(dw, InterleaveLower(D(), a, b));
2058}
2059
2060template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2061HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2062 return BitCast(dw, InterleaveUpper(D(), a, b));
2063}
2064
2065// ================================================== MASK
2066
2067template <typename T, size_t N>
2068HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2069 typename Mask128<T, N>::Raw or_sum = 0;
2070 for (size_t i = 0; i < N; ++i) {
2071 or_sum |= mask.bits[i];
2072 }
2073 return or_sum == 0;
2074}
2075
2076template <typename T, size_t N>
2077HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2078 using Bits = typename Mask128<T, N>::Raw;
2079 constexpr Bits kAll = static_cast<Bits>(~Bits{0});
2080 Bits and_sum = kAll;
2081 for (size_t i = 0; i < N; ++i) {
2082 and_sum &= mask.bits[i];
2083 }
2084 return and_sum == kAll;
2085}
2086
2087// `p` points to at least 8 readable bytes, not all of which need be valid.
2088template <typename T, size_t N>
2090 const uint8_t* HWY_RESTRICT bits) {
2091 Mask128<T, N> m;
2092 for (size_t i = 0; i < N; ++i) {
2093 const size_t bit = size_t{1} << (i & 7);
2094 const size_t idx_byte = i >> 3;
2095 m.bits[i] = Mask128<T, N>::FromBool((bits[idx_byte] & bit) != 0);
2096 }
2097 return m;
2098}
2099
2100// `p` points to at least 8 writable bytes.
2101template <typename T, size_t N>
2102HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
2103 uint8_t* bits) {
2104 bits[0] = 0;
2105 if (N > 8) bits[1] = 0; // N <= 16, so max two bytes
2106 for (size_t i = 0; i < N; ++i) {
2107 const size_t bit = size_t{1} << (i & 7);
2108 const size_t idx_byte = i >> 3;
2109 if (mask.bits[i]) {
2110 bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
2111 }
2112 }
2113 return N > 8 ? 2 : 1;
2114}
2115
2116template <typename T, size_t N>
2117HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
2118 size_t count = 0;
2119 for (size_t i = 0; i < N; ++i) {
2120 count += mask.bits[i] != 0;
2121 }
2122 return count;
2123}
2124
2125template <typename T, size_t N>
2126HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
2127 const Mask128<T, N> mask) {
2128 for (size_t i = 0; i < N; ++i) {
2129 if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
2130 }
2131 return intptr_t{-1};
2132}
2133
2134// ------------------------------ Compress
2135
2136template <typename T>
2137struct CompressIsPartition {
2138 enum { value = 1 };
2139};
2140
2141template <typename T, size_t N>
2143 size_t count = 0;
2144 Vec128<T, N> ret;
2145 for (size_t i = 0; i < N; ++i) {
2146 if (mask.bits[i]) {
2147 ret.raw[count++] = v.raw[i];
2148 }
2149 }
2150 for (size_t i = 0; i < N; ++i) {
2151 if (!mask.bits[i]) {
2152 ret.raw[count++] = v.raw[i];
2153 }
2154 }
2155 HWY_DASSERT(count == N);
2156 return ret;
2157}
2158
2159// ------------------------------ CompressNot
2160template <typename T, size_t N>
2162 size_t count = 0;
2163 Vec128<T, N> ret;
2164 for (size_t i = 0; i < N; ++i) {
2165 if (!mask.bits[i]) {
2166 ret.raw[count++] = v.raw[i];
2167 }
2168 }
2169 for (size_t i = 0; i < N; ++i) {
2170 if (mask.bits[i]) {
2171 ret.raw[count++] = v.raw[i];
2172 }
2173 }
2174 HWY_DASSERT(count == N);
2175 return ret;
2176}
2177
2178// ------------------------------ CompressBlocksNot
2179HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
2180 Mask128<uint64_t> /* m */) {
2181 return v;
2182}
2183
2184// ------------------------------ CompressBits
2185template <typename T, size_t N>
2187 const uint8_t* HWY_RESTRICT bits) {
2188 return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
2189}
2190
2191// ------------------------------ CompressStore
2192template <typename T, size_t N>
2193HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
2194 Simd<T, N, 0> /* tag */,
2195 T* HWY_RESTRICT unaligned) {
2196 size_t count = 0;
2197 for (size_t i = 0; i < N; ++i) {
2198 if (mask.bits[i]) {
2199 unaligned[count++] = v.raw[i];
2200 }
2201 }
2202 return count;
2203}
2204
2205// ------------------------------ CompressBlendedStore
2206template <typename T, size_t N>
2207HWY_API size_t CompressBlendedStore(Vec128<T, N> v, const Mask128<T, N> mask,
2208 Simd<T, N, 0> d,
2209 T* HWY_RESTRICT unaligned) {
2210 return CompressStore(v, mask, d, unaligned);
2211}
2212
2213// ------------------------------ CompressBitsStore
2214template <typename T, size_t N>
2215HWY_API size_t CompressBitsStore(Vec128<T, N> v,
2216 const uint8_t* HWY_RESTRICT bits,
2217 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
2218 const Mask128<T, N> mask = LoadMaskBits(d, bits);
2219 StoreU(Compress(v, mask), d, unaligned);
2220 return CountTrue(d, mask);
2221}
2222
2223// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2224template <size_t N>
2225HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
2226 Vec128<bfloat16_t, 2 * N> a,
2227 Vec128<bfloat16_t, 2 * N> b,
2228 const Vec128<float, N> sum0,
2229 Vec128<float, N>& sum1) {
2230 const Rebind<bfloat16_t, decltype(df32)> dbf16;
2231 // Avoid ZipLower/Upper so this also works on big-endian systems.
2232 const Vec128<float, N> a0 = PromoteTo(df32, LowerHalf(dbf16, a));
2233 const Vec128<float, N> a1 = PromoteTo(df32, UpperHalf(dbf16, a));
2234 const Vec128<float, N> b0 = PromoteTo(df32, LowerHalf(dbf16, b));
2235 const Vec128<float, N> b1 = PromoteTo(df32, UpperHalf(dbf16, b));
2236 sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2237 return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2238}
2239
2240// ================================================== REDUCTIONS
2241
2242template <typename T, size_t N>
2243HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2244 T sum = T{0};
2245 for (size_t i = 0; i < N; ++i) {
2246 sum += v.raw[i];
2247 }
2248 return Set(d, sum);
2249}
2250template <typename T, size_t N>
2251HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2252 T min = HighestValue<T>();
2253 for (size_t i = 0; i < N; ++i) {
2254 min = HWY_MIN(min, v.raw[i]);
2255 }
2256 return Set(d, min);
2257}
2258template <typename T, size_t N>
2259HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
2260 T max = LowestValue<T>();
2261 for (size_t i = 0; i < N; ++i) {
2262 max = HWY_MAX(max, v.raw[i]);
2263 }
2264 return Set(d, max);
2265}
2266
2267// ================================================== OPS WITH DEPENDENCIES
2268
2269// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2270
2271HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
2272 const Vec128<uint64_t> b) {
2273 alignas(16) uint64_t mul[2];
2274 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2275 return Load(Full128<uint64_t>(), mul);
2276}
2277
2278HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
2279 const Vec128<uint64_t> b) {
2280 alignas(16) uint64_t mul[2];
2281 const Half<Full128<uint64_t>> d2;
2282 mul[0] =
2283 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2284 return Load(Full128<uint64_t>(), mul);
2285}
2286
2287// ================================================== Operator wrapper
2288
2289template <class V>
2290HWY_API V Add(V a, V b) {
2291 return a + b;
2292}
2293template <class V>
2294HWY_API V Sub(V a, V b) {
2295 return a - b;
2296}
2297
2298template <class V>
2299HWY_API V Mul(V a, V b) {
2300 return a * b;
2301}
2302template <class V>
2303HWY_API V Div(V a, V b) {
2304 return a / b;
2305}
2306
2307template <class V>
2308V Shl(V a, V b) {
2309 return a << b;
2310}
2311template <class V>
2312V Shr(V a, V b) {
2313 return a >> b;
2314}
2315
2316template <class V>
2317HWY_API auto Eq(V a, V b) -> decltype(a == b) {
2318 return a == b;
2319}
2320template <class V>
2321HWY_API auto Ne(V a, V b) -> decltype(a == b) {
2322 return a != b;
2323}
2324template <class V>
2325HWY_API auto Lt(V a, V b) -> decltype(a == b) {
2326 return a < b;
2327}
2328
2329template <class V>
2330HWY_API auto Gt(V a, V b) -> decltype(a == b) {
2331 return a > b;
2332}
2333template <class V>
2334HWY_API auto Ge(V a, V b) -> decltype(a == b) {
2335 return a >= b;
2336}
2337
2338template <class V>
2339HWY_API auto Le(V a, V b) -> decltype(a == b) {
2340 return a <= b;
2341}
2342
2343// NOLINTNEXTLINE(google-readability-namespace-comments)
2344} // namespace HWY_NAMESPACE
2345} // namespace hwy
#define HWY_MAX(a, b)
Definition: base.h:126
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_API
Definition: base.h:120
#define HWY_MIN(a, b)
Definition: base.h:125
#define HWY_INLINE
Definition: base.h:62
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: arm_neon-inl.h:804
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:806
static HWY_INLINE Raw FromBool(bool b)
Definition: emu128-inl.h:73
Raw bits[16/sizeof(T)]
Definition: emu128-inl.h:78
Definition: arm_neon-inl.h:760
HWY_INLINE Vec128()=default
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: emu128-inl.h:42
Raw raw
Definition: arm_neon-inl.h:793
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: emu128-inl.h:48
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: emu128-inl.h:57
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: emu128-inl.h:54
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: emu128-inl.h:39
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: emu128-inl.h:51
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: emu128-inl.h:45
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE constexpr T IncrementWithWraparound(T t)
Definition: emu128-inl.h:139
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition: emu128-inl.h:1464
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition: emu128-inl.h:1473
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
V Shl(V a, V b)
Definition: arm_neon-inl.h:6292
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6260
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6250
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6265
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6212
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:6255
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6323
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
V Shr(V a, V b)
Definition: arm_neon-inl.h:6296
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6305
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:6240
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:836
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
constexpr float HighestValue< float >()
Definition: base.h:580
constexpr float LowestValue< float >()
Definition: base.h:567
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
HWY_API constexpr T LimitsMax()
Definition: base.h:548
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
@ value
Definition: arm_neon-inl.h:5319
Definition: arm_neon-inl.h:3883
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3884
Definition: ops/shared-inl.h:40
Definition: emu128-inl.h:84
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: emu128-inl.h:86
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:448
Definition: emu128-inl.h:438
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition: emu128-inl.h:440
Definition: base.h:246