48#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
50template <
size_t kLanes,
class D,
class V = VFromD<D>>
52 constexpr size_t kBytes = kLanes *
sizeof(LaneType<V>);
53 static_assert(kBytes < 16,
"Shift count is per-block");
54 return CombineShiftRightBytes<kBytes>(
d, hi, lo);
80 using TU =
TFromD<
decltype(du)>;
81 const TU max_x2 =
static_cast<TU
>(MaxExponentTimes2<T>());
87template <
class D,
typename T = TFromD<D>>
90#if HWY_MEM_OPS_MIGHT_FAULT
92 for (
size_t i = 0; i < num; ++i) {
102template <
class D,
typename T = TFromD<D>>
105#if HWY_MEM_OPS_MIGHT_FAULT
107 for (
size_t i = 0; i < num; ++i) {
120#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
121#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
122#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
124#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
129template <
typename T,
size_t N,
class V>
132 const V A =
LoadU(
d, unaligned + 0 *
N);
133 const V B =
LoadU(
d, unaligned + 1 *
N);
138template <
typename T,
class V>
141 v0 =
LoadU(
d, unaligned + 0);
142 v1 =
LoadU(
d, unaligned + 1);
150template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
154 A =
LoadU(
d, unaligned + 0 *
N);
155 B =
LoadU(
d, unaligned + 1 *
N);
156 C =
LoadU(
d, unaligned + 2 *
N);
161template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
163 V& v0, V& v1, V& v2) {
171 constexpr uint8_t Z = 0x80;
172 alignas(16)
constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
173 Z, Z, Z, Z, Z, Z, Z, Z};
174 alignas(16)
constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5,
175 8, 11, 14, Z, Z, Z, Z, Z};
176 alignas(16)
constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
177 Z, Z, Z, 1, 4, 7, 10, 13};
178 alignas(16)
constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
179 Z, Z, Z, Z, Z, Z, Z, Z};
180 alignas(16)
constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6,
181 9, 12, 15, Z, Z, Z, Z, Z};
182 alignas(16)
constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
183 Z, Z, Z, 2, 5, 8, 11, 14};
184 alignas(16)
constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
185 Z, Z, Z, Z, Z, Z, Z, Z};
186 alignas(16)
constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7,
187 10, 13, Z, Z, Z, Z, Z, Z};
188 alignas(16)
constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
189 Z, Z, 0, 3, 6, 9, 12, 15};
199 v0 =
Or3(v0L, v0M, v0U);
200 v1 =
Or3(v1L, v1M, v1U);
201 v2 =
Or3(v2L, v2M, v2U);
208 V& v0, V& v1, V& v2) {
215 constexpr uint8_t Z = 0x80;
216 alignas(16)
constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
217 alignas(16)
constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
218 alignas(16)
constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
219 alignas(16)
constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
220 alignas(16)
constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
221 alignas(16)
constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
222 alignas(16)
constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
223 alignas(16)
constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
224 alignas(16)
constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
234 v0 =
Or3(v0L, v0M, v0U);
235 v1 =
Or3(v1L, v1M, v1U);
236 v2 =
Or3(v2L, v2M, v2U);
243 V& v0, V& v1, V& v2) {
251 constexpr uint16_t Z = 0x8080;
252 alignas(16)
constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
254 alignas(16)
constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302,
255 0x0908, 0x0F0E, Z, Z};
256 alignas(16)
constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z,
257 Z, Z, 0x0504, 0x0B0A};
258 alignas(16)
constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
260 alignas(16)
constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504,
262 alignas(16)
constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z,
263 Z, 0x0100, 0x0706, 0x0D0C};
264 alignas(16)
constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
266 alignas(16)
constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706,
268 alignas(16)
constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z,
269 Z, 0x0302, 0x0908, 0x0F0E};
279 v0 =
Or3(v0L, v0M, v0U);
280 v1 =
Or3(v1L, v1M, v1U);
281 v2 =
Or3(v2L, v2M, v2U);
284template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
286 V& v0, V& v1, V& v2) {
292 const V vxx_02_03_xx =
OddEven(C, B);
298 const V vxx_xx_10_11 =
OddEven(A, B);
299 const V v12_13_xx_xx =
OddEven(B, C);
302 const V vxx_20_21_xx =
OddEven(B, A);
306template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
308 V& v0, V& v1, V& v2) {
314 v1 = CombineShiftRightBytes<sizeof(T)>(
d, C, A);
318template <
typename T,
class V>
320 V& v0, V& v1, V& v2) {
321 v0 =
LoadU(
d, unaligned + 0);
322 v1 =
LoadU(
d, unaligned + 1);
323 v2 =
LoadU(
d, unaligned + 2);
331template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
335 A =
LoadU(
d, unaligned + 0 *
N);
336 B =
LoadU(
d, unaligned + 1 *
N);
337 C =
LoadU(
d, unaligned + 2 *
N);
338 D =
LoadU(
d, unaligned + 3 *
N);
343template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
345 V& v0, V& v1, V& v2, V& v3) {
347 using V64 =
VFromD<
decltype(d64)>;
379template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
381 V& v0, V& v1, V& v2, V& v3) {
386 using VW =
VFromD<
decltype(dw)>;
416template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
418 V& v0, V& v1, V& v2, V& v3) {
435template <
typename T,
size_t N,
class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
437 V& v0, V& v1, V& v2, V& v3) {
447template <
typename T,
class V>
449 V& v0, V& v1, V& v2, V& v3) {
450 v0 =
LoadU(
d, unaligned + 0);
451 v1 =
LoadU(
d, unaligned + 1);
452 v2 =
LoadU(
d, unaligned + 2);
453 v3 =
LoadU(
d, unaligned + 3);
461template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
471template <
typename T,
size_t N,
class V, HWY_IF_GE128(T, N)>
488 StoreU(v10, d_full, unaligned);
492template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
494 const Vec128<T, N> part1, Simd<T, N, 0> ,
497 const Full128<T> d_full;
498 const Vec128<T> v0{part0.raw};
499 const Vec128<T> v1{part1.raw};
501 alignas(16) T buf[16 /
sizeof(T)];
503 CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
512template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
529 const auto k5 =
Set(du, 5);
530 const auto k6 =
Set(du, 6);
536 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
537 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
538 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
539 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
540 0x80, 0, 0x80, 0x80, 1, 0x80,
541 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
546 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
550 const V A =
BitCast(
d, A0 | A1 | A2);
553 const auto shuf_B0 = shuf_A2 + k6;
554 const auto shuf_B1 = shuf_A0 + k5;
555 const auto shuf_B2 = shuf_A1 + k5;
559 const V B =
BitCast(
d, B0 | B1 | B2);
562 const auto shuf_C0 = shuf_B2 + k6;
563 const auto shuf_C1 = shuf_B0 + k5;
564 const auto shuf_C2 = shuf_B1 + k5;
568 const V C =
BitCast(
d, C0 | C1 | C2);
579 const auto k2 =
Set(du8, 2 *
sizeof(T));
580 const auto k3 =
Set(du8, 3 *
sizeof(T));
586 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
587 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
588 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
589 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
590 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
591 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
597 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
603 const V A =
BitCast(
d, A0 | A1 | A2);
606 const auto shuf_B0 = shuf_A1 + k3;
607 const auto shuf_B1 = shuf_A2 + k3;
608 const auto shuf_B2 = shuf_A0 + k2;
612 const V B =
BitCast(
d, B0 | B1 | B2);
615 const auto shuf_C0 = shuf_B1 + k3;
616 const auto shuf_C1 = shuf_B2 + k3;
617 const auto shuf_C2 = shuf_B0 + k2;
621 const V C =
BitCast(
d, C0 | C1 | C2);
634 const V v01_v20 =
OddEven(v0, v2);
639 const V v1_321 = ShiftRightLanes<1>(
d, v1);
640 const V v0_32 = ShiftRightLanes<2>(
d, v0);
641 const V v21_v11 =
OddEven(v2, v1_321);
642 const V v12_v02 =
OddEven(v1_321, v0_32);
648 const V v23_v13 =
OddEven(v2, v1_321);
649 const V v03_v22 =
OddEven(v0, v2);
669template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
673 constexpr size_t N = 16 /
sizeof(T);
677 const auto k5 =
Set(du, 5);
678 const auto k6 =
Set(du, 6);
687 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
688 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
689 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
690 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
691 0x80, 0, 0x80, 0x80, 1, 0x80,
692 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
695 const auto shuf_A0 =
Load(du, tbl_v0);
696 const auto shuf_A1 =
Load(du, tbl_v1);
697 const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
701 const auto A =
BitCast(d_full, A0 | A1 | A2);
702 StoreU(A, d_full, unaligned + 0 *
N);
705 const auto shuf_B0 = shuf_A2 + k6;
706 const auto shuf_B1 = shuf_A0 + k5;
707 const auto shuf_B2 = shuf_A1 + k5;
711 const Vec64<T> B{(B0 | B1 | B2).raw};
716template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
718 const Vec64<T> part2, Full64<T> dh,
721 const Full128<uint8_t> du8;
722 constexpr size_t N = 16 /
sizeof(T);
723 const auto k2 =
Set(du8, 2 *
sizeof(T));
724 const auto k3 =
Set(du8, 3 *
sizeof(T));
726 const Vec128<T> v0{part0.raw};
727 const Vec128<T> v1{part1.raw};
728 const Vec128<T> v2{part2.raw};
734 alignas(16)
static constexpr uint8_t tbl_v1[16] = {
735 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
736 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
737 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
738 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
739 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
743 const auto shuf_A1 =
Load(du8, tbl_v1);
745 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
746 const auto shuf_A2 =
Load(du8, tbl_v2);
751 const Vec128<T> A =
BitCast(
d, A0 | A1 | A2);
755 const auto shuf_B0 = shuf_A1 + k3;
756 const auto shuf_B1 = shuf_A2 + k3;
757 const auto shuf_B2 = shuf_A0 + k2;
761 const Vec128<T> B =
BitCast(
d, B0 | B1 | B2);
762 StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 *
N);
766template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
768 const Vec64<T> v2, Full64<T>
d,
771 constexpr size_t N = 2;
773 const Vec64<T> v01_v20 =
OddEven(v0, v2);
775 StoreU(v10_v00,
d, unaligned + 0 *
N);
776 StoreU(v01_v20,
d, unaligned + 1 *
N);
777 StoreU(v21_v11,
d, unaligned + 2 *
N);
783template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
799 alignas(16)
static constexpr uint8_t tbl_v0[16] = {
800 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
801 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
804 const auto shuf_A0 =
Load(du, tbl_v0);
805 const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
806 const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
811 alignas(16) T buf[16 /
sizeof(T)];
813 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
817template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
822 constexpr size_t N = 4 /
sizeof(T);
834 alignas(16)
static constexpr uint8_t tbl_v2[16] = {
835 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
836 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
842 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
844 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
848 const auto A =
BitCast(d_full, A0 | A1 | A2);
849 alignas(16) T buf[16 /
sizeof(T)];
851 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
869template <
typename T,
size_t N,
class V, HWY_IF_LE128(T, N)>
887 const auto v10L =
ZipLower(dw, v0, v1);
888 const auto v32L =
ZipLower(dw, v2, v3);
889 const auto v10U =
ZipUpper(dw, v0, v1);
890 const auto v32U =
ZipUpper(dw, v2, v3);
913template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
917 constexpr size_t N = 16 /
sizeof(T);
925 const auto v10 =
ZipLower(dw, v0, v1);
926 const auto v32 =
ZipLower(dw, v2, v3);
929 StoreU(A, d_full, unaligned + 0 *
N);
930 StoreU(B, d_full, unaligned + 1 *
N);
934template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
936 const Vec64<T> part2,
const Vec64<T> part3,
938 constexpr size_t N = 16 /
sizeof(T);
940 const Full128<T> d_full;
941 const Vec128<T> v0{part0.raw};
942 const Vec128<T> v1{part1.raw};
943 const Vec128<T> v2{part2.raw};
944 const Vec128<T> v3{part3.raw};
947 StoreU(A, d_full, unaligned + 0 *
N);
948 StoreU(B, d_full, unaligned + 1 *
N);
952template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
954 const Vec128<T, N> part1,
955 const Vec128<T, N> part2,
956 const Vec128<T, N> part3, Simd<T, N, 0> ,
959 const Full128<T> d_full;
961 const Vec128<T> v0{part0.raw};
962 const Vec128<T> v1{part1.raw};
963 const Vec128<T> v2{part2.raw};
964 const Vec128<T> v3{part3.raw};
965 const auto v10 =
ZipLower(dw, v0, v1);
966 const auto v32 =
ZipLower(dw, v2, v3);
968 alignas(16) T buf[16 /
sizeof(T)];
969 StoreU(v3210, d_full, buf);
970 CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
978#if HWY_TARGET != HWY_SCALAR
994 const auto mask =
Set(du, 0xF);
998 alignas(16)
static constexpr uint8_t basisL[16] = {
999 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
1000 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
1001 alignas(16)
static constexpr uint8_t basisU[16] = {
1002 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
1003 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
1004 const auto sL =
And(state, mask);
1005 const auto sU = ShiftRight<4>(state);
1008 state =
Xor(gf4L, gf4U);
1013 alignas(16)
static constexpr uint8_t kZetaInv[16] = {
1014 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
1015 alignas(16)
static constexpr uint8_t kInv[16] = {
1016 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
1018 const auto sL =
And(state, mask);
1019 const auto sU = ShiftRight<4>(state);
1020 const auto sX =
Xor(sU, sL);
1029 alignas(16)
static constexpr uint8_t kAffineL[16] = {
1030 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
1031 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
1032 alignas(16)
static constexpr uint8_t kAffineU[16] = {
1033 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
1034 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
1037 return Xor(
Xor(affL, affU),
Set(du, 0x63));
1045#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
1046#ifdef HWY_NATIVE_AES
1047#undef HWY_NATIVE_AES
1049#define HWY_NATIVE_AES
1053#if HWY_TARGET != HWY_SCALAR
1058HWY_API V ShiftRows(
const V state) {
1060 alignas(16)
static constexpr uint8_t kShiftRow[16] = {
1065 const auto shift_row =
LoadDup128(du, kShiftRow);
1070HWY_API V MixColumns(
const V state) {
1077 alignas(16)
static constexpr uint8_t k2301[16] = {
1078 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
1079 alignas(16)
static constexpr uint8_t k1230[16] = {
1080 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
1084 const auto d =
Xor(
Add(state, state), overflow);
1086 const auto d_s2301 =
Xor(
d, s2301);
1087 const auto t_s2301 =
Xor(state, d_s2301);
1089 return Xor(d_s2301, t1230_s3012);
1098 state = detail::SubBytes(state);
1099 state = detail::ShiftRows(state);
1100 state = detail::MixColumns(state);
1101 state =
Xor(state, round_key);
1108 state = detail::SubBytes(state);
1109 state = detail::ShiftRows(state);
1110 state =
Xor(state, round_key);
1120 static_assert(
IsSame<
TFromD<
decltype(
d)>, uint64_t>(),
"V must be u64");
1121 const auto k1 =
Set(
d, 0x1111111111111111ULL);
1122 const auto k2 =
Set(
d, 0x2222222222222222ULL);
1123 const auto k4 =
Set(
d, 0x4444444444444444ULL);
1124 const auto k8 =
Set(
d, 0x8888888888888888ULL);
1125 const auto a0 =
And(a, k1);
1126 const auto a1 =
And(a, k2);
1127 const auto a2 =
And(a, k4);
1128 const auto a3 =
And(a, k8);
1129 const auto b0 =
And(b, k1);
1130 const auto b1 =
And(b, k2);
1131 const auto b2 =
And(b, k4);
1132 const auto b3 =
And(b, k8);
1148 static_assert(
IsSame<
TFromD<
decltype(
d)>, uint64_t>(),
"V must be u64");
1149 const auto k1 =
Set(
d, 0x1111111111111111ULL);
1150 const auto k2 =
Set(
d, 0x2222222222222222ULL);
1151 const auto k4 =
Set(
d, 0x4444444444444444ULL);
1152 const auto k8 =
Set(
d, 0x8888888888888888ULL);
1153 const auto a0 =
And(a, k1);
1154 const auto a1 =
And(a, k2);
1155 const auto a2 =
And(a, k4);
1156 const auto a3 =
And(a, k8);
1157 const auto b0 =
And(b, k1);
1158 const auto b1 =
And(b, k2);
1159 const auto b2 =
And(b, k4);
1160 const auto b3 =
And(b, k8);
1177#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
1178#ifdef HWY_NATIVE_POPCNT
1179#undef HWY_NATIVE_POPCNT
1181#define HWY_NATIVE_POPCNT
1184#undef HWY_MIN_POW2_FOR_128
1185#if HWY_TARGET == HWY_RVV
1186#define HWY_MIN_POW2_FOR_128 1
1190#define HWY_MIN_POW2_FOR_128 0
1195template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
1196 HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
1198 static_assert(IsSame<TFromD<D>, uint8_t>(),
"V must be u8");
1200 HWY_ALIGN constexpr uint8_t kLookup[16] = {
1201 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1203 const auto lo =
And(
v,
Set(
d, 0xF));
1204 const auto hi = ShiftRight<4>(
v);
1210#if HWY_TARGET != HWY_RVV
1212template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
1214 static_assert(IsSame<TFromD<D>, uint8_t>(),
"V must be u8");
1223template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
1225 static_assert(IsSame<TFromD<D>, uint16_t>(),
"V must be u16");
1229 return Add(ShiftRight<8>(vals),
And(vals,
Set(
d, 0xFF)));
1232template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
1234 static_assert(IsSame<TFromD<D>, uint32_t>(),
"V must be u32");
1238 return Add(ShiftRight<16>(vals),
And(vals,
Set(
d, 0xFF)));
1241#if HWY_HAVE_INTEGER64
1242template <
typename V,
class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
1244 static_assert(IsSame<TFromD<D>, uint64_t>(),
"V must be u64");
1248 return Add(ShiftRight<32>(vals),
And(vals,
Set(
d, 0xFF)));
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES)
Definition: base.h:353
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:346
#define HWY_API
Definition: base.h:120
#define HWY_IF_NOT_LANE_SIZE(T, bytes)
Definition: base.h:348
#define HWY_IF_GE128(T, N)
Definition: base.h:337
#define HWY_INLINE
Definition: base.h:62
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
d
Definition: rvv-inl.h:1742
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition: arm_neon-inl.h:5938
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6173
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:68
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec< D > Inf(D d)
Definition: generic_ops-inl.h:77
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:25
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition: arm_neon-inl.h:5976
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:42
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:88
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:103
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
N
Definition: rvv-inl.h:1742
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition: arm_neon-inl.h:6017
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6106
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2238
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:6138
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4719
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
HWY_API constexpr bool IsSame()
Definition: base.h:322
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:612
HWY_API constexpr T LimitsMax()
Definition: base.h:548
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: ops/shared-inl.h:40