28#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
35#if HWY_COMPILER_CLANGCL
43#include <avx2intrin.h>
44#include <f16cintrin.h>
47#include <avx512fintrin.h>
48#include <avx512vlintrin.h>
49#include <avx512bwintrin.h>
50#include <avx512dqintrin.h>
51#include <avx512vlbwintrin.h>
52#include <avx512vldqintrin.h>
53#include <avx512bitalgintrin.h>
54#include <avx512vlbitalgintrin.h>
55#include <avx512vpopcntdqintrin.h>
56#include <avx512vpopcntdqvlintrin.h>
64#include <sanitizer/msan_interface.h>
119 return *
this = (*
this * other);
122 return *
this = (*
this / other);
125 return *
this = (*
this + other);
128 return *
this = (*
this - other);
131 return *
this = (*
this & other);
134 return *
this = (*
this | other);
137 return *
this = (*
this ^ other);
156 return _mm512_castpd_si512(
v);
185template <
typename T,
typename FromT>
195 return Vec512<T>{_mm512_setzero_si512()};
216 _mm512_set1_epi64(
static_cast<long long>(t))};
229 _mm512_set1_epi64(
static_cast<long long>(t))};
246 return Vec512<T>{_mm512_undefined_epi32()};
266 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
333 using VU =
VFromD<
decltype(du)>;
334 const __m512i ret = _mm512_ternarylogic_epi64(
345 using VU =
VFromD<
decltype(du)>;
346 const __m512i ret = _mm512_ternarylogic_epi64(
357 using VU =
VFromD<
decltype(du)>;
383#if HWY_TARGET == HWY_AVX3_DL
385#ifdef HWY_NATIVE_POPCNT
386#undef HWY_NATIVE_POPCNT
388#define HWY_NATIVE_POPCNT
425 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
441 const __m512i out = _mm512_ternarylogic_epi32(
467template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
470 const uint32_t all = ~uint32_t(0);
472 m.raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u32(all, n));
476template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
478 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
479 return Mask512<T>{
static_cast<__mmask64
>(bits)};
489 const uint64_t all = ~uint64_t(0);
491 m.
raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u64(all, n));
494 return detail::FirstN<T>(n);
631 static_assert(IsSigned<T>(),
"Only works for signed/float");
636template <
typename T, HWY_IF_FLOAT(T)>
866template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
873 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
898 return shifted &
Set(d8, 0xFF >> kBits);
921 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
922 return (shifted ^ shifted_sign) - shifted_sign;
929 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
935 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
966template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
971 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
993 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1014 const auto shifted_sign =
1015 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1016 return (shifted ^ shifted_sign) - shifted_sign;
1037template <
typename T, HWY_IF_SIGNED(T)>
1199template <
typename T, HWY_IF_FLOAT(T)>
1204template <
typename T, HWY_IF_NOT_FLOAT(T)>
1206 return Zero(Full512<T>()) -
v;
1305 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1309 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1315 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1319 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1325 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1329 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1335 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1339 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1348template <
typename TFrom,
typename TTo>
1350 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1356template <
typename T>
1361template <
typename T>
1366template <
typename T>
1371template <
typename T>
1379template <
typename T>
1381 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1387template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1391template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1393 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1395template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1397 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1399template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1401 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1414template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1418template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1420 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1422template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1424 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1426template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1428 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1485template <
typename T>
1490template <
typename T>
1499template <
typename T>
1503template <
typename T>
1507template <
typename T>
1511template <
typename T>
1518template <
typename T>
1551 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(
v.raw))};
1564template <
typename T>
1573template <
typename T>
1575#if HWY_COMPILER_HAS_MASK_INTRINSICS
1581template <
typename T>
1583#if HWY_COMPILER_HAS_MASK_INTRINSICS
1589template <
typename T>
1591#if HWY_COMPILER_HAS_MASK_INTRINSICS
1594 return Mask512<T>{
static_cast<uint16_t
>(~m.raw & 0xFFFF)};
1597template <
typename T>
1599#if HWY_COMPILER_HAS_MASK_INTRINSICS
1602 return Mask512<T>{
static_cast<uint8_t
>(~m.raw & 0xFF)};
1606template <
typename T>
1609#if HWY_COMPILER_HAS_MASK_INTRINSICS
1615template <
typename T>
1618#if HWY_COMPILER_HAS_MASK_INTRINSICS
1624template <
typename T>
1627#if HWY_COMPILER_HAS_MASK_INTRINSICS
1633template <
typename T>
1636#if HWY_COMPILER_HAS_MASK_INTRINSICS
1643template <
typename T>
1646#if HWY_COMPILER_HAS_MASK_INTRINSICS
1652template <
typename T>
1655#if HWY_COMPILER_HAS_MASK_INTRINSICS
1661template <
typename T>
1664#if HWY_COMPILER_HAS_MASK_INTRINSICS
1670template <
typename T>
1673#if HWY_COMPILER_HAS_MASK_INTRINSICS
1680template <
typename T>
1683#if HWY_COMPILER_HAS_MASK_INTRINSICS
1689template <
typename T>
1692#if HWY_COMPILER_HAS_MASK_INTRINSICS
1698template <
typename T>
1701#if HWY_COMPILER_HAS_MASK_INTRINSICS
1707template <
typename T>
1710#if HWY_COMPILER_HAS_MASK_INTRINSICS
1717template <
typename T>
1720#if HWY_COMPILER_HAS_MASK_INTRINSICS
1726template <
typename T>
1729#if HWY_COMPILER_HAS_MASK_INTRINSICS
1735template <
typename T>
1738#if HWY_COMPILER_HAS_MASK_INTRINSICS
1744template <
typename T>
1747#if HWY_COMPILER_HAS_MASK_INTRINSICS
1756template <
typename T>
1761template <
typename T>
1766template <
typename T>
1771template <
typename T>
1776template <
typename T>
1788 return ShiftRight<15>(
v);
1792 return ShiftRight<31>(
v);
1828template <
typename T>
1830 return Vec512<T>{_mm512_load_si512(aligned)};
1841template <
typename T>
1843 return Vec512<T>{_mm512_loadu_si512(p)};
1856template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1862template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1865 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1868template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1871 return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1874template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1877 return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1894template <
typename T>
1898 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1902 const __m128 x4 = _mm_loadu_ps(p);
1908 const __m128d x2 = _mm_loadu_pd(p);
1914template <
typename T>
1917 _mm512_store_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
1921 _mm512_store_ps(aligned,
v.raw);
1925 _mm512_store_pd(aligned,
v.raw);
1928template <
typename T>
1931 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(p),
v.raw);
1935 _mm512_storeu_ps(p,
v.raw);
1939 _mm512_storeu_pd(p,
v.raw);
1944template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1947 _mm512_mask_storeu_epi8(p, m.
raw,
v.raw);
1950template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1953 _mm512_mask_storeu_epi16(p, m.raw,
v.raw);
1956template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1959 _mm512_mask_storeu_epi32(p, m.raw,
v.raw);
1962template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1965 _mm512_mask_storeu_epi64(p, m.raw,
v.raw);
1970 _mm512_mask_storeu_ps(p, m.
raw,
v.raw);
1975 _mm512_mask_storeu_pd(p, m.
raw,
v.raw);
1980template <
typename T>
1983 _mm512_stream_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
1987 _mm512_stream_ps(aligned,
v.raw);
1991 _mm512_stream_pd(aligned,
v.raw);
2002template <
typename T>
2006 _mm512_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
2008template <
typename T>
2012 _mm512_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2015template <
typename T>
2019 _mm512_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
2021template <
typename T>
2025 _mm512_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2030template <
typename T,
typename Offset>
2033 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2036template <
typename T,
typename Index>
2039 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2046 _mm512_i32scatter_ps(base, offset.
raw,
v.raw, 1);
2051 _mm512_i32scatter_ps(base, index.
raw,
v.raw, 4);
2057 _mm512_i64scatter_pd(base, offset.
raw,
v.raw, 1);
2062 _mm512_i64scatter_pd(base, index.
raw,
v.raw, 8);
2069template <
typename T>
2074 return Vec512<T>{_mm512_i32gather_epi32(offset.
raw, base, 1)};
2076template <
typename T>
2081 return Vec512<T>{_mm512_i32gather_epi32(index.
raw, base, 4)};
2084template <
typename T>
2089 return Vec512<T>{_mm512_i64gather_epi64(offset.
raw, base, 1)};
2091template <
typename T>
2096 return Vec512<T>{_mm512_i64gather_epi64(index.
raw, base, 8)};
2101template <
typename T,
typename Offset>
2104 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2107template <
typename T,
typename Index>
2110 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2142template <
typename T>
2144 return Vec256<T>{_mm512_castsi512_si256(
v.raw)};
2153template <
typename T>
2160template <
typename T>
2162 return Vec256<T>{_mm512_extracti32x8_epi32(
v.raw, 1)};
2172template <
typename T>
2176 alignas(64) T lanes[64 /
sizeof(T)];
2182template <
typename T>
2186 alignas(64) T lanes[64 /
sizeof(T)];
2189 return Load(
d, lanes);
2193template <
typename T>
2200template <
typename T>
2205 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.
raw, 0)};
2227template <
typename T>
2230 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.
raw, 1)};
2245template <
int kBytes,
typename T>
2247 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2248 return Vec512<T>{_mm512_bslli_epi128(
v.raw, kBytes)};
2251template <
int kBytes,
typename T>
2258template <
int kLanes,
typename T>
2264template <
int kLanes,
typename T>
2270template <
int kBytes,
typename T>
2272 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2273 return Vec512<T>{_mm512_bsrli_epi128(
v.raw, kBytes)};
2277template <
int kLanes,
typename T>
2285template <
int kBytes,
typename T,
class V = Vec512<T>>
2297 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2299 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2303 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2309 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2310 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2315 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2316 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2323 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2325 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2329 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2335 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2336 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2341 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2342 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2349 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2350 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2355 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2356 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0xFF * kLane);
2369template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2371 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CDAB)};
2379template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2387template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2395template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2463template <
typename T>
2468template <
typename T,
typename TI>
2470 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2471#if HWY_IS_DEBUG_BUILD
2474 AllTrue(di,
Lt(vec,
Set(di,
static_cast<TI
>(64 /
sizeof(T))))));
2479template <
typename T,
typename TI>
2481 const Rebind<TI,
decltype(
d)> di;
2485template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2490template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2492 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw,
v.raw)};
2506template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2509 alignas(64)
constexpr int16_t kReverse[32] = {
2510 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2511 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2514 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2517template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2519 alignas(64)
constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2520 7, 6, 5, 4, 3, 2, 1, 0};
2524template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2526 alignas(64)
constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2532template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2538template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2543template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2550template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2553 alignas(64)
constexpr int16_t kReverse4[32] = {
2554 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2555 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2558 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2561template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2566template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2568 return Vec512<T>{_mm512_permutex_epi64(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2571 return Vec512<double>{_mm512_permutex_pd(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2576template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2579 alignas(64)
constexpr int16_t kReverse8[32] = {
2580 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2581 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2584 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2587template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2590 alignas(64)
constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2591 15, 14, 13, 12, 11, 10, 9, 8};
2592 const Vec512<int32_t> idx =
Load(di, kReverse8);
2594 _mm512_permutexvar_epi32(idx.raw,
BitCast(di,
v).raw)});
2597template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2701template <
typename T,
class V = Vec512<T>>
2710template <
typename T,
typename TW = MakeW
ide<T>>
2714template <
typename T,
typename TW = MakeW
ide<T>>
2719template <
typename T,
typename TW = MakeW
ide<T>>
2727template <
typename T>
2730 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BABA)};
2744template <
typename T>
2747 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_DCDC)};
2761template <
typename T>
2764 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BADC)};
2778template <
typename T>
2783 const __mmask32 mask = (0x0000FFFF);
2789 const __mmask16 mask = (0x00FF);
2795 const __mmask8 mask = (0x0F);
2801template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2804#if HWY_TARGET == HWY_AVX3_DL
2805 alignas(64)
constexpr uint8_t kIdx[64] = {
2806 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
2807 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51,
2808 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
2809 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103,
2810 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
2814 __mmask64{0xFFFFFFFFFFFFFFFFull},
BitCast(du, hi).raw)});
2823 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2828template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2831 alignas(64)
constexpr uint16_t kIdx[32] = {
2832 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2833 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
2834 return BitCast(
d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
2836 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
2839template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2842 alignas(64)
constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2843 17, 19, 21, 23, 25, 27, 29, 31};
2844 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2846 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2852 alignas(64)
constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2853 17, 19, 21, 23, 25, 27, 29, 31};
2855 __mmask16{0xFFFF}, hi.
raw)};
2858template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2861 alignas(64)
constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2862 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2863 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2870 alignas(64)
constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2872 __mmask8{0xFF}, hi.
raw)};
2877template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2880#if HWY_TARGET == HWY_AVX3_DL
2881 alignas(64)
constexpr uint8_t kIdx[64] = {
2882 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
2883 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
2884 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
2885 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
2886 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
2890 __mmask64{0xFFFFFFFFFFFFFFFFull},
BitCast(du, hi).raw)});
2900 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
2905template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2908 alignas(64)
constexpr uint16_t kIdx[32] = {
2909 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
2910 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
2911 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
2913 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
2916template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2919 alignas(64)
constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2920 16, 18, 20, 22, 24, 26, 28, 30};
2921 return BitCast(
d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
2923 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2929 alignas(64)
constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2930 16, 18, 20, 22, 24, 26, 28, 30};
2932 __mmask16{0xFFFF}, hi.
raw)};
2935template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2938 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2939 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2940 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2947 alignas(64)
constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2949 __mmask8{0xFF}, hi.
raw)};
2954template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2956 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CCAA)};
2962template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2969template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2971 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_DDBB)};
2977template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2984template <
typename T>
2986 constexpr size_t s =
sizeof(T);
2987 constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2993template <
typename T>
2995 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.
raw, even.
raw)};
3000 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.
raw, even.
raw)};
3005 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.
raw, even.
raw)};
3010template <
typename T>
3012 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
3016 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
3025template <
typename T>
3027 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
3030 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
3040template <
typename T,
typename TI>
3046template <
typename T,
typename TI,
size_t NI>
3049 const Half<
decltype(d512)> d256;
3050 const Half<
decltype(d256)> d128;
3053 const auto from_512 =
3059template <
typename T,
typename TI>
3066template <
typename T,
size_t N,
typename TI>
3069 const Half<
decltype(d512)> d256;
3070 const Half<
decltype(d256)> d128;
3073 const auto bytes_512 =
3077template <
typename T,
typename TI>
3150 const Rebind<uint16_t,
decltype(df32)> du16;
3170 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3181 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3183 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3193 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3196 alignas(16)
static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3198 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3207 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3209 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3218 alignas(16)
static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3219 0, 4, 8, 12, 0, 4, 8, 12};
3221 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3230 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3232 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3248 const Rebind<int32_t,
decltype(dbf16)> di32;
3249 const Rebind<uint32_t,
decltype(dbf16)> du32;
3250 const Rebind<uint16_t,
decltype(dbf16)> du16;
3251 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3259 const Repartition<uint32_t,
decltype(dbf16)> du32;
3280 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
3284 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3286 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
3317#if !defined(HWY_DISABLE_PCLMUL_AES)
3320#ifdef HWY_NATIVE_AES
3321#undef HWY_NATIVE_AES
3323#define HWY_NATIVE_AES
3328#if HWY_TARGET == HWY_AVX3_DL
3332 const Half<
decltype(
d)> d2;
3340#if HWY_TARGET == HWY_AVX3_DL
3344 const Half<
decltype(
d)> d2;
3352#if HWY_TARGET == HWY_AVX3_DL
3355 alignas(64) uint64_t a[8];
3356 alignas(64) uint64_t b[8];
3361 for (
size_t i = 0; i < 8; i += 2) {
3363 Store(mul, d128, a + i);
3370#if HWY_TARGET == HWY_AVX3_DL
3373 alignas(64) uint64_t a[8];
3374 alignas(64) uint64_t b[8];
3379 for (
size_t i = 0; i < 8; i += 2) {
3381 Store(mul, d128, a + i);
3392template <
typename T,
typename T2>
3395 for (
size_t i = 0; i < 64 /
sizeof(T); ++i) {
3396 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
3398 return Load(
d, lanes);
3407template <
typename T>
3409#if HWY_COMPILER_HAS_MASK_INTRINSICS
3410 return _kortestz_mask64_u8(mask.
raw, mask.
raw);
3412 return mask.
raw == 0;
3415template <
typename T>
3417#if HWY_COMPILER_HAS_MASK_INTRINSICS
3418 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
3420 return mask.
raw == 0;
3423template <
typename T>
3425#if HWY_COMPILER_HAS_MASK_INTRINSICS
3426 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
3428 return mask.
raw == 0;
3431template <
typename T>
3433#if HWY_COMPILER_HAS_MASK_INTRINSICS
3434 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
3436 return mask.
raw == 0;
3442template <
typename T>
3449template <
typename T>
3451#if HWY_COMPILER_HAS_MASK_INTRINSICS
3452 return _kortestc_mask64_u8(mask.
raw, mask.
raw);
3454 return mask.
raw == 0xFFFFFFFFFFFFFFFFull;
3457template <
typename T>
3459#if HWY_COMPILER_HAS_MASK_INTRINSICS
3460 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
3462 return mask.
raw == 0xFFFFFFFFull;
3465template <
typename T>
3467#if HWY_COMPILER_HAS_MASK_INTRINSICS
3468 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
3470 return mask.
raw == 0xFFFFull;
3473template <
typename T>
3475#if HWY_COMPILER_HAS_MASK_INTRINSICS
3476 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
3478 return mask.
raw == 0xFFull;
3484template <
typename T>
3490template <
typename T>
3500template <
typename T>
3503 const size_t kNumBytes = 8 /
sizeof(T);
3504 CopyBytes<kNumBytes>(&mask.
raw, bits);
3509template <
typename T>
3514template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3520template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3522 const Mask512<T> mask) {
3528template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3530 return Vec512<T>{_mm512_maskz_compress_epi32(mask.
raw,
v.raw)};
3537template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3540 alignas(16)
constexpr uint64_t packed_array[256] = {
3542 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
3543 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
3544 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
3545 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
3546 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
3547 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
3548 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
3549 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
3550 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
3551 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
3552 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
3553 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
3554 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
3555 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
3556 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
3557 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
3558 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
3559 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
3560 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
3561 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
3562 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
3563 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
3564 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
3565 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
3566 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
3567 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
3568 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
3569 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
3570 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
3571 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
3572 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
3573 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
3574 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
3575 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
3576 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
3577 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
3578 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
3579 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
3580 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
3581 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
3582 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
3583 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
3584 0x10765432, 0x17654320, 0x07654321, 0x76543210};
3590 const auto packed =
Set(du64, packed_array[mask.raw]);
3591 alignas(64)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3592 const auto indices = Indices512<T>{(packed >>
Load(du64, shifts)).raw};
3600template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3603 const Rebind<uint16_t,
decltype(
d)> du;
3606#if HWY_TARGET == HWY_AVX3_DL
3607 const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3611 const Mask512<int32_t> mask32{
static_cast<__mmask16
>(mask.raw)};
3619template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3622 const Rebind<uint16_t,
decltype(
d)> du;
3625#if HWY_TARGET == HWY_AVX3_DL
3626 const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)};
3629 const Half<
decltype(du)> duh;
3633 const uint32_t mask_bits{mask.raw};
3634 const Mask512<int32_t> mask0{
static_cast<__mmask16
>(mask_bits & 0xFFFF)};
3635 const Mask512<int32_t> mask1{
static_cast<__mmask16
>(mask_bits >> 16)};
3636 const auto compressed0 =
Compress(promoted0, mask0);
3637 const auto compressed1 =
Compress(promoted1, mask1);
3643 const size_t num0 =
CountTrue(dw, mask0);
3644 const __mmask32 m_upper = ~((1u << num0) - 1);
3645 alignas(64) uint16_t iota[64] = {
3646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3648 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3649 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3650 const auto idx =
LoadU(du, iota + 32 - num0);
3651 const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3652 demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3660template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
3665template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3668 alignas(16)
constexpr uint64_t packed_array[256] = {
3670 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
3671 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
3672 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
3673 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
3674 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
3675 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
3676 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
3677 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
3678 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
3679 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
3680 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
3681 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
3682 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
3683 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
3684 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
3685 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
3686 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
3687 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
3688 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
3689 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
3690 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
3691 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
3692 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
3693 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
3694 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
3695 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
3696 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
3697 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
3698 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
3699 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
3700 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
3701 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
3702 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
3703 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
3704 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
3705 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
3706 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
3707 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
3708 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
3709 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
3710 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
3711 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
3712 0x76543210, 0x76543201, 0x76543210, 0x76543210};
3718 const auto packed =
Set(du64, packed_array[mask.raw]);
3719 alignas(64)
constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3720 const auto indices = Indices512<T>{(packed >>
Load(du64, shifts)).raw};
3730template <
typename T>
3737template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3740 const Rebind<uint16_t,
decltype(
d)> du;
3743 const uint64_t mask_bits{mask.
raw};
3745#if HWY_TARGET == HWY_AVX3_DL
3746 _mm512_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
3749 const Half<
decltype(du)> duh;
3753 const uint64_t maskL = mask_bits & 0xFFFF;
3754 const uint64_t maskH = mask_bits >> 16;
3757 const auto compressed0 =
Compress(promoted0, mask0);
3758 const auto compressed1 =
Compress(promoted1, mask1);
3760 const Half<
decltype(
d)> dh;
3765 StoreU(demoted0, dh, unaligned);
3772template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3775 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
3776 const size_t count =
PopCount(uint64_t{mask.raw});
3779 __msan_unpoison(unaligned, count *
sizeof(T));
3784template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3787 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
3788 const size_t count =
PopCount(uint64_t{mask.raw});
3791 __msan_unpoison(unaligned, count *
sizeof(T));
3799 _mm512_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
3800 const size_t count =
PopCount(uint64_t{mask.
raw});
3803 __msan_unpoison(unaligned, count *
sizeof(
float));
3811 _mm512_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
3812 const size_t count =
PopCount(uint64_t{mask.
raw});
3815 __msan_unpoison(unaligned, count *
sizeof(
double));
3821template <
typename T>
3833 __msan_unpoison(unaligned, count *
sizeof(T));
3840template <
typename T>
3852template <_MM_PERM_ENUM kPerm,
typename T>
3856template <_MM_PERM_ENUM kPerm>
3860template <_MM_PERM_ENUM kPerm>
3873template <
typename T>
3877 constexpr size_t N = 64 /
sizeof(T);
3882 const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
3883 const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
3885 A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
3886 B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
3887 C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
3900template <
typename T>
3905 constexpr size_t N = 64 /
sizeof(T);
3911 const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
3912 const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
3913 const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
3914 const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
3915 A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
3916 B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
3917 C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
3918 D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
3935template <
typename T>
3939 constexpr size_t N = 64 /
sizeof(T);
3940 const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
3941 const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
3942 const auto j1_i1_j0_i0 =
3943 detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
3944 const auto j3_i3_j2_i2 =
3945 detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
3946 StoreU(j1_i1_j0_i0,
d, unaligned + 0 *
N);
3947 StoreU(j3_i3_j2_i2,
d, unaligned + 1 *
N);
3958template <
typename T>
3962 constexpr size_t N = 64 /
sizeof(T);
3963 const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
3964 const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
3965 const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
3968 detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
3970 detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
3972 detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
3974 StoreU(out0,
d, unaligned + 0 *
N);
3975 StoreU(out1,
d, unaligned + 1 *
N);
3976 StoreU(out2,
d, unaligned + 2 *
N);
3989template <
typename T>
3993 constexpr size_t N = 64 /
sizeof(T);
3994 const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
3995 const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
3996 const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
3997 const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
3999 detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
4001 detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
4003 detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
4005 detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
4006 StoreU(out0,
d, unaligned + 0 *
N);
4007 StoreU(out1,
d, unaligned + 1 *
N);
4008 StoreU(out2,
d, unaligned + 2 *
N);
4009 StoreU(out3,
d, unaligned + 3 *
N);
4018 const DFromV<
decltype(a)> du64;
4020 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
4021 const auto a32 =
BitCast(du32, a);
4022 const auto b32 =
BitCast(du32, b);
4030 const auto aLbL =
MulEven(a32, b32);
4031 const auto w3 = aLbL & maskL;
4033 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
4034 const auto w2 = t2 & maskL;
4035 const auto w1 = ShiftRight<32>(t2);
4037 const auto t =
MulEven(a32, bH) + w2;
4038 const auto k = ShiftRight<32>(t);
4040 const auto mulH =
MulEven(aH, bH) + w1 + k;
4041 const auto mulL = ShiftLeft<32>(t) + w3;
4047 const DFromV<
decltype(a)> du64;
4049 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
4050 const auto a32 =
BitCast(du32, a);
4051 const auto b32 =
BitCast(du32, b);
4057 const auto aLbL =
MulEven(a32, b32);
4058 const auto w3 = aLbL & maskL;
4060 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
4061 const auto w2 = t2 & maskL;
4062 const auto w1 = ShiftRight<32>(t2);
4064 const auto t =
MulEven(a32, bH) + w2;
4065 const auto k = ShiftRight<32>(t);
4067 const auto mulH =
MulEven(aH, bH) + w1 + k;
4068 const auto mulL = ShiftLeft<32>(t) + w3;
4097 return Set(
d, _mm512_reduce_add_epi32(
v.raw));
4100 return Set(
d, _mm512_reduce_add_epi64(
v.raw));
4103 return Set(
d,
static_cast<uint32_t
>(_mm512_reduce_add_epi32(
v.raw)));
4106 return Set(
d,
static_cast<uint64_t
>(_mm512_reduce_add_epi64(
v.raw)));
4109 return Set(
d, _mm512_reduce_add_ps(
v.raw));
4112 return Set(
d, _mm512_reduce_add_pd(
v.raw));
4117 return Set(
d, _mm512_reduce_min_epi32(
v.raw));
4120 return Set(
d, _mm512_reduce_min_epi64(
v.raw));
4123 return Set(
d, _mm512_reduce_min_epu32(
v.raw));
4126 return Set(
d, _mm512_reduce_min_epu64(
v.raw));
4129 return Set(
d, _mm512_reduce_min_ps(
v.raw));
4132 return Set(
d, _mm512_reduce_min_pd(
v.raw));
4134template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4138 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4141 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
4146 return Set(
d, _mm512_reduce_max_epi32(
v.raw));
4149 return Set(
d, _mm512_reduce_max_epi64(
v.raw));
4152 return Set(
d, _mm512_reduce_max_epu32(
v.raw));
4155 return Set(
d, _mm512_reduce_max_epu64(
v.raw));
4158 return Set(
d, _mm512_reduce_max_ps(
v.raw));
4161 return Set(
d, _mm512_reduce_max_pd(
v.raw));
4163template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4167 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4170 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:69
#define HWY_API
Definition: base.h:120
#define HWY_INLINE
Definition: base.h:62
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:70
#define HWY_DASSERT(condition)
Definition: base.h:191
Definition: arm_neon-inl.h:760
Raw raw
Definition: arm_neon-inl.h:793
Definition: wasm_256-inl.h:39
Raw raw
Definition: x86_256-inl.h:100
Definition: x86_512-inl.h:112
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:121
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:113
Raw raw
Definition: x86_512-inl.h:140
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:133
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:124
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:136
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:127
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:118
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:130
#define HWY_AVX3_DL
Definition: detect_targets.h:62
#define HWY_TARGET
Definition: detect_targets.h:341
const double shift
Definition: RateControl.cpp:165
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2425
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:721
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:5417
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition: generic_ops-inl.h:151
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2463
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:5406
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1520
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:462
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3578
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:818
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2510
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3219
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3208
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:870
Vec512< T > Shuffle128(const Vec512< T > lo, const Vec512< T > hi)
Definition: x86_512-inl.h:3853
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:131
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:879
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4283
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:892
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:855
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2444
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:2039
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:673
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:763
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: generic_ops-inl.h:513
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition: generic_ops-inl.h:332
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4150
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1574
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2502
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:929
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1356
d
Definition: rvv-inl.h:1742
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1616
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2149
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4533
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2398
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:2189
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4498
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1080
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:4187
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:5305
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:4046
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1669
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:4062
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3363
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:4284
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3433
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5280
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:3514
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3617
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:2409
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:5290
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2901
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:209
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4932
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2166
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2470
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4200
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:2096
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:2508
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2176
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4353
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:200
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:594
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4779
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4654
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4453
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:4056
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1916
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3438
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4380
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3467
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2014
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2019
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4096
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4614
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4555
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:202
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5787
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2711
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:198
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1104
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition: arm_neon-inl.h:5815
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:4203
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2006
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:2024
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2887
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:5269
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:2182
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:4482
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:2430
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:236
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2706
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:1398
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4664
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4164
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3934
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3394
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:2157
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2725
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4422
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3380
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:211
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3888
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4540
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3606
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3976
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:2279
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4028
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4940
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:1035
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5005
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:2236
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:4050
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:565
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1746
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2544
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1999
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:2225
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4406
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1447
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1627
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:5299
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3661
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition: arm_neon-inl.h:1070
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4726
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, Vec128< T, N > b)
Definition: emu128-inl.h:548
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:1025
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:4196
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition: emu128-inl.h:402
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:4292
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:4189
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5862
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:2402
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1620
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:4171
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4762
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5846
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1089
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:833
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3424
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1719
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:3233
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3928
HWY_API TFromV< V > GetLane(const V v)
Definition: arm_neon-inl.h:1061
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4744
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:1121
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition: arm_neon-inl.h:1992
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:1021
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2718
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4514
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1705
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:4068
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:3352
typename D::Half Half
Definition: ops/shared-inl.h:215
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4936
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3629
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:206
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
N
Definition: rvv-inl.h:1742
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5837
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:1429
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition: arm_neon-inl.h:3448
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:1346
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1870
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4548
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5823
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4005
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:616
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1527
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2882
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition: arm_neon-inl.h:1210
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition: emu128-inl.h:605
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition: emu128-inl.h:392
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3800
const vfloat64m1_t v
Definition: rvv-inl.h:1742
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:4176
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1758
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3635
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition: arm_neon-inl.h:5763
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1846
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
#define HWY_ALIGN
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
Definition: x86_512-inl.h:2464
__m512i raw
Definition: x86_512-inl.h:2465
Definition: x86_512-inl.h:145
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:146
Definition: ops/shared-inl.h:40
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:175
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:171
Definition: x86_512-inl.h:166
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:167
__m512d type
Definition: x86_512-inl.h:86
__m512 type
Definition: x86_512-inl.h:82
Definition: x86_512-inl.h:77
__m512i type
Definition: x86_512-inl.h:78
__mmask64 type
Definition: x86_512-inl.h:94
__mmask32 type
Definition: x86_512-inl.h:98
__mmask16 type
Definition: x86_512-inl.h:102
__mmask8 type
Definition: x86_512-inl.h:106
Definition: x86_512-inl.h:91