Grok 10.0.3
base.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef HIGHWAY_HWY_BASE_H_
17#define HIGHWAY_HWY_BASE_H_
18
19// For SIMD module implementations and their callers, target-independent.
20
21#include <stddef.h>
22#include <stdint.h>
23
25#include "hwy/highway_export.h"
26
27#if HWY_ARCH_X86
28#include <atomic>
29#endif
30
31//------------------------------------------------------------------------------
32// Compiler-specific definitions
33
34#define HWY_STR_IMPL(macro) #macro
35#define HWY_STR(macro) HWY_STR_IMPL(macro)
36
37#if HWY_COMPILER_MSVC
38
39#include <intrin.h>
40
41#define HWY_RESTRICT __restrict
42#define HWY_INLINE __forceinline
43#define HWY_NOINLINE __declspec(noinline)
44#define HWY_FLATTEN
45#define HWY_NORETURN __declspec(noreturn)
46#define HWY_LIKELY(expr) (expr)
47#define HWY_UNLIKELY(expr) (expr)
48#define HWY_PRAGMA(tokens) __pragma(tokens)
49#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
50#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
51#define HWY_MAYBE_UNUSED
52#define HWY_HAS_ASSUME_ALIGNED 0
53#if (_MSC_VER >= 1700)
54#define HWY_MUST_USE_RESULT _Check_return_
55#else
56#define HWY_MUST_USE_RESULT
57#endif
58
59#else
60
61#define HWY_RESTRICT __restrict__
62#define HWY_INLINE inline __attribute__((always_inline))
63#define HWY_NOINLINE __attribute__((noinline))
64#define HWY_FLATTEN __attribute__((flatten))
65#define HWY_NORETURN __attribute__((noreturn))
66#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
67#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
68#define HWY_PRAGMA(tokens) _Pragma(#tokens)
69#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
70#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
71// Encountered "attribute list cannot appear here" when using the C++17
72// [[maybe_unused]], so only use the old style attribute for now.
73#define HWY_MAYBE_UNUSED __attribute__((unused))
74#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
75
76#endif // !HWY_COMPILER_MSVC
77
78//------------------------------------------------------------------------------
79// Builtin/attributes
80
81// Enables error-checking of format strings.
82#if HWY_HAS_ATTRIBUTE(__format__)
83#define HWY_FORMAT(idx_fmt, idx_arg) \
84 __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
85#else
86#define HWY_FORMAT(idx_fmt, idx_arg)
87#endif
88
89// Returns a void* pointer which the compiler then assumes is N-byte aligned.
90// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
91//
92// The assignment semantics are required by GCC/Clang. ICC provides an in-place
93// __assume_aligned, whereas MSVC's __assume appears unsuitable.
94#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
95#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
96#else
97#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
98#endif
99
100// Clang and GCC require attributes on each function into which SIMD intrinsics
101// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
102// automatic annotation via pragmas.
103#if HWY_COMPILER_CLANG
104#define HWY_PUSH_ATTRIBUTES(targets_str) \
105 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
106 apply_to = function))
107#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
108#elif HWY_COMPILER_GCC
109#define HWY_PUSH_ATTRIBUTES(targets_str) \
110 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
111#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
112#else
113#define HWY_PUSH_ATTRIBUTES(targets_str)
114#define HWY_POP_ATTRIBUTES
115#endif
116
117//------------------------------------------------------------------------------
118// Macros
119
120#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
121
122#define HWY_CONCAT_IMPL(a, b) a##b
123#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
124
125#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
126#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
127
128// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
129// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
130// does, without generating code.
131#if HWY_ARCH_X86
132#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
133#else
134// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
135#define HWY_FENCE
136#endif
137
138// 4 instances of a given literal value, useful as input to LoadDup128.
139#define HWY_REP4(literal) literal, literal, literal, literal
140
141#define HWY_ABORT(format, ...) \
142 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
143
144// Always enabled.
145#define HWY_ASSERT(condition) \
146 do { \
147 if (!(condition)) { \
148 HWY_ABORT("Assert %s", #condition); \
149 } \
150 } while (0)
151
152#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
153#define HWY_IS_MSAN 1
154#else
155#define HWY_IS_MSAN 0
156#endif
157
158#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
159#define HWY_IS_ASAN 1
160#else
161#define HWY_IS_ASAN 0
162#endif
163
164#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
165#define HWY_IS_TSAN 1
166#else
167#define HWY_IS_TSAN 0
168#endif
169
170// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
171// You can disable MSAN by adding this attribute to the function that fails.
172#if HWY_IS_MSAN
173#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
174#else
175#define HWY_ATTR_NO_MSAN
176#endif
177
178// For enabling HWY_DASSERT and shortening tests in slower debug builds
179#if !defined(HWY_IS_DEBUG_BUILD)
180// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
181// MSVC defines NDEBUG (if not, could instead check _DEBUG).
182#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
183 HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
184#define HWY_IS_DEBUG_BUILD 1
185#else
186#define HWY_IS_DEBUG_BUILD 0
187#endif
188#endif // HWY_IS_DEBUG_BUILD
189
190#if HWY_IS_DEBUG_BUILD
191#define HWY_DASSERT(condition) HWY_ASSERT(condition)
192#else
193#define HWY_DASSERT(condition) \
194 do { \
195 } while (0)
196#endif
197
198namespace hwy {
199
200//------------------------------------------------------------------------------
201// kMaxVectorSize (undocumented, pending removal)
202
203#if HWY_ARCH_X86
204static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
205#elif HWY_ARCH_RVV && defined(__riscv_vector)
206// Not actually an upper bound on the size.
207static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
208#else
209static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
210#endif
211
212//------------------------------------------------------------------------------
213// Alignment
214
215// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
216// should be allocated dynamically via aligned_allocator.h because Lanes() may
217// exceed the stack size.
218#if HWY_ARCH_X86
219#define HWY_ALIGN_MAX alignas(64)
220#elif HWY_ARCH_RVV && defined(__riscv_vector)
221#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
222#else
223#define HWY_ALIGN_MAX alignas(16)
224#endif
225
226//------------------------------------------------------------------------------
227// Lane types
228
229// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
230// by concatenating base type and bits.
231
232#if HWY_ARCH_ARM && (__ARM_FP & 2)
233#define HWY_NATIVE_FLOAT16 1
234#else
235#define HWY_NATIVE_FLOAT16 0
236#endif
237
238#pragma pack(push, 1)
239
240#if HWY_NATIVE_FLOAT16
241using float16_t = __fp16;
242// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
243// arguments, so use a wrapper.
244// TODO(janwas): replace with _Float16 when that is supported?
245#else
246struct float16_t {
247 uint16_t bits;
248};
249#endif
250
252 uint16_t bits;
253};
254
255#pragma pack(pop)
256
257using float32_t = float;
258using float64_t = double;
259
260#pragma pack(push, 1)
261
262// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
263// https://reviews.llvm.org/D86310
264struct alignas(16) uint128_t {
265 uint64_t lo; // little-endian layout
266 uint64_t hi;
267};
268
269// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
270// field is to be compared (Lt128Upper instead of Lt128).
271struct alignas(16) K64V64 {
272 uint64_t value; // little-endian layout
273 uint64_t key;
274};
275
276#pragma pack(pop)
277
278static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
279 const uint128_t& b) {
280 return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
281}
282// Required for std::greater.
283static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
284 const uint128_t& b) {
285 return b < a;
286}
287
288static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
289 const K64V64& b) {
290 return a.key < b.key;
291}
292// Required for std::greater.
293static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
294 const K64V64& b) {
295 return b < a;
296}
297
298//------------------------------------------------------------------------------
299// Controlling overload resolution (SFINAE)
300
301template <bool Condition>
302struct EnableIfT {};
303template <>
304struct EnableIfT<true> {
305 using type = void;
306};
307
308template <bool Condition>
310
311template <typename T, typename U>
312struct IsSameT {
313 enum { value = 0 };
314};
315
316template <typename T>
317struct IsSameT<T, T> {
318 enum { value = 1 };
319};
320
321template <typename T, typename U>
322HWY_API constexpr bool IsSame() {
324}
325
326// Insert into template/function arguments to enable this overload only for
327// vectors of AT MOST this many bits.
328//
329// Note that enabling for exactly 128 bits is unnecessary because a function can
330// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
331// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
332#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
333#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
334#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
335#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
336#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
337#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
338#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
339
340#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
341#define HWY_IF_SIGNED(T) \
342 hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
343#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
344#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
345
346#define HWY_IF_LANE_SIZE(T, bytes) \
347 hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
348#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
349 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
350#define HWY_IF_LANE_SIZE_LT(T, bytes) \
351 hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
352
353#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
354 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
355
356// Empty struct used as a size tag type.
357template <size_t N>
358struct SizeTag {};
359
360template <class T>
362 using type = T;
363};
364template <class T>
365struct RemoveConstT<const T> {
366 using type = T;
367};
368
369template <class T>
371
372//------------------------------------------------------------------------------
373// Type relations
374
375namespace detail {
376
377template <typename T>
379template <>
380struct Relations<uint8_t> {
381 using Unsigned = uint8_t;
382 using Signed = int8_t;
383 using Wide = uint16_t;
384};
385template <>
386struct Relations<int8_t> {
387 using Unsigned = uint8_t;
388 using Signed = int8_t;
389 using Wide = int16_t;
390};
391template <>
392struct Relations<uint16_t> {
393 using Unsigned = uint16_t;
394 using Signed = int16_t;
395 using Wide = uint32_t;
396 using Narrow = uint8_t;
397};
398template <>
399struct Relations<int16_t> {
400 using Unsigned = uint16_t;
401 using Signed = int16_t;
402 using Wide = int32_t;
403 using Narrow = int8_t;
404};
405template <>
406struct Relations<uint32_t> {
407 using Unsigned = uint32_t;
408 using Signed = int32_t;
409 using Float = float;
410 using Wide = uint64_t;
411 using Narrow = uint16_t;
412};
413template <>
414struct Relations<int32_t> {
415 using Unsigned = uint32_t;
416 using Signed = int32_t;
417 using Float = float;
418 using Wide = int64_t;
419 using Narrow = int16_t;
420};
421template <>
422struct Relations<uint64_t> {
423 using Unsigned = uint64_t;
424 using Signed = int64_t;
425 using Float = double;
427 using Narrow = uint32_t;
428};
429template <>
430struct Relations<int64_t> {
431 using Unsigned = uint64_t;
432 using Signed = int64_t;
433 using Float = double;
434 using Narrow = int32_t;
435};
436template <>
439 using Narrow = uint64_t;
440};
441template <>
443 using Unsigned = uint16_t;
444 using Signed = int16_t;
446 using Wide = float;
447};
448template <>
450 using Unsigned = uint16_t;
451 using Signed = int16_t;
452 using Wide = float;
453};
454template <>
455struct Relations<float> {
456 using Unsigned = uint32_t;
457 using Signed = int32_t;
458 using Float = float;
459 using Wide = double;
461};
462template <>
463struct Relations<double> {
464 using Unsigned = uint64_t;
465 using Signed = int64_t;
466 using Float = double;
467 using Narrow = float;
468};
469
470template <size_t N>
472template <>
473struct TypeFromSize<1> {
474 using Unsigned = uint8_t;
475 using Signed = int8_t;
476};
477template <>
478struct TypeFromSize<2> {
479 using Unsigned = uint16_t;
480 using Signed = int16_t;
481};
482template <>
483struct TypeFromSize<4> {
484 using Unsigned = uint32_t;
485 using Signed = int32_t;
486 using Float = float;
487};
488template <>
489struct TypeFromSize<8> {
490 using Unsigned = uint64_t;
491 using Signed = int64_t;
492 using Float = double;
493};
494template <>
495struct TypeFromSize<16> {
497};
498
499} // namespace detail
500
501// Aliases for types of a different category, but the same size.
502template <typename T>
504template <typename T>
506template <typename T>
508
509// Aliases for types of the same category, but different size.
510template <typename T>
512template <typename T>
514
515// Obtain type from its size [bytes].
516template <size_t N>
518template <size_t N>
520template <size_t N>
522
523//------------------------------------------------------------------------------
524// Type traits
525
526template <typename T>
527HWY_API constexpr bool IsFloat() {
528 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
529 // from a float, not compared.
530 return IsSame<T, float>() || IsSame<T, double>();
531}
532
533template <typename T>
534HWY_API constexpr bool IsSigned() {
535 return T(0) > T(-1);
536}
537template <>
538constexpr bool IsSigned<float16_t>() {
539 return true;
540}
541template <>
542constexpr bool IsSigned<bfloat16_t>() {
543 return true;
544}
545
546// Largest/smallest representable integer values.
547template <typename T>
548HWY_API constexpr T LimitsMax() {
549 static_assert(!IsFloat<T>(), "Only for integer types");
550 using TU = MakeUnsigned<T>;
551 return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
552 : static_cast<TU>(~0ull));
553}
554template <typename T>
555HWY_API constexpr T LimitsMin() {
556 static_assert(!IsFloat<T>(), "Only for integer types");
557 return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
558}
559
560// Largest/smallest representable value (integer or float). This naming avoids
561// confusion with numeric_limits<float>::min() (the smallest positive value).
562template <typename T>
563HWY_API constexpr T LowestValue() {
564 return LimitsMin<T>();
565}
566template <>
567constexpr float LowestValue<float>() {
568 return -3.402823466e+38F;
569}
570template <>
571constexpr double LowestValue<double>() {
572 return -1.7976931348623158e+308;
573}
574
575template <typename T>
576HWY_API constexpr T HighestValue() {
577 return LimitsMax<T>();
578}
579template <>
580constexpr float HighestValue<float>() {
581 return 3.402823466e+38F;
582}
583template <>
584constexpr double HighestValue<double>() {
585 return 1.7976931348623158e+308;
586}
587
588// Returns width in bits of the mantissa field in IEEE binary32/64.
589template <typename T>
590constexpr int MantissaBits() {
591 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
592 return 0;
593}
594template <>
595constexpr int MantissaBits<float>() {
596 return 23;
597}
598template <>
599constexpr int MantissaBits<double>() {
600 return 52;
601}
602
603// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
604// the largest possible (biased) exponent field. Used by IsInf.
605template <typename T>
607 return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
608}
609
610// Returns bitmask of the sign bit in IEEE binary32/64.
611template <typename T>
613 return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
614}
615
616// Returns bitmask of the exponent field in IEEE binary32/64.
617template <typename T>
619 return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
620}
621
622// Returns bitmask of the mantissa field in IEEE binary32/64.
623template <typename T>
625 return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
626}
627
628// Returns 1 << mantissa_bits as a floating-point number. All integers whose
629// absolute value are less than this can be represented exactly.
630template <typename T>
631constexpr T MantissaEnd() {
632 static_assert(sizeof(T) == 0, "Only instantiate the specializations");
633 return 0;
634}
635template <>
636constexpr float MantissaEnd<float>() {
637 return 8388608.0f; // 1 << 23
638}
639template <>
640constexpr double MantissaEnd<double>() {
641 // floating point literal with p52 requires C++17.
642 return 4503599627370496.0; // 1 << 52
643}
644
645// Returns width in bits of the exponent field in IEEE binary32/64.
646template <typename T>
647constexpr int ExponentBits() {
648 // Exponent := remaining bits after deducting sign and mantissa.
649 return 8 * sizeof(T) - 1 - MantissaBits<T>();
650}
651
652// Returns largest value of the biased exponent field in IEEE binary32/64,
653// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
654// This is expressed as a signed integer for more efficient comparison.
655template <typename T>
657 return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
658}
659
660//------------------------------------------------------------------------------
661// Helper functions
662
663template <typename T1, typename T2>
664constexpr inline T1 DivCeil(T1 a, T2 b) {
665 return (a + b - 1) / b;
666}
667
668// Works for any `align`; if a power of two, compiler emits ADD+AND.
669constexpr inline size_t RoundUpTo(size_t what, size_t align) {
670 return DivCeil(what, align) * align;
671}
672
673// Undefined results for x == 0.
674HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
675#if HWY_COMPILER_MSVC
676 unsigned long index; // NOLINT
677 _BitScanForward(&index, x);
678 return index;
679#else // HWY_COMPILER_MSVC
680 return static_cast<size_t>(__builtin_ctz(x));
681#endif // HWY_COMPILER_MSVC
682}
683
684HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
685#if HWY_COMPILER_MSVC
686#if HWY_ARCH_X86_64
687 unsigned long index; // NOLINT
688 _BitScanForward64(&index, x);
689 return index;
690#else // HWY_ARCH_X86_64
691 // _BitScanForward64 not available
692 uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
693 unsigned long index; // NOLINT
694 if (lsb == 0) {
695 uint32_t msb = static_cast<uint32_t>(x >> 32u);
696 _BitScanForward(&index, msb);
697 return 32 + index;
698 } else {
699 _BitScanForward(&index, lsb);
700 return index;
701 }
702#endif // HWY_ARCH_X86_64
703#else // HWY_COMPILER_MSVC
704 return static_cast<size_t>(__builtin_ctzll(x));
705#endif // HWY_COMPILER_MSVC
706}
707
708// Undefined results for x == 0.
709HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
710#if HWY_COMPILER_MSVC
711 unsigned long index; // NOLINT
712 _BitScanReverse(&index, x);
713 return 31 - index;
714#else // HWY_COMPILER_MSVC
715 return static_cast<size_t>(__builtin_clz(x));
716#endif // HWY_COMPILER_MSVC
717}
718
719HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
720#if HWY_COMPILER_MSVC
721#if HWY_ARCH_X86_64
722 unsigned long index; // NOLINT
723 _BitScanReverse64(&index, x);
724 return 63 - index;
725#else // HWY_ARCH_X86_64
726 // _BitScanReverse64 not available
727 const uint32_t msb = static_cast<uint32_t>(x >> 32u);
728 unsigned long index; // NOLINT
729 if (msb == 0) {
730 const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
731 _BitScanReverse(&index, lsb);
732 return 63 - index;
733 } else {
734 _BitScanReverse(&index, msb);
735 return 31 - index;
736 }
737#endif // HWY_ARCH_X86_64
738#else // HWY_COMPILER_MSVC
739 return static_cast<size_t>(__builtin_clzll(x));
740#endif // HWY_COMPILER_MSVC
741}
742
743HWY_API size_t PopCount(uint64_t x) {
744#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
745 return static_cast<size_t>(__builtin_popcountll(x));
746 // This instruction has a separate feature flag, but is often called from
747 // non-SIMD code, so we don't want to require dynamic dispatch. It was first
748 // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
749 // for AVX, so check for that.
750#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
751 return _mm_popcnt_u64(x);
752#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
753 return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
754 _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
755#else
756 x -= ((x >> 1) & 0x5555555555555555ULL);
757 x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
758 x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
759 x += (x >> 8);
760 x += (x >> 16);
761 x += (x >> 32);
762 return static_cast<size_t>(x & 0x7Fu);
763#endif
764}
765
766// Skip HWY_API due to GCC "function not considered for inlining". Previously
767// such errors were caused by underlying type mismatches, but it's not clear
768// what is still mismatched despite all the casts.
769template <typename TI>
770/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
771 return x == TI{1}
772 ? 0
773 : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
774}
775
776template <typename TI>
777/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
778 return x == TI{1}
779 ? 0
780 : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
781}
782
783#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
784#pragma intrinsic(_umul128)
785#endif
786
787// 64 x 64 = 128 bit multiplication
788HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
789#if defined(__SIZEOF_INT128__)
790 __uint128_t product = (__uint128_t)a * (__uint128_t)b;
791 *upper = (uint64_t)(product >> 64);
792 return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
793#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
794 return _umul128(a, b, upper);
795#else
796 constexpr uint64_t kLo32 = 0xFFFFFFFFU;
797 const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
798 const uint64_t hi_lo = (a >> 32) * (b & kLo32);
799 const uint64_t lo_hi = (a & kLo32) * (b >> 32);
800 const uint64_t hi_hi = (a >> 32) * (b >> 32);
801 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
802 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
803 return (t << 32) | (lo_lo & kLo32);
804#endif
805}
806
807#if HWY_COMPILER_MSVC
808#pragma intrinsic(memcpy)
809#pragma intrinsic(memset)
810#endif
811
812// The source/destination must not overlap/alias.
813template <size_t kBytes, typename From, typename To>
814HWY_API void CopyBytes(const From* from, To* to) {
815#if HWY_COMPILER_MSVC
816 memcpy(to, from, kBytes);
817#else
818 __builtin_memcpy(to, from, kBytes);
819#endif
820}
821
822template <size_t kBytes, typename To>
823HWY_API void ZeroBytes(To* to) {
824#if HWY_COMPILER_MSVC
825 memset(to, 0, kBytes);
826#else
827 __builtin_memset(to, 0, kBytes);
828#endif
829}
830
832 uint32_t bits = bf.bits;
833 bits <<= 16;
834 float f;
835 CopyBytes<4>(&bits, &f);
836 return f;
837}
838
840 uint32_t bits;
841 CopyBytes<4>(&f, &bits);
842 bfloat16_t bf;
843 bf.bits = static_cast<uint16_t>(bits >> 16);
844 return bf;
845}
846
848 Abort(const char* file, int line, const char* format, ...);
849
850} // namespace hwy
851
852#endif // HIGHWAY_HWY_BASE_H_
#define HWY_RESTRICT
Definition: base.h:61
#define HWY_NORETURN
Definition: base.h:65
#define HWY_API
Definition: base.h:120
#define HWY_MAYBE_UNUSED
Definition: base.h:73
#define HWY_DLLEXPORT
Definition: highway_export.h:13
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:814
constexpr T MantissaEnd()
Definition: base.h:631
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:684
constexpr MakeSigned< T > MaxExponentTimes2()
Definition: base.h:606
constexpr MakeUnsigned< T > MantissaMask()
Definition: base.h:624
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:831
HWY_API void ZeroBytes(To *to)
Definition: base.h:823
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:788
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:839
HWY_API constexpr T LimitsMin()
Definition: base.h:555
typename detail::TypeFromSize< N >::Float FloatFromSize
Definition: base.h:521
HWY_API constexpr T HighestValue()
Definition: base.h:576
typename RemoveConstT< T >::type RemoveConst
Definition: base.h:370
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:517
constexpr float HighestValue< float >()
Definition: base.h:580
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition: base.h:519
constexpr T1 DivCeil(T1 a, T2 b)
Definition: base.h:664
constexpr float MantissaEnd< float >()
Definition: base.h:636
double float64_t
Definition: base.h:258
HWY_API constexpr bool IsSame()
Definition: base.h:322
constexpr bool IsSigned< bfloat16_t >()
Definition: base.h:542
HWY_API constexpr bool IsSigned()
Definition: base.h:534
constexpr size_t FloorLog2(TI x)
Definition: base.h:770
constexpr MakeUnsigned< T > ExponentMask()
Definition: base.h:618
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:709
constexpr bool IsSigned< float16_t >()
Definition: base.h:538
constexpr double HighestValue< double >()
Definition: base.h:584
constexpr int MantissaBits< double >()
Definition: base.h:599
typename EnableIfT< Condition >::type EnableIf
Definition: base.h:309
static HWY_MAYBE_UNUSED bool operator>(const uint128_t &a, const uint128_t &b)
Definition: base.h:283
float float32_t
Definition: base.h:257
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:743
constexpr double MantissaEnd< double >()
Definition: base.h:640
constexpr int MantissaBits()
Definition: base.h:590
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:674
constexpr float LowestValue< float >()
Definition: base.h:567
constexpr MakeSigned< T > MaxExponentField()
Definition: base.h:656
constexpr size_t CeilLog2(TI x)
Definition: base.h:777
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:719
constexpr MakeUnsigned< T > SignMask()
Definition: base.h:612
constexpr double LowestValue< double >()
Definition: base.h:571
static HWY_MAYBE_UNUSED bool operator<(const uint128_t &a, const uint128_t &b)
Definition: base.h:278
HWY_API constexpr T LowestValue()
Definition: base.h:563
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize
Definition: base.h:209
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:503
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:511
typename detail::Relations< T >::Float MakeFloat
Definition: base.h:507
HWY_API constexpr bool IsFloat()
Definition: base.h:527
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
constexpr int MantissaBits< float >()
Definition: base.h:595
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char *file
HWY_DLLEXPORT HWY_NORETURN void int const char * format
Definition: base.h:848
HWY_DLLEXPORT HWY_NORETURN void int line
Definition: base.h:848
HWY_API constexpr T LimitsMax()
Definition: base.h:548
constexpr size_t RoundUpTo(size_t what, size_t align)
Definition: base.h:669
typename detail::Relations< T >::Narrow MakeNarrow
Definition: base.h:513
constexpr int ExponentBits()
Definition: base.h:647
void type
Definition: base.h:305
Definition: base.h:302
Definition: base.h:312
@ value
Definition: base.h:313
Definition: base.h:271
uint64_t value
Definition: base.h:272
uint64_t key
Definition: base.h:273
T type
Definition: base.h:366
Definition: base.h:361
T type
Definition: base.h:362
Definition: base.h:358
Definition: base.h:251
uint16_t bits
Definition: base.h:252
int16_t Signed
Definition: base.h:451
float Wide
Definition: base.h:452
uint16_t Unsigned
Definition: base.h:450
double Float
Definition: base.h:466
uint64_t Unsigned
Definition: base.h:464
int64_t Signed
Definition: base.h:465
float Narrow
Definition: base.h:467
int16_t Signed
Definition: base.h:444
float Wide
Definition: base.h:446
uint16_t Unsigned
Definition: base.h:443
uint32_t Unsigned
Definition: base.h:456
double Wide
Definition: base.h:459
float Float
Definition: base.h:458
int32_t Signed
Definition: base.h:457
uint16_t Unsigned
Definition: base.h:400
int16_t Signed
Definition: base.h:401
int32_t Wide
Definition: base.h:402
int8_t Narrow
Definition: base.h:403
uint32_t Unsigned
Definition: base.h:415
int64_t Wide
Definition: base.h:418
float Float
Definition: base.h:417
int16_t Narrow
Definition: base.h:419
int32_t Signed
Definition: base.h:416
int32_t Narrow
Definition: base.h:434
double Float
Definition: base.h:433
uint64_t Unsigned
Definition: base.h:431
int64_t Signed
Definition: base.h:432
int16_t Wide
Definition: base.h:389
int8_t Signed
Definition: base.h:388
uint8_t Unsigned
Definition: base.h:387
uint64_t Narrow
Definition: base.h:439
uint8_t Narrow
Definition: base.h:396
int16_t Signed
Definition: base.h:394
uint32_t Wide
Definition: base.h:395
uint16_t Unsigned
Definition: base.h:393
uint32_t Unsigned
Definition: base.h:407
uint64_t Wide
Definition: base.h:410
uint16_t Narrow
Definition: base.h:411
float Float
Definition: base.h:409
int32_t Signed
Definition: base.h:408
uint32_t Narrow
Definition: base.h:427
int64_t Signed
Definition: base.h:424
uint64_t Unsigned
Definition: base.h:423
double Float
Definition: base.h:425
int8_t Signed
Definition: base.h:382
uint8_t Unsigned
Definition: base.h:381
uint16_t Wide
Definition: base.h:383
Definition: base.h:378
int8_t Signed
Definition: base.h:475
uint8_t Unsigned
Definition: base.h:474
int16_t Signed
Definition: base.h:480
uint16_t Unsigned
Definition: base.h:479
int32_t Signed
Definition: base.h:485
uint32_t Unsigned
Definition: base.h:484
float Float
Definition: base.h:486
double Float
Definition: base.h:492
int64_t Signed
Definition: base.h:491
uint64_t Unsigned
Definition: base.h:490
Definition: base.h:471
Definition: base.h:246
uint16_t bits
Definition: base.h:247
Definition: base.h:264
uint64_t lo
Definition: base.h:265
uint64_t hi
Definition: base.h:266