xref: /llvm-project/libc/src/string/memory_utils/op_x86.h (revision 1973270fc66680e6894c3ae9395a7e07e7b4d43c)
169090143SGuillaume Chatelet //===-- x86 implementation of memory function building blocks -------------===//
269090143SGuillaume Chatelet //
369090143SGuillaume Chatelet // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
469090143SGuillaume Chatelet // See https://llvm.org/LICENSE.txt for license information.
569090143SGuillaume Chatelet // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
669090143SGuillaume Chatelet //
769090143SGuillaume Chatelet //===----------------------------------------------------------------------===//
869090143SGuillaume Chatelet //
969090143SGuillaume Chatelet // This file provides x86 specific building blocks to compose memory functions.
1069090143SGuillaume Chatelet //
1169090143SGuillaume Chatelet //===----------------------------------------------------------------------===//
1269090143SGuillaume Chatelet #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
1369090143SGuillaume Chatelet #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
1469090143SGuillaume Chatelet 
155ff3ff33SPetr Hosek #include "src/__support/macros/config.h"
16f100ec25SGuillaume Chatelet #include "src/__support/macros/properties/architectures.h"
1769090143SGuillaume Chatelet 
1848ba7da9SGuillaume Chatelet #if defined(LIBC_TARGET_ARCH_IS_X86)
1969090143SGuillaume Chatelet 
2069090143SGuillaume Chatelet #include "src/__support/common.h"
2169090143SGuillaume Chatelet #include "src/string/memory_utils/op_builtin.h"
2269090143SGuillaume Chatelet #include "src/string/memory_utils/op_generic.h"
2369090143SGuillaume Chatelet 
24af029d38SMichael Jones #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) ||      \
25af029d38SMichael Jones     defined(__SSE2__)
2669090143SGuillaume Chatelet #include <immintrin.h>
27af029d38SMichael Jones #endif
28af029d38SMichael Jones 
2969090143SGuillaume Chatelet // Define fake functions to prevent the compiler from failing on undefined
30af029d38SMichael Jones // functions in case the CPU extension is not present.
31af029d38SMichael Jones #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
32*1973270fSSchrodinger ZHU Yifan #undef _mm512_cmpneq_epi8_mask
3369090143SGuillaume Chatelet #define _mm512_cmpneq_epi8_mask(A, B) 0
34af029d38SMichael Jones #endif
35af029d38SMichael Jones #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
36*1973270fSSchrodinger ZHU Yifan #undef _mm256_movemask_epi8
37310b619eSFangrui Song #define _mm256_movemask_epi8(A) 0
38af029d38SMichael Jones #endif
39af029d38SMichael Jones #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
40*1973270fSSchrodinger ZHU Yifan #undef _mm_movemask_epi8
41af029d38SMichael Jones #define _mm_movemask_epi8(A) 0
42af029d38SMichael Jones #endif
4369090143SGuillaume Chatelet 
445ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL {
455ff3ff33SPetr Hosek namespace x86 {
4669090143SGuillaume Chatelet 
4769090143SGuillaume Chatelet // A set of constants to check compile time features.
4888d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
4988d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
5088d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
5188d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
5288d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
5388d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
5469090143SGuillaume Chatelet 
5569090143SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
5669090143SGuillaume Chatelet // Memcpy repmovsb implementation
5769090143SGuillaume Chatelet struct Memcpy {
585bf8efd2SRoland McGrath   LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
5969090143SGuillaume Chatelet     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
6069090143SGuillaume Chatelet   }
6169090143SGuillaume Chatelet };
6269090143SGuillaume Chatelet 
635ff3ff33SPetr Hosek } // namespace x86
645ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL
655e32765cSGuillaume Chatelet 
665ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL {
675ff3ff33SPetr Hosek namespace generic {
681c814c99SGuillaume Chatelet 
6966a03295SVitaly Goldshteyn // Not equals: returns non-zero iff values at head or tail differ.
7066a03295SVitaly Goldshteyn // This function typically loads more data than necessary when the two buffer
7166a03295SVitaly Goldshteyn // differs.
7266a03295SVitaly Goldshteyn template <typename T>
7366a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
7466a03295SVitaly Goldshteyn   static_assert(cpp::is_integral_v<T>);
7566a03295SVitaly Goldshteyn   return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
7666a03295SVitaly Goldshteyn }
7766a03295SVitaly Goldshteyn 
781c814c99SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
791c814c99SGuillaume Chatelet // Specializations for uint16_t
801c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
811c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
821c814c99SGuillaume Chatelet   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
831c814c99SGuillaume Chatelet }
841c814c99SGuillaume Chatelet template <>
851c814c99SGuillaume Chatelet LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
861c814c99SGuillaume Chatelet   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
871c814c99SGuillaume Chatelet }
881c814c99SGuillaume Chatelet template <>
891c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
901c814c99SGuillaume Chatelet   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
911c814c99SGuillaume Chatelet          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
921c814c99SGuillaume Chatelet }
931c814c99SGuillaume Chatelet template <>
941c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
951c814c99SGuillaume Chatelet 
961c814c99SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
971c814c99SGuillaume Chatelet // Specializations for uint32_t
981c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
991c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
1001c814c99SGuillaume Chatelet   return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
1011c814c99SGuillaume Chatelet }
1021c814c99SGuillaume Chatelet template <>
1031c814c99SGuillaume Chatelet LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
1041c814c99SGuillaume Chatelet   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
1051c814c99SGuillaume Chatelet }
1061c814c99SGuillaume Chatelet template <>
1071c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
1081c814c99SGuillaume Chatelet   const auto a = load_be<uint32_t>(p1, offset);
1091c814c99SGuillaume Chatelet   const auto b = load_be<uint32_t>(p2, offset);
1101c814c99SGuillaume Chatelet   return cmp_uint32_t(a, b);
1111c814c99SGuillaume Chatelet }
1121c814c99SGuillaume Chatelet template <>
1131c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
1141c814c99SGuillaume Chatelet 
1151c814c99SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
1161c814c99SGuillaume Chatelet // Specializations for uint64_t
1171c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
1181c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
1191c814c99SGuillaume Chatelet   return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
1201c814c99SGuillaume Chatelet }
1211c814c99SGuillaume Chatelet template <>
1221c814c99SGuillaume Chatelet LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
1231c814c99SGuillaume Chatelet   return !eq<uint64_t>(p1, p2, offset);
1241c814c99SGuillaume Chatelet }
1251c814c99SGuillaume Chatelet template <>
1261c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
1271c814c99SGuillaume Chatelet template <>
1281c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
1291c814c99SGuillaume Chatelet                                                size_t offset) {
1301c814c99SGuillaume Chatelet   const auto a = load_be<uint64_t>(p1, offset);
1311c814c99SGuillaume Chatelet   const auto b = load_be<uint64_t>(p2, offset);
1321c814c99SGuillaume Chatelet   return cmp_neq_uint64_t(a, b);
1331c814c99SGuillaume Chatelet }
1341c814c99SGuillaume Chatelet 
135bc4f3e31SGuillaume Chatelet // SIMD types are defined with attributes. e.g., '__m128i' is defined as
136bc4f3e31SGuillaume Chatelet // long long  __attribute__((__vector_size__(16), __aligned__(16)))
137bc4f3e31SGuillaume Chatelet // When we use these SIMD types in template specialization GCC complains:
138bc4f3e31SGuillaume Chatelet // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
139bc4f3e31SGuillaume Chatelet // Therefore, we disable this warning in this file.
140bc4f3e31SGuillaume Chatelet #pragma GCC diagnostic push
141bc4f3e31SGuillaume Chatelet #pragma GCC diagnostic ignored "-Wignored-attributes"
142bc4f3e31SGuillaume Chatelet 
1431c814c99SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
1441c814c99SGuillaume Chatelet // Specializations for __m128i
1451c814c99SGuillaume Chatelet #if defined(__SSE4_1__)
1461c814c99SGuillaume Chatelet template <> struct is_vector<__m128i> : cpp::true_type {};
1471c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
14866a03295SVitaly Goldshteyn LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
14966a03295SVitaly Goldshteyn   const auto a = load<__m128i>(p1, offset);
15066a03295SVitaly Goldshteyn   const auto b = load<__m128i>(p2, offset);
15166a03295SVitaly Goldshteyn   return _mm_xor_si128(a, b);
15266a03295SVitaly Goldshteyn }
1531c814c99SGuillaume Chatelet LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
1541c814c99SGuillaume Chatelet   return _mm_max_epu8(a, b);
1551c814c99SGuillaume Chatelet }
1561c814c99SGuillaume Chatelet LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
1571c814c99SGuillaume Chatelet   return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
1581c814c99SGuillaume Chatelet                                               8, 9, 10, 11, 12, 13, 14, 15));
1591c814c99SGuillaume Chatelet }
1601c814c99SGuillaume Chatelet LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
1619ca6e5bbSGuillaume Chatelet   return static_cast<uint16_t>(
1629ca6e5bbSGuillaume Chatelet       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
1631c814c99SGuillaume Chatelet }
16466a03295SVitaly Goldshteyn LIBC_INLINE bool is_zero(__m128i value) {
16566a03295SVitaly Goldshteyn   return _mm_testz_si128(value, value) == 1;
16666a03295SVitaly Goldshteyn }
1671c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
16866a03295SVitaly Goldshteyn   return is_zero(load_and_xor_m128i(p1, p2, offset));
1691c814c99SGuillaume Chatelet }
1701c814c99SGuillaume Chatelet template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
17166a03295SVitaly Goldshteyn   return !is_zero(load_and_xor_m128i(p1, p2, offset));
17266a03295SVitaly Goldshteyn }
17366a03295SVitaly Goldshteyn template <>
17466a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
17566a03295SVitaly Goldshteyn                                                        size_t count) {
17666a03295SVitaly Goldshteyn   const __m128i head = load_and_xor_m128i(p1, p2, 0);
17766a03295SVitaly Goldshteyn   const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
17866a03295SVitaly Goldshteyn   return !is_zero(_mm_or_si128(head, tail));
1791c814c99SGuillaume Chatelet }
1801c814c99SGuillaume Chatelet template <>
1811c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
1821c814c99SGuillaume Chatelet   const auto a = load<__m128i>(p1, offset);
1831c814c99SGuillaume Chatelet   const auto b = load<__m128i>(p2, offset);
1841c814c99SGuillaume Chatelet   const auto vmax = bytewise_max(a, b);
1851c814c99SGuillaume Chatelet   const auto le = big_endian_cmp_mask(vmax, b);
1861c814c99SGuillaume Chatelet   const auto ge = big_endian_cmp_mask(vmax, a);
1871c814c99SGuillaume Chatelet   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
1881c814c99SGuillaume Chatelet   return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
1891c814c99SGuillaume Chatelet }
1901c814c99SGuillaume Chatelet #endif // __SSE4_1__
1911c814c99SGuillaume Chatelet 
1921c814c99SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
1931c814c99SGuillaume Chatelet // Specializations for __m256i
1941c814c99SGuillaume Chatelet #if defined(__AVX__)
1951c814c99SGuillaume Chatelet template <> struct is_vector<__m256i> : cpp::true_type {};
1961c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
19766a03295SVitaly Goldshteyn LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
19866a03295SVitaly Goldshteyn   return _mm256_castps_si256(
19966a03295SVitaly Goldshteyn       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
20066a03295SVitaly Goldshteyn }
20166a03295SVitaly Goldshteyn LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
20266a03295SVitaly Goldshteyn   return _mm256_castps_si256(
20366a03295SVitaly Goldshteyn       _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
20466a03295SVitaly Goldshteyn }
20566a03295SVitaly Goldshteyn LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
2061c814c99SGuillaume Chatelet   const auto a = load<__m256i>(p1, offset);
2071c814c99SGuillaume Chatelet   const auto b = load<__m256i>(p2, offset);
20866a03295SVitaly Goldshteyn   return xor_m256i(a, b);
20966a03295SVitaly Goldshteyn }
21066a03295SVitaly Goldshteyn LIBC_INLINE bool is_zero(__m256i value) {
21166a03295SVitaly Goldshteyn   return _mm256_testz_si256(value, value) == 1;
21266a03295SVitaly Goldshteyn }
21366a03295SVitaly Goldshteyn template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
21466a03295SVitaly Goldshteyn   return is_zero(load_and_xor_m256i(p1, p2, offset));
2151c814c99SGuillaume Chatelet }
2161c814c99SGuillaume Chatelet template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
21766a03295SVitaly Goldshteyn   return !is_zero(load_and_xor_m256i(p1, p2, offset));
21866a03295SVitaly Goldshteyn }
21966a03295SVitaly Goldshteyn template <>
22066a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
22166a03295SVitaly Goldshteyn                                                        size_t count) {
22266a03295SVitaly Goldshteyn   const __m256i head = load_and_xor_m256i(p1, p2, 0);
22366a03295SVitaly Goldshteyn   const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
22466a03295SVitaly Goldshteyn   return !is_zero(or_m256i(head, tail));
2251c814c99SGuillaume Chatelet }
2261c814c99SGuillaume Chatelet #endif // __AVX__
2271c814c99SGuillaume Chatelet 
2281c814c99SGuillaume Chatelet #if defined(__AVX2__)
2291c814c99SGuillaume Chatelet LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
2301c814c99SGuillaume Chatelet   return _mm256_max_epu8(a, b);
2311c814c99SGuillaume Chatelet }
2329ca6e5bbSGuillaume Chatelet LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
2339ca6e5bbSGuillaume Chatelet   // Bytewise comparison of 'max' and 'value'.
2349ca6e5bbSGuillaume Chatelet   const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
2359ca6e5bbSGuillaume Chatelet   // Because x86 is little endian, bytes in the vector must be reversed before
2369ca6e5bbSGuillaume Chatelet   // using movemask.
2379ca6e5bbSGuillaume Chatelet #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
2389ca6e5bbSGuillaume Chatelet   // When AVX512BMI is available we can completely reverse the vector through
2399ca6e5bbSGuillaume Chatelet   // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
2409ca6e5bbSGuillaume Chatelet   const __m256i big_endian_byte_mask =
2419ca6e5bbSGuillaume Chatelet       _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
2421c814c99SGuillaume Chatelet                                               8, 9, 10, 11, 12, 13, 14, 15,   //
2431c814c99SGuillaume Chatelet                                               16, 17, 18, 19, 20, 21, 22, 23, //
2449ca6e5bbSGuillaume Chatelet                                               24, 25, 26, 27, 28, 29, 30, 31),
2459ca6e5bbSGuillaume Chatelet                               little_endian_byte_mask);
2469ca6e5bbSGuillaume Chatelet   // And turn the byte vector mask into an 'uint32_t' for direct scalar
2479ca6e5bbSGuillaume Chatelet   // comparison.
2489ca6e5bbSGuillaume Chatelet   return _mm256_movemask_epi8(big_endian_byte_mask);
2499ca6e5bbSGuillaume Chatelet #else
2509ca6e5bbSGuillaume Chatelet   // We can't byte-reverse '__m256i' in a single instruction with AVX2.
2519ca6e5bbSGuillaume Chatelet   // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
2529ca6e5bbSGuillaume Chatelet   // leading to:
2539ca6e5bbSGuillaume Chatelet   // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
2549ca6e5bbSGuillaume Chatelet   //           31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
2559ca6e5bbSGuillaume Chatelet   // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
2569ca6e5bbSGuillaume Chatelet   const __m256i half_reversed = _mm256_shuffle_epi8(
2579ca6e5bbSGuillaume Chatelet       little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,       //
2589ca6e5bbSGuillaume Chatelet                                                8, 9, 10, 11, 12, 13, 14, 15, //
2599ca6e5bbSGuillaume Chatelet                                                0, 1, 2, 3, 4, 5, 6, 7,       //
2609ca6e5bbSGuillaume Chatelet                                                8, 9, 10, 11, 12, 13, 14, 15));
2619ca6e5bbSGuillaume Chatelet   // Then we turn the vector into an uint32_t.
2629ca6e5bbSGuillaume Chatelet   const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
2639ca6e5bbSGuillaume Chatelet   // And swap the lower and upper parts. This is optimized into a single `rorx`
2649ca6e5bbSGuillaume Chatelet   // instruction.
2659ca6e5bbSGuillaume Chatelet   return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
2669ca6e5bbSGuillaume Chatelet #endif
2671c814c99SGuillaume Chatelet }
2681c814c99SGuillaume Chatelet template <>
2691c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
2701c814c99SGuillaume Chatelet   const auto a = load<__m256i>(p1, offset);
2711c814c99SGuillaume Chatelet   const auto b = load<__m256i>(p2, offset);
2721c814c99SGuillaume Chatelet   const auto vmax = bytewise_max(a, b);
2731c814c99SGuillaume Chatelet   const auto le = big_endian_cmp_mask(vmax, b);
2741c814c99SGuillaume Chatelet   const auto ge = big_endian_cmp_mask(vmax, a);
2751c814c99SGuillaume Chatelet   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
2769ca6e5bbSGuillaume Chatelet   return cmp_neq_uint64_t(ge, le);
2771c814c99SGuillaume Chatelet }
2781c814c99SGuillaume Chatelet #endif // __AVX2__
2791c814c99SGuillaume Chatelet 
2801c814c99SGuillaume Chatelet ///////////////////////////////////////////////////////////////////////////////
2811c814c99SGuillaume Chatelet // Specializations for __m512i
2821c814c99SGuillaume Chatelet #if defined(__AVX512BW__)
2831c814c99SGuillaume Chatelet template <> struct is_vector<__m512i> : cpp::true_type {};
2841c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
2851c814c99SGuillaume Chatelet LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
2861c814c99SGuillaume Chatelet   return _mm512_max_epu8(a, b);
2871c814c99SGuillaume Chatelet }
2889ca6e5bbSGuillaume Chatelet LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
2899ca6e5bbSGuillaume Chatelet   // The AVX512BMI version is disabled due to bad codegen.
2909ca6e5bbSGuillaume Chatelet   // https://github.com/llvm/llvm-project/issues/77459
2919ca6e5bbSGuillaume Chatelet   // https://github.com/llvm/llvm-project/pull/77081
2929ca6e5bbSGuillaume Chatelet   // TODO: Re-enable when clang version meets the fixed version.
2939ca6e5bbSGuillaume Chatelet #if false && defined(__AVX512VBMI__)
2949ca6e5bbSGuillaume Chatelet   // When AVX512BMI is available we can completely reverse the vector through
2959ca6e5bbSGuillaume Chatelet   // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
2969ca6e5bbSGuillaume Chatelet   const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
2971c814c99SGuillaume Chatelet                                        8, 9, 10, 11, 12, 13, 14, 15,   //
2981c814c99SGuillaume Chatelet                                        16, 17, 18, 19, 20, 21, 22, 23, //
2991c814c99SGuillaume Chatelet                                        24, 25, 26, 27, 28, 29, 30, 31, //
3001c814c99SGuillaume Chatelet                                        32, 33, 34, 35, 36, 37, 38, 39, //
3011c814c99SGuillaume Chatelet                                        40, 41, 42, 43, 44, 45, 46, 47, //
3021c814c99SGuillaume Chatelet                                        48, 49, 50, 51, 52, 53, 54, 55, //
3039ca6e5bbSGuillaume Chatelet                                        56, 57, 58, 59, 60, 61, 62, 63);
3049ca6e5bbSGuillaume Chatelet   // Then we compute the mask for equal bytes.
3059ca6e5bbSGuillaume Chatelet   return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
3069ca6e5bbSGuillaume Chatelet                                 _mm512_permutexvar_epi8(indices, value));
3079ca6e5bbSGuillaume Chatelet #else
3089ca6e5bbSGuillaume Chatelet   // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
3099ca6e5bbSGuillaume Chatelet   // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
3109ca6e5bbSGuillaume Chatelet   // So we only reverse groups of 8 bytes, these groups are necessarily within a
3119ca6e5bbSGuillaume Chatelet   // 16-byte lane.
3129ca6e5bbSGuillaume Chatelet   // zmm = | 16 bytes  | 16 bytes  | 16 bytes  | 16 bytes  |
3139ca6e5bbSGuillaume Chatelet   // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
31457948542SGuillaume Chatelet   const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
31557948542SGuillaume Chatelet                                           0, 1, 2, 3, 4, 5, 6, 7,       //
31657948542SGuillaume Chatelet                                           8, 9, 10, 11, 12, 13, 14, 15, //
31757948542SGuillaume Chatelet                                           0, 1, 2, 3, 4, 5, 6, 7,       //
31857948542SGuillaume Chatelet                                           8, 9, 10, 11, 12, 13, 14, 15, //
31957948542SGuillaume Chatelet                                           0, 1, 2, 3, 4, 5, 6, 7,       //
3209ca6e5bbSGuillaume Chatelet                                           8, 9, 10, 11, 12, 13, 14, 15, //
3219ca6e5bbSGuillaume Chatelet                                           0, 1, 2, 3, 4, 5, 6, 7);
3229ca6e5bbSGuillaume Chatelet   // Then we compute the mask for equal bytes. In this mask the bits of each
3239ca6e5bbSGuillaume Chatelet   // byte are already reversed but the byte themselves should be reversed, this
3249ca6e5bbSGuillaume Chatelet   // is done by using a bswap instruction.
3259ca6e5bbSGuillaume Chatelet   return __builtin_bswap64(
3269ca6e5bbSGuillaume Chatelet       _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
3279ca6e5bbSGuillaume Chatelet                              _mm512_shuffle_epi8(value, indices)));
3289ca6e5bbSGuillaume Chatelet 
3299ca6e5bbSGuillaume Chatelet #endif
3301c814c99SGuillaume Chatelet }
3311c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
3321c814c99SGuillaume Chatelet   const auto a = load<__m512i>(p1, offset);
3331c814c99SGuillaume Chatelet   const auto b = load<__m512i>(p2, offset);
3341c814c99SGuillaume Chatelet   return _mm512_cmpneq_epi8_mask(a, b) == 0;
3351c814c99SGuillaume Chatelet }
3361c814c99SGuillaume Chatelet template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
3371c814c99SGuillaume Chatelet   const auto a = load<__m512i>(p1, offset);
3381c814c99SGuillaume Chatelet   const auto b = load<__m512i>(p2, offset);
33966a03295SVitaly Goldshteyn   return _mm512_cmpneq_epi8_mask(a, b) != 0;
34066a03295SVitaly Goldshteyn }
34166a03295SVitaly Goldshteyn LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
34266a03295SVitaly Goldshteyn   const auto a = load<__m512i>(p1, offset);
34366a03295SVitaly Goldshteyn   const auto b = load<__m512i>(p2, offset);
34466a03295SVitaly Goldshteyn   return _mm512_xor_epi64(a, b);
34566a03295SVitaly Goldshteyn }
34666a03295SVitaly Goldshteyn LIBC_INLINE bool is_zero(__m512i value) {
34766a03295SVitaly Goldshteyn   return _mm512_test_epi32_mask(value, value) == 0;
34866a03295SVitaly Goldshteyn }
34966a03295SVitaly Goldshteyn template <>
35066a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
35166a03295SVitaly Goldshteyn                                                        size_t count) {
35266a03295SVitaly Goldshteyn   const __m512i head = load_and_xor_m512i(p1, p2, 0);
35366a03295SVitaly Goldshteyn   const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
35466a03295SVitaly Goldshteyn   return !is_zero(_mm512_or_epi64(head, tail));
3551c814c99SGuillaume Chatelet }
3561c814c99SGuillaume Chatelet template <>
3571c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
3581c814c99SGuillaume Chatelet   const auto a = load<__m512i>(p1, offset);
3591c814c99SGuillaume Chatelet   const auto b = load<__m512i>(p2, offset);
3601c814c99SGuillaume Chatelet   const auto vmax = bytewise_max(a, b);
3611c814c99SGuillaume Chatelet   const auto le = big_endian_cmp_mask(vmax, b);
3621c814c99SGuillaume Chatelet   const auto ge = big_endian_cmp_mask(vmax, a);
3631c814c99SGuillaume Chatelet   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
3641c814c99SGuillaume Chatelet   return cmp_neq_uint64_t(ge, le);
3651c814c99SGuillaume Chatelet }
3661c814c99SGuillaume Chatelet #endif // __AVX512BW__
3671c814c99SGuillaume Chatelet 
368bc4f3e31SGuillaume Chatelet #pragma GCC diagnostic pop
369bc4f3e31SGuillaume Chatelet 
3705ff3ff33SPetr Hosek } // namespace generic
3715ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL
3721c814c99SGuillaume Chatelet 
37348ba7da9SGuillaume Chatelet #endif // LIBC_TARGET_ARCH_IS_X86
37469090143SGuillaume Chatelet 
37569090143SGuillaume Chatelet #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
376