169090143SGuillaume Chatelet //===-- x86 implementation of memory function building blocks -------------===// 269090143SGuillaume Chatelet // 369090143SGuillaume Chatelet // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 469090143SGuillaume Chatelet // See https://llvm.org/LICENSE.txt for license information. 569090143SGuillaume Chatelet // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 669090143SGuillaume Chatelet // 769090143SGuillaume Chatelet //===----------------------------------------------------------------------===// 869090143SGuillaume Chatelet // 969090143SGuillaume Chatelet // This file provides x86 specific building blocks to compose memory functions. 1069090143SGuillaume Chatelet // 1169090143SGuillaume Chatelet //===----------------------------------------------------------------------===// 1269090143SGuillaume Chatelet #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 1369090143SGuillaume Chatelet #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 1469090143SGuillaume Chatelet 155ff3ff33SPetr Hosek #include "src/__support/macros/config.h" 16f100ec25SGuillaume Chatelet #include "src/__support/macros/properties/architectures.h" 1769090143SGuillaume Chatelet 1848ba7da9SGuillaume Chatelet #if defined(LIBC_TARGET_ARCH_IS_X86) 1969090143SGuillaume Chatelet 2069090143SGuillaume Chatelet #include "src/__support/common.h" 2169090143SGuillaume Chatelet #include "src/string/memory_utils/op_builtin.h" 2269090143SGuillaume Chatelet #include "src/string/memory_utils/op_generic.h" 2369090143SGuillaume Chatelet 24af029d38SMichael Jones #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \ 25af029d38SMichael Jones defined(__SSE2__) 2669090143SGuillaume Chatelet #include <immintrin.h> 27af029d38SMichael Jones #endif 28af029d38SMichael Jones 2969090143SGuillaume Chatelet // Define fake functions to prevent the compiler from failing on undefined 30af029d38SMichael Jones // functions in case the CPU extension is not present. 31af029d38SMichael Jones #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__)) 32*1973270fSSchrodinger ZHU Yifan #undef _mm512_cmpneq_epi8_mask 3369090143SGuillaume Chatelet #define _mm512_cmpneq_epi8_mask(A, B) 0 34af029d38SMichael Jones #endif 35af029d38SMichael Jones #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__)) 36*1973270fSSchrodinger ZHU Yifan #undef _mm256_movemask_epi8 37310b619eSFangrui Song #define _mm256_movemask_epi8(A) 0 38af029d38SMichael Jones #endif 39af029d38SMichael Jones #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__)) 40*1973270fSSchrodinger ZHU Yifan #undef _mm_movemask_epi8 41af029d38SMichael Jones #define _mm_movemask_epi8(A) 0 42af029d38SMichael Jones #endif 4369090143SGuillaume Chatelet 445ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL { 455ff3ff33SPetr Hosek namespace x86 { 4669090143SGuillaume Chatelet 4769090143SGuillaume Chatelet // A set of constants to check compile time features. 4888d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__); 4988d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__); 5088d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); 5188d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__); 5288d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__); 5388d82b74SNick Desaulniers LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); 5469090143SGuillaume Chatelet 5569090143SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 5669090143SGuillaume Chatelet // Memcpy repmovsb implementation 5769090143SGuillaume Chatelet struct Memcpy { 585bf8efd2SRoland McGrath LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) { 5969090143SGuillaume Chatelet asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); 6069090143SGuillaume Chatelet } 6169090143SGuillaume Chatelet }; 6269090143SGuillaume Chatelet 635ff3ff33SPetr Hosek } // namespace x86 645ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL 655e32765cSGuillaume Chatelet 665ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL { 675ff3ff33SPetr Hosek namespace generic { 681c814c99SGuillaume Chatelet 6966a03295SVitaly Goldshteyn // Not equals: returns non-zero iff values at head or tail differ. 7066a03295SVitaly Goldshteyn // This function typically loads more data than necessary when the two buffer 7166a03295SVitaly Goldshteyn // differs. 7266a03295SVitaly Goldshteyn template <typename T> 7366a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) { 7466a03295SVitaly Goldshteyn static_assert(cpp::is_integral_v<T>); 7566a03295SVitaly Goldshteyn return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T)); 7666a03295SVitaly Goldshteyn } 7766a03295SVitaly Goldshteyn 781c814c99SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 791c814c99SGuillaume Chatelet // Specializations for uint16_t 801c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; 811c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 821c814c99SGuillaume Chatelet return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); 831c814c99SGuillaume Chatelet } 841c814c99SGuillaume Chatelet template <> 851c814c99SGuillaume Chatelet LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 861c814c99SGuillaume Chatelet return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); 871c814c99SGuillaume Chatelet } 881c814c99SGuillaume Chatelet template <> 891c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 901c814c99SGuillaume Chatelet return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - 911c814c99SGuillaume Chatelet static_cast<int32_t>(load_be<uint16_t>(p2, offset)); 921c814c99SGuillaume Chatelet } 931c814c99SGuillaume Chatelet template <> 941c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset); 951c814c99SGuillaume Chatelet 961c814c99SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 971c814c99SGuillaume Chatelet // Specializations for uint32_t 981c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {}; 991c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 1001c814c99SGuillaume Chatelet return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset); 1011c814c99SGuillaume Chatelet } 1021c814c99SGuillaume Chatelet template <> 1031c814c99SGuillaume Chatelet LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 1041c814c99SGuillaume Chatelet return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); 1051c814c99SGuillaume Chatelet } 1061c814c99SGuillaume Chatelet template <> 1071c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 1081c814c99SGuillaume Chatelet const auto a = load_be<uint32_t>(p1, offset); 1091c814c99SGuillaume Chatelet const auto b = load_be<uint32_t>(p2, offset); 1101c814c99SGuillaume Chatelet return cmp_uint32_t(a, b); 1111c814c99SGuillaume Chatelet } 1121c814c99SGuillaume Chatelet template <> 1131c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset); 1141c814c99SGuillaume Chatelet 1151c814c99SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 1161c814c99SGuillaume Chatelet // Specializations for uint64_t 1171c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {}; 1181c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 1191c814c99SGuillaume Chatelet return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset); 1201c814c99SGuillaume Chatelet } 1211c814c99SGuillaume Chatelet template <> 1221c814c99SGuillaume Chatelet LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 1231c814c99SGuillaume Chatelet return !eq<uint64_t>(p1, p2, offset); 1241c814c99SGuillaume Chatelet } 1251c814c99SGuillaume Chatelet template <> 1261c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset); 1271c814c99SGuillaume Chatelet template <> 1281c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2, 1291c814c99SGuillaume Chatelet size_t offset) { 1301c814c99SGuillaume Chatelet const auto a = load_be<uint64_t>(p1, offset); 1311c814c99SGuillaume Chatelet const auto b = load_be<uint64_t>(p2, offset); 1321c814c99SGuillaume Chatelet return cmp_neq_uint64_t(a, b); 1331c814c99SGuillaume Chatelet } 1341c814c99SGuillaume Chatelet 135bc4f3e31SGuillaume Chatelet // SIMD types are defined with attributes. e.g., '__m128i' is defined as 136bc4f3e31SGuillaume Chatelet // long long __attribute__((__vector_size__(16), __aligned__(16))) 137bc4f3e31SGuillaume Chatelet // When we use these SIMD types in template specialization GCC complains: 138bc4f3e31SGuillaume Chatelet // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]" 139bc4f3e31SGuillaume Chatelet // Therefore, we disable this warning in this file. 140bc4f3e31SGuillaume Chatelet #pragma GCC diagnostic push 141bc4f3e31SGuillaume Chatelet #pragma GCC diagnostic ignored "-Wignored-attributes" 142bc4f3e31SGuillaume Chatelet 1431c814c99SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 1441c814c99SGuillaume Chatelet // Specializations for __m128i 1451c814c99SGuillaume Chatelet #if defined(__SSE4_1__) 1461c814c99SGuillaume Chatelet template <> struct is_vector<__m128i> : cpp::true_type {}; 1471c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; 14866a03295SVitaly Goldshteyn LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) { 14966a03295SVitaly Goldshteyn const auto a = load<__m128i>(p1, offset); 15066a03295SVitaly Goldshteyn const auto b = load<__m128i>(p2, offset); 15166a03295SVitaly Goldshteyn return _mm_xor_si128(a, b); 15266a03295SVitaly Goldshteyn } 1531c814c99SGuillaume Chatelet LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { 1541c814c99SGuillaume Chatelet return _mm_max_epu8(a, b); 1551c814c99SGuillaume Chatelet } 1561c814c99SGuillaume Chatelet LIBC_INLINE __m128i bytewise_reverse(__m128i value) { 1571c814c99SGuillaume Chatelet return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 1581c814c99SGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15)); 1591c814c99SGuillaume Chatelet } 1601c814c99SGuillaume Chatelet LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { 1619ca6e5bbSGuillaume Chatelet return static_cast<uint16_t>( 1629ca6e5bbSGuillaume Chatelet _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); 1631c814c99SGuillaume Chatelet } 16466a03295SVitaly Goldshteyn LIBC_INLINE bool is_zero(__m128i value) { 16566a03295SVitaly Goldshteyn return _mm_testz_si128(value, value) == 1; 16666a03295SVitaly Goldshteyn } 1671c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 16866a03295SVitaly Goldshteyn return is_zero(load_and_xor_m128i(p1, p2, offset)); 1691c814c99SGuillaume Chatelet } 1701c814c99SGuillaume Chatelet template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 17166a03295SVitaly Goldshteyn return !is_zero(load_and_xor_m128i(p1, p2, offset)); 17266a03295SVitaly Goldshteyn } 17366a03295SVitaly Goldshteyn template <> 17466a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2, 17566a03295SVitaly Goldshteyn size_t count) { 17666a03295SVitaly Goldshteyn const __m128i head = load_and_xor_m128i(p1, p2, 0); 17766a03295SVitaly Goldshteyn const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i)); 17866a03295SVitaly Goldshteyn return !is_zero(_mm_or_si128(head, tail)); 1791c814c99SGuillaume Chatelet } 1801c814c99SGuillaume Chatelet template <> 1811c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 1821c814c99SGuillaume Chatelet const auto a = load<__m128i>(p1, offset); 1831c814c99SGuillaume Chatelet const auto b = load<__m128i>(p2, offset); 1841c814c99SGuillaume Chatelet const auto vmax = bytewise_max(a, b); 1851c814c99SGuillaume Chatelet const auto le = big_endian_cmp_mask(vmax, b); 1861c814c99SGuillaume Chatelet const auto ge = big_endian_cmp_mask(vmax, a); 1871c814c99SGuillaume Chatelet static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>); 1881c814c99SGuillaume Chatelet return static_cast<int32_t>(ge) - static_cast<int32_t>(le); 1891c814c99SGuillaume Chatelet } 1901c814c99SGuillaume Chatelet #endif // __SSE4_1__ 1911c814c99SGuillaume Chatelet 1921c814c99SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 1931c814c99SGuillaume Chatelet // Specializations for __m256i 1941c814c99SGuillaume Chatelet #if defined(__AVX__) 1951c814c99SGuillaume Chatelet template <> struct is_vector<__m256i> : cpp::true_type {}; 1961c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; 19766a03295SVitaly Goldshteyn LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) { 19866a03295SVitaly Goldshteyn return _mm256_castps_si256( 19966a03295SVitaly Goldshteyn _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); 20066a03295SVitaly Goldshteyn } 20166a03295SVitaly Goldshteyn LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) { 20266a03295SVitaly Goldshteyn return _mm256_castps_si256( 20366a03295SVitaly Goldshteyn _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); 20466a03295SVitaly Goldshteyn } 20566a03295SVitaly Goldshteyn LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) { 2061c814c99SGuillaume Chatelet const auto a = load<__m256i>(p1, offset); 2071c814c99SGuillaume Chatelet const auto b = load<__m256i>(p2, offset); 20866a03295SVitaly Goldshteyn return xor_m256i(a, b); 20966a03295SVitaly Goldshteyn } 21066a03295SVitaly Goldshteyn LIBC_INLINE bool is_zero(__m256i value) { 21166a03295SVitaly Goldshteyn return _mm256_testz_si256(value, value) == 1; 21266a03295SVitaly Goldshteyn } 21366a03295SVitaly Goldshteyn template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 21466a03295SVitaly Goldshteyn return is_zero(load_and_xor_m256i(p1, p2, offset)); 2151c814c99SGuillaume Chatelet } 2161c814c99SGuillaume Chatelet template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 21766a03295SVitaly Goldshteyn return !is_zero(load_and_xor_m256i(p1, p2, offset)); 21866a03295SVitaly Goldshteyn } 21966a03295SVitaly Goldshteyn template <> 22066a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2, 22166a03295SVitaly Goldshteyn size_t count) { 22266a03295SVitaly Goldshteyn const __m256i head = load_and_xor_m256i(p1, p2, 0); 22366a03295SVitaly Goldshteyn const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i)); 22466a03295SVitaly Goldshteyn return !is_zero(or_m256i(head, tail)); 2251c814c99SGuillaume Chatelet } 2261c814c99SGuillaume Chatelet #endif // __AVX__ 2271c814c99SGuillaume Chatelet 2281c814c99SGuillaume Chatelet #if defined(__AVX2__) 2291c814c99SGuillaume Chatelet LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { 2301c814c99SGuillaume Chatelet return _mm256_max_epu8(a, b); 2311c814c99SGuillaume Chatelet } 2329ca6e5bbSGuillaume Chatelet LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { 2339ca6e5bbSGuillaume Chatelet // Bytewise comparison of 'max' and 'value'. 2349ca6e5bbSGuillaume Chatelet const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value); 2359ca6e5bbSGuillaume Chatelet // Because x86 is little endian, bytes in the vector must be reversed before 2369ca6e5bbSGuillaume Chatelet // using movemask. 2379ca6e5bbSGuillaume Chatelet #if defined(__AVX512VBMI__) && defined(__AVX512VL__) 2389ca6e5bbSGuillaume Chatelet // When AVX512BMI is available we can completely reverse the vector through 2399ca6e5bbSGuillaume Chatelet // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a); 2409ca6e5bbSGuillaume Chatelet const __m256i big_endian_byte_mask = 2419ca6e5bbSGuillaume Chatelet _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 2421c814c99SGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15, // 2431c814c99SGuillaume Chatelet 16, 17, 18, 19, 20, 21, 22, 23, // 2449ca6e5bbSGuillaume Chatelet 24, 25, 26, 27, 28, 29, 30, 31), 2459ca6e5bbSGuillaume Chatelet little_endian_byte_mask); 2469ca6e5bbSGuillaume Chatelet // And turn the byte vector mask into an 'uint32_t' for direct scalar 2479ca6e5bbSGuillaume Chatelet // comparison. 2489ca6e5bbSGuillaume Chatelet return _mm256_movemask_epi8(big_endian_byte_mask); 2499ca6e5bbSGuillaume Chatelet #else 2509ca6e5bbSGuillaume Chatelet // We can't byte-reverse '__m256i' in a single instruction with AVX2. 2519ca6e5bbSGuillaume Chatelet // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane 2529ca6e5bbSGuillaume Chatelet // leading to: 2539ca6e5bbSGuillaume Chatelet // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 2549ca6e5bbSGuillaume Chatelet // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] 2559ca6e5bbSGuillaume Chatelet // So we first shuffle each 16-byte lane leading to half-reversed vector mask. 2569ca6e5bbSGuillaume Chatelet const __m256i half_reversed = _mm256_shuffle_epi8( 2579ca6e5bbSGuillaume Chatelet little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 2589ca6e5bbSGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15, // 2599ca6e5bbSGuillaume Chatelet 0, 1, 2, 3, 4, 5, 6, 7, // 2609ca6e5bbSGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15)); 2619ca6e5bbSGuillaume Chatelet // Then we turn the vector into an uint32_t. 2629ca6e5bbSGuillaume Chatelet const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed); 2639ca6e5bbSGuillaume Chatelet // And swap the lower and upper parts. This is optimized into a single `rorx` 2649ca6e5bbSGuillaume Chatelet // instruction. 2659ca6e5bbSGuillaume Chatelet return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16); 2669ca6e5bbSGuillaume Chatelet #endif 2671c814c99SGuillaume Chatelet } 2681c814c99SGuillaume Chatelet template <> 2691c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 2701c814c99SGuillaume Chatelet const auto a = load<__m256i>(p1, offset); 2711c814c99SGuillaume Chatelet const auto b = load<__m256i>(p2, offset); 2721c814c99SGuillaume Chatelet const auto vmax = bytewise_max(a, b); 2731c814c99SGuillaume Chatelet const auto le = big_endian_cmp_mask(vmax, b); 2741c814c99SGuillaume Chatelet const auto ge = big_endian_cmp_mask(vmax, a); 2751c814c99SGuillaume Chatelet static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>); 2769ca6e5bbSGuillaume Chatelet return cmp_neq_uint64_t(ge, le); 2771c814c99SGuillaume Chatelet } 2781c814c99SGuillaume Chatelet #endif // __AVX2__ 2791c814c99SGuillaume Chatelet 2801c814c99SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////// 2811c814c99SGuillaume Chatelet // Specializations for __m512i 2821c814c99SGuillaume Chatelet #if defined(__AVX512BW__) 2831c814c99SGuillaume Chatelet template <> struct is_vector<__m512i> : cpp::true_type {}; 2841c814c99SGuillaume Chatelet template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; 2851c814c99SGuillaume Chatelet LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { 2861c814c99SGuillaume Chatelet return _mm512_max_epu8(a, b); 2871c814c99SGuillaume Chatelet } 2889ca6e5bbSGuillaume Chatelet LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { 2899ca6e5bbSGuillaume Chatelet // The AVX512BMI version is disabled due to bad codegen. 2909ca6e5bbSGuillaume Chatelet // https://github.com/llvm/llvm-project/issues/77459 2919ca6e5bbSGuillaume Chatelet // https://github.com/llvm/llvm-project/pull/77081 2929ca6e5bbSGuillaume Chatelet // TODO: Re-enable when clang version meets the fixed version. 2939ca6e5bbSGuillaume Chatelet #if false && defined(__AVX512VBMI__) 2949ca6e5bbSGuillaume Chatelet // When AVX512BMI is available we can completely reverse the vector through 2959ca6e5bbSGuillaume Chatelet // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); 2969ca6e5bbSGuillaume Chatelet const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 2971c814c99SGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15, // 2981c814c99SGuillaume Chatelet 16, 17, 18, 19, 20, 21, 22, 23, // 2991c814c99SGuillaume Chatelet 24, 25, 26, 27, 28, 29, 30, 31, // 3001c814c99SGuillaume Chatelet 32, 33, 34, 35, 36, 37, 38, 39, // 3011c814c99SGuillaume Chatelet 40, 41, 42, 43, 44, 45, 46, 47, // 3021c814c99SGuillaume Chatelet 48, 49, 50, 51, 52, 53, 54, 55, // 3039ca6e5bbSGuillaume Chatelet 56, 57, 58, 59, 60, 61, 62, 63); 3049ca6e5bbSGuillaume Chatelet // Then we compute the mask for equal bytes. 3059ca6e5bbSGuillaume Chatelet return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // 3069ca6e5bbSGuillaume Chatelet _mm512_permutexvar_epi8(indices, value)); 3079ca6e5bbSGuillaume Chatelet #else 3089ca6e5bbSGuillaume Chatelet // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. 3099ca6e5bbSGuillaume Chatelet // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. 3109ca6e5bbSGuillaume Chatelet // So we only reverse groups of 8 bytes, these groups are necessarily within a 3119ca6e5bbSGuillaume Chatelet // 16-byte lane. 3129ca6e5bbSGuillaume Chatelet // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | 3139ca6e5bbSGuillaume Chatelet // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> | 31457948542SGuillaume Chatelet const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, // 31557948542SGuillaume Chatelet 0, 1, 2, 3, 4, 5, 6, 7, // 31657948542SGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15, // 31757948542SGuillaume Chatelet 0, 1, 2, 3, 4, 5, 6, 7, // 31857948542SGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15, // 31957948542SGuillaume Chatelet 0, 1, 2, 3, 4, 5, 6, 7, // 3209ca6e5bbSGuillaume Chatelet 8, 9, 10, 11, 12, 13, 14, 15, // 3219ca6e5bbSGuillaume Chatelet 0, 1, 2, 3, 4, 5, 6, 7); 3229ca6e5bbSGuillaume Chatelet // Then we compute the mask for equal bytes. In this mask the bits of each 3239ca6e5bbSGuillaume Chatelet // byte are already reversed but the byte themselves should be reversed, this 3249ca6e5bbSGuillaume Chatelet // is done by using a bswap instruction. 3259ca6e5bbSGuillaume Chatelet return __builtin_bswap64( 3269ca6e5bbSGuillaume Chatelet _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), // 3279ca6e5bbSGuillaume Chatelet _mm512_shuffle_epi8(value, indices))); 3289ca6e5bbSGuillaume Chatelet 3299ca6e5bbSGuillaume Chatelet #endif 3301c814c99SGuillaume Chatelet } 3311c814c99SGuillaume Chatelet template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 3321c814c99SGuillaume Chatelet const auto a = load<__m512i>(p1, offset); 3331c814c99SGuillaume Chatelet const auto b = load<__m512i>(p2, offset); 3341c814c99SGuillaume Chatelet return _mm512_cmpneq_epi8_mask(a, b) == 0; 3351c814c99SGuillaume Chatelet } 3361c814c99SGuillaume Chatelet template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 3371c814c99SGuillaume Chatelet const auto a = load<__m512i>(p1, offset); 3381c814c99SGuillaume Chatelet const auto b = load<__m512i>(p2, offset); 33966a03295SVitaly Goldshteyn return _mm512_cmpneq_epi8_mask(a, b) != 0; 34066a03295SVitaly Goldshteyn } 34166a03295SVitaly Goldshteyn LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) { 34266a03295SVitaly Goldshteyn const auto a = load<__m512i>(p1, offset); 34366a03295SVitaly Goldshteyn const auto b = load<__m512i>(p2, offset); 34466a03295SVitaly Goldshteyn return _mm512_xor_epi64(a, b); 34566a03295SVitaly Goldshteyn } 34666a03295SVitaly Goldshteyn LIBC_INLINE bool is_zero(__m512i value) { 34766a03295SVitaly Goldshteyn return _mm512_test_epi32_mask(value, value) == 0; 34866a03295SVitaly Goldshteyn } 34966a03295SVitaly Goldshteyn template <> 35066a03295SVitaly Goldshteyn LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2, 35166a03295SVitaly Goldshteyn size_t count) { 35266a03295SVitaly Goldshteyn const __m512i head = load_and_xor_m512i(p1, p2, 0); 35366a03295SVitaly Goldshteyn const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i)); 35466a03295SVitaly Goldshteyn return !is_zero(_mm512_or_epi64(head, tail)); 3551c814c99SGuillaume Chatelet } 3561c814c99SGuillaume Chatelet template <> 3571c814c99SGuillaume Chatelet LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 3581c814c99SGuillaume Chatelet const auto a = load<__m512i>(p1, offset); 3591c814c99SGuillaume Chatelet const auto b = load<__m512i>(p2, offset); 3601c814c99SGuillaume Chatelet const auto vmax = bytewise_max(a, b); 3611c814c99SGuillaume Chatelet const auto le = big_endian_cmp_mask(vmax, b); 3621c814c99SGuillaume Chatelet const auto ge = big_endian_cmp_mask(vmax, a); 3631c814c99SGuillaume Chatelet static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>); 3641c814c99SGuillaume Chatelet return cmp_neq_uint64_t(ge, le); 3651c814c99SGuillaume Chatelet } 3661c814c99SGuillaume Chatelet #endif // __AVX512BW__ 3671c814c99SGuillaume Chatelet 368bc4f3e31SGuillaume Chatelet #pragma GCC diagnostic pop 369bc4f3e31SGuillaume Chatelet 3705ff3ff33SPetr Hosek } // namespace generic 3715ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL 3721c814c99SGuillaume Chatelet 37348ba7da9SGuillaume Chatelet #endif // LIBC_TARGET_ARCH_IS_X86 37469090143SGuillaume Chatelet 37569090143SGuillaume Chatelet #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 376