1 //===-- x86 implementation of memory function building blocks -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides x86 specific building blocks to compose memory functions. 10 // 11 //===----------------------------------------------------------------------===// 12 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 13 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 14 15 #include "src/__support/macros/config.h" 16 #include "src/__support/macros/properties/architectures.h" 17 18 #if defined(LIBC_TARGET_ARCH_IS_X86) 19 20 #include "src/__support/common.h" 21 #include "src/string/memory_utils/op_builtin.h" 22 #include "src/string/memory_utils/op_generic.h" 23 24 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \ 25 defined(__SSE2__) 26 #include <immintrin.h> 27 #endif 28 29 // Define fake functions to prevent the compiler from failing on undefined 30 // functions in case the CPU extension is not present. 31 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__)) 32 #undef _mm512_cmpneq_epi8_mask 33 #define _mm512_cmpneq_epi8_mask(A, B) 0 34 #endif 35 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__)) 36 #undef _mm256_movemask_epi8 37 #define _mm256_movemask_epi8(A) 0 38 #endif 39 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__)) 40 #undef _mm_movemask_epi8 41 #define _mm_movemask_epi8(A) 0 42 #endif 43 44 namespace LIBC_NAMESPACE_DECL { 45 namespace x86 { 46 47 // A set of constants to check compile time features. 48 LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__); 49 LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__); 50 LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); 51 LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__); 52 LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__); 53 LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); 54 55 /////////////////////////////////////////////////////////////////////////////// 56 // Memcpy repmovsb implementation 57 struct Memcpy { 58 LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) { 59 asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); 60 } 61 }; 62 63 } // namespace x86 64 } // namespace LIBC_NAMESPACE_DECL 65 66 namespace LIBC_NAMESPACE_DECL { 67 namespace generic { 68 69 // Not equals: returns non-zero iff values at head or tail differ. 70 // This function typically loads more data than necessary when the two buffer 71 // differs. 72 template <typename T> 73 LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) { 74 static_assert(cpp::is_integral_v<T>); 75 return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T)); 76 } 77 78 /////////////////////////////////////////////////////////////////////////////// 79 // Specializations for uint16_t 80 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; 81 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 82 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); 83 } 84 template <> 85 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 86 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); 87 } 88 template <> 89 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 90 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - 91 static_cast<int32_t>(load_be<uint16_t>(p2, offset)); 92 } 93 template <> 94 LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset); 95 96 /////////////////////////////////////////////////////////////////////////////// 97 // Specializations for uint32_t 98 template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {}; 99 template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 100 return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset); 101 } 102 template <> 103 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 104 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); 105 } 106 template <> 107 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 108 const auto a = load_be<uint32_t>(p1, offset); 109 const auto b = load_be<uint32_t>(p2, offset); 110 return cmp_uint32_t(a, b); 111 } 112 template <> 113 LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset); 114 115 /////////////////////////////////////////////////////////////////////////////// 116 // Specializations for uint64_t 117 template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {}; 118 template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 119 return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset); 120 } 121 template <> 122 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 123 return !eq<uint64_t>(p1, p2, offset); 124 } 125 template <> 126 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset); 127 template <> 128 LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2, 129 size_t offset) { 130 const auto a = load_be<uint64_t>(p1, offset); 131 const auto b = load_be<uint64_t>(p2, offset); 132 return cmp_neq_uint64_t(a, b); 133 } 134 135 // SIMD types are defined with attributes. e.g., '__m128i' is defined as 136 // long long __attribute__((__vector_size__(16), __aligned__(16))) 137 // When we use these SIMD types in template specialization GCC complains: 138 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]" 139 // Therefore, we disable this warning in this file. 140 #pragma GCC diagnostic push 141 #pragma GCC diagnostic ignored "-Wignored-attributes" 142 143 /////////////////////////////////////////////////////////////////////////////// 144 // Specializations for __m128i 145 #if defined(__SSE4_1__) 146 template <> struct is_vector<__m128i> : cpp::true_type {}; 147 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; 148 LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) { 149 const auto a = load<__m128i>(p1, offset); 150 const auto b = load<__m128i>(p2, offset); 151 return _mm_xor_si128(a, b); 152 } 153 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { 154 return _mm_max_epu8(a, b); 155 } 156 LIBC_INLINE __m128i bytewise_reverse(__m128i value) { 157 return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 158 8, 9, 10, 11, 12, 13, 14, 15)); 159 } 160 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { 161 return static_cast<uint16_t>( 162 _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); 163 } 164 LIBC_INLINE bool is_zero(__m128i value) { 165 return _mm_testz_si128(value, value) == 1; 166 } 167 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 168 return is_zero(load_and_xor_m128i(p1, p2, offset)); 169 } 170 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 171 return !is_zero(load_and_xor_m128i(p1, p2, offset)); 172 } 173 template <> 174 LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2, 175 size_t count) { 176 const __m128i head = load_and_xor_m128i(p1, p2, 0); 177 const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i)); 178 return !is_zero(_mm_or_si128(head, tail)); 179 } 180 template <> 181 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { 182 const auto a = load<__m128i>(p1, offset); 183 const auto b = load<__m128i>(p2, offset); 184 const auto vmax = bytewise_max(a, b); 185 const auto le = big_endian_cmp_mask(vmax, b); 186 const auto ge = big_endian_cmp_mask(vmax, a); 187 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>); 188 return static_cast<int32_t>(ge) - static_cast<int32_t>(le); 189 } 190 #endif // __SSE4_1__ 191 192 /////////////////////////////////////////////////////////////////////////////// 193 // Specializations for __m256i 194 #if defined(__AVX__) 195 template <> struct is_vector<__m256i> : cpp::true_type {}; 196 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; 197 LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) { 198 return _mm256_castps_si256( 199 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); 200 } 201 LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) { 202 return _mm256_castps_si256( 203 _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); 204 } 205 LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) { 206 const auto a = load<__m256i>(p1, offset); 207 const auto b = load<__m256i>(p2, offset); 208 return xor_m256i(a, b); 209 } 210 LIBC_INLINE bool is_zero(__m256i value) { 211 return _mm256_testz_si256(value, value) == 1; 212 } 213 template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 214 return is_zero(load_and_xor_m256i(p1, p2, offset)); 215 } 216 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 217 return !is_zero(load_and_xor_m256i(p1, p2, offset)); 218 } 219 template <> 220 LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2, 221 size_t count) { 222 const __m256i head = load_and_xor_m256i(p1, p2, 0); 223 const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i)); 224 return !is_zero(or_m256i(head, tail)); 225 } 226 #endif // __AVX__ 227 228 #if defined(__AVX2__) 229 LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { 230 return _mm256_max_epu8(a, b); 231 } 232 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { 233 // Bytewise comparison of 'max' and 'value'. 234 const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value); 235 // Because x86 is little endian, bytes in the vector must be reversed before 236 // using movemask. 237 #if defined(__AVX512VBMI__) && defined(__AVX512VL__) 238 // When AVX512BMI is available we can completely reverse the vector through 239 // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a); 240 const __m256i big_endian_byte_mask = 241 _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 242 8, 9, 10, 11, 12, 13, 14, 15, // 243 16, 17, 18, 19, 20, 21, 22, 23, // 244 24, 25, 26, 27, 28, 29, 30, 31), 245 little_endian_byte_mask); 246 // And turn the byte vector mask into an 'uint32_t' for direct scalar 247 // comparison. 248 return _mm256_movemask_epi8(big_endian_byte_mask); 249 #else 250 // We can't byte-reverse '__m256i' in a single instruction with AVX2. 251 // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane 252 // leading to: 253 // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 254 // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] 255 // So we first shuffle each 16-byte lane leading to half-reversed vector mask. 256 const __m256i half_reversed = _mm256_shuffle_epi8( 257 little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 258 8, 9, 10, 11, 12, 13, 14, 15, // 259 0, 1, 2, 3, 4, 5, 6, 7, // 260 8, 9, 10, 11, 12, 13, 14, 15)); 261 // Then we turn the vector into an uint32_t. 262 const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed); 263 // And swap the lower and upper parts. This is optimized into a single `rorx` 264 // instruction. 265 return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16); 266 #endif 267 } 268 template <> 269 LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { 270 const auto a = load<__m256i>(p1, offset); 271 const auto b = load<__m256i>(p2, offset); 272 const auto vmax = bytewise_max(a, b); 273 const auto le = big_endian_cmp_mask(vmax, b); 274 const auto ge = big_endian_cmp_mask(vmax, a); 275 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>); 276 return cmp_neq_uint64_t(ge, le); 277 } 278 #endif // __AVX2__ 279 280 /////////////////////////////////////////////////////////////////////////////// 281 // Specializations for __m512i 282 #if defined(__AVX512BW__) 283 template <> struct is_vector<__m512i> : cpp::true_type {}; 284 template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; 285 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { 286 return _mm512_max_epu8(a, b); 287 } 288 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { 289 // The AVX512BMI version is disabled due to bad codegen. 290 // https://github.com/llvm/llvm-project/issues/77459 291 // https://github.com/llvm/llvm-project/pull/77081 292 // TODO: Re-enable when clang version meets the fixed version. 293 #if false && defined(__AVX512VBMI__) 294 // When AVX512BMI is available we can completely reverse the vector through 295 // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); 296 const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 297 8, 9, 10, 11, 12, 13, 14, 15, // 298 16, 17, 18, 19, 20, 21, 22, 23, // 299 24, 25, 26, 27, 28, 29, 30, 31, // 300 32, 33, 34, 35, 36, 37, 38, 39, // 301 40, 41, 42, 43, 44, 45, 46, 47, // 302 48, 49, 50, 51, 52, 53, 54, 55, // 303 56, 57, 58, 59, 60, 61, 62, 63); 304 // Then we compute the mask for equal bytes. 305 return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // 306 _mm512_permutexvar_epi8(indices, value)); 307 #else 308 // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. 309 // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. 310 // So we only reverse groups of 8 bytes, these groups are necessarily within a 311 // 16-byte lane. 312 // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | 313 // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> | 314 const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, // 315 0, 1, 2, 3, 4, 5, 6, 7, // 316 8, 9, 10, 11, 12, 13, 14, 15, // 317 0, 1, 2, 3, 4, 5, 6, 7, // 318 8, 9, 10, 11, 12, 13, 14, 15, // 319 0, 1, 2, 3, 4, 5, 6, 7, // 320 8, 9, 10, 11, 12, 13, 14, 15, // 321 0, 1, 2, 3, 4, 5, 6, 7); 322 // Then we compute the mask for equal bytes. In this mask the bits of each 323 // byte are already reversed but the byte themselves should be reversed, this 324 // is done by using a bswap instruction. 325 return __builtin_bswap64( 326 _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), // 327 _mm512_shuffle_epi8(value, indices))); 328 329 #endif 330 } 331 template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 332 const auto a = load<__m512i>(p1, offset); 333 const auto b = load<__m512i>(p2, offset); 334 return _mm512_cmpneq_epi8_mask(a, b) == 0; 335 } 336 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 337 const auto a = load<__m512i>(p1, offset); 338 const auto b = load<__m512i>(p2, offset); 339 return _mm512_cmpneq_epi8_mask(a, b) != 0; 340 } 341 LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) { 342 const auto a = load<__m512i>(p1, offset); 343 const auto b = load<__m512i>(p2, offset); 344 return _mm512_xor_epi64(a, b); 345 } 346 LIBC_INLINE bool is_zero(__m512i value) { 347 return _mm512_test_epi32_mask(value, value) == 0; 348 } 349 template <> 350 LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2, 351 size_t count) { 352 const __m512i head = load_and_xor_m512i(p1, p2, 0); 353 const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i)); 354 return !is_zero(_mm512_or_epi64(head, tail)); 355 } 356 template <> 357 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { 358 const auto a = load<__m512i>(p1, offset); 359 const auto b = load<__m512i>(p2, offset); 360 const auto vmax = bytewise_max(a, b); 361 const auto le = big_endian_cmp_mask(vmax, b); 362 const auto ge = big_endian_cmp_mask(vmax, a); 363 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>); 364 return cmp_neq_uint64_t(ge, le); 365 } 366 #endif // __AVX512BW__ 367 368 #pragma GCC diagnostic pop 369 370 } // namespace generic 371 } // namespace LIBC_NAMESPACE_DECL 372 373 #endif // LIBC_TARGET_ARCH_IS_X86 374 375 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H 376