1 //===-- aarch64 implementation of memory function building blocks ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides aarch64 specific building blocks to compose memory 10 // functions. 11 // 12 //===----------------------------------------------------------------------===// 13 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H 14 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H 15 16 #include "src/__support/macros/config.h" 17 #include "src/__support/macros/properties/architectures.h" 18 19 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) 20 21 #include "src/__support/CPP/type_traits.h" // cpp::always_false 22 #include "src/__support/common.h" 23 #include "src/string/memory_utils/op_generic.h" 24 25 #ifdef __ARM_NEON 26 #include <arm_neon.h> 27 #endif //__ARM_NEON 28 29 namespace LIBC_NAMESPACE_DECL { 30 namespace aarch64 { 31 32 LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); 33 34 namespace neon { 35 36 struct BzeroCacheLine { 37 static constexpr size_t SIZE = 64; 38 39 LIBC_INLINE static void block(Ptr dst, uint8_t) { 40 #if __SIZEOF_POINTER__ == 4 41 asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); 42 #else 43 asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); 44 #endif 45 } 46 47 LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { 48 size_t offset = 0; 49 do { 50 block(dst + offset, value); 51 offset += SIZE; 52 } while (offset < count - SIZE); 53 // Unaligned store, we can't use 'dc zva' here. 54 generic::Memset<generic_v512>::tail(dst, value, count); 55 } 56 }; 57 58 LIBC_INLINE bool hasZva() { 59 uint64_t zva_val; 60 asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); 61 // DC ZVA is permitted if DZP, bit [4] is zero. 62 // BS, bits [3:0] is log2 of the block count in words. 63 // So the next line checks whether the instruction is permitted and block 64 // count is 16 words (i.e. 64 bytes). 65 return (zva_val & 0b11111) == 0b00100; 66 } 67 68 } // namespace neon 69 70 /////////////////////////////////////////////////////////////////////////////// 71 // Bcmp 72 template <size_t Size> struct Bcmp { 73 static constexpr size_t SIZE = Size; 74 static constexpr size_t BlockSize = 32; 75 76 LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) { 77 return reinterpret_cast<const unsigned char *>(ptr); 78 } 79 80 LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { 81 if constexpr (Size == 16) { 82 auto _p1 = as_u8(p1); 83 auto _p2 = as_u8(p2); 84 uint8x16_t a = vld1q_u8(_p1); 85 uint8x16_t n = vld1q_u8(_p2); 86 uint8x16_t an = veorq_u8(a, n); 87 uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an)); 88 return vmaxv_u32(an_reduced); 89 } else if constexpr (Size == 32) { 90 auto _p1 = as_u8(p1); 91 auto _p2 = as_u8(p2); 92 uint8x16_t a = vld1q_u8(_p1); 93 uint8x16_t b = vld1q_u8(_p1 + 16); 94 uint8x16_t n = vld1q_u8(_p2); 95 uint8x16_t o = vld1q_u8(_p2 + 16); 96 uint8x16_t an = veorq_u8(a, n); 97 uint8x16_t bo = veorq_u8(b, o); 98 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is 99 // a difference between the two buffers. We reduce this value down to 4 100 // bytes in two steps. First, calculate the saturated move value when 101 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get 102 // a single 32 bit nonzero value if a mismatch occurred. 103 uint8x16_t anbo = vorrq_u8(an, bo); 104 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); 105 return vmaxv_u32(anbo_reduced); 106 } else if constexpr ((Size % BlockSize) == 0) { 107 for (size_t offset = 0; offset < Size; offset += BlockSize) 108 if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset)) 109 return value; 110 } else { 111 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented"); 112 } 113 return BcmpReturnType::zero(); 114 } 115 116 LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { 117 return block(p1 + count - SIZE, p2 + count - SIZE); 118 } 119 120 LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { 121 if constexpr (Size == 16) { 122 auto _p1 = as_u8(p1); 123 auto _p2 = as_u8(p2); 124 uint8x16_t a = vld1q_u8(_p1); 125 uint8x16_t b = vld1q_u8(_p1 + count - 16); 126 uint8x16_t n = vld1q_u8(_p2); 127 uint8x16_t o = vld1q_u8(_p2 + count - 16); 128 uint8x16_t an = veorq_u8(a, n); 129 uint8x16_t bo = veorq_u8(b, o); 130 // anbo = (a ^ n) | (b ^ o) 131 uint8x16_t anbo = vorrq_u8(an, bo); 132 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); 133 return vmaxv_u32(anbo_reduced); 134 } else if constexpr (Size == 32) { 135 auto _p1 = as_u8(p1); 136 auto _p2 = as_u8(p2); 137 uint8x16_t a = vld1q_u8(_p1); 138 uint8x16_t b = vld1q_u8(_p1 + 16); 139 uint8x16_t c = vld1q_u8(_p1 + count - 16); 140 uint8x16_t d = vld1q_u8(_p1 + count - 32); 141 uint8x16_t n = vld1q_u8(_p2); 142 uint8x16_t o = vld1q_u8(_p2 + 16); 143 uint8x16_t p = vld1q_u8(_p2 + count - 16); 144 uint8x16_t q = vld1q_u8(_p2 + count - 32); 145 uint8x16_t an = veorq_u8(a, n); 146 uint8x16_t bo = veorq_u8(b, o); 147 uint8x16_t cp = veorq_u8(c, p); 148 uint8x16_t dq = veorq_u8(d, q); 149 uint8x16_t anbo = vorrq_u8(an, bo); 150 uint8x16_t cpdq = vorrq_u8(cp, dq); 151 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to 152 // a nonzero 32 bit value if a mismatch occurred. 153 uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); 154 uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); 155 return vmaxv_u32(abnocpdq_reduced); 156 } else { 157 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented"); 158 } 159 return BcmpReturnType::zero(); 160 } 161 162 LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, 163 size_t count) { 164 static_assert(Size > 1, "a loop of size 1 does not need tail"); 165 size_t offset = 0; 166 do { 167 if (auto value = block(p1 + offset, p2 + offset)) 168 return value; 169 offset += SIZE; 170 } while (offset < count - SIZE); 171 return tail(p1, p2, count); 172 } 173 }; 174 175 } // namespace aarch64 176 } // namespace LIBC_NAMESPACE_DECL 177 178 namespace LIBC_NAMESPACE_DECL { 179 namespace generic { 180 181 /////////////////////////////////////////////////////////////////////////////// 182 // Specializations for uint16_t 183 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; 184 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 185 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); 186 } 187 template <> 188 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 189 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); 190 } 191 template <> 192 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { 193 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - 194 static_cast<int32_t>(load_be<uint16_t>(p2, offset)); 195 } 196 197 /////////////////////////////////////////////////////////////////////////////// 198 // Specializations for uint32_t 199 template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {}; 200 template <> 201 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 202 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); 203 } 204 template <> 205 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { 206 const auto a = load_be<uint32_t>(p1, offset); 207 const auto b = load_be<uint32_t>(p2, offset); 208 return a > b ? 1 : a < b ? -1 : 0; 209 } 210 211 /////////////////////////////////////////////////////////////////////////////// 212 // Specializations for uint64_t 213 template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {}; 214 template <> 215 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 216 return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset); 217 } 218 template <> 219 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) { 220 const auto a = load_be<uint64_t>(p1, offset); 221 const auto b = load_be<uint64_t>(p2, offset); 222 if (a != b) 223 return a > b ? 1 : -1; 224 return MemcmpReturnType::zero(); 225 } 226 227 /////////////////////////////////////////////////////////////////////////////// 228 // Specializations for uint8x16_t 229 template <> struct is_vector<uint8x16_t> : cpp::true_type {}; 230 template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {}; 231 template <> 232 LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) { 233 for (size_t i = 0; i < 2; ++i) { 234 auto a = load<uint64_t>(p1, offset); 235 auto b = load<uint64_t>(p2, offset); 236 uint32_t cond = a != b; 237 if (cond) 238 return cond; 239 offset += sizeof(uint64_t); 240 } 241 return 0; 242 } 243 template <> 244 LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) { 245 for (size_t i = 0; i < 2; ++i) { 246 auto a = load_be<uint64_t>(p1, offset); 247 auto b = load_be<uint64_t>(p2, offset); 248 if (a != b) 249 return cmp_neq_uint64_t(a, b); 250 offset += sizeof(uint64_t); 251 } 252 return MemcmpReturnType::zero(); 253 } 254 255 /////////////////////////////////////////////////////////////////////////////// 256 // Specializations for uint8x16x2_t 257 template <> struct is_vector<uint8x16x2_t> : cpp::true_type {}; 258 template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {}; 259 template <> 260 LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2, 261 size_t offset) { 262 for (size_t i = 0; i < 4; ++i) { 263 auto a = load_be<uint64_t>(p1, offset); 264 auto b = load_be<uint64_t>(p2, offset); 265 if (a != b) 266 return cmp_neq_uint64_t(a, b); 267 offset += sizeof(uint64_t); 268 } 269 return MemcmpReturnType::zero(); 270 } 271 } // namespace generic 272 } // namespace LIBC_NAMESPACE_DECL 273 274 #endif // LIBC_TARGET_ARCH_IS_AARCH64 275 276 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H 277