1 //===-- Memcpy implementation for x86_64 ------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H 9 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H 10 11 #include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR 12 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY 13 #include "src/string/memory_utils/op_builtin.h" 14 #include "src/string/memory_utils/op_x86.h" 15 #include "src/string/memory_utils/utils.h" 16 17 #include <stddef.h> // size_t 18 #include <stdint.h> // SIZE_MAX 19 20 #ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB 21 #error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. 22 #endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB 23 24 #ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE 25 #error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. 26 #endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE 27 28 namespace LIBC_NAMESPACE_DECL { 29 30 namespace x86 { 31 32 LIBC_INLINE_VAR constexpr size_t K_ONE_CACHELINE = 64; 33 LIBC_INLINE_VAR constexpr size_t K_TWO_CACHELINES = 2 * K_ONE_CACHELINE; 34 LIBC_INLINE_VAR constexpr size_t K_THREE_CACHELINES = 3 * K_ONE_CACHELINE; 35 36 LIBC_INLINE_VAR constexpr bool K_USE_SOFTWARE_PREFETCHING = 37 LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING); 38 39 // Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only 40 // above a certain threshold. Defaults to "do not use rep;movsb". 41 #ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE 42 #define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX 43 #endif 44 LIBC_INLINE_VAR constexpr size_t K_REP_MOVSB_THRESHOLD = 45 LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; 46 47 } // namespace x86 48 49 [[maybe_unused]] LIBC_INLINE void 50 inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, 51 size_t count) { 52 if (count <= 128) 53 return builtin::Memcpy<64>::head_tail(dst, src, count); 54 builtin::Memcpy<32>::block(dst, src); 55 align_to_next_boundary<32, Arg::Dst>(dst, src, count); 56 return builtin::Memcpy<32>::loop_and_tail(dst, src, count); 57 } 58 59 [[maybe_unused]] LIBC_INLINE void 60 inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src, 61 size_t count) { 62 if (count <= 128) 63 return builtin::Memcpy<64>::head_tail(dst, src, count); 64 if (count < 256) 65 return builtin::Memcpy<128>::head_tail(dst, src, count); 66 builtin::Memcpy<32>::block(dst, src); 67 align_to_next_boundary<32, Arg::Dst>(dst, src, count); 68 return builtin::Memcpy<64>::loop_and_tail(dst, src, count); 69 } 70 71 [[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch(Ptr __restrict dst, 72 CPtr __restrict src, 73 size_t distance) { 74 prefetch_to_local_cache(src + distance); 75 prefetch_for_write(dst + distance); 76 } 77 78 [[maybe_unused]] LIBC_INLINE void 79 inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst, 80 CPtr __restrict src, size_t count) { 81 using namespace LIBC_NAMESPACE::x86; 82 inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE); 83 if (count <= 128) 84 return builtin::Memcpy<64>::head_tail(dst, src, count); 85 inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES); 86 // Aligning 'dst' on a 32B boundary. 87 builtin::Memcpy<32>::block(dst, src); 88 align_to_next_boundary<32, Arg::Dst>(dst, src, count); 89 builtin::Memcpy<96>::block(dst, src); 90 size_t offset = 96; 91 // At this point: 92 // - we copied between 96B and 128B, 93 // - we prefetched cachelines at 'src + 64' and 'src + 128', 94 // - 'dst' is 32B aligned, 95 // - count >= 128. 96 if (count < 352) { 97 // Two cache lines at a time. 98 while (offset + K_TWO_CACHELINES + 32 <= count) { 99 inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE); 100 inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES); 101 // Copy one cache line at a time to prevent the use of `rep;movsb`. 102 for (size_t i = 0; i < 2; ++i, offset += K_ONE_CACHELINE) 103 builtin::Memcpy<K_ONE_CACHELINE>::block_offset(dst, src, offset); 104 } 105 } else { 106 // Three cache lines at a time. 107 while (offset + K_THREE_CACHELINES + 32 <= count) { 108 inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE); 109 inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES); 110 inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES); 111 // Copy one cache line at a time to prevent the use of `rep;movsb`. 112 for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE) 113 builtin::Memcpy<K_ONE_CACHELINE>::block_offset(dst, src, offset); 114 } 115 } 116 // We don't use 'loop_and_tail_offset' because it assumes at least one 117 // iteration of the loop. 118 while (offset + 32 <= count) { 119 builtin::Memcpy<32>::block_offset(dst, src, offset); 120 offset += 32; 121 } 122 return builtin::Memcpy<32>::tail(dst, src, count); 123 } 124 125 [[maybe_unused]] LIBC_INLINE void 126 inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst, 127 CPtr __restrict src, size_t count) { 128 using namespace LIBC_NAMESPACE::x86; 129 inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE); 130 if (count <= 128) 131 return builtin::Memcpy<64>::head_tail(dst, src, count); 132 inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES); 133 inline_memcpy_prefetch(dst, src, K_THREE_CACHELINES); 134 if (count < 256) 135 return builtin::Memcpy<128>::head_tail(dst, src, count); 136 // Aligning 'dst' on a 32B boundary. 137 builtin::Memcpy<32>::block(dst, src); 138 align_to_next_boundary<32, Arg::Dst>(dst, src, count); 139 builtin::Memcpy<224>::block(dst, src); 140 size_t offset = 224; 141 // At this point: 142 // - we copied between 224B and 256B, 143 // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196' 144 // - 'dst' is 32B aligned, 145 // - count >= 128. 146 while (offset + K_THREE_CACHELINES + 64 <= count) { 147 // Three cache lines at a time. 148 inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE); 149 inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES); 150 inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES); 151 // Copy one cache line at a time to prevent the use of `rep;movsb`. 152 for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE) 153 builtin::Memcpy<K_ONE_CACHELINE>::block_offset(dst, src, offset); 154 } 155 // We don't use 'loop_and_tail_offset' because it assumes at least one 156 // iteration of the loop. 157 while (offset + 64 <= count) { 158 builtin::Memcpy<64>::block_offset(dst, src, offset); 159 offset += 64; 160 } 161 return builtin::Memcpy<64>::tail(dst, src, count); 162 } 163 164 [[maybe_unused]] LIBC_INLINE void 165 inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { 166 #if defined(__AVX512F__) 167 constexpr size_t VECTOR_SIZE = 64; 168 #elif defined(__AVX__) 169 constexpr size_t VECTOR_SIZE = 32; 170 #elif defined(__SSE2__) 171 constexpr size_t VECTOR_SIZE = 16; 172 #else 173 constexpr size_t VECTOR_SIZE = 8; 174 #endif 175 if (count == 0) 176 return; 177 if (count == 1) 178 return builtin::Memcpy<1>::block(dst, src); 179 if (count == 2) 180 return builtin::Memcpy<2>::block(dst, src); 181 if (count == 3) 182 return builtin::Memcpy<3>::block(dst, src); 183 if (count == 4) 184 return builtin::Memcpy<4>::block(dst, src); 185 if (count < 8) 186 return builtin::Memcpy<4>::head_tail(dst, src, count); 187 // If count is equal to a power of 2, we can handle it as head-tail 188 // of both smaller size and larger size (head-tail are either 189 // non-overlapping for smaller size, or completely collapsed 190 // for larger size). It seems to be more profitable to do the copy 191 // with the larger size, if it's natively supported (e.g. doing 192 // 2 collapsed 32-byte moves for count=64 if AVX2 is supported). 193 // But it's not profitable to use larger size if it's not natively 194 // supported: we will both use more instructions and handle fewer 195 // sizes in earlier branches. 196 if (VECTOR_SIZE >= 16 ? count < 16 : count <= 16) 197 return builtin::Memcpy<8>::head_tail(dst, src, count); 198 if (VECTOR_SIZE >= 32 ? count < 32 : count <= 32) 199 return builtin::Memcpy<16>::head_tail(dst, src, count); 200 if (VECTOR_SIZE >= 64 ? count < 64 : count <= 64) 201 return builtin::Memcpy<32>::head_tail(dst, src, count); 202 if constexpr (x86::K_AVX) { 203 if constexpr (x86::K_USE_SOFTWARE_PREFETCHING) { 204 return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count); 205 } else { 206 return inline_memcpy_x86_avx_ge64(dst, src, count); 207 } 208 } else { 209 if constexpr (x86::K_USE_SOFTWARE_PREFETCHING) { 210 return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count); 211 } else { 212 return inline_memcpy_x86_sse2_ge64(dst, src, count); 213 } 214 } 215 } 216 217 [[maybe_unused]] LIBC_INLINE void 218 inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst, 219 CPtr __restrict src, size_t count) { 220 if constexpr (x86::K_REP_MOVSB_THRESHOLD == 0) { 221 return x86::Memcpy::repmovsb(dst, src, count); 222 } else if constexpr (x86::K_REP_MOVSB_THRESHOLD == SIZE_MAX) { 223 return inline_memcpy_x86(dst, src, count); 224 } else { 225 if (LIBC_UNLIKELY(count >= x86::K_REP_MOVSB_THRESHOLD)) 226 return x86::Memcpy::repmovsb(dst, src, count); 227 else 228 return inline_memcpy_x86(dst, src, count); 229 } 230 } 231 232 } // namespace LIBC_NAMESPACE_DECL 233 234 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H 235