1// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2// See https://llvm.org/LICENSE.txt for license information. 3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4 5// Routines taken from libc/AOR_v20.02/string/aarch64 6 7#include "../assembly.h" 8 9// 10// __arm_sc_memcpy / __arm_sc_memmove 11// 12 13#define dstin x0 14#define src x1 15#define count x2 16#define dst x3 17#define srcend1 x4 18#define dstend1 x5 19#define A_l x6 20#define A_lw w6 21#define A_h x7 22#define B_l x8 23#define B_lw w8 24#define B_h x9 25#define C_l x10 26#define C_lw w10 27#define C_h x11 28#define D_l x12 29#define D_h x13 30#define E_l x14 31#define E_h x15 32#define F_l x16 33#define F_h x17 34#define G_l count 35#define G_h dst 36#define H_l src 37#define H_h srcend1 38#define tmp1 x14 39 40/* This implementation handles overlaps and supports both memcpy and memmove 41 from a single entry point. It uses unaligned accesses and branchless 42 sequences to keep the code small, simple and improve performance. 43 44 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45 copies of up to 128 bytes, and large copies. The overhead of the overlap 46 check is negligible since it is only required for large copies. 47 48 Large copies use a software pipelined loop processing 64 bytes per iteration. 49 The destination pointer is 16-byte aligned to minimize unaligned accesses. 50 The loop tail is handled by always copying 64 bytes from the end. 51*/ 52 53DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy) 54 add srcend1, src, count 55 add dstend1, dstin, count 56 cmp count, 128 57 b.hi 7f // copy_long 58 cmp count, 32 59 b.hi 4f // copy32_128 60 61 /* Small copies: 0..32 bytes. */ 62 cmp count, 16 63 b.lo 0f // copy16 64 ldp A_l, A_h, [src] 65 ldp D_l, D_h, [srcend1, -16] 66 stp A_l, A_h, [dstin] 67 stp D_l, D_h, [dstend1, -16] 68 ret 69 70 /* Copy 8-15 bytes. */ 710: // copy16 72 tbz count, 3, 1f // copy8 73 ldr A_l, [src] 74 ldr A_h, [srcend1, -8] 75 str A_l, [dstin] 76 str A_h, [dstend1, -8] 77 ret 78 79 .p2align 3 80 /* Copy 4-7 bytes. */ 811: // copy8 82 tbz count, 2, 2f // copy4 83 ldr A_lw, [src] 84 ldr B_lw, [srcend1, -4] 85 str A_lw, [dstin] 86 str B_lw, [dstend1, -4] 87 ret 88 89 /* Copy 0..3 bytes using a branchless sequence. */ 902: // copy4 91 cbz count, 3f // copy0 92 lsr tmp1, count, 1 93 ldrb A_lw, [src] 94 ldrb C_lw, [srcend1, -1] 95 ldrb B_lw, [src, tmp1] 96 strb A_lw, [dstin] 97 strb B_lw, [dstin, tmp1] 98 strb C_lw, [dstend1, -1] 993: // copy0 100 ret 101 102 .p2align 4 103 /* Medium copies: 33..128 bytes. */ 1044: // copy32_128 105 ldp A_l, A_h, [src] 106 ldp B_l, B_h, [src, 16] 107 ldp C_l, C_h, [srcend1, -32] 108 ldp D_l, D_h, [srcend1, -16] 109 cmp count, 64 110 b.hi 5f // copy128 111 stp A_l, A_h, [dstin] 112 stp B_l, B_h, [dstin, 16] 113 stp C_l, C_h, [dstend1, -32] 114 stp D_l, D_h, [dstend1, -16] 115 ret 116 117 .p2align 4 118 /* Copy 65..128 bytes. */ 1195: // copy128 120 ldp E_l, E_h, [src, 32] 121 ldp F_l, F_h, [src, 48] 122 cmp count, 96 123 b.ls 6f // copy96 124 ldp G_l, G_h, [srcend1, -64] 125 ldp H_l, H_h, [srcend1, -48] 126 stp G_l, G_h, [dstend1, -64] 127 stp H_l, H_h, [dstend1, -48] 1286: // copy96 129 stp A_l, A_h, [dstin] 130 stp B_l, B_h, [dstin, 16] 131 stp E_l, E_h, [dstin, 32] 132 stp F_l, F_h, [dstin, 48] 133 stp C_l, C_h, [dstend1, -32] 134 stp D_l, D_h, [dstend1, -16] 135 ret 136 137 .p2align 4 138 /* Copy more than 128 bytes. */ 1397: // copy_long 140 /* Use backwards copy if there is an overlap. */ 141 sub tmp1, dstin, src 142 cbz tmp1, 3b // copy0 143 cmp tmp1, count 144 b.lo 10f //copy_long_backwards 145 146 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 147 148 ldp D_l, D_h, [src] 149 and tmp1, dstin, 15 150 bic dst, dstin, 15 151 sub src, src, tmp1 152 add count, count, tmp1 /* Count is now 16 too large. */ 153 ldp A_l, A_h, [src, 16] 154 stp D_l, D_h, [dstin] 155 ldp B_l, B_h, [src, 32] 156 ldp C_l, C_h, [src, 48] 157 ldp D_l, D_h, [src, 64]! 158 subs count, count, 128 + 16 /* Test and readjust count. */ 159 b.ls 9f // copy64_from_end 1608: // loop64 161 stp A_l, A_h, [dst, 16] 162 ldp A_l, A_h, [src, 16] 163 stp B_l, B_h, [dst, 32] 164 ldp B_l, B_h, [src, 32] 165 stp C_l, C_h, [dst, 48] 166 ldp C_l, C_h, [src, 48] 167 stp D_l, D_h, [dst, 64]! 168 ldp D_l, D_h, [src, 64]! 169 subs count, count, 64 170 b.hi 8b // loop64 171 172 /* Write the last iteration and copy 64 bytes from the end. */ 1739: // copy64_from_end 174 ldp E_l, E_h, [srcend1, -64] 175 stp A_l, A_h, [dst, 16] 176 ldp A_l, A_h, [srcend1, -48] 177 stp B_l, B_h, [dst, 32] 178 ldp B_l, B_h, [srcend1, -32] 179 stp C_l, C_h, [dst, 48] 180 ldp C_l, C_h, [srcend1, -16] 181 stp D_l, D_h, [dst, 64] 182 stp E_l, E_h, [dstend1, -64] 183 stp A_l, A_h, [dstend1, -48] 184 stp B_l, B_h, [dstend1, -32] 185 stp C_l, C_h, [dstend1, -16] 186 ret 187 188 .p2align 4 189 190 /* Large backwards copy for overlapping copies. 191 Copy 16 bytes and then align dst to 16-byte alignment. */ 19210: // copy_long_backwards 193 ldp D_l, D_h, [srcend1, -16] 194 and tmp1, dstend1, 15 195 sub srcend1, srcend1, tmp1 196 sub count, count, tmp1 197 ldp A_l, A_h, [srcend1, -16] 198 stp D_l, D_h, [dstend1, -16] 199 ldp B_l, B_h, [srcend1, -32] 200 ldp C_l, C_h, [srcend1, -48] 201 ldp D_l, D_h, [srcend1, -64]! 202 sub dstend1, dstend1, tmp1 203 subs count, count, 128 204 b.ls 12f // copy64_from_start 205 20611: // loop64_backwards 207 stp A_l, A_h, [dstend1, -16] 208 ldp A_l, A_h, [srcend1, -16] 209 stp B_l, B_h, [dstend1, -32] 210 ldp B_l, B_h, [srcend1, -32] 211 stp C_l, C_h, [dstend1, -48] 212 ldp C_l, C_h, [srcend1, -48] 213 stp D_l, D_h, [dstend1, -64]! 214 ldp D_l, D_h, [srcend1, -64]! 215 subs count, count, 64 216 b.hi 11b // loop64_backwards 217 218 /* Write the last iteration and copy 64 bytes from the start. */ 21912: // copy64_from_start 220 ldp G_l, G_h, [src, 48] 221 stp A_l, A_h, [dstend1, -16] 222 ldp A_l, A_h, [src, 32] 223 stp B_l, B_h, [dstend1, -32] 224 ldp B_l, B_h, [src, 16] 225 stp C_l, C_h, [dstend1, -48] 226 ldp C_l, C_h, [src] 227 stp D_l, D_h, [dstend1, -64] 228 stp G_l, G_h, [dstin, 48] 229 stp A_l, A_h, [dstin, 32] 230 stp B_l, B_h, [dstin, 16] 231 stp C_l, C_h, [dstin] 232 ret 233END_COMPILERRT_FUNCTION(__arm_sc_memcpy) 234 235DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) 236 237// This version uses FP registers. Use this only on targets with them 238#if defined(__aarch64__) && __ARM_FP != 0 239// 240// __arm_sc_memset 241// 242 243#define dstin x0 244#define val x1 245#define valw w1 246#define count x2 247#define dst x3 248#define dstend2 x4 249#define zva_val x5 250 251DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset) 252#ifdef __ARM_FEATURE_SVE 253 mov z0.b, valw 254#else 255 bfi valw, valw, #8, #8 256 bfi valw, valw, #16, #16 257 bfi val, val, #32, #32 258 fmov d0, val 259 fmov v0.d[1], val 260#endif 261 add dstend2, dstin, count 262 263 cmp count, 96 264 b.hi 7f // set_long 265 cmp count, 16 266 b.hs 4f // set_medium 267 mov val, v0.D[0] 268 269 /* Set 0..15 bytes. */ 270 tbz count, 3, 1f 271 str val, [dstin] 272 str val, [dstend2, -8] 273 ret 274 nop 2751: tbz count, 2, 2f 276 str valw, [dstin] 277 str valw, [dstend2, -4] 278 ret 2792: cbz count, 3f 280 strb valw, [dstin] 281 tbz count, 1, 3f 282 strh valw, [dstend2, -2] 2833: ret 284 285 /* Set 17..96 bytes. */ 2864: // set_medium 287 str q0, [dstin] 288 tbnz count, 6, 6f // set96 289 str q0, [dstend2, -16] 290 tbz count, 5, 5f 291 str q0, [dstin, 16] 292 str q0, [dstend2, -32] 2935: ret 294 295 .p2align 4 296 /* Set 64..96 bytes. Write 64 bytes from the start and 297 32 bytes from the end. */ 2986: // set96 299 str q0, [dstin, 16] 300 stp q0, q0, [dstin, 32] 301 stp q0, q0, [dstend2, -32] 302 ret 303 304 .p2align 4 3057: // set_long 306 and valw, valw, 255 307 bic dst, dstin, 15 308 str q0, [dstin] 309 cmp count, 160 310 ccmp valw, 0, 0, hs 311 b.ne 9f // no_zva 312 313#ifndef SKIP_ZVA_CHECK 314 mrs zva_val, dczid_el0 315 and zva_val, zva_val, 31 316 cmp zva_val, 4 /* ZVA size is 64 bytes. */ 317 b.ne 9f // no_zva 318#endif 319 str q0, [dst, 16] 320 stp q0, q0, [dst, 32] 321 bic dst, dst, 63 322 sub count, dstend2, dst /* Count is now 64 too large. */ 323 sub count, count, 128 /* Adjust count and bias for loop. */ 324 325 .p2align 4 3268: // zva_loop 327 add dst, dst, 64 328 dc zva, dst 329 subs count, count, 64 330 b.hi 8b // zva_loop 331 stp q0, q0, [dstend2, -64] 332 stp q0, q0, [dstend2, -32] 333 ret 334 3359: // no_zva 336 sub count, dstend2, dst /* Count is 16 too large. */ 337 sub dst, dst, 16 /* Dst is biased by -32. */ 338 sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 33910: // no_zva_loop 340 stp q0, q0, [dst, 32] 341 stp q0, q0, [dst, 64]! 342 subs count, count, 64 343 b.hi 10b // no_zva_loop 344 stp q0, q0, [dstend2, -64] 345 stp q0, q0, [dstend2, -32] 346 ret 347END_COMPILERRT_FUNCTION(__arm_sc_memset) 348 349#endif // __aarch64__ 350