10fca6ea1SDimitry Andric// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 20fca6ea1SDimitry Andric// See https://llvm.org/LICENSE.txt for license information. 30fca6ea1SDimitry Andric// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 40fca6ea1SDimitry Andric 50fca6ea1SDimitry Andric// Routines taken from libc/AOR_v20.02/string/aarch64 60fca6ea1SDimitry Andric 70fca6ea1SDimitry Andric#include "../assembly.h" 80fca6ea1SDimitry Andric 90fca6ea1SDimitry Andric#ifdef __aarch64__ 100fca6ea1SDimitry Andric 110fca6ea1SDimitry Andric#define L(l) .L ## l 120fca6ea1SDimitry Andric 130fca6ea1SDimitry Andric// 140fca6ea1SDimitry Andric// __arm_sc_memcpy / __arm_sc_memmove 150fca6ea1SDimitry Andric// 160fca6ea1SDimitry Andric 170fca6ea1SDimitry Andric#define dstin x0 180fca6ea1SDimitry Andric#define src x1 190fca6ea1SDimitry Andric#define count x2 200fca6ea1SDimitry Andric#define dst x3 210fca6ea1SDimitry Andric#define srcend1 x4 220fca6ea1SDimitry Andric#define dstend1 x5 230fca6ea1SDimitry Andric#define A_l x6 240fca6ea1SDimitry Andric#define A_lw w6 250fca6ea1SDimitry Andric#define A_h x7 260fca6ea1SDimitry Andric#define B_l x8 270fca6ea1SDimitry Andric#define B_lw w8 280fca6ea1SDimitry Andric#define B_h x9 290fca6ea1SDimitry Andric#define C_l x10 300fca6ea1SDimitry Andric#define C_lw w10 310fca6ea1SDimitry Andric#define C_h x11 320fca6ea1SDimitry Andric#define D_l x12 330fca6ea1SDimitry Andric#define D_h x13 340fca6ea1SDimitry Andric#define E_l x14 350fca6ea1SDimitry Andric#define E_h x15 360fca6ea1SDimitry Andric#define F_l x16 370fca6ea1SDimitry Andric#define F_h x17 380fca6ea1SDimitry Andric#define G_l count 390fca6ea1SDimitry Andric#define G_h dst 400fca6ea1SDimitry Andric#define H_l src 410fca6ea1SDimitry Andric#define H_h srcend1 420fca6ea1SDimitry Andric#define tmp1 x14 430fca6ea1SDimitry Andric 440fca6ea1SDimitry Andric/* This implementation handles overlaps and supports both memcpy and memmove 450fca6ea1SDimitry Andric from a single entry point. It uses unaligned accesses and branchless 460fca6ea1SDimitry Andric sequences to keep the code small, simple and improve performance. 470fca6ea1SDimitry Andric 480fca6ea1SDimitry Andric Copies are split into 3 main cases: small copies of up to 32 bytes, medium 490fca6ea1SDimitry Andric copies of up to 128 bytes, and large copies. The overhead of the overlap 500fca6ea1SDimitry Andric check is negligible since it is only required for large copies. 510fca6ea1SDimitry Andric 520fca6ea1SDimitry Andric Large copies use a software pipelined loop processing 64 bytes per iteration. 530fca6ea1SDimitry Andric The destination pointer is 16-byte aligned to minimize unaligned accesses. 540fca6ea1SDimitry Andric The loop tail is handled by always copying 64 bytes from the end. 550fca6ea1SDimitry Andric*/ 560fca6ea1SDimitry Andric 570fca6ea1SDimitry AndricDEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy) 580fca6ea1SDimitry Andric add srcend1, src, count 590fca6ea1SDimitry Andric add dstend1, dstin, count 600fca6ea1SDimitry Andric cmp count, 128 610fca6ea1SDimitry Andric b.hi L(copy_long) 620fca6ea1SDimitry Andric cmp count, 32 630fca6ea1SDimitry Andric b.hi L(copy32_128) 640fca6ea1SDimitry Andric 650fca6ea1SDimitry Andric /* Small copies: 0..32 bytes. */ 660fca6ea1SDimitry Andric cmp count, 16 670fca6ea1SDimitry Andric b.lo L(copy16) 680fca6ea1SDimitry Andric ldp A_l, A_h, [src] 690fca6ea1SDimitry Andric ldp D_l, D_h, [srcend1, -16] 700fca6ea1SDimitry Andric stp A_l, A_h, [dstin] 710fca6ea1SDimitry Andric stp D_l, D_h, [dstend1, -16] 720fca6ea1SDimitry Andric ret 730fca6ea1SDimitry Andric 740fca6ea1SDimitry Andric /* Copy 8-15 bytes. */ 750fca6ea1SDimitry AndricL(copy16): 760fca6ea1SDimitry Andric tbz count, 3, L(copy8) 770fca6ea1SDimitry Andric ldr A_l, [src] 780fca6ea1SDimitry Andric ldr A_h, [srcend1, -8] 790fca6ea1SDimitry Andric str A_l, [dstin] 800fca6ea1SDimitry Andric str A_h, [dstend1, -8] 810fca6ea1SDimitry Andric ret 820fca6ea1SDimitry Andric 830fca6ea1SDimitry Andric .p2align 3 840fca6ea1SDimitry Andric /* Copy 4-7 bytes. */ 850fca6ea1SDimitry AndricL(copy8): 860fca6ea1SDimitry Andric tbz count, 2, L(copy4) 870fca6ea1SDimitry Andric ldr A_lw, [src] 880fca6ea1SDimitry Andric ldr B_lw, [srcend1, -4] 890fca6ea1SDimitry Andric str A_lw, [dstin] 900fca6ea1SDimitry Andric str B_lw, [dstend1, -4] 910fca6ea1SDimitry Andric ret 920fca6ea1SDimitry Andric 930fca6ea1SDimitry Andric /* Copy 0..3 bytes using a branchless sequence. */ 940fca6ea1SDimitry AndricL(copy4): 950fca6ea1SDimitry Andric cbz count, L(copy0) 960fca6ea1SDimitry Andric lsr tmp1, count, 1 970fca6ea1SDimitry Andric ldrb A_lw, [src] 980fca6ea1SDimitry Andric ldrb C_lw, [srcend1, -1] 990fca6ea1SDimitry Andric ldrb B_lw, [src, tmp1] 1000fca6ea1SDimitry Andric strb A_lw, [dstin] 1010fca6ea1SDimitry Andric strb B_lw, [dstin, tmp1] 1020fca6ea1SDimitry Andric strb C_lw, [dstend1, -1] 1030fca6ea1SDimitry AndricL(copy0): 1040fca6ea1SDimitry Andric ret 1050fca6ea1SDimitry Andric 1060fca6ea1SDimitry Andric .p2align 4 1070fca6ea1SDimitry Andric /* Medium copies: 33..128 bytes. */ 1080fca6ea1SDimitry AndricL(copy32_128): 1090fca6ea1SDimitry Andric ldp A_l, A_h, [src] 1100fca6ea1SDimitry Andric ldp B_l, B_h, [src, 16] 1110fca6ea1SDimitry Andric ldp C_l, C_h, [srcend1, -32] 1120fca6ea1SDimitry Andric ldp D_l, D_h, [srcend1, -16] 1130fca6ea1SDimitry Andric cmp count, 64 1140fca6ea1SDimitry Andric b.hi L(copy128) 1150fca6ea1SDimitry Andric stp A_l, A_h, [dstin] 1160fca6ea1SDimitry Andric stp B_l, B_h, [dstin, 16] 1170fca6ea1SDimitry Andric stp C_l, C_h, [dstend1, -32] 1180fca6ea1SDimitry Andric stp D_l, D_h, [dstend1, -16] 1190fca6ea1SDimitry Andric ret 1200fca6ea1SDimitry Andric 1210fca6ea1SDimitry Andric .p2align 4 1220fca6ea1SDimitry Andric /* Copy 65..128 bytes. */ 1230fca6ea1SDimitry AndricL(copy128): 1240fca6ea1SDimitry Andric ldp E_l, E_h, [src, 32] 1250fca6ea1SDimitry Andric ldp F_l, F_h, [src, 48] 1260fca6ea1SDimitry Andric cmp count, 96 1270fca6ea1SDimitry Andric b.ls L(copy96) 1280fca6ea1SDimitry Andric ldp G_l, G_h, [srcend1, -64] 1290fca6ea1SDimitry Andric ldp H_l, H_h, [srcend1, -48] 1300fca6ea1SDimitry Andric stp G_l, G_h, [dstend1, -64] 1310fca6ea1SDimitry Andric stp H_l, H_h, [dstend1, -48] 1320fca6ea1SDimitry AndricL(copy96): 1330fca6ea1SDimitry Andric stp A_l, A_h, [dstin] 1340fca6ea1SDimitry Andric stp B_l, B_h, [dstin, 16] 1350fca6ea1SDimitry Andric stp E_l, E_h, [dstin, 32] 1360fca6ea1SDimitry Andric stp F_l, F_h, [dstin, 48] 1370fca6ea1SDimitry Andric stp C_l, C_h, [dstend1, -32] 1380fca6ea1SDimitry Andric stp D_l, D_h, [dstend1, -16] 1390fca6ea1SDimitry Andric ret 1400fca6ea1SDimitry Andric 1410fca6ea1SDimitry Andric .p2align 4 1420fca6ea1SDimitry Andric /* Copy more than 128 bytes. */ 1430fca6ea1SDimitry AndricL(copy_long): 1440fca6ea1SDimitry Andric /* Use backwards copy if there is an overlap. */ 1450fca6ea1SDimitry Andric sub tmp1, dstin, src 1460fca6ea1SDimitry Andric cbz tmp1, L(copy0) 1470fca6ea1SDimitry Andric cmp tmp1, count 1480fca6ea1SDimitry Andric b.lo L(copy_long_backwards) 1490fca6ea1SDimitry Andric 1500fca6ea1SDimitry Andric /* Copy 16 bytes and then align dst to 16-byte alignment. */ 1510fca6ea1SDimitry Andric 1520fca6ea1SDimitry Andric ldp D_l, D_h, [src] 1530fca6ea1SDimitry Andric and tmp1, dstin, 15 1540fca6ea1SDimitry Andric bic dst, dstin, 15 1550fca6ea1SDimitry Andric sub src, src, tmp1 1560fca6ea1SDimitry Andric add count, count, tmp1 /* Count is now 16 too large. */ 1570fca6ea1SDimitry Andric ldp A_l, A_h, [src, 16] 1580fca6ea1SDimitry Andric stp D_l, D_h, [dstin] 1590fca6ea1SDimitry Andric ldp B_l, B_h, [src, 32] 1600fca6ea1SDimitry Andric ldp C_l, C_h, [src, 48] 1610fca6ea1SDimitry Andric ldp D_l, D_h, [src, 64]! 1620fca6ea1SDimitry Andric subs count, count, 128 + 16 /* Test and readjust count. */ 1630fca6ea1SDimitry Andric b.ls L(copy64_from_end) 1640fca6ea1SDimitry AndricL(loop64): 1650fca6ea1SDimitry Andric stp A_l, A_h, [dst, 16] 1660fca6ea1SDimitry Andric ldp A_l, A_h, [src, 16] 1670fca6ea1SDimitry Andric stp B_l, B_h, [dst, 32] 1680fca6ea1SDimitry Andric ldp B_l, B_h, [src, 32] 1690fca6ea1SDimitry Andric stp C_l, C_h, [dst, 48] 1700fca6ea1SDimitry Andric ldp C_l, C_h, [src, 48] 1710fca6ea1SDimitry Andric stp D_l, D_h, [dst, 64]! 1720fca6ea1SDimitry Andric ldp D_l, D_h, [src, 64]! 1730fca6ea1SDimitry Andric subs count, count, 64 1740fca6ea1SDimitry Andric b.hi L(loop64) 1750fca6ea1SDimitry Andric 1760fca6ea1SDimitry Andric /* Write the last iteration and copy 64 bytes from the end. */ 1770fca6ea1SDimitry AndricL(copy64_from_end): 1780fca6ea1SDimitry Andric ldp E_l, E_h, [srcend1, -64] 1790fca6ea1SDimitry Andric stp A_l, A_h, [dst, 16] 1800fca6ea1SDimitry Andric ldp A_l, A_h, [srcend1, -48] 1810fca6ea1SDimitry Andric stp B_l, B_h, [dst, 32] 1820fca6ea1SDimitry Andric ldp B_l, B_h, [srcend1, -32] 1830fca6ea1SDimitry Andric stp C_l, C_h, [dst, 48] 1840fca6ea1SDimitry Andric ldp C_l, C_h, [srcend1, -16] 1850fca6ea1SDimitry Andric stp D_l, D_h, [dst, 64] 1860fca6ea1SDimitry Andric stp E_l, E_h, [dstend1, -64] 1870fca6ea1SDimitry Andric stp A_l, A_h, [dstend1, -48] 1880fca6ea1SDimitry Andric stp B_l, B_h, [dstend1, -32] 1890fca6ea1SDimitry Andric stp C_l, C_h, [dstend1, -16] 1900fca6ea1SDimitry Andric ret 1910fca6ea1SDimitry Andric 1920fca6ea1SDimitry Andric .p2align 4 1930fca6ea1SDimitry Andric 1940fca6ea1SDimitry Andric /* Large backwards copy for overlapping copies. 1950fca6ea1SDimitry Andric Copy 16 bytes and then align dst to 16-byte alignment. */ 1960fca6ea1SDimitry AndricL(copy_long_backwards): 1970fca6ea1SDimitry Andric ldp D_l, D_h, [srcend1, -16] 1980fca6ea1SDimitry Andric and tmp1, dstend1, 15 1990fca6ea1SDimitry Andric sub srcend1, srcend1, tmp1 2000fca6ea1SDimitry Andric sub count, count, tmp1 2010fca6ea1SDimitry Andric ldp A_l, A_h, [srcend1, -16] 2020fca6ea1SDimitry Andric stp D_l, D_h, [dstend1, -16] 2030fca6ea1SDimitry Andric ldp B_l, B_h, [srcend1, -32] 2040fca6ea1SDimitry Andric ldp C_l, C_h, [srcend1, -48] 2050fca6ea1SDimitry Andric ldp D_l, D_h, [srcend1, -64]! 2060fca6ea1SDimitry Andric sub dstend1, dstend1, tmp1 2070fca6ea1SDimitry Andric subs count, count, 128 2080fca6ea1SDimitry Andric b.ls L(copy64_from_start) 2090fca6ea1SDimitry Andric 2100fca6ea1SDimitry AndricL(loop64_backwards): 2110fca6ea1SDimitry Andric stp A_l, A_h, [dstend1, -16] 2120fca6ea1SDimitry Andric ldp A_l, A_h, [srcend1, -16] 2130fca6ea1SDimitry Andric stp B_l, B_h, [dstend1, -32] 2140fca6ea1SDimitry Andric ldp B_l, B_h, [srcend1, -32] 2150fca6ea1SDimitry Andric stp C_l, C_h, [dstend1, -48] 2160fca6ea1SDimitry Andric ldp C_l, C_h, [srcend1, -48] 2170fca6ea1SDimitry Andric stp D_l, D_h, [dstend1, -64]! 2180fca6ea1SDimitry Andric ldp D_l, D_h, [srcend1, -64]! 2190fca6ea1SDimitry Andric subs count, count, 64 2200fca6ea1SDimitry Andric b.hi L(loop64_backwards) 2210fca6ea1SDimitry Andric 2220fca6ea1SDimitry Andric /* Write the last iteration and copy 64 bytes from the start. */ 2230fca6ea1SDimitry AndricL(copy64_from_start): 2240fca6ea1SDimitry Andric ldp G_l, G_h, [src, 48] 2250fca6ea1SDimitry Andric stp A_l, A_h, [dstend1, -16] 2260fca6ea1SDimitry Andric ldp A_l, A_h, [src, 32] 2270fca6ea1SDimitry Andric stp B_l, B_h, [dstend1, -32] 2280fca6ea1SDimitry Andric ldp B_l, B_h, [src, 16] 2290fca6ea1SDimitry Andric stp C_l, C_h, [dstend1, -48] 2300fca6ea1SDimitry Andric ldp C_l, C_h, [src] 2310fca6ea1SDimitry Andric stp D_l, D_h, [dstend1, -64] 2320fca6ea1SDimitry Andric stp G_l, G_h, [dstin, 48] 2330fca6ea1SDimitry Andric stp A_l, A_h, [dstin, 32] 2340fca6ea1SDimitry Andric stp B_l, B_h, [dstin, 16] 2350fca6ea1SDimitry Andric stp C_l, C_h, [dstin] 2360fca6ea1SDimitry Andric ret 2370fca6ea1SDimitry AndricEND_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy) 2380fca6ea1SDimitry Andric 2390fca6ea1SDimitry AndricDEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) 2400fca6ea1SDimitry Andric 2410fca6ea1SDimitry Andric 2420fca6ea1SDimitry Andric// 2430fca6ea1SDimitry Andric// __arm_sc_memset 2440fca6ea1SDimitry Andric// 2450fca6ea1SDimitry Andric 2460fca6ea1SDimitry Andric#define dstin x0 2470fca6ea1SDimitry Andric#define val x1 2480fca6ea1SDimitry Andric#define valw w1 2490fca6ea1SDimitry Andric#define count x2 2500fca6ea1SDimitry Andric#define dst x3 2510fca6ea1SDimitry Andric#define dstend2 x4 2520fca6ea1SDimitry Andric#define zva_val x5 2530fca6ea1SDimitry Andric 2540fca6ea1SDimitry AndricDEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset) 255*62987288SDimitry Andric#ifdef __ARM_FEATURE_SVE 256*62987288SDimitry Andric mov z0.b, valw 257*62987288SDimitry Andric#else 258*62987288SDimitry Andric bfi valw, valw, #8, #8 259*62987288SDimitry Andric bfi valw, valw, #16, #16 260*62987288SDimitry Andric bfi val, val, #32, #32 261*62987288SDimitry Andric fmov d0, val 262*62987288SDimitry Andric fmov v0.d[1], val 263*62987288SDimitry Andric#endif 2640fca6ea1SDimitry Andric add dstend2, dstin, count 2650fca6ea1SDimitry Andric 2660fca6ea1SDimitry Andric cmp count, 96 2670fca6ea1SDimitry Andric b.hi L(set_long) 2680fca6ea1SDimitry Andric cmp count, 16 2690fca6ea1SDimitry Andric b.hs L(set_medium) 2700fca6ea1SDimitry Andric mov val, v0.D[0] 2710fca6ea1SDimitry Andric 2720fca6ea1SDimitry Andric /* Set 0..15 bytes. */ 2730fca6ea1SDimitry Andric tbz count, 3, 1f 2740fca6ea1SDimitry Andric str val, [dstin] 2750fca6ea1SDimitry Andric str val, [dstend2, -8] 2760fca6ea1SDimitry Andric ret 2770fca6ea1SDimitry Andric nop 2780fca6ea1SDimitry Andric1: tbz count, 2, 2f 2790fca6ea1SDimitry Andric str valw, [dstin] 2800fca6ea1SDimitry Andric str valw, [dstend2, -4] 2810fca6ea1SDimitry Andric ret 2820fca6ea1SDimitry Andric2: cbz count, 3f 2830fca6ea1SDimitry Andric strb valw, [dstin] 2840fca6ea1SDimitry Andric tbz count, 1, 3f 2850fca6ea1SDimitry Andric strh valw, [dstend2, -2] 2860fca6ea1SDimitry Andric3: ret 2870fca6ea1SDimitry Andric 2880fca6ea1SDimitry Andric /* Set 17..96 bytes. */ 2890fca6ea1SDimitry AndricL(set_medium): 2900fca6ea1SDimitry Andric str q0, [dstin] 2910fca6ea1SDimitry Andric tbnz count, 6, L(set96) 2920fca6ea1SDimitry Andric str q0, [dstend2, -16] 2930fca6ea1SDimitry Andric tbz count, 5, 1f 2940fca6ea1SDimitry Andric str q0, [dstin, 16] 2950fca6ea1SDimitry Andric str q0, [dstend2, -32] 2960fca6ea1SDimitry Andric1: ret 2970fca6ea1SDimitry Andric 2980fca6ea1SDimitry Andric .p2align 4 2990fca6ea1SDimitry Andric /* Set 64..96 bytes. Write 64 bytes from the start and 3000fca6ea1SDimitry Andric 32 bytes from the end. */ 3010fca6ea1SDimitry AndricL(set96): 3020fca6ea1SDimitry Andric str q0, [dstin, 16] 3030fca6ea1SDimitry Andric stp q0, q0, [dstin, 32] 3040fca6ea1SDimitry Andric stp q0, q0, [dstend2, -32] 3050fca6ea1SDimitry Andric ret 3060fca6ea1SDimitry Andric 3070fca6ea1SDimitry Andric .p2align 4 3080fca6ea1SDimitry AndricL(set_long): 3090fca6ea1SDimitry Andric and valw, valw, 255 3100fca6ea1SDimitry Andric bic dst, dstin, 15 3110fca6ea1SDimitry Andric str q0, [dstin] 3120fca6ea1SDimitry Andric cmp count, 160 3130fca6ea1SDimitry Andric ccmp valw, 0, 0, hs 3140fca6ea1SDimitry Andric b.ne L(no_zva) 3150fca6ea1SDimitry Andric 3160fca6ea1SDimitry Andric#ifndef SKIP_ZVA_CHECK 3170fca6ea1SDimitry Andric mrs zva_val, dczid_el0 3180fca6ea1SDimitry Andric and zva_val, zva_val, 31 3190fca6ea1SDimitry Andric cmp zva_val, 4 /* ZVA size is 64 bytes. */ 3200fca6ea1SDimitry Andric b.ne L(no_zva) 3210fca6ea1SDimitry Andric#endif 3220fca6ea1SDimitry Andric str q0, [dst, 16] 3230fca6ea1SDimitry Andric stp q0, q0, [dst, 32] 3240fca6ea1SDimitry Andric bic dst, dst, 63 3250fca6ea1SDimitry Andric sub count, dstend2, dst /* Count is now 64 too large. */ 3260fca6ea1SDimitry Andric sub count, count, 128 /* Adjust count and bias for loop. */ 3270fca6ea1SDimitry Andric 3280fca6ea1SDimitry Andric .p2align 4 3290fca6ea1SDimitry AndricL(zva_loop): 3300fca6ea1SDimitry Andric add dst, dst, 64 3310fca6ea1SDimitry Andric dc zva, dst 3320fca6ea1SDimitry Andric subs count, count, 64 3330fca6ea1SDimitry Andric b.hi L(zva_loop) 3340fca6ea1SDimitry Andric stp q0, q0, [dstend2, -64] 3350fca6ea1SDimitry Andric stp q0, q0, [dstend2, -32] 3360fca6ea1SDimitry Andric ret 3370fca6ea1SDimitry Andric 3380fca6ea1SDimitry AndricL(no_zva): 3390fca6ea1SDimitry Andric sub count, dstend2, dst /* Count is 16 too large. */ 3400fca6ea1SDimitry Andric sub dst, dst, 16 /* Dst is biased by -32. */ 3410fca6ea1SDimitry Andric sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 3420fca6ea1SDimitry AndricL(no_zva_loop): 3430fca6ea1SDimitry Andric stp q0, q0, [dst, 32] 3440fca6ea1SDimitry Andric stp q0, q0, [dst, 64]! 3450fca6ea1SDimitry Andric subs count, count, 64 3460fca6ea1SDimitry Andric b.hi L(no_zva_loop) 3470fca6ea1SDimitry Andric stp q0, q0, [dstend2, -64] 3480fca6ea1SDimitry Andric stp q0, q0, [dstend2, -32] 3490fca6ea1SDimitry Andric ret 3500fca6ea1SDimitry AndricEND_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset) 3510fca6ea1SDimitry Andric 3520fca6ea1SDimitry Andric#endif // __aarch64__ 353