xref: /freebsd-src/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S (revision 62987288060ff68c817b7056815aa9fb8ba8ecd7)
10fca6ea1SDimitry Andric// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
20fca6ea1SDimitry Andric// See https://llvm.org/LICENSE.txt for license information.
30fca6ea1SDimitry Andric// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
40fca6ea1SDimitry Andric
50fca6ea1SDimitry Andric// Routines taken from libc/AOR_v20.02/string/aarch64
60fca6ea1SDimitry Andric
70fca6ea1SDimitry Andric#include "../assembly.h"
80fca6ea1SDimitry Andric
90fca6ea1SDimitry Andric#ifdef __aarch64__
100fca6ea1SDimitry Andric
110fca6ea1SDimitry Andric#define L(l) .L ## l
120fca6ea1SDimitry Andric
130fca6ea1SDimitry Andric//
140fca6ea1SDimitry Andric//  __arm_sc_memcpy / __arm_sc_memmove
150fca6ea1SDimitry Andric//
160fca6ea1SDimitry Andric
170fca6ea1SDimitry Andric#define dstin    x0
180fca6ea1SDimitry Andric#define src      x1
190fca6ea1SDimitry Andric#define count    x2
200fca6ea1SDimitry Andric#define dst      x3
210fca6ea1SDimitry Andric#define srcend1  x4
220fca6ea1SDimitry Andric#define dstend1  x5
230fca6ea1SDimitry Andric#define A_l      x6
240fca6ea1SDimitry Andric#define A_lw     w6
250fca6ea1SDimitry Andric#define A_h      x7
260fca6ea1SDimitry Andric#define B_l      x8
270fca6ea1SDimitry Andric#define B_lw     w8
280fca6ea1SDimitry Andric#define B_h      x9
290fca6ea1SDimitry Andric#define C_l      x10
300fca6ea1SDimitry Andric#define C_lw     w10
310fca6ea1SDimitry Andric#define C_h      x11
320fca6ea1SDimitry Andric#define D_l      x12
330fca6ea1SDimitry Andric#define D_h      x13
340fca6ea1SDimitry Andric#define E_l      x14
350fca6ea1SDimitry Andric#define E_h      x15
360fca6ea1SDimitry Andric#define F_l      x16
370fca6ea1SDimitry Andric#define F_h      x17
380fca6ea1SDimitry Andric#define G_l      count
390fca6ea1SDimitry Andric#define G_h      dst
400fca6ea1SDimitry Andric#define H_l      src
410fca6ea1SDimitry Andric#define H_h      srcend1
420fca6ea1SDimitry Andric#define tmp1     x14
430fca6ea1SDimitry Andric
440fca6ea1SDimitry Andric/* This implementation handles overlaps and supports both memcpy and memmove
450fca6ea1SDimitry Andric   from a single entry point.  It uses unaligned accesses and branchless
460fca6ea1SDimitry Andric   sequences to keep the code small, simple and improve performance.
470fca6ea1SDimitry Andric
480fca6ea1SDimitry Andric   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
490fca6ea1SDimitry Andric   copies of up to 128 bytes, and large copies.  The overhead of the overlap
500fca6ea1SDimitry Andric   check is negligible since it is only required for large copies.
510fca6ea1SDimitry Andric
520fca6ea1SDimitry Andric   Large copies use a software pipelined loop processing 64 bytes per iteration.
530fca6ea1SDimitry Andric   The destination pointer is 16-byte aligned to minimize unaligned accesses.
540fca6ea1SDimitry Andric   The loop tail is handled by always copying 64 bytes from the end.
550fca6ea1SDimitry Andric*/
560fca6ea1SDimitry Andric
570fca6ea1SDimitry AndricDEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
580fca6ea1SDimitry Andric        add     srcend1, src, count
590fca6ea1SDimitry Andric        add     dstend1, dstin, count
600fca6ea1SDimitry Andric        cmp     count, 128
610fca6ea1SDimitry Andric        b.hi    L(copy_long)
620fca6ea1SDimitry Andric        cmp     count, 32
630fca6ea1SDimitry Andric        b.hi    L(copy32_128)
640fca6ea1SDimitry Andric
650fca6ea1SDimitry Andric        /* Small copies: 0..32 bytes.  */
660fca6ea1SDimitry Andric        cmp     count, 16
670fca6ea1SDimitry Andric        b.lo    L(copy16)
680fca6ea1SDimitry Andric        ldp     A_l, A_h, [src]
690fca6ea1SDimitry Andric        ldp     D_l, D_h, [srcend1, -16]
700fca6ea1SDimitry Andric        stp     A_l, A_h, [dstin]
710fca6ea1SDimitry Andric        stp     D_l, D_h, [dstend1, -16]
720fca6ea1SDimitry Andric        ret
730fca6ea1SDimitry Andric
740fca6ea1SDimitry Andric        /* Copy 8-15 bytes.  */
750fca6ea1SDimitry AndricL(copy16):
760fca6ea1SDimitry Andric        tbz     count, 3, L(copy8)
770fca6ea1SDimitry Andric        ldr     A_l, [src]
780fca6ea1SDimitry Andric        ldr     A_h, [srcend1, -8]
790fca6ea1SDimitry Andric        str     A_l, [dstin]
800fca6ea1SDimitry Andric        str     A_h, [dstend1, -8]
810fca6ea1SDimitry Andric        ret
820fca6ea1SDimitry Andric
830fca6ea1SDimitry Andric        .p2align 3
840fca6ea1SDimitry Andric        /* Copy 4-7 bytes.  */
850fca6ea1SDimitry AndricL(copy8):
860fca6ea1SDimitry Andric        tbz     count, 2, L(copy4)
870fca6ea1SDimitry Andric        ldr     A_lw, [src]
880fca6ea1SDimitry Andric        ldr     B_lw, [srcend1, -4]
890fca6ea1SDimitry Andric        str     A_lw, [dstin]
900fca6ea1SDimitry Andric        str     B_lw, [dstend1, -4]
910fca6ea1SDimitry Andric        ret
920fca6ea1SDimitry Andric
930fca6ea1SDimitry Andric        /* Copy 0..3 bytes using a branchless sequence.  */
940fca6ea1SDimitry AndricL(copy4):
950fca6ea1SDimitry Andric        cbz     count, L(copy0)
960fca6ea1SDimitry Andric        lsr     tmp1, count, 1
970fca6ea1SDimitry Andric        ldrb    A_lw, [src]
980fca6ea1SDimitry Andric        ldrb    C_lw, [srcend1, -1]
990fca6ea1SDimitry Andric        ldrb    B_lw, [src, tmp1]
1000fca6ea1SDimitry Andric        strb    A_lw, [dstin]
1010fca6ea1SDimitry Andric        strb    B_lw, [dstin, tmp1]
1020fca6ea1SDimitry Andric        strb    C_lw, [dstend1, -1]
1030fca6ea1SDimitry AndricL(copy0):
1040fca6ea1SDimitry Andric        ret
1050fca6ea1SDimitry Andric
1060fca6ea1SDimitry Andric        .p2align 4
1070fca6ea1SDimitry Andric        /* Medium copies: 33..128 bytes.  */
1080fca6ea1SDimitry AndricL(copy32_128):
1090fca6ea1SDimitry Andric        ldp     A_l, A_h, [src]
1100fca6ea1SDimitry Andric        ldp     B_l, B_h, [src, 16]
1110fca6ea1SDimitry Andric        ldp     C_l, C_h, [srcend1, -32]
1120fca6ea1SDimitry Andric        ldp     D_l, D_h, [srcend1, -16]
1130fca6ea1SDimitry Andric        cmp     count, 64
1140fca6ea1SDimitry Andric        b.hi    L(copy128)
1150fca6ea1SDimitry Andric        stp     A_l, A_h, [dstin]
1160fca6ea1SDimitry Andric        stp     B_l, B_h, [dstin, 16]
1170fca6ea1SDimitry Andric        stp     C_l, C_h, [dstend1, -32]
1180fca6ea1SDimitry Andric        stp     D_l, D_h, [dstend1, -16]
1190fca6ea1SDimitry Andric        ret
1200fca6ea1SDimitry Andric
1210fca6ea1SDimitry Andric        .p2align 4
1220fca6ea1SDimitry Andric        /* Copy 65..128 bytes.  */
1230fca6ea1SDimitry AndricL(copy128):
1240fca6ea1SDimitry Andric        ldp     E_l, E_h, [src, 32]
1250fca6ea1SDimitry Andric        ldp     F_l, F_h, [src, 48]
1260fca6ea1SDimitry Andric        cmp     count, 96
1270fca6ea1SDimitry Andric        b.ls    L(copy96)
1280fca6ea1SDimitry Andric        ldp     G_l, G_h, [srcend1, -64]
1290fca6ea1SDimitry Andric        ldp     H_l, H_h, [srcend1, -48]
1300fca6ea1SDimitry Andric        stp     G_l, G_h, [dstend1, -64]
1310fca6ea1SDimitry Andric        stp     H_l, H_h, [dstend1, -48]
1320fca6ea1SDimitry AndricL(copy96):
1330fca6ea1SDimitry Andric        stp     A_l, A_h, [dstin]
1340fca6ea1SDimitry Andric        stp     B_l, B_h, [dstin, 16]
1350fca6ea1SDimitry Andric        stp     E_l, E_h, [dstin, 32]
1360fca6ea1SDimitry Andric        stp     F_l, F_h, [dstin, 48]
1370fca6ea1SDimitry Andric        stp     C_l, C_h, [dstend1, -32]
1380fca6ea1SDimitry Andric        stp     D_l, D_h, [dstend1, -16]
1390fca6ea1SDimitry Andric        ret
1400fca6ea1SDimitry Andric
1410fca6ea1SDimitry Andric        .p2align 4
1420fca6ea1SDimitry Andric        /* Copy more than 128 bytes.  */
1430fca6ea1SDimitry AndricL(copy_long):
1440fca6ea1SDimitry Andric        /* Use backwards copy if there is an overlap.  */
1450fca6ea1SDimitry Andric        sub     tmp1, dstin, src
1460fca6ea1SDimitry Andric        cbz     tmp1, L(copy0)
1470fca6ea1SDimitry Andric        cmp     tmp1, count
1480fca6ea1SDimitry Andric        b.lo    L(copy_long_backwards)
1490fca6ea1SDimitry Andric
1500fca6ea1SDimitry Andric        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
1510fca6ea1SDimitry Andric
1520fca6ea1SDimitry Andric        ldp     D_l, D_h, [src]
1530fca6ea1SDimitry Andric        and     tmp1, dstin, 15
1540fca6ea1SDimitry Andric        bic     dst, dstin, 15
1550fca6ea1SDimitry Andric        sub     src, src, tmp1
1560fca6ea1SDimitry Andric        add     count, count, tmp1      /* Count is now 16 too large.  */
1570fca6ea1SDimitry Andric        ldp     A_l, A_h, [src, 16]
1580fca6ea1SDimitry Andric        stp     D_l, D_h, [dstin]
1590fca6ea1SDimitry Andric        ldp     B_l, B_h, [src, 32]
1600fca6ea1SDimitry Andric        ldp     C_l, C_h, [src, 48]
1610fca6ea1SDimitry Andric        ldp     D_l, D_h, [src, 64]!
1620fca6ea1SDimitry Andric        subs    count, count, 128 + 16  /* Test and readjust count.  */
1630fca6ea1SDimitry Andric        b.ls    L(copy64_from_end)
1640fca6ea1SDimitry AndricL(loop64):
1650fca6ea1SDimitry Andric        stp     A_l, A_h, [dst, 16]
1660fca6ea1SDimitry Andric        ldp     A_l, A_h, [src, 16]
1670fca6ea1SDimitry Andric        stp     B_l, B_h, [dst, 32]
1680fca6ea1SDimitry Andric        ldp     B_l, B_h, [src, 32]
1690fca6ea1SDimitry Andric        stp     C_l, C_h, [dst, 48]
1700fca6ea1SDimitry Andric        ldp     C_l, C_h, [src, 48]
1710fca6ea1SDimitry Andric        stp     D_l, D_h, [dst, 64]!
1720fca6ea1SDimitry Andric        ldp     D_l, D_h, [src, 64]!
1730fca6ea1SDimitry Andric        subs    count, count, 64
1740fca6ea1SDimitry Andric        b.hi    L(loop64)
1750fca6ea1SDimitry Andric
1760fca6ea1SDimitry Andric        /* Write the last iteration and copy 64 bytes from the end.  */
1770fca6ea1SDimitry AndricL(copy64_from_end):
1780fca6ea1SDimitry Andric        ldp     E_l, E_h, [srcend1, -64]
1790fca6ea1SDimitry Andric        stp     A_l, A_h, [dst, 16]
1800fca6ea1SDimitry Andric        ldp     A_l, A_h, [srcend1, -48]
1810fca6ea1SDimitry Andric        stp     B_l, B_h, [dst, 32]
1820fca6ea1SDimitry Andric        ldp     B_l, B_h, [srcend1, -32]
1830fca6ea1SDimitry Andric        stp     C_l, C_h, [dst, 48]
1840fca6ea1SDimitry Andric        ldp     C_l, C_h, [srcend1, -16]
1850fca6ea1SDimitry Andric        stp     D_l, D_h, [dst, 64]
1860fca6ea1SDimitry Andric        stp     E_l, E_h, [dstend1, -64]
1870fca6ea1SDimitry Andric        stp     A_l, A_h, [dstend1, -48]
1880fca6ea1SDimitry Andric        stp     B_l, B_h, [dstend1, -32]
1890fca6ea1SDimitry Andric        stp     C_l, C_h, [dstend1, -16]
1900fca6ea1SDimitry Andric        ret
1910fca6ea1SDimitry Andric
1920fca6ea1SDimitry Andric        .p2align 4
1930fca6ea1SDimitry Andric
1940fca6ea1SDimitry Andric        /* Large backwards copy for overlapping copies.
1950fca6ea1SDimitry Andric           Copy 16 bytes and then align dst to 16-byte alignment.  */
1960fca6ea1SDimitry AndricL(copy_long_backwards):
1970fca6ea1SDimitry Andric        ldp     D_l, D_h, [srcend1, -16]
1980fca6ea1SDimitry Andric        and     tmp1, dstend1, 15
1990fca6ea1SDimitry Andric        sub     srcend1, srcend1, tmp1
2000fca6ea1SDimitry Andric        sub     count, count, tmp1
2010fca6ea1SDimitry Andric        ldp     A_l, A_h, [srcend1, -16]
2020fca6ea1SDimitry Andric        stp     D_l, D_h, [dstend1, -16]
2030fca6ea1SDimitry Andric        ldp     B_l, B_h, [srcend1, -32]
2040fca6ea1SDimitry Andric        ldp     C_l, C_h, [srcend1, -48]
2050fca6ea1SDimitry Andric        ldp     D_l, D_h, [srcend1, -64]!
2060fca6ea1SDimitry Andric        sub     dstend1, dstend1, tmp1
2070fca6ea1SDimitry Andric        subs    count, count, 128
2080fca6ea1SDimitry Andric        b.ls    L(copy64_from_start)
2090fca6ea1SDimitry Andric
2100fca6ea1SDimitry AndricL(loop64_backwards):
2110fca6ea1SDimitry Andric        stp     A_l, A_h, [dstend1, -16]
2120fca6ea1SDimitry Andric        ldp     A_l, A_h, [srcend1, -16]
2130fca6ea1SDimitry Andric        stp     B_l, B_h, [dstend1, -32]
2140fca6ea1SDimitry Andric        ldp     B_l, B_h, [srcend1, -32]
2150fca6ea1SDimitry Andric        stp     C_l, C_h, [dstend1, -48]
2160fca6ea1SDimitry Andric        ldp     C_l, C_h, [srcend1, -48]
2170fca6ea1SDimitry Andric        stp     D_l, D_h, [dstend1, -64]!
2180fca6ea1SDimitry Andric        ldp     D_l, D_h, [srcend1, -64]!
2190fca6ea1SDimitry Andric        subs    count, count, 64
2200fca6ea1SDimitry Andric        b.hi    L(loop64_backwards)
2210fca6ea1SDimitry Andric
2220fca6ea1SDimitry Andric        /* Write the last iteration and copy 64 bytes from the start.  */
2230fca6ea1SDimitry AndricL(copy64_from_start):
2240fca6ea1SDimitry Andric        ldp     G_l, G_h, [src, 48]
2250fca6ea1SDimitry Andric        stp     A_l, A_h, [dstend1, -16]
2260fca6ea1SDimitry Andric        ldp     A_l, A_h, [src, 32]
2270fca6ea1SDimitry Andric        stp     B_l, B_h, [dstend1, -32]
2280fca6ea1SDimitry Andric        ldp     B_l, B_h, [src, 16]
2290fca6ea1SDimitry Andric        stp     C_l, C_h, [dstend1, -48]
2300fca6ea1SDimitry Andric        ldp     C_l, C_h, [src]
2310fca6ea1SDimitry Andric        stp     D_l, D_h, [dstend1, -64]
2320fca6ea1SDimitry Andric        stp     G_l, G_h, [dstin, 48]
2330fca6ea1SDimitry Andric        stp     A_l, A_h, [dstin, 32]
2340fca6ea1SDimitry Andric        stp     B_l, B_h, [dstin, 16]
2350fca6ea1SDimitry Andric        stp     C_l, C_h, [dstin]
2360fca6ea1SDimitry Andric        ret
2370fca6ea1SDimitry AndricEND_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
2380fca6ea1SDimitry Andric
2390fca6ea1SDimitry AndricDEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
2400fca6ea1SDimitry Andric
2410fca6ea1SDimitry Andric
2420fca6ea1SDimitry Andric//
2430fca6ea1SDimitry Andric//  __arm_sc_memset
2440fca6ea1SDimitry Andric//
2450fca6ea1SDimitry Andric
2460fca6ea1SDimitry Andric#define dstin    x0
2470fca6ea1SDimitry Andric#define val      x1
2480fca6ea1SDimitry Andric#define valw     w1
2490fca6ea1SDimitry Andric#define count    x2
2500fca6ea1SDimitry Andric#define dst      x3
2510fca6ea1SDimitry Andric#define dstend2  x4
2520fca6ea1SDimitry Andric#define zva_val  x5
2530fca6ea1SDimitry Andric
2540fca6ea1SDimitry AndricDEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
255*62987288SDimitry Andric#ifdef __ARM_FEATURE_SVE
256*62987288SDimitry Andric        mov     z0.b, valw
257*62987288SDimitry Andric#else
258*62987288SDimitry Andric        bfi valw, valw, #8, #8
259*62987288SDimitry Andric        bfi valw, valw, #16, #16
260*62987288SDimitry Andric        bfi val, val, #32, #32
261*62987288SDimitry Andric        fmov d0, val
262*62987288SDimitry Andric        fmov v0.d[1], val
263*62987288SDimitry Andric#endif
2640fca6ea1SDimitry Andric        add     dstend2, dstin, count
2650fca6ea1SDimitry Andric
2660fca6ea1SDimitry Andric        cmp     count, 96
2670fca6ea1SDimitry Andric        b.hi    L(set_long)
2680fca6ea1SDimitry Andric        cmp     count, 16
2690fca6ea1SDimitry Andric        b.hs    L(set_medium)
2700fca6ea1SDimitry Andric        mov     val, v0.D[0]
2710fca6ea1SDimitry Andric
2720fca6ea1SDimitry Andric        /* Set 0..15 bytes.  */
2730fca6ea1SDimitry Andric        tbz     count, 3, 1f
2740fca6ea1SDimitry Andric        str     val, [dstin]
2750fca6ea1SDimitry Andric        str     val, [dstend2, -8]
2760fca6ea1SDimitry Andric        ret
2770fca6ea1SDimitry Andric        nop
2780fca6ea1SDimitry Andric1:      tbz     count, 2, 2f
2790fca6ea1SDimitry Andric        str     valw, [dstin]
2800fca6ea1SDimitry Andric        str     valw, [dstend2, -4]
2810fca6ea1SDimitry Andric        ret
2820fca6ea1SDimitry Andric2:      cbz     count, 3f
2830fca6ea1SDimitry Andric        strb    valw, [dstin]
2840fca6ea1SDimitry Andric        tbz     count, 1, 3f
2850fca6ea1SDimitry Andric        strh    valw, [dstend2, -2]
2860fca6ea1SDimitry Andric3:      ret
2870fca6ea1SDimitry Andric
2880fca6ea1SDimitry Andric        /* Set 17..96 bytes.  */
2890fca6ea1SDimitry AndricL(set_medium):
2900fca6ea1SDimitry Andric        str     q0, [dstin]
2910fca6ea1SDimitry Andric        tbnz    count, 6, L(set96)
2920fca6ea1SDimitry Andric        str     q0, [dstend2, -16]
2930fca6ea1SDimitry Andric        tbz     count, 5, 1f
2940fca6ea1SDimitry Andric        str     q0, [dstin, 16]
2950fca6ea1SDimitry Andric        str     q0, [dstend2, -32]
2960fca6ea1SDimitry Andric1:      ret
2970fca6ea1SDimitry Andric
2980fca6ea1SDimitry Andric        .p2align 4
2990fca6ea1SDimitry Andric        /* Set 64..96 bytes.  Write 64 bytes from the start and
3000fca6ea1SDimitry Andric           32 bytes from the end.  */
3010fca6ea1SDimitry AndricL(set96):
3020fca6ea1SDimitry Andric        str     q0, [dstin, 16]
3030fca6ea1SDimitry Andric        stp     q0, q0, [dstin, 32]
3040fca6ea1SDimitry Andric        stp     q0, q0, [dstend2, -32]
3050fca6ea1SDimitry Andric        ret
3060fca6ea1SDimitry Andric
3070fca6ea1SDimitry Andric        .p2align 4
3080fca6ea1SDimitry AndricL(set_long):
3090fca6ea1SDimitry Andric        and     valw, valw, 255
3100fca6ea1SDimitry Andric        bic     dst, dstin, 15
3110fca6ea1SDimitry Andric        str     q0, [dstin]
3120fca6ea1SDimitry Andric        cmp     count, 160
3130fca6ea1SDimitry Andric        ccmp    valw, 0, 0, hs
3140fca6ea1SDimitry Andric        b.ne    L(no_zva)
3150fca6ea1SDimitry Andric
3160fca6ea1SDimitry Andric#ifndef SKIP_ZVA_CHECK
3170fca6ea1SDimitry Andric        mrs     zva_val, dczid_el0
3180fca6ea1SDimitry Andric        and     zva_val, zva_val, 31
3190fca6ea1SDimitry Andric        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
3200fca6ea1SDimitry Andric        b.ne    L(no_zva)
3210fca6ea1SDimitry Andric#endif
3220fca6ea1SDimitry Andric        str     q0, [dst, 16]
3230fca6ea1SDimitry Andric        stp     q0, q0, [dst, 32]
3240fca6ea1SDimitry Andric        bic     dst, dst, 63
3250fca6ea1SDimitry Andric        sub     count, dstend2, dst      /* Count is now 64 too large.  */
3260fca6ea1SDimitry Andric        sub     count, count, 128       /* Adjust count and bias for loop.  */
3270fca6ea1SDimitry Andric
3280fca6ea1SDimitry Andric        .p2align 4
3290fca6ea1SDimitry AndricL(zva_loop):
3300fca6ea1SDimitry Andric        add     dst, dst, 64
3310fca6ea1SDimitry Andric        dc      zva, dst
3320fca6ea1SDimitry Andric        subs    count, count, 64
3330fca6ea1SDimitry Andric        b.hi    L(zva_loop)
3340fca6ea1SDimitry Andric        stp     q0, q0, [dstend2, -64]
3350fca6ea1SDimitry Andric        stp     q0, q0, [dstend2, -32]
3360fca6ea1SDimitry Andric        ret
3370fca6ea1SDimitry Andric
3380fca6ea1SDimitry AndricL(no_zva):
3390fca6ea1SDimitry Andric        sub     count, dstend2, dst      /* Count is 16 too large.  */
3400fca6ea1SDimitry Andric        sub     dst, dst, 16            /* Dst is biased by -32.  */
3410fca6ea1SDimitry Andric        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
3420fca6ea1SDimitry AndricL(no_zva_loop):
3430fca6ea1SDimitry Andric        stp     q0, q0, [dst, 32]
3440fca6ea1SDimitry Andric        stp     q0, q0, [dst, 64]!
3450fca6ea1SDimitry Andric        subs    count, count, 64
3460fca6ea1SDimitry Andric        b.hi    L(no_zva_loop)
3470fca6ea1SDimitry Andric        stp     q0, q0, [dstend2, -64]
3480fca6ea1SDimitry Andric        stp     q0, q0, [dstend2, -32]
3490fca6ea1SDimitry Andric        ret
3500fca6ea1SDimitry AndricEND_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
3510fca6ea1SDimitry Andric
3520fca6ea1SDimitry Andric#endif // __aarch64__
353