xref: /freebsd-src/contrib/arm-optimized-routines/string/aarch64/memcpy.S (revision f3087bef11543b42e0d69b708f367097a4118d24)
131914882SAlex Richardson/*
231914882SAlex Richardson * memcpy - copy memory area
331914882SAlex Richardson *
4*072a4ba8SAndrew Turner * Copyright (c) 2012-2022, Arm Limited.
5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson */
731914882SAlex Richardson
831914882SAlex Richardson/* Assumptions:
931914882SAlex Richardson *
1031914882SAlex Richardson * ARMv8-a, AArch64, unaligned accesses.
1131914882SAlex Richardson *
1231914882SAlex Richardson */
1331914882SAlex Richardson
14*072a4ba8SAndrew Turner#include "asmdefs.h"
1531914882SAlex Richardson
1631914882SAlex Richardson#define dstin	x0
1731914882SAlex Richardson#define src	x1
1831914882SAlex Richardson#define count	x2
1931914882SAlex Richardson#define dst	x3
2031914882SAlex Richardson#define srcend	x4
2131914882SAlex Richardson#define dstend	x5
2231914882SAlex Richardson#define A_l	x6
2331914882SAlex Richardson#define A_lw	w6
2431914882SAlex Richardson#define A_h	x7
2531914882SAlex Richardson#define B_l	x8
2631914882SAlex Richardson#define B_lw	w8
2731914882SAlex Richardson#define B_h	x9
2831914882SAlex Richardson#define C_l	x10
2931914882SAlex Richardson#define C_lw	w10
3031914882SAlex Richardson#define C_h	x11
3131914882SAlex Richardson#define D_l	x12
3231914882SAlex Richardson#define D_h	x13
3331914882SAlex Richardson#define E_l	x14
3431914882SAlex Richardson#define E_h	x15
3531914882SAlex Richardson#define F_l	x16
3631914882SAlex Richardson#define F_h	x17
3731914882SAlex Richardson#define G_l	count
3831914882SAlex Richardson#define G_h	dst
3931914882SAlex Richardson#define H_l	src
4031914882SAlex Richardson#define H_h	srcend
4131914882SAlex Richardson#define tmp1	x14
4231914882SAlex Richardson
4331914882SAlex Richardson/* This implementation handles overlaps and supports both memcpy and memmove
4431914882SAlex Richardson   from a single entry point.  It uses unaligned accesses and branchless
4531914882SAlex Richardson   sequences to keep the code small, simple and improve performance.
4631914882SAlex Richardson
4731914882SAlex Richardson   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
4831914882SAlex Richardson   copies of up to 128 bytes, and large copies.  The overhead of the overlap
4931914882SAlex Richardson   check is negligible since it is only required for large copies.
5031914882SAlex Richardson
5131914882SAlex Richardson   Large copies use a software pipelined loop processing 64 bytes per iteration.
5231914882SAlex Richardson   The destination pointer is 16-byte aligned to minimize unaligned accesses.
5331914882SAlex Richardson   The loop tail is handled by always copying 64 bytes from the end.
5431914882SAlex Richardson*/
5531914882SAlex Richardson
5631914882SAlex RichardsonENTRY_ALIAS (__memmove_aarch64)
5731914882SAlex RichardsonENTRY (__memcpy_aarch64)
5831914882SAlex Richardson	add	srcend, src, count
5931914882SAlex Richardson	add	dstend, dstin, count
6031914882SAlex Richardson	cmp	count, 128
6131914882SAlex Richardson	b.hi	L(copy_long)
6231914882SAlex Richardson	cmp	count, 32
6331914882SAlex Richardson	b.hi	L(copy32_128)
6431914882SAlex Richardson
6531914882SAlex Richardson	/* Small copies: 0..32 bytes.  */
6631914882SAlex Richardson	cmp	count, 16
6731914882SAlex Richardson	b.lo	L(copy16)
6831914882SAlex Richardson	ldp	A_l, A_h, [src]
6931914882SAlex Richardson	ldp	D_l, D_h, [srcend, -16]
7031914882SAlex Richardson	stp	A_l, A_h, [dstin]
7131914882SAlex Richardson	stp	D_l, D_h, [dstend, -16]
7231914882SAlex Richardson	ret
7331914882SAlex Richardson
7431914882SAlex Richardson	/* Copy 8-15 bytes.  */
7531914882SAlex RichardsonL(copy16):
7631914882SAlex Richardson	tbz	count, 3, L(copy8)
7731914882SAlex Richardson	ldr	A_l, [src]
7831914882SAlex Richardson	ldr	A_h, [srcend, -8]
7931914882SAlex Richardson	str	A_l, [dstin]
8031914882SAlex Richardson	str	A_h, [dstend, -8]
8131914882SAlex Richardson	ret
8231914882SAlex Richardson
8331914882SAlex Richardson	.p2align 3
8431914882SAlex Richardson	/* Copy 4-7 bytes.  */
8531914882SAlex RichardsonL(copy8):
8631914882SAlex Richardson	tbz	count, 2, L(copy4)
8731914882SAlex Richardson	ldr	A_lw, [src]
8831914882SAlex Richardson	ldr	B_lw, [srcend, -4]
8931914882SAlex Richardson	str	A_lw, [dstin]
9031914882SAlex Richardson	str	B_lw, [dstend, -4]
9131914882SAlex Richardson	ret
9231914882SAlex Richardson
9331914882SAlex Richardson	/* Copy 0..3 bytes using a branchless sequence.  */
9431914882SAlex RichardsonL(copy4):
9531914882SAlex Richardson	cbz	count, L(copy0)
9631914882SAlex Richardson	lsr	tmp1, count, 1
9731914882SAlex Richardson	ldrb	A_lw, [src]
9831914882SAlex Richardson	ldrb	C_lw, [srcend, -1]
9931914882SAlex Richardson	ldrb	B_lw, [src, tmp1]
10031914882SAlex Richardson	strb	A_lw, [dstin]
10131914882SAlex Richardson	strb	B_lw, [dstin, tmp1]
10231914882SAlex Richardson	strb	C_lw, [dstend, -1]
10331914882SAlex RichardsonL(copy0):
10431914882SAlex Richardson	ret
10531914882SAlex Richardson
10631914882SAlex Richardson	.p2align 4
10731914882SAlex Richardson	/* Medium copies: 33..128 bytes.  */
10831914882SAlex RichardsonL(copy32_128):
10931914882SAlex Richardson	ldp	A_l, A_h, [src]
11031914882SAlex Richardson	ldp	B_l, B_h, [src, 16]
11131914882SAlex Richardson	ldp	C_l, C_h, [srcend, -32]
11231914882SAlex Richardson	ldp	D_l, D_h, [srcend, -16]
11331914882SAlex Richardson	cmp	count, 64
11431914882SAlex Richardson	b.hi	L(copy128)
11531914882SAlex Richardson	stp	A_l, A_h, [dstin]
11631914882SAlex Richardson	stp	B_l, B_h, [dstin, 16]
11731914882SAlex Richardson	stp	C_l, C_h, [dstend, -32]
11831914882SAlex Richardson	stp	D_l, D_h, [dstend, -16]
11931914882SAlex Richardson	ret
12031914882SAlex Richardson
12131914882SAlex Richardson	.p2align 4
12231914882SAlex Richardson	/* Copy 65..128 bytes.  */
12331914882SAlex RichardsonL(copy128):
12431914882SAlex Richardson	ldp	E_l, E_h, [src, 32]
12531914882SAlex Richardson	ldp	F_l, F_h, [src, 48]
12631914882SAlex Richardson	cmp	count, 96
12731914882SAlex Richardson	b.ls	L(copy96)
12831914882SAlex Richardson	ldp	G_l, G_h, [srcend, -64]
12931914882SAlex Richardson	ldp	H_l, H_h, [srcend, -48]
13031914882SAlex Richardson	stp	G_l, G_h, [dstend, -64]
13131914882SAlex Richardson	stp	H_l, H_h, [dstend, -48]
13231914882SAlex RichardsonL(copy96):
13331914882SAlex Richardson	stp	A_l, A_h, [dstin]
13431914882SAlex Richardson	stp	B_l, B_h, [dstin, 16]
13531914882SAlex Richardson	stp	E_l, E_h, [dstin, 32]
13631914882SAlex Richardson	stp	F_l, F_h, [dstin, 48]
13731914882SAlex Richardson	stp	C_l, C_h, [dstend, -32]
13831914882SAlex Richardson	stp	D_l, D_h, [dstend, -16]
13931914882SAlex Richardson	ret
14031914882SAlex Richardson
14131914882SAlex Richardson	.p2align 4
14231914882SAlex Richardson	/* Copy more than 128 bytes.  */
14331914882SAlex RichardsonL(copy_long):
14431914882SAlex Richardson	/* Use backwards copy if there is an overlap.  */
14531914882SAlex Richardson	sub	tmp1, dstin, src
14631914882SAlex Richardson	cbz	tmp1, L(copy0)
14731914882SAlex Richardson	cmp	tmp1, count
14831914882SAlex Richardson	b.lo	L(copy_long_backwards)
14931914882SAlex Richardson
15031914882SAlex Richardson	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
15131914882SAlex Richardson
15231914882SAlex Richardson	ldp	D_l, D_h, [src]
15331914882SAlex Richardson	and	tmp1, dstin, 15
15431914882SAlex Richardson	bic	dst, dstin, 15
15531914882SAlex Richardson	sub	src, src, tmp1
15631914882SAlex Richardson	add	count, count, tmp1	/* Count is now 16 too large.  */
15731914882SAlex Richardson	ldp	A_l, A_h, [src, 16]
15831914882SAlex Richardson	stp	D_l, D_h, [dstin]
15931914882SAlex Richardson	ldp	B_l, B_h, [src, 32]
16031914882SAlex Richardson	ldp	C_l, C_h, [src, 48]
16131914882SAlex Richardson	ldp	D_l, D_h, [src, 64]!
16231914882SAlex Richardson	subs	count, count, 128 + 16	/* Test and readjust count.  */
16331914882SAlex Richardson	b.ls	L(copy64_from_end)
16431914882SAlex Richardson
16531914882SAlex RichardsonL(loop64):
16631914882SAlex Richardson	stp	A_l, A_h, [dst, 16]
16731914882SAlex Richardson	ldp	A_l, A_h, [src, 16]
16831914882SAlex Richardson	stp	B_l, B_h, [dst, 32]
16931914882SAlex Richardson	ldp	B_l, B_h, [src, 32]
17031914882SAlex Richardson	stp	C_l, C_h, [dst, 48]
17131914882SAlex Richardson	ldp	C_l, C_h, [src, 48]
17231914882SAlex Richardson	stp	D_l, D_h, [dst, 64]!
17331914882SAlex Richardson	ldp	D_l, D_h, [src, 64]!
17431914882SAlex Richardson	subs	count, count, 64
17531914882SAlex Richardson	b.hi	L(loop64)
17631914882SAlex Richardson
17731914882SAlex Richardson	/* Write the last iteration and copy 64 bytes from the end.  */
17831914882SAlex RichardsonL(copy64_from_end):
17931914882SAlex Richardson	ldp	E_l, E_h, [srcend, -64]
18031914882SAlex Richardson	stp	A_l, A_h, [dst, 16]
18131914882SAlex Richardson	ldp	A_l, A_h, [srcend, -48]
18231914882SAlex Richardson	stp	B_l, B_h, [dst, 32]
18331914882SAlex Richardson	ldp	B_l, B_h, [srcend, -32]
18431914882SAlex Richardson	stp	C_l, C_h, [dst, 48]
18531914882SAlex Richardson	ldp	C_l, C_h, [srcend, -16]
18631914882SAlex Richardson	stp	D_l, D_h, [dst, 64]
18731914882SAlex Richardson	stp	E_l, E_h, [dstend, -64]
18831914882SAlex Richardson	stp	A_l, A_h, [dstend, -48]
18931914882SAlex Richardson	stp	B_l, B_h, [dstend, -32]
19031914882SAlex Richardson	stp	C_l, C_h, [dstend, -16]
19131914882SAlex Richardson	ret
19231914882SAlex Richardson
19331914882SAlex Richardson	.p2align 4
19431914882SAlex Richardson
19531914882SAlex Richardson	/* Large backwards copy for overlapping copies.
19631914882SAlex Richardson	   Copy 16 bytes and then align dst to 16-byte alignment.  */
19731914882SAlex RichardsonL(copy_long_backwards):
19831914882SAlex Richardson	ldp	D_l, D_h, [srcend, -16]
19931914882SAlex Richardson	and	tmp1, dstend, 15
20031914882SAlex Richardson	sub	srcend, srcend, tmp1
20131914882SAlex Richardson	sub	count, count, tmp1
20231914882SAlex Richardson	ldp	A_l, A_h, [srcend, -16]
20331914882SAlex Richardson	stp	D_l, D_h, [dstend, -16]
20431914882SAlex Richardson	ldp	B_l, B_h, [srcend, -32]
20531914882SAlex Richardson	ldp	C_l, C_h, [srcend, -48]
20631914882SAlex Richardson	ldp	D_l, D_h, [srcend, -64]!
20731914882SAlex Richardson	sub	dstend, dstend, tmp1
20831914882SAlex Richardson	subs	count, count, 128
20931914882SAlex Richardson	b.ls	L(copy64_from_start)
21031914882SAlex Richardson
21131914882SAlex RichardsonL(loop64_backwards):
21231914882SAlex Richardson	stp	A_l, A_h, [dstend, -16]
21331914882SAlex Richardson	ldp	A_l, A_h, [srcend, -16]
21431914882SAlex Richardson	stp	B_l, B_h, [dstend, -32]
21531914882SAlex Richardson	ldp	B_l, B_h, [srcend, -32]
21631914882SAlex Richardson	stp	C_l, C_h, [dstend, -48]
21731914882SAlex Richardson	ldp	C_l, C_h, [srcend, -48]
21831914882SAlex Richardson	stp	D_l, D_h, [dstend, -64]!
21931914882SAlex Richardson	ldp	D_l, D_h, [srcend, -64]!
22031914882SAlex Richardson	subs	count, count, 64
22131914882SAlex Richardson	b.hi	L(loop64_backwards)
22231914882SAlex Richardson
22331914882SAlex Richardson	/* Write the last iteration and copy 64 bytes from the start.  */
22431914882SAlex RichardsonL(copy64_from_start):
22531914882SAlex Richardson	ldp	G_l, G_h, [src, 48]
22631914882SAlex Richardson	stp	A_l, A_h, [dstend, -16]
22731914882SAlex Richardson	ldp	A_l, A_h, [src, 32]
22831914882SAlex Richardson	stp	B_l, B_h, [dstend, -32]
22931914882SAlex Richardson	ldp	B_l, B_h, [src, 16]
23031914882SAlex Richardson	stp	C_l, C_h, [dstend, -48]
23131914882SAlex Richardson	ldp	C_l, C_h, [src]
23231914882SAlex Richardson	stp	D_l, D_h, [dstend, -64]
23331914882SAlex Richardson	stp	G_l, G_h, [dstin, 48]
23431914882SAlex Richardson	stp	A_l, A_h, [dstin, 32]
23531914882SAlex Richardson	stp	B_l, B_h, [dstin, 16]
23631914882SAlex Richardson	stp	C_l, C_h, [dstin]
23731914882SAlex Richardson	ret
23831914882SAlex Richardson
23931914882SAlex RichardsonEND (__memcpy_aarch64)
24031914882SAlex Richardson
241