xref: /freebsd-src/contrib/arm-optimized-routines/string/aarch64/memcmp.S (revision f3087bef11543b42e0d69b708f367097a4118d24)
131914882SAlex Richardson/* memcmp - compare memory
231914882SAlex Richardson *
3*072a4ba8SAndrew Turner * Copyright (c) 2013-2022, Arm Limited.
4*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
531914882SAlex Richardson */
631914882SAlex Richardson
731914882SAlex Richardson/* Assumptions:
831914882SAlex Richardson *
9d49ad206SAndrew Turner * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
1031914882SAlex Richardson */
1131914882SAlex Richardson
12*072a4ba8SAndrew Turner#include "asmdefs.h"
1331914882SAlex Richardson
1431914882SAlex Richardson#define src1	x0
1531914882SAlex Richardson#define src2	x1
1631914882SAlex Richardson#define limit	x2
1731914882SAlex Richardson#define result	w0
1831914882SAlex Richardson
1931914882SAlex Richardson#define data1	x3
2031914882SAlex Richardson#define data1w	w3
21d49ad206SAndrew Turner#define data2	x4
22d49ad206SAndrew Turner#define data2w	w4
23d49ad206SAndrew Turner#define data3	x5
24d49ad206SAndrew Turner#define data3w	w5
25d49ad206SAndrew Turner#define data4	x6
26d49ad206SAndrew Turner#define data4w	w6
27d49ad206SAndrew Turner#define tmp	x6
28d49ad206SAndrew Turner#define src1end	x7
29d49ad206SAndrew Turner#define src2end	x8
30d49ad206SAndrew Turner
3131914882SAlex Richardson
3231914882SAlex RichardsonENTRY (__memcmp_aarch64)
33d49ad206SAndrew Turner	cmp	limit, 16
34d49ad206SAndrew Turner	b.lo	L(less16)
35d49ad206SAndrew Turner	ldp	data1, data3, [src1]
36d49ad206SAndrew Turner	ldp	data2, data4, [src2]
37d49ad206SAndrew Turner	ccmp	data1, data2, 0, ne
38d49ad206SAndrew Turner	ccmp	data3, data4, 0, eq
39d49ad206SAndrew Turner	b.ne	L(return2)
40d49ad206SAndrew Turner
41d49ad206SAndrew Turner	add	src1end, src1, limit
42d49ad206SAndrew Turner	add	src2end, src2, limit
43d49ad206SAndrew Turner	cmp	limit, 32
44d49ad206SAndrew Turner	b.ls	L(last_bytes)
45d49ad206SAndrew Turner	cmp	limit, 160
46d49ad206SAndrew Turner	b.hs	L(loop_align)
47d49ad206SAndrew Turner	sub	limit, limit, 32
48d49ad206SAndrew Turner
49d49ad206SAndrew Turner	.p2align 4
50d49ad206SAndrew TurnerL(loop32):
51d49ad206SAndrew Turner	ldp	data1, data3, [src1, 16]
52d49ad206SAndrew Turner	ldp	data2, data4, [src2, 16]
5331914882SAlex Richardson	cmp	data1, data2
54d49ad206SAndrew Turner	ccmp	data3, data4, 0, eq
55d49ad206SAndrew Turner	b.ne	L(return2)
56d49ad206SAndrew Turner	cmp	limit, 16
5731914882SAlex Richardson	b.ls	L(last_bytes)
5831914882SAlex Richardson
59d49ad206SAndrew Turner	ldp	data1, data3, [src1, 32]
60d49ad206SAndrew Turner	ldp	data2, data4, [src2, 32]
6131914882SAlex Richardson	cmp	data1, data2
62d49ad206SAndrew Turner	ccmp	data3, data4, 0, eq
63d49ad206SAndrew Turner	b.ne	L(return2)
64d49ad206SAndrew Turner	add	src1, src1, 32
65d49ad206SAndrew Turner	add	src2, src2, 32
66d49ad206SAndrew TurnerL(last64):
67d49ad206SAndrew Turner	subs	limit, limit, 32
68d49ad206SAndrew Turner	b.hi	L(loop32)
6931914882SAlex Richardson
7031914882SAlex Richardson	/* Compare last 1-16 bytes using unaligned access.  */
7131914882SAlex RichardsonL(last_bytes):
72d49ad206SAndrew Turner	ldp	data1, data3, [src1end, -16]
73d49ad206SAndrew Turner	ldp	data2, data4, [src2end, -16]
74d49ad206SAndrew TurnerL(return2):
7531914882SAlex Richardson	cmp	data1, data2
76d49ad206SAndrew Turner	csel	data1, data1, data3, ne
77d49ad206SAndrew Turner	csel	data2, data2, data4, ne
7831914882SAlex Richardson
7931914882SAlex Richardson	/* Compare data bytes and set return value to 0, -1 or 1.  */
8031914882SAlex RichardsonL(return):
8131914882SAlex Richardson#ifndef __AARCH64EB__
8231914882SAlex Richardson	rev	data1, data1
8331914882SAlex Richardson	rev	data2, data2
8431914882SAlex Richardson#endif
8531914882SAlex Richardson	cmp	data1, data2
8631914882SAlex Richardson	cset	result, ne
8731914882SAlex Richardson	cneg	result, result, lo
8831914882SAlex Richardson	ret
8931914882SAlex Richardson
9031914882SAlex Richardson	.p2align 4
91d49ad206SAndrew TurnerL(less16):
92d49ad206SAndrew Turner	add	src1end, src1, limit
93d49ad206SAndrew Turner	add	src2end, src2, limit
94d49ad206SAndrew Turner	tbz	limit, 3, L(less8)
95d49ad206SAndrew Turner	ldr	data1, [src1]
96d49ad206SAndrew Turner	ldr	data2, [src2]
97d49ad206SAndrew Turner	ldr	data3, [src1end, -8]
98d49ad206SAndrew Turner	ldr	data4, [src2end, -8]
99d49ad206SAndrew Turner	b	L(return2)
100d49ad206SAndrew Turner
101d49ad206SAndrew Turner	.p2align 4
10231914882SAlex RichardsonL(less8):
103d49ad206SAndrew Turner	tbz	limit, 2, L(less4)
104d49ad206SAndrew Turner	ldr	data1w, [src1]
105d49ad206SAndrew Turner	ldr	data2w, [src2]
106d49ad206SAndrew Turner	ldr	data3w, [src1end, -4]
107d49ad206SAndrew Turner	ldr	data4w, [src2end, -4]
108d49ad206SAndrew Turner	b	L(return2)
109d49ad206SAndrew Turner
110d49ad206SAndrew TurnerL(less4):
111d49ad206SAndrew Turner	tbz	limit, 1, L(less2)
112d49ad206SAndrew Turner	ldrh	data1w, [src1]
113d49ad206SAndrew Turner	ldrh	data2w, [src2]
11431914882SAlex Richardson	cmp	data1w, data2w
11531914882SAlex Richardson	b.ne	L(return)
116d49ad206SAndrew TurnerL(less2):
117d49ad206SAndrew Turner	mov	result, 0
118d49ad206SAndrew Turner	tbz	limit, 0, L(return_zero)
119d49ad206SAndrew Turner	ldrb	data1w, [src1end, -1]
120d49ad206SAndrew Turner	ldrb	data2w, [src2end, -1]
12131914882SAlex Richardson	sub	result, data1w, data2w
122d49ad206SAndrew TurnerL(return_zero):
123d49ad206SAndrew Turner	ret
124d49ad206SAndrew Turner
125d49ad206SAndrew TurnerL(loop_align):
126d49ad206SAndrew Turner	ldp	data1, data3, [src1, 16]
127d49ad206SAndrew Turner	ldp	data2, data4, [src2, 16]
128d49ad206SAndrew Turner	cmp	data1, data2
129d49ad206SAndrew Turner	ccmp	data3, data4, 0, eq
130d49ad206SAndrew Turner	b.ne	L(return2)
131d49ad206SAndrew Turner
132d49ad206SAndrew Turner	/* Align src2 and adjust src1, src2 and limit.  */
133d49ad206SAndrew Turner	and	tmp, src2, 15
134d49ad206SAndrew Turner	sub	tmp, tmp, 16
135d49ad206SAndrew Turner	sub	src2, src2, tmp
136d49ad206SAndrew Turner	add	limit, limit, tmp
137d49ad206SAndrew Turner	sub	src1, src1, tmp
138d49ad206SAndrew Turner	sub	limit, limit, 64 + 16
139d49ad206SAndrew Turner
140d49ad206SAndrew Turner	.p2align 4
141d49ad206SAndrew TurnerL(loop64):
142d49ad206SAndrew Turner	ldr	q0, [src1, 16]
143d49ad206SAndrew Turner	ldr	q1, [src2, 16]
144d49ad206SAndrew Turner	subs	limit, limit, 64
145d49ad206SAndrew Turner	ldr	q2, [src1, 32]
146d49ad206SAndrew Turner	ldr	q3, [src2, 32]
147d49ad206SAndrew Turner	eor	v0.16b, v0.16b, v1.16b
148d49ad206SAndrew Turner	eor	v1.16b, v2.16b, v3.16b
149d49ad206SAndrew Turner	ldr	q2, [src1, 48]
150d49ad206SAndrew Turner	ldr	q3, [src2, 48]
151d49ad206SAndrew Turner	umaxp	v0.16b, v0.16b, v1.16b
152d49ad206SAndrew Turner	ldr	q4, [src1, 64]!
153d49ad206SAndrew Turner	ldr	q5, [src2, 64]!
154d49ad206SAndrew Turner	eor	v1.16b, v2.16b, v3.16b
155d49ad206SAndrew Turner	eor	v2.16b, v4.16b, v5.16b
156d49ad206SAndrew Turner	umaxp	v1.16b, v1.16b, v2.16b
157d49ad206SAndrew Turner	umaxp	v0.16b, v0.16b, v1.16b
158d49ad206SAndrew Turner	umaxp	v0.16b, v0.16b, v0.16b
159d49ad206SAndrew Turner	fmov	tmp, d0
160d49ad206SAndrew Turner	ccmp	tmp, 0, 0, hi
161d49ad206SAndrew Turner	b.eq	L(loop64)
162d49ad206SAndrew Turner
163d49ad206SAndrew Turner	/* If equal, process last 1-64 bytes using scalar loop.  */
164d49ad206SAndrew Turner	add	limit, limit, 64 + 16
165d49ad206SAndrew Turner	cbz	tmp, L(last64)
166d49ad206SAndrew Turner
167d49ad206SAndrew Turner	/* Determine the 8-byte aligned offset of the first difference.  */
168d49ad206SAndrew Turner#ifdef __AARCH64EB__
169d49ad206SAndrew Turner	rev16	tmp, tmp
170d49ad206SAndrew Turner#endif
171d49ad206SAndrew Turner	rev	tmp, tmp
172d49ad206SAndrew Turner	clz	tmp, tmp
173d49ad206SAndrew Turner	bic	tmp, tmp, 7
174d49ad206SAndrew Turner	sub	tmp, tmp, 48
175d49ad206SAndrew Turner	ldr	data1, [src1, tmp]
176d49ad206SAndrew Turner	ldr	data2, [src2, tmp]
177d49ad206SAndrew Turner#ifndef __AARCH64EB__
178d49ad206SAndrew Turner	rev	data1, data1
179d49ad206SAndrew Turner	rev	data2, data2
180d49ad206SAndrew Turner#endif
181d49ad206SAndrew Turner	mov	result, 1
182d49ad206SAndrew Turner	cmp	data1, data2
183d49ad206SAndrew Turner	cneg	result, result, lo
18431914882SAlex Richardson	ret
18531914882SAlex Richardson
18631914882SAlex RichardsonEND (__memcmp_aarch64)
187