xref: /freebsd-src/contrib/cortex-strings/src/aarch64/strncmp.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
1*09a53ad8SAndrew Turner/* Copyright (c) 2013, Linaro Limited
2*09a53ad8SAndrew Turner   All rights reserved.
3*09a53ad8SAndrew Turner
4*09a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
5*09a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
6*09a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
7*09a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer.
8*09a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
9*09a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer in the
10*09a53ad8SAndrew Turner         documentation and/or other materials provided with the distribution.
11*09a53ad8SAndrew Turner       * Neither the name of the Linaro nor the
12*09a53ad8SAndrew Turner         names of its contributors may be used to endorse or promote products
13*09a53ad8SAndrew Turner         derived from this software without specific prior written permission.
14*09a53ad8SAndrew Turner
15*09a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*09a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*09a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*09a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19*09a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20*09a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21*09a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22*09a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23*09a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*09a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25*09a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26*09a53ad8SAndrew Turner
27*09a53ad8SAndrew Turner/* Assumptions:
28*09a53ad8SAndrew Turner *
29*09a53ad8SAndrew Turner * ARMv8-a, AArch64
30*09a53ad8SAndrew Turner */
31*09a53ad8SAndrew Turner
32*09a53ad8SAndrew Turner	.macro def_fn f p2align=0
33*09a53ad8SAndrew Turner	.text
34*09a53ad8SAndrew Turner	.p2align \p2align
35*09a53ad8SAndrew Turner	.global \f
36*09a53ad8SAndrew Turner	.type \f, %function
37*09a53ad8SAndrew Turner\f:
38*09a53ad8SAndrew Turner	.endm
39*09a53ad8SAndrew Turner
40*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101
41*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f
42*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080
43*09a53ad8SAndrew Turner
44*09a53ad8SAndrew Turner/* Parameters and result.  */
45*09a53ad8SAndrew Turner#define src1		x0
46*09a53ad8SAndrew Turner#define src2		x1
47*09a53ad8SAndrew Turner#define limit		x2
48*09a53ad8SAndrew Turner#define result		x0
49*09a53ad8SAndrew Turner
50*09a53ad8SAndrew Turner/* Internal variables.  */
51*09a53ad8SAndrew Turner#define data1		x3
52*09a53ad8SAndrew Turner#define data1w		w3
53*09a53ad8SAndrew Turner#define data2		x4
54*09a53ad8SAndrew Turner#define data2w		w4
55*09a53ad8SAndrew Turner#define has_nul		x5
56*09a53ad8SAndrew Turner#define diff		x6
57*09a53ad8SAndrew Turner#define syndrome	x7
58*09a53ad8SAndrew Turner#define tmp1		x8
59*09a53ad8SAndrew Turner#define tmp2		x9
60*09a53ad8SAndrew Turner#define tmp3		x10
61*09a53ad8SAndrew Turner#define zeroones	x11
62*09a53ad8SAndrew Turner#define pos		x12
63*09a53ad8SAndrew Turner#define limit_wd	x13
64*09a53ad8SAndrew Turner#define mask		x14
65*09a53ad8SAndrew Turner#define endloop		x15
66*09a53ad8SAndrew Turner
67*09a53ad8SAndrew Turner	.text
68*09a53ad8SAndrew Turner	.p2align 6
69*09a53ad8SAndrew Turner	.rep 7
70*09a53ad8SAndrew Turner	nop	/* Pad so that the loop below fits a cache line.  */
71*09a53ad8SAndrew Turner	.endr
72*09a53ad8SAndrew Turnerdef_fn strncmp
73*09a53ad8SAndrew Turner	cbz	limit, .Lret0
74*09a53ad8SAndrew Turner	eor	tmp1, src1, src2
75*09a53ad8SAndrew Turner	mov	zeroones, #REP8_01
76*09a53ad8SAndrew Turner	tst	tmp1, #7
77*09a53ad8SAndrew Turner	b.ne	.Lmisaligned8
78*09a53ad8SAndrew Turner	ands	tmp1, src1, #7
79*09a53ad8SAndrew Turner	b.ne	.Lmutual_align
80*09a53ad8SAndrew Turner	/* Calculate the number of full and partial words -1.  */
81*09a53ad8SAndrew Turner	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
82*09a53ad8SAndrew Turner	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
83*09a53ad8SAndrew Turner
84*09a53ad8SAndrew Turner	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
85*09a53ad8SAndrew Turner	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
86*09a53ad8SAndrew Turner	   can be done in parallel across the entire word.  */
87*09a53ad8SAndrew Turner	/* Start of performance-critical section  -- one 64B cache line.  */
88*09a53ad8SAndrew Turner.Lloop_aligned:
89*09a53ad8SAndrew Turner	ldr	data1, [src1], #8
90*09a53ad8SAndrew Turner	ldr	data2, [src2], #8
91*09a53ad8SAndrew Turner.Lstart_realigned:
92*09a53ad8SAndrew Turner	subs	limit_wd, limit_wd, #1
93*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
94*09a53ad8SAndrew Turner	orr	tmp2, data1, #REP8_7f
95*09a53ad8SAndrew Turner	eor	diff, data1, data2	/* Non-zero if differences found.  */
96*09a53ad8SAndrew Turner	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
97*09a53ad8SAndrew Turner	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
98*09a53ad8SAndrew Turner	ccmp	endloop, #0, #0, eq
99*09a53ad8SAndrew Turner	b.eq	.Lloop_aligned
100*09a53ad8SAndrew Turner	/* End of performance-critical section  -- one 64B cache line.  */
101*09a53ad8SAndrew Turner
102*09a53ad8SAndrew Turner	/* Not reached the limit, must have found the end or a diff.  */
103*09a53ad8SAndrew Turner	tbz	limit_wd, #63, .Lnot_limit
104*09a53ad8SAndrew Turner
105*09a53ad8SAndrew Turner	/* Limit % 8 == 0 => all bytes significant.  */
106*09a53ad8SAndrew Turner	ands	limit, limit, #7
107*09a53ad8SAndrew Turner	b.eq	.Lnot_limit
108*09a53ad8SAndrew Turner
109*09a53ad8SAndrew Turner	lsl	limit, limit, #3	/* Bits -> bytes.  */
110*09a53ad8SAndrew Turner	mov	mask, #~0
111*09a53ad8SAndrew Turner#ifdef __AARCH64EB__
112*09a53ad8SAndrew Turner	lsr	mask, mask, limit
113*09a53ad8SAndrew Turner#else
114*09a53ad8SAndrew Turner	lsl	mask, mask, limit
115*09a53ad8SAndrew Turner#endif
116*09a53ad8SAndrew Turner	bic	data1, data1, mask
117*09a53ad8SAndrew Turner	bic	data2, data2, mask
118*09a53ad8SAndrew Turner
119*09a53ad8SAndrew Turner	/* Make sure that the NUL byte is marked in the syndrome.  */
120*09a53ad8SAndrew Turner	orr	has_nul, has_nul, mask
121*09a53ad8SAndrew Turner
122*09a53ad8SAndrew Turner.Lnot_limit:
123*09a53ad8SAndrew Turner	orr	syndrome, diff, has_nul
124*09a53ad8SAndrew Turner
125*09a53ad8SAndrew Turner#ifndef	__AARCH64EB__
126*09a53ad8SAndrew Turner	rev	syndrome, syndrome
127*09a53ad8SAndrew Turner	rev	data1, data1
128*09a53ad8SAndrew Turner	/* The MS-non-zero bit of the syndrome marks either the first bit
129*09a53ad8SAndrew Turner	   that is different, or the top bit of the first zero byte.
130*09a53ad8SAndrew Turner	   Shifting left now will bring the critical information into the
131*09a53ad8SAndrew Turner	   top bits.  */
132*09a53ad8SAndrew Turner	clz	pos, syndrome
133*09a53ad8SAndrew Turner	rev	data2, data2
134*09a53ad8SAndrew Turner	lsl	data1, data1, pos
135*09a53ad8SAndrew Turner	lsl	data2, data2, pos
136*09a53ad8SAndrew Turner	/* But we need to zero-extend (char is unsigned) the value and then
137*09a53ad8SAndrew Turner	   perform a signed 32-bit subtraction.  */
138*09a53ad8SAndrew Turner	lsr	data1, data1, #56
139*09a53ad8SAndrew Turner	sub	result, data1, data2, lsr #56
140*09a53ad8SAndrew Turner	ret
141*09a53ad8SAndrew Turner#else
142*09a53ad8SAndrew Turner	/* For big-endian we cannot use the trick with the syndrome value
143*09a53ad8SAndrew Turner	   as carry-propagation can corrupt the upper bits if the trailing
144*09a53ad8SAndrew Turner	   bytes in the string contain 0x01.  */
145*09a53ad8SAndrew Turner	/* However, if there is no NUL byte in the dword, we can generate
146*09a53ad8SAndrew Turner	   the result directly.  We can't just subtract the bytes as the
147*09a53ad8SAndrew Turner	   MSB might be significant.  */
148*09a53ad8SAndrew Turner	cbnz	has_nul, 1f
149*09a53ad8SAndrew Turner	cmp	data1, data2
150*09a53ad8SAndrew Turner	cset	result, ne
151*09a53ad8SAndrew Turner	cneg	result, result, lo
152*09a53ad8SAndrew Turner	ret
153*09a53ad8SAndrew Turner1:
154*09a53ad8SAndrew Turner	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
155*09a53ad8SAndrew Turner	rev	tmp3, data1
156*09a53ad8SAndrew Turner	sub	tmp1, tmp3, zeroones
157*09a53ad8SAndrew Turner	orr	tmp2, tmp3, #REP8_7f
158*09a53ad8SAndrew Turner	bic	has_nul, tmp1, tmp2
159*09a53ad8SAndrew Turner	rev	has_nul, has_nul
160*09a53ad8SAndrew Turner	orr	syndrome, diff, has_nul
161*09a53ad8SAndrew Turner	clz	pos, syndrome
162*09a53ad8SAndrew Turner	/* The MS-non-zero bit of the syndrome marks either the first bit
163*09a53ad8SAndrew Turner	   that is different, or the top bit of the first zero byte.
164*09a53ad8SAndrew Turner	   Shifting left now will bring the critical information into the
165*09a53ad8SAndrew Turner	   top bits.  */
166*09a53ad8SAndrew Turner	lsl	data1, data1, pos
167*09a53ad8SAndrew Turner	lsl	data2, data2, pos
168*09a53ad8SAndrew Turner	/* But we need to zero-extend (char is unsigned) the value and then
169*09a53ad8SAndrew Turner	   perform a signed 32-bit subtraction.  */
170*09a53ad8SAndrew Turner	lsr	data1, data1, #56
171*09a53ad8SAndrew Turner	sub	result, data1, data2, lsr #56
172*09a53ad8SAndrew Turner	ret
173*09a53ad8SAndrew Turner#endif
174*09a53ad8SAndrew Turner
175*09a53ad8SAndrew Turner.Lmutual_align:
176*09a53ad8SAndrew Turner	/* Sources are mutually aligned, but are not currently at an
177*09a53ad8SAndrew Turner	   alignment boundary.  Round down the addresses and then mask off
178*09a53ad8SAndrew Turner	   the bytes that precede the start point.
179*09a53ad8SAndrew Turner	   We also need to adjust the limit calculations, but without
180*09a53ad8SAndrew Turner	   overflowing if the limit is near ULONG_MAX.  */
181*09a53ad8SAndrew Turner	bic	src1, src1, #7
182*09a53ad8SAndrew Turner	bic	src2, src2, #7
183*09a53ad8SAndrew Turner	ldr	data1, [src1], #8
184*09a53ad8SAndrew Turner	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
185*09a53ad8SAndrew Turner	ldr	data2, [src2], #8
186*09a53ad8SAndrew Turner	mov	tmp2, #~0
187*09a53ad8SAndrew Turner	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
188*09a53ad8SAndrew Turner#ifdef __AARCH64EB__
189*09a53ad8SAndrew Turner	/* Big-endian.  Early bytes are at MSB.  */
190*09a53ad8SAndrew Turner	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
191*09a53ad8SAndrew Turner#else
192*09a53ad8SAndrew Turner	/* Little-endian.  Early bytes are at LSB.  */
193*09a53ad8SAndrew Turner	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
194*09a53ad8SAndrew Turner#endif
195*09a53ad8SAndrew Turner	and	tmp3, limit_wd, #7
196*09a53ad8SAndrew Turner	lsr	limit_wd, limit_wd, #3
197*09a53ad8SAndrew Turner	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
198*09a53ad8SAndrew Turner	add	limit, limit, tmp1
199*09a53ad8SAndrew Turner	add	tmp3, tmp3, tmp1
200*09a53ad8SAndrew Turner	orr	data1, data1, tmp2
201*09a53ad8SAndrew Turner	orr	data2, data2, tmp2
202*09a53ad8SAndrew Turner	add	limit_wd, limit_wd, tmp3, lsr #3
203*09a53ad8SAndrew Turner	b	.Lstart_realigned
204*09a53ad8SAndrew Turner
205*09a53ad8SAndrew Turner.Lret0:
206*09a53ad8SAndrew Turner	mov	result, #0
207*09a53ad8SAndrew Turner	ret
208*09a53ad8SAndrew Turner
209*09a53ad8SAndrew Turner	.p2align 6
210*09a53ad8SAndrew Turner.Lmisaligned8:
211*09a53ad8SAndrew Turner	sub	limit, limit, #1
212*09a53ad8SAndrew Turner1:
213*09a53ad8SAndrew Turner	/* Perhaps we can do better than this.  */
214*09a53ad8SAndrew Turner	ldrb	data1w, [src1], #1
215*09a53ad8SAndrew Turner	ldrb	data2w, [src2], #1
216*09a53ad8SAndrew Turner	subs	limit, limit, #1
217*09a53ad8SAndrew Turner	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
218*09a53ad8SAndrew Turner	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
219*09a53ad8SAndrew Turner	b.eq	1b
220*09a53ad8SAndrew Turner	sub	result, data1, data2
221*09a53ad8SAndrew Turner	ret
222*09a53ad8SAndrew Turner	.size strncmp, . - strncmp
223