xref: /freebsd-src/contrib/cortex-strings/src/aarch64/strcmp.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
1*09a53ad8SAndrew Turner/* Copyright (c) 2012, Linaro Limited
2*09a53ad8SAndrew Turner   All rights reserved.
3*09a53ad8SAndrew Turner
4*09a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
5*09a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
6*09a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
7*09a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer.
8*09a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
9*09a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer in the
10*09a53ad8SAndrew Turner         documentation and/or other materials provided with the distribution.
11*09a53ad8SAndrew Turner       * Neither the name of the Linaro nor the
12*09a53ad8SAndrew Turner         names of its contributors may be used to endorse or promote products
13*09a53ad8SAndrew Turner         derived from this software without specific prior written permission.
14*09a53ad8SAndrew Turner
15*09a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*09a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*09a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*09a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19*09a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20*09a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21*09a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22*09a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23*09a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*09a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25*09a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26*09a53ad8SAndrew Turner
27*09a53ad8SAndrew Turner/* Assumptions:
28*09a53ad8SAndrew Turner *
29*09a53ad8SAndrew Turner * ARMv8-a, AArch64
30*09a53ad8SAndrew Turner */
31*09a53ad8SAndrew Turner
32*09a53ad8SAndrew Turner	.macro def_fn f p2align=0
33*09a53ad8SAndrew Turner	.text
34*09a53ad8SAndrew Turner	.p2align \p2align
35*09a53ad8SAndrew Turner	.global \f
36*09a53ad8SAndrew Turner	.type \f, %function
37*09a53ad8SAndrew Turner\f:
38*09a53ad8SAndrew Turner	.endm
39*09a53ad8SAndrew Turner
40*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101
41*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f
42*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080
43*09a53ad8SAndrew Turner
44*09a53ad8SAndrew Turner/* Parameters and result.  */
45*09a53ad8SAndrew Turner#define src1		x0
46*09a53ad8SAndrew Turner#define src2		x1
47*09a53ad8SAndrew Turner#define result		x0
48*09a53ad8SAndrew Turner
49*09a53ad8SAndrew Turner/* Internal variables.  */
50*09a53ad8SAndrew Turner#define data1		x2
51*09a53ad8SAndrew Turner#define data1w		w2
52*09a53ad8SAndrew Turner#define data2		x3
53*09a53ad8SAndrew Turner#define data2w		w3
54*09a53ad8SAndrew Turner#define has_nul		x4
55*09a53ad8SAndrew Turner#define diff		x5
56*09a53ad8SAndrew Turner#define syndrome	x6
57*09a53ad8SAndrew Turner#define tmp1		x7
58*09a53ad8SAndrew Turner#define tmp2		x8
59*09a53ad8SAndrew Turner#define tmp3		x9
60*09a53ad8SAndrew Turner#define zeroones	x10
61*09a53ad8SAndrew Turner#define pos		x11
62*09a53ad8SAndrew Turner
63*09a53ad8SAndrew Turner	/* Start of performance-critical section  -- one 64B cache line.  */
64*09a53ad8SAndrew Turnerdef_fn strcmp p2align=6
65*09a53ad8SAndrew Turner	eor	tmp1, src1, src2
66*09a53ad8SAndrew Turner	mov	zeroones, #REP8_01
67*09a53ad8SAndrew Turner	tst	tmp1, #7
68*09a53ad8SAndrew Turner	b.ne	.Lmisaligned8
69*09a53ad8SAndrew Turner	ands	tmp1, src1, #7
70*09a53ad8SAndrew Turner	b.ne	.Lmutual_align
71*09a53ad8SAndrew Turner	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
72*09a53ad8SAndrew Turner	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
73*09a53ad8SAndrew Turner	   can be done in parallel across the entire word.  */
74*09a53ad8SAndrew Turner.Lloop_aligned:
75*09a53ad8SAndrew Turner	ldr	data1, [src1], #8
76*09a53ad8SAndrew Turner	ldr	data2, [src2], #8
77*09a53ad8SAndrew Turner.Lstart_realigned:
78*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
79*09a53ad8SAndrew Turner	orr	tmp2, data1, #REP8_7f
80*09a53ad8SAndrew Turner	eor	diff, data1, data2	/* Non-zero if differences found.  */
81*09a53ad8SAndrew Turner	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
82*09a53ad8SAndrew Turner	orr	syndrome, diff, has_nul
83*09a53ad8SAndrew Turner	cbz	syndrome, .Lloop_aligned
84*09a53ad8SAndrew Turner	/* End of performance-critical section  -- one 64B cache line.  */
85*09a53ad8SAndrew Turner
86*09a53ad8SAndrew Turner#ifndef	__AARCH64EB__
87*09a53ad8SAndrew Turner	rev	syndrome, syndrome
88*09a53ad8SAndrew Turner	rev	data1, data1
89*09a53ad8SAndrew Turner	/* The MS-non-zero bit of the syndrome marks either the first bit
90*09a53ad8SAndrew Turner	   that is different, or the top bit of the first zero byte.
91*09a53ad8SAndrew Turner	   Shifting left now will bring the critical information into the
92*09a53ad8SAndrew Turner	   top bits.  */
93*09a53ad8SAndrew Turner	clz	pos, syndrome
94*09a53ad8SAndrew Turner	rev	data2, data2
95*09a53ad8SAndrew Turner	lsl	data1, data1, pos
96*09a53ad8SAndrew Turner	lsl	data2, data2, pos
97*09a53ad8SAndrew Turner	/* But we need to zero-extend (char is unsigned) the value and then
98*09a53ad8SAndrew Turner	   perform a signed 32-bit subtraction.  */
99*09a53ad8SAndrew Turner	lsr	data1, data1, #56
100*09a53ad8SAndrew Turner	sub	result, data1, data2, lsr #56
101*09a53ad8SAndrew Turner	ret
102*09a53ad8SAndrew Turner#else
103*09a53ad8SAndrew Turner	/* For big-endian we cannot use the trick with the syndrome value
104*09a53ad8SAndrew Turner	   as carry-propagation can corrupt the upper bits if the trailing
105*09a53ad8SAndrew Turner	   bytes in the string contain 0x01.  */
106*09a53ad8SAndrew Turner	/* However, if there is no NUL byte in the dword, we can generate
107*09a53ad8SAndrew Turner	   the result directly.  We can't just subtract the bytes as the
108*09a53ad8SAndrew Turner	   MSB might be significant.  */
109*09a53ad8SAndrew Turner	cbnz	has_nul, 1f
110*09a53ad8SAndrew Turner	cmp	data1, data2
111*09a53ad8SAndrew Turner	cset	result, ne
112*09a53ad8SAndrew Turner	cneg	result, result, lo
113*09a53ad8SAndrew Turner	ret
114*09a53ad8SAndrew Turner1:
115*09a53ad8SAndrew Turner	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
116*09a53ad8SAndrew Turner	rev	tmp3, data1
117*09a53ad8SAndrew Turner	sub	tmp1, tmp3, zeroones
118*09a53ad8SAndrew Turner	orr	tmp2, tmp3, #REP8_7f
119*09a53ad8SAndrew Turner	bic	has_nul, tmp1, tmp2
120*09a53ad8SAndrew Turner	rev	has_nul, has_nul
121*09a53ad8SAndrew Turner	orr	syndrome, diff, has_nul
122*09a53ad8SAndrew Turner	clz	pos, syndrome
123*09a53ad8SAndrew Turner	/* The MS-non-zero bit of the syndrome marks either the first bit
124*09a53ad8SAndrew Turner	   that is different, or the top bit of the first zero byte.
125*09a53ad8SAndrew Turner	   Shifting left now will bring the critical information into the
126*09a53ad8SAndrew Turner	   top bits.  */
127*09a53ad8SAndrew Turner	lsl	data1, data1, pos
128*09a53ad8SAndrew Turner	lsl	data2, data2, pos
129*09a53ad8SAndrew Turner	/* But we need to zero-extend (char is unsigned) the value and then
130*09a53ad8SAndrew Turner	   perform a signed 32-bit subtraction.  */
131*09a53ad8SAndrew Turner	lsr	data1, data1, #56
132*09a53ad8SAndrew Turner	sub	result, data1, data2, lsr #56
133*09a53ad8SAndrew Turner	ret
134*09a53ad8SAndrew Turner#endif
135*09a53ad8SAndrew Turner
136*09a53ad8SAndrew Turner.Lmutual_align:
137*09a53ad8SAndrew Turner	/* Sources are mutually aligned, but are not currently at an
138*09a53ad8SAndrew Turner	   alignment boundary.  Round down the addresses and then mask off
139*09a53ad8SAndrew Turner	   the bytes that preceed the start point.  */
140*09a53ad8SAndrew Turner	bic	src1, src1, #7
141*09a53ad8SAndrew Turner	bic	src2, src2, #7
142*09a53ad8SAndrew Turner	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
143*09a53ad8SAndrew Turner	ldr	data1, [src1], #8
144*09a53ad8SAndrew Turner	neg	tmp1, tmp1		/* Bits to alignment -64.  */
145*09a53ad8SAndrew Turner	ldr	data2, [src2], #8
146*09a53ad8SAndrew Turner	mov	tmp2, #~0
147*09a53ad8SAndrew Turner#ifdef __AARCH64EB__
148*09a53ad8SAndrew Turner	/* Big-endian.  Early bytes are at MSB.  */
149*09a53ad8SAndrew Turner	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
150*09a53ad8SAndrew Turner#else
151*09a53ad8SAndrew Turner	/* Little-endian.  Early bytes are at LSB.  */
152*09a53ad8SAndrew Turner	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
153*09a53ad8SAndrew Turner#endif
154*09a53ad8SAndrew Turner	orr	data1, data1, tmp2
155*09a53ad8SAndrew Turner	orr	data2, data2, tmp2
156*09a53ad8SAndrew Turner	b	.Lstart_realigned
157*09a53ad8SAndrew Turner
158*09a53ad8SAndrew Turner.Lmisaligned8:
159*09a53ad8SAndrew Turner	/* We can do better than this.  */
160*09a53ad8SAndrew Turner	ldrb	data1w, [src1], #1
161*09a53ad8SAndrew Turner	ldrb	data2w, [src2], #1
162*09a53ad8SAndrew Turner	cmp	data1w, #1
163*09a53ad8SAndrew Turner	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
164*09a53ad8SAndrew Turner	b.eq	.Lmisaligned8
165*09a53ad8SAndrew Turner	sub	result, data1, data2
166*09a53ad8SAndrew Turner	ret
167