xref: /freebsd-src/contrib/cortex-strings/src/aarch64/strlen.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
1*09a53ad8SAndrew Turner/* Copyright (c) 2013-2015, Linaro Limited
2*09a53ad8SAndrew Turner   All rights reserved.
3*09a53ad8SAndrew Turner
4*09a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
5*09a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
6*09a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
7*09a53ad8SAndrew Turner	 notice, this list of conditions and the following disclaimer.
8*09a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
9*09a53ad8SAndrew Turner	 notice, this list of conditions and the following disclaimer in the
10*09a53ad8SAndrew Turner	 documentation and/or other materials provided with the distribution.
11*09a53ad8SAndrew Turner       * Neither the name of the Linaro nor the
12*09a53ad8SAndrew Turner	 names of its contributors may be used to endorse or promote products
13*09a53ad8SAndrew Turner	 derived from this software without specific prior written permission.
14*09a53ad8SAndrew Turner
15*09a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*09a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*09a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*09a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19*09a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20*09a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21*09a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22*09a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23*09a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*09a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25*09a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26*09a53ad8SAndrew Turner
27*09a53ad8SAndrew Turner/* Assumptions:
28*09a53ad8SAndrew Turner *
29*09a53ad8SAndrew Turner * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
30*09a53ad8SAndrew Turner */
31*09a53ad8SAndrew Turner
32*09a53ad8SAndrew Turner/* To test the page crossing code path more thoroughly, compile with
33*09a53ad8SAndrew Turner   -DTEST_PAGE_CROSS - this will force all calls through the slower
34*09a53ad8SAndrew Turner   entry path.  This option is not intended for production use.	 */
35*09a53ad8SAndrew Turner
36*09a53ad8SAndrew Turner/* Arguments and results.  */
37*09a53ad8SAndrew Turner#define srcin		x0
38*09a53ad8SAndrew Turner#define len		x0
39*09a53ad8SAndrew Turner
40*09a53ad8SAndrew Turner/* Locals and temporaries.  */
41*09a53ad8SAndrew Turner#define src		x1
42*09a53ad8SAndrew Turner#define data1		x2
43*09a53ad8SAndrew Turner#define data2		x3
44*09a53ad8SAndrew Turner#define has_nul1	x4
45*09a53ad8SAndrew Turner#define has_nul2	x5
46*09a53ad8SAndrew Turner#define tmp1		x4
47*09a53ad8SAndrew Turner#define tmp2		x5
48*09a53ad8SAndrew Turner#define tmp3		x6
49*09a53ad8SAndrew Turner#define tmp4		x7
50*09a53ad8SAndrew Turner#define zeroones	x8
51*09a53ad8SAndrew Turner
52*09a53ad8SAndrew Turner#define L(l) .L ## l
53*09a53ad8SAndrew Turner
54*09a53ad8SAndrew Turner	.macro def_fn f p2align=0
55*09a53ad8SAndrew Turner	.text
56*09a53ad8SAndrew Turner	.p2align \p2align
57*09a53ad8SAndrew Turner	.global \f
58*09a53ad8SAndrew Turner	.type \f, %function
59*09a53ad8SAndrew Turner\f:
60*09a53ad8SAndrew Turner	.endm
61*09a53ad8SAndrew Turner
62*09a53ad8SAndrew Turner	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63*09a53ad8SAndrew Turner	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64*09a53ad8SAndrew Turner	   can be done in parallel across the entire word. A faster check
65*09a53ad8SAndrew Turner	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
66*09a53ad8SAndrew Turner	   false hits for characters 129..255.	*/
67*09a53ad8SAndrew Turner
68*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101
69*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f
70*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080
71*09a53ad8SAndrew Turner
72*09a53ad8SAndrew Turner#ifdef TEST_PAGE_CROSS
73*09a53ad8SAndrew Turner# define MIN_PAGE_SIZE 15
74*09a53ad8SAndrew Turner#else
75*09a53ad8SAndrew Turner# define MIN_PAGE_SIZE 4096
76*09a53ad8SAndrew Turner#endif
77*09a53ad8SAndrew Turner
78*09a53ad8SAndrew Turner	/* Since strings are short on average, we check the first 16 bytes
79*09a53ad8SAndrew Turner	   of the string for a NUL character.  In order to do an unaligned ldp
80*09a53ad8SAndrew Turner	   safely we have to do a page cross check first.  If there is a NUL
81*09a53ad8SAndrew Turner	   byte we calculate the length from the 2 8-byte words using
82*09a53ad8SAndrew Turner	   conditional select to reduce branch mispredictions (it is unlikely
83*09a53ad8SAndrew Turner	   strlen will be repeatedly called on strings with the same length).
84*09a53ad8SAndrew Turner
85*09a53ad8SAndrew Turner	   If the string is longer than 16 bytes, we align src so don't need
86*09a53ad8SAndrew Turner	   further page cross checks, and process 32 bytes per iteration
87*09a53ad8SAndrew Turner	   using the fast NUL check.  If we encounter non-ASCII characters,
88*09a53ad8SAndrew Turner	   fallback to a second loop using the full NUL check.
89*09a53ad8SAndrew Turner
90*09a53ad8SAndrew Turner	   If the page cross check fails, we read 16 bytes from an aligned
91*09a53ad8SAndrew Turner	   address, remove any characters before the string, and continue
92*09a53ad8SAndrew Turner	   in the main loop using aligned loads.  Since strings crossing a
93*09a53ad8SAndrew Turner	   page in the first 16 bytes are rare (probability of
94*09a53ad8SAndrew Turner	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
95*09a53ad8SAndrew Turner
96*09a53ad8SAndrew Turner	   AArch64 systems have a minimum page size of 4k.  We don't bother
97*09a53ad8SAndrew Turner	   checking for larger page sizes - the cost of setting up the correct
98*09a53ad8SAndrew Turner	   page size is just not worth the extra gain from a small reduction in
99*09a53ad8SAndrew Turner	   the cases taking the slow path.  Note that we only care about
100*09a53ad8SAndrew Turner	   whether the first fetch, which may be misaligned, crosses a page
101*09a53ad8SAndrew Turner	   boundary.  */
102*09a53ad8SAndrew Turner
103*09a53ad8SAndrew Turnerdef_fn strlen p2align=6
104*09a53ad8SAndrew Turner	and	tmp1, srcin, MIN_PAGE_SIZE - 1
105*09a53ad8SAndrew Turner	mov	zeroones, REP8_01
106*09a53ad8SAndrew Turner	cmp	tmp1, MIN_PAGE_SIZE - 16
107*09a53ad8SAndrew Turner	b.gt	L(page_cross)
108*09a53ad8SAndrew Turner	ldp	data1, data2, [srcin]
109*09a53ad8SAndrew Turner#ifdef __AARCH64EB__
110*09a53ad8SAndrew Turner	/* For big-endian, carry propagation (if the final byte in the
111*09a53ad8SAndrew Turner	   string is 0x01) means we cannot use has_nul1/2 directly.
112*09a53ad8SAndrew Turner	   Since we expect strings to be small and early-exit,
113*09a53ad8SAndrew Turner	   byte-swap the data now so has_null1/2 will be correct.  */
114*09a53ad8SAndrew Turner	rev	data1, data1
115*09a53ad8SAndrew Turner	rev	data2, data2
116*09a53ad8SAndrew Turner#endif
117*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
118*09a53ad8SAndrew Turner	orr	tmp2, data1, REP8_7f
119*09a53ad8SAndrew Turner	sub	tmp3, data2, zeroones
120*09a53ad8SAndrew Turner	orr	tmp4, data2, REP8_7f
121*09a53ad8SAndrew Turner	bics	has_nul1, tmp1, tmp2
122*09a53ad8SAndrew Turner	bic	has_nul2, tmp3, tmp4
123*09a53ad8SAndrew Turner	ccmp	has_nul2, 0, 0, eq
124*09a53ad8SAndrew Turner	beq	L(main_loop_entry)
125*09a53ad8SAndrew Turner
126*09a53ad8SAndrew Turner	/* Enter with C = has_nul1 == 0.  */
127*09a53ad8SAndrew Turner	csel	has_nul1, has_nul1, has_nul2, cc
128*09a53ad8SAndrew Turner	mov	len, 8
129*09a53ad8SAndrew Turner	rev	has_nul1, has_nul1
130*09a53ad8SAndrew Turner	clz	tmp1, has_nul1
131*09a53ad8SAndrew Turner	csel	len, xzr, len, cc
132*09a53ad8SAndrew Turner	add	len, len, tmp1, lsr 3
133*09a53ad8SAndrew Turner	ret
134*09a53ad8SAndrew Turner
135*09a53ad8SAndrew Turner	/* The inner loop processes 32 bytes per iteration and uses the fast
136*09a53ad8SAndrew Turner	   NUL check.  If we encounter non-ASCII characters, use a second
137*09a53ad8SAndrew Turner	   loop with the accurate NUL check.  */
138*09a53ad8SAndrew Turner	.p2align 4
139*09a53ad8SAndrew TurnerL(main_loop_entry):
140*09a53ad8SAndrew Turner	bic	src, srcin, 15
141*09a53ad8SAndrew Turner	sub	src, src, 16
142*09a53ad8SAndrew TurnerL(main_loop):
143*09a53ad8SAndrew Turner	ldp	data1, data2, [src, 32]!
144*09a53ad8SAndrew Turner.Lpage_cross_entry:
145*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
146*09a53ad8SAndrew Turner	sub	tmp3, data2, zeroones
147*09a53ad8SAndrew Turner	orr	tmp2, tmp1, tmp3
148*09a53ad8SAndrew Turner	tst	tmp2, zeroones, lsl 7
149*09a53ad8SAndrew Turner	bne	1f
150*09a53ad8SAndrew Turner	ldp	data1, data2, [src, 16]
151*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
152*09a53ad8SAndrew Turner	sub	tmp3, data2, zeroones
153*09a53ad8SAndrew Turner	orr	tmp2, tmp1, tmp3
154*09a53ad8SAndrew Turner	tst	tmp2, zeroones, lsl 7
155*09a53ad8SAndrew Turner	beq	L(main_loop)
156*09a53ad8SAndrew Turner	add	src, src, 16
157*09a53ad8SAndrew Turner1:
158*09a53ad8SAndrew Turner	/* The fast check failed, so do the slower, accurate NUL check.	 */
159*09a53ad8SAndrew Turner	orr	tmp2, data1, REP8_7f
160*09a53ad8SAndrew Turner	orr	tmp4, data2, REP8_7f
161*09a53ad8SAndrew Turner	bics	has_nul1, tmp1, tmp2
162*09a53ad8SAndrew Turner	bic	has_nul2, tmp3, tmp4
163*09a53ad8SAndrew Turner	ccmp	has_nul2, 0, 0, eq
164*09a53ad8SAndrew Turner	beq	L(nonascii_loop)
165*09a53ad8SAndrew Turner
166*09a53ad8SAndrew Turner	/* Enter with C = has_nul1 == 0.  */
167*09a53ad8SAndrew TurnerL(tail):
168*09a53ad8SAndrew Turner#ifdef __AARCH64EB__
169*09a53ad8SAndrew Turner	/* For big-endian, carry propagation (if the final byte in the
170*09a53ad8SAndrew Turner	   string is 0x01) means we cannot use has_nul1/2 directly.  The
171*09a53ad8SAndrew Turner	   easiest way to get the correct byte is to byte-swap the data
172*09a53ad8SAndrew Turner	   and calculate the syndrome a second time.  */
173*09a53ad8SAndrew Turner	csel	data1, data1, data2, cc
174*09a53ad8SAndrew Turner	rev	data1, data1
175*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
176*09a53ad8SAndrew Turner	orr	tmp2, data1, REP8_7f
177*09a53ad8SAndrew Turner	bic	has_nul1, tmp1, tmp2
178*09a53ad8SAndrew Turner#else
179*09a53ad8SAndrew Turner	csel	has_nul1, has_nul1, has_nul2, cc
180*09a53ad8SAndrew Turner#endif
181*09a53ad8SAndrew Turner	sub	len, src, srcin
182*09a53ad8SAndrew Turner	rev	has_nul1, has_nul1
183*09a53ad8SAndrew Turner	add	tmp2, len, 8
184*09a53ad8SAndrew Turner	clz	tmp1, has_nul1
185*09a53ad8SAndrew Turner	csel	len, len, tmp2, cc
186*09a53ad8SAndrew Turner	add	len, len, tmp1, lsr 3
187*09a53ad8SAndrew Turner	ret
188*09a53ad8SAndrew Turner
189*09a53ad8SAndrew TurnerL(nonascii_loop):
190*09a53ad8SAndrew Turner	ldp	data1, data2, [src, 16]!
191*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
192*09a53ad8SAndrew Turner	orr	tmp2, data1, REP8_7f
193*09a53ad8SAndrew Turner	sub	tmp3, data2, zeroones
194*09a53ad8SAndrew Turner	orr	tmp4, data2, REP8_7f
195*09a53ad8SAndrew Turner	bics	has_nul1, tmp1, tmp2
196*09a53ad8SAndrew Turner	bic	has_nul2, tmp3, tmp4
197*09a53ad8SAndrew Turner	ccmp	has_nul2, 0, 0, eq
198*09a53ad8SAndrew Turner	bne	L(tail)
199*09a53ad8SAndrew Turner	ldp	data1, data2, [src, 16]!
200*09a53ad8SAndrew Turner	sub	tmp1, data1, zeroones
201*09a53ad8SAndrew Turner	orr	tmp2, data1, REP8_7f
202*09a53ad8SAndrew Turner	sub	tmp3, data2, zeroones
203*09a53ad8SAndrew Turner	orr	tmp4, data2, REP8_7f
204*09a53ad8SAndrew Turner	bics	has_nul1, tmp1, tmp2
205*09a53ad8SAndrew Turner	bic	has_nul2, tmp3, tmp4
206*09a53ad8SAndrew Turner	ccmp	has_nul2, 0, 0, eq
207*09a53ad8SAndrew Turner	beq	L(nonascii_loop)
208*09a53ad8SAndrew Turner	b	L(tail)
209*09a53ad8SAndrew Turner
210*09a53ad8SAndrew Turner	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
211*09a53ad8SAndrew Turner	   srcin to 0x7f, so we ignore any NUL bytes before the string.
212*09a53ad8SAndrew Turner	   Then continue in the aligned loop.  */
213*09a53ad8SAndrew TurnerL(page_cross):
214*09a53ad8SAndrew Turner	bic	src, srcin, 15
215*09a53ad8SAndrew Turner	ldp	data1, data2, [src]
216*09a53ad8SAndrew Turner	lsl	tmp1, srcin, 3
217*09a53ad8SAndrew Turner	mov	tmp4, -1
218*09a53ad8SAndrew Turner#ifdef __AARCH64EB__
219*09a53ad8SAndrew Turner	/* Big-endian.	Early bytes are at MSB.	 */
220*09a53ad8SAndrew Turner	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
221*09a53ad8SAndrew Turner#else
222*09a53ad8SAndrew Turner	/* Little-endian.  Early bytes are at LSB.  */
223*09a53ad8SAndrew Turner	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
224*09a53ad8SAndrew Turner#endif
225*09a53ad8SAndrew Turner	orr	tmp1, tmp1, REP8_80
226*09a53ad8SAndrew Turner	orn	data1, data1, tmp1
227*09a53ad8SAndrew Turner	orn	tmp2, data2, tmp1
228*09a53ad8SAndrew Turner	tst	srcin, 8
229*09a53ad8SAndrew Turner	csel	data1, data1, tmp4, eq
230*09a53ad8SAndrew Turner	csel	data2, data2, tmp2, eq
231*09a53ad8SAndrew Turner	b	L(page_cross_entry)
232*09a53ad8SAndrew Turner
233*09a53ad8SAndrew Turner	.size	strlen, . - strlen
234