xref: /freebsd-src/contrib/cortex-strings/src/aarch64/strchr.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
109a53ad8SAndrew Turner/*
209a53ad8SAndrew Turner   strchr - find a character in a string
309a53ad8SAndrew Turner
409a53ad8SAndrew Turner   Copyright (c) 2014, ARM Limited
509a53ad8SAndrew Turner   All rights Reserved.
609a53ad8SAndrew Turner
709a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
809a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
909a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
1009a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer.
1109a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
1209a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer in the
1309a53ad8SAndrew Turner         documentation and/or other materials provided with the distribution.
1409a53ad8SAndrew Turner       * Neither the name of the company nor the names of its contributors
1509a53ad8SAndrew Turner         may be used to endorse or promote products derived from this
1609a53ad8SAndrew Turner         software without specific prior written permission.
1709a53ad8SAndrew Turner
1809a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1909a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2009a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2109a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
2209a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
2309a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
2409a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2509a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2609a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2709a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2809a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
2909a53ad8SAndrew Turner
3009a53ad8SAndrew Turner/* Assumptions:
3109a53ad8SAndrew Turner *
3209a53ad8SAndrew Turner * ARMv8-a, AArch64
3309a53ad8SAndrew Turner * Neon Available.
3409a53ad8SAndrew Turner */
3509a53ad8SAndrew Turner
3609a53ad8SAndrew Turner/* Arguments and results.  */
3709a53ad8SAndrew Turner#define srcin		x0
3809a53ad8SAndrew Turner#define chrin		w1
3909a53ad8SAndrew Turner
4009a53ad8SAndrew Turner#define result		x0
4109a53ad8SAndrew Turner
4209a53ad8SAndrew Turner#define src		x2
4309a53ad8SAndrew Turner#define	tmp1		x3
4409a53ad8SAndrew Turner#define wtmp2		w4
4509a53ad8SAndrew Turner#define tmp3		x5
4609a53ad8SAndrew Turner
4709a53ad8SAndrew Turner#define vrepchr		v0
4809a53ad8SAndrew Turner#define vdata1		v1
4909a53ad8SAndrew Turner#define vdata2		v2
5009a53ad8SAndrew Turner#define vhas_nul1	v3
5109a53ad8SAndrew Turner#define vhas_nul2	v4
5209a53ad8SAndrew Turner#define vhas_chr1	v5
5309a53ad8SAndrew Turner#define vhas_chr2	v6
5409a53ad8SAndrew Turner#define vrepmask_0	v7
5509a53ad8SAndrew Turner#define vrepmask_c	v16
5609a53ad8SAndrew Turner#define vend1		v17
5709a53ad8SAndrew Turner#define vend2		v18
5809a53ad8SAndrew Turner
5909a53ad8SAndrew Turner/* Core algorithm.
6009a53ad8SAndrew Turner
6109a53ad8SAndrew Turner   For each 32-byte hunk we calculate a 64-bit syndrome value, with
6209a53ad8SAndrew Turner   two bits per byte (LSB is always in bits 0 and 1, for both big
6309a53ad8SAndrew Turner   and little-endian systems).  For each tuple, bit 0 is set iff
6409a53ad8SAndrew Turner   the relevant byte matched the requested character; bit 1 is set
6509a53ad8SAndrew Turner   iff the relevant byte matched the NUL end of string (we trigger
6609a53ad8SAndrew Turner   off bit0 for the special case of looking for NUL).  Since the bits
6709a53ad8SAndrew Turner   in the syndrome reflect exactly the order in which things occur
6809a53ad8SAndrew Turner   in the original string a count_trailing_zeros() operation will
6909a53ad8SAndrew Turner   identify exactly which byte is causing the termination, and why.  */
7009a53ad8SAndrew Turner
7109a53ad8SAndrew Turner/* Locals and temporaries.  */
7209a53ad8SAndrew Turner
7309a53ad8SAndrew Turner	.macro def_fn f p2align=0
7409a53ad8SAndrew Turner	.text
7509a53ad8SAndrew Turner	.p2align \p2align
7609a53ad8SAndrew Turner	.global \f
7709a53ad8SAndrew Turner	.type \f, %function
7809a53ad8SAndrew Turner\f:
7909a53ad8SAndrew Turner	.endm
8009a53ad8SAndrew Turner
81*27044e17SAndrew Turner	.macro def_alias f a
82*27044e17SAndrew Turner	.weak \a
83*27044e17SAndrew Turner	.set \a,\f
84*27044e17SAndrew Turner	.endm
85*27044e17SAndrew Turner
8609a53ad8SAndrew Turnerdef_fn strchr
87*27044e17SAndrew Turnerdef_alias strchr index
8809a53ad8SAndrew Turner	/* Magic constant 0x40100401 to allow us to identify which lane
8909a53ad8SAndrew Turner	   matches the requested byte.  Magic constant 0x80200802 used
9009a53ad8SAndrew Turner	   similarly for NUL termination.  */
9109a53ad8SAndrew Turner	mov	wtmp2, #0x0401
9209a53ad8SAndrew Turner	movk	wtmp2, #0x4010, lsl #16
9309a53ad8SAndrew Turner	dup	vrepchr.16b, chrin
9409a53ad8SAndrew Turner	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
9509a53ad8SAndrew Turner	dup	vrepmask_c.4s, wtmp2
9609a53ad8SAndrew Turner	ands	tmp1, srcin, #31
9709a53ad8SAndrew Turner	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
9809a53ad8SAndrew Turner	b.eq	.Lloop
9909a53ad8SAndrew Turner
10009a53ad8SAndrew Turner	/* Input string is not 32-byte aligned.  Rather than forcing
10109a53ad8SAndrew Turner	   the padding bytes to a safe value, we calculate the syndrome
10209a53ad8SAndrew Turner	   for all the bytes, but then mask off those bits of the
10309a53ad8SAndrew Turner	   syndrome that are related to the padding.  */
10409a53ad8SAndrew Turner	ld1	{vdata1.16b, vdata2.16b}, [src], #32
10509a53ad8SAndrew Turner	neg	tmp1, tmp1
10609a53ad8SAndrew Turner	cmeq	vhas_nul1.16b, vdata1.16b, #0
10709a53ad8SAndrew Turner	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
10809a53ad8SAndrew Turner	cmeq	vhas_nul2.16b, vdata2.16b, #0
10909a53ad8SAndrew Turner	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
11009a53ad8SAndrew Turner	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
11109a53ad8SAndrew Turner	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
11209a53ad8SAndrew Turner	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
11309a53ad8SAndrew Turner	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
11409a53ad8SAndrew Turner	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
11509a53ad8SAndrew Turner	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
11609a53ad8SAndrew Turner	lsl	tmp1, tmp1, #1
11709a53ad8SAndrew Turner	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
11809a53ad8SAndrew Turner	mov	tmp3, #~0
11909a53ad8SAndrew Turner	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
12009a53ad8SAndrew Turner	lsr	tmp1, tmp3, tmp1
12109a53ad8SAndrew Turner
122*27044e17SAndrew Turner	mov	tmp3, vend1.d[0]
12309a53ad8SAndrew Turner	bic	tmp1, tmp3, tmp1	// Mask padding bits.
12409a53ad8SAndrew Turner	cbnz	tmp1, .Ltail
12509a53ad8SAndrew Turner
12609a53ad8SAndrew Turner.Lloop:
12709a53ad8SAndrew Turner	ld1	{vdata1.16b, vdata2.16b}, [src], #32
12809a53ad8SAndrew Turner	cmeq	vhas_nul1.16b, vdata1.16b, #0
12909a53ad8SAndrew Turner	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
13009a53ad8SAndrew Turner	cmeq	vhas_nul2.16b, vdata2.16b, #0
13109a53ad8SAndrew Turner	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
13209a53ad8SAndrew Turner	/* Use a fast check for the termination condition.  */
13309a53ad8SAndrew Turner	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
13409a53ad8SAndrew Turner	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
13509a53ad8SAndrew Turner	orr	vend1.16b, vend1.16b, vend2.16b
13609a53ad8SAndrew Turner	addp	vend1.2d, vend1.2d, vend1.2d
137*27044e17SAndrew Turner	mov	tmp1, vend1.d[0]
13809a53ad8SAndrew Turner	cbz	tmp1, .Lloop
13909a53ad8SAndrew Turner
14009a53ad8SAndrew Turner	/* Termination condition found.  Now need to establish exactly why
14109a53ad8SAndrew Turner	   we terminated.  */
14209a53ad8SAndrew Turner	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
14309a53ad8SAndrew Turner	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
14409a53ad8SAndrew Turner	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
14509a53ad8SAndrew Turner	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
14609a53ad8SAndrew Turner	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
14709a53ad8SAndrew Turner	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
14809a53ad8SAndrew Turner	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
14909a53ad8SAndrew Turner	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
15009a53ad8SAndrew Turner
151*27044e17SAndrew Turner	mov	tmp1, vend1.d[0]
15209a53ad8SAndrew Turner.Ltail:
15309a53ad8SAndrew Turner	/* Count the trailing zeros, by bit reversing...  */
15409a53ad8SAndrew Turner	rbit	tmp1, tmp1
15509a53ad8SAndrew Turner	/* Re-bias source.  */
15609a53ad8SAndrew Turner	sub	src, src, #32
15709a53ad8SAndrew Turner	clz	tmp1, tmp1	/* And counting the leading zeros.  */
15809a53ad8SAndrew Turner	/* Tmp1 is even if the target charager was found first.  Otherwise
15909a53ad8SAndrew Turner	   we've found the end of string and we weren't looking for NUL.  */
16009a53ad8SAndrew Turner	tst	tmp1, #1
16109a53ad8SAndrew Turner	add	result, src, tmp1, lsr #1
16209a53ad8SAndrew Turner	csel	result, result, xzr, eq
16309a53ad8SAndrew Turner	ret
16409a53ad8SAndrew Turner
16509a53ad8SAndrew Turner	.size	strchr, . - strchr
166