xref: /freebsd-src/contrib/cortex-strings/src/aarch64/strchrnul.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
109a53ad8SAndrew Turner/*
209a53ad8SAndrew Turner   strchrnul - find a character or nul in a string
309a53ad8SAndrew Turner
409a53ad8SAndrew Turner   Copyright (c) 2014, ARM Limited
509a53ad8SAndrew Turner   All rights Reserved.
609a53ad8SAndrew Turner
709a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
809a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
909a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
1009a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer.
1109a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
1209a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer in the
1309a53ad8SAndrew Turner         documentation and/or other materials provided with the distribution.
1409a53ad8SAndrew Turner       * Neither the name of the company nor the names of its contributors
1509a53ad8SAndrew Turner         may be used to endorse or promote products derived from this
1609a53ad8SAndrew Turner         software without specific prior written permission.
1709a53ad8SAndrew Turner
1809a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1909a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2009a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2109a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
2209a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
2309a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
2409a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2509a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2609a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2709a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2809a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
2909a53ad8SAndrew Turner
3009a53ad8SAndrew Turner/* Assumptions:
3109a53ad8SAndrew Turner *
3209a53ad8SAndrew Turner * ARMv8-a, AArch64
3309a53ad8SAndrew Turner * Neon Available.
3409a53ad8SAndrew Turner */
3509a53ad8SAndrew Turner
3609a53ad8SAndrew Turner/* Arguments and results.  */
3709a53ad8SAndrew Turner#define srcin		x0
3809a53ad8SAndrew Turner#define chrin		w1
3909a53ad8SAndrew Turner
4009a53ad8SAndrew Turner#define result		x0
4109a53ad8SAndrew Turner
4209a53ad8SAndrew Turner#define src		x2
4309a53ad8SAndrew Turner#define	tmp1		x3
4409a53ad8SAndrew Turner#define wtmp2		w4
4509a53ad8SAndrew Turner#define tmp3		x5
4609a53ad8SAndrew Turner
4709a53ad8SAndrew Turner#define vrepchr		v0
4809a53ad8SAndrew Turner#define vdata1		v1
4909a53ad8SAndrew Turner#define vdata2		v2
5009a53ad8SAndrew Turner#define vhas_nul1	v3
5109a53ad8SAndrew Turner#define vhas_nul2	v4
5209a53ad8SAndrew Turner#define vhas_chr1	v5
5309a53ad8SAndrew Turner#define vhas_chr2	v6
5409a53ad8SAndrew Turner#define vrepmask	v7
5509a53ad8SAndrew Turner#define vend1		v16
5609a53ad8SAndrew Turner
5709a53ad8SAndrew Turner/* Core algorithm.
5809a53ad8SAndrew Turner
5909a53ad8SAndrew Turner   For each 32-byte hunk we calculate a 64-bit syndrome value, with
6009a53ad8SAndrew Turner   two bits per byte (LSB is always in bits 0 and 1, for both big
6109a53ad8SAndrew Turner   and little-endian systems).  For each tuple, bit 0 is set iff
6209a53ad8SAndrew Turner   the relevant byte matched the requested character or nul.  Since the
6309a53ad8SAndrew Turner   bits in the syndrome reflect exactly the order in which things occur
6409a53ad8SAndrew Turner   in the original string a count_trailing_zeros() operation will
6509a53ad8SAndrew Turner   identify exactly which byte is causing the termination.  */
6609a53ad8SAndrew Turner
6709a53ad8SAndrew Turner/* Locals and temporaries.  */
6809a53ad8SAndrew Turner
6909a53ad8SAndrew Turner	.macro def_fn f p2align=0
7009a53ad8SAndrew Turner	.text
7109a53ad8SAndrew Turner	.p2align \p2align
7209a53ad8SAndrew Turner	.global \f
7309a53ad8SAndrew Turner	.type \f, %function
7409a53ad8SAndrew Turner\f:
7509a53ad8SAndrew Turner	.endm
7609a53ad8SAndrew Turner
7709a53ad8SAndrew Turnerdef_fn strchrnul
7809a53ad8SAndrew Turner	/* Magic constant 0x40100401 to allow us to identify which lane
7909a53ad8SAndrew Turner	   matches the termination condition.  */
8009a53ad8SAndrew Turner	mov	wtmp2, #0x0401
8109a53ad8SAndrew Turner	movk	wtmp2, #0x4010, lsl #16
8209a53ad8SAndrew Turner	dup	vrepchr.16b, chrin
8309a53ad8SAndrew Turner	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
8409a53ad8SAndrew Turner	dup	vrepmask.4s, wtmp2
8509a53ad8SAndrew Turner	ands	tmp1, srcin, #31
8609a53ad8SAndrew Turner	b.eq	.Lloop
8709a53ad8SAndrew Turner
8809a53ad8SAndrew Turner	/* Input string is not 32-byte aligned.  Rather than forcing
8909a53ad8SAndrew Turner	   the padding bytes to a safe value, we calculate the syndrome
9009a53ad8SAndrew Turner	   for all the bytes, but then mask off those bits of the
9109a53ad8SAndrew Turner	   syndrome that are related to the padding.  */
9209a53ad8SAndrew Turner	ld1	{vdata1.16b, vdata2.16b}, [src], #32
9309a53ad8SAndrew Turner	neg	tmp1, tmp1
9409a53ad8SAndrew Turner	cmeq	vhas_nul1.16b, vdata1.16b, #0
9509a53ad8SAndrew Turner	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
9609a53ad8SAndrew Turner	cmeq	vhas_nul2.16b, vdata2.16b, #0
9709a53ad8SAndrew Turner	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
9809a53ad8SAndrew Turner	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
9909a53ad8SAndrew Turner	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
10009a53ad8SAndrew Turner	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
10109a53ad8SAndrew Turner	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
10209a53ad8SAndrew Turner	lsl	tmp1, tmp1, #1
10309a53ad8SAndrew Turner	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
10409a53ad8SAndrew Turner	mov	tmp3, #~0
10509a53ad8SAndrew Turner	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
10609a53ad8SAndrew Turner	lsr	tmp1, tmp3, tmp1
10709a53ad8SAndrew Turner
108*27044e17SAndrew Turner	mov	tmp3, vend1.d[0]
10909a53ad8SAndrew Turner	bic	tmp1, tmp3, tmp1	// Mask padding bits.
11009a53ad8SAndrew Turner	cbnz	tmp1, .Ltail
11109a53ad8SAndrew Turner
11209a53ad8SAndrew Turner.Lloop:
11309a53ad8SAndrew Turner	ld1	{vdata1.16b, vdata2.16b}, [src], #32
11409a53ad8SAndrew Turner	cmeq	vhas_nul1.16b, vdata1.16b, #0
11509a53ad8SAndrew Turner	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
11609a53ad8SAndrew Turner	cmeq	vhas_nul2.16b, vdata2.16b, #0
11709a53ad8SAndrew Turner	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
11809a53ad8SAndrew Turner	/* Use a fast check for the termination condition.  */
11909a53ad8SAndrew Turner	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
12009a53ad8SAndrew Turner	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
12109a53ad8SAndrew Turner	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
12209a53ad8SAndrew Turner	addp	vend1.2d, vend1.2d, vend1.2d
123*27044e17SAndrew Turner	mov	tmp1, vend1.d[0]
12409a53ad8SAndrew Turner	cbz	tmp1, .Lloop
12509a53ad8SAndrew Turner
12609a53ad8SAndrew Turner	/* Termination condition found.  Now need to establish exactly why
12709a53ad8SAndrew Turner	   we terminated.  */
12809a53ad8SAndrew Turner	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
12909a53ad8SAndrew Turner	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
13009a53ad8SAndrew Turner	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
13109a53ad8SAndrew Turner	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
13209a53ad8SAndrew Turner
133*27044e17SAndrew Turner	mov	tmp1, vend1.d[0]
13409a53ad8SAndrew Turner.Ltail:
13509a53ad8SAndrew Turner	/* Count the trailing zeros, by bit reversing...  */
13609a53ad8SAndrew Turner	rbit	tmp1, tmp1
13709a53ad8SAndrew Turner	/* Re-bias source.  */
13809a53ad8SAndrew Turner	sub	src, src, #32
13909a53ad8SAndrew Turner	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
14009a53ad8SAndrew Turner	/* tmp1 is twice the offset into the fragment.  */
14109a53ad8SAndrew Turner	add	result, src, tmp1, lsr #1
14209a53ad8SAndrew Turner	ret
14309a53ad8SAndrew Turner
14409a53ad8SAndrew Turner	.size	strchrnul, . - strchrnul
145