xref: /freebsd-src/contrib/cortex-strings/src/aarch64/memchr.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
109a53ad8SAndrew Turner/*
209a53ad8SAndrew Turner * memchr - find a character in a memory zone
309a53ad8SAndrew Turner *
409a53ad8SAndrew Turner * Copyright (c) 2014, ARM Limited
509a53ad8SAndrew Turner * All rights Reserved.
609a53ad8SAndrew Turner *
709a53ad8SAndrew Turner * Redistribution and use in source and binary forms, with or without
809a53ad8SAndrew Turner * modification, are permitted provided that the following conditions are met:
909a53ad8SAndrew Turner *     * Redistributions of source code must retain the above copyright
1009a53ad8SAndrew Turner *       notice, this list of conditions and the following disclaimer.
1109a53ad8SAndrew Turner *     * Redistributions in binary form must reproduce the above copyright
1209a53ad8SAndrew Turner *       notice, this list of conditions and the following disclaimer in the
1309a53ad8SAndrew Turner *       documentation and/or other materials provided with the distribution.
1409a53ad8SAndrew Turner *     * Neither the name of the company nor the names of its contributors
1509a53ad8SAndrew Turner *       may be used to endorse or promote products derived from this
1609a53ad8SAndrew Turner *       software without specific prior written permission.
1709a53ad8SAndrew Turner *
1809a53ad8SAndrew Turner * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1909a53ad8SAndrew Turner * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2009a53ad8SAndrew Turner * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2109a53ad8SAndrew Turner * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
2209a53ad8SAndrew Turner * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
2309a53ad8SAndrew Turner * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
2409a53ad8SAndrew Turner * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2509a53ad8SAndrew Turner * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2609a53ad8SAndrew Turner * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2709a53ad8SAndrew Turner * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2809a53ad8SAndrew Turner * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2909a53ad8SAndrew Turner */
3009a53ad8SAndrew Turner
3109a53ad8SAndrew Turner/* Assumptions:
3209a53ad8SAndrew Turner *
3309a53ad8SAndrew Turner * ARMv8-a, AArch64
3409a53ad8SAndrew Turner * Neon Available.
3509a53ad8SAndrew Turner */
3609a53ad8SAndrew Turner
3709a53ad8SAndrew Turner/* Arguments and results.  */
3809a53ad8SAndrew Turner#define srcin		x0
3909a53ad8SAndrew Turner#define chrin		w1
4009a53ad8SAndrew Turner#define cntin		x2
4109a53ad8SAndrew Turner
4209a53ad8SAndrew Turner#define result		x0
4309a53ad8SAndrew Turner
4409a53ad8SAndrew Turner#define src		x3
4509a53ad8SAndrew Turner#define	tmp		x4
4609a53ad8SAndrew Turner#define wtmp2		w5
4709a53ad8SAndrew Turner#define synd		x6
4809a53ad8SAndrew Turner#define soff		x9
4909a53ad8SAndrew Turner#define cntrem		x10
5009a53ad8SAndrew Turner
5109a53ad8SAndrew Turner#define vrepchr		v0
5209a53ad8SAndrew Turner#define vdata1		v1
5309a53ad8SAndrew Turner#define vdata2		v2
5409a53ad8SAndrew Turner#define vhas_chr1	v3
5509a53ad8SAndrew Turner#define vhas_chr2	v4
5609a53ad8SAndrew Turner#define vrepmask	v5
5709a53ad8SAndrew Turner#define vend		v6
5809a53ad8SAndrew Turner
5909a53ad8SAndrew Turner/*
6009a53ad8SAndrew Turner * Core algorithm:
6109a53ad8SAndrew Turner *
6209a53ad8SAndrew Turner * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
6309a53ad8SAndrew Turner * per byte. For each tuple, bit 0 is set if the relevant byte matched the
6409a53ad8SAndrew Turner * requested character and bit 1 is not used (faster than using a 32bit
6509a53ad8SAndrew Turner * syndrome). Since the bits in the syndrome reflect exactly the order in which
6609a53ad8SAndrew Turner * things occur in the original string, counting trailing zeros allows to
6709a53ad8SAndrew Turner * identify exactly which byte has matched.
6809a53ad8SAndrew Turner */
6909a53ad8SAndrew Turner
7009a53ad8SAndrew Turner	.macro def_fn f p2align=0
7109a53ad8SAndrew Turner	.text
7209a53ad8SAndrew Turner	.p2align \p2align
7309a53ad8SAndrew Turner	.global \f
7409a53ad8SAndrew Turner	.type \f, %function
7509a53ad8SAndrew Turner\f:
7609a53ad8SAndrew Turner	.endm
7709a53ad8SAndrew Turner
7809a53ad8SAndrew Turnerdef_fn memchr
7909a53ad8SAndrew Turner	/* Do not dereference srcin if no bytes to compare.  */
8009a53ad8SAndrew Turner	cbz	cntin, .Lzero_length
8109a53ad8SAndrew Turner	/*
8209a53ad8SAndrew Turner	 * Magic constant 0x40100401 allows us to identify which lane matches
8309a53ad8SAndrew Turner	 * the requested byte.
8409a53ad8SAndrew Turner	 */
8509a53ad8SAndrew Turner	mov	wtmp2, #0x0401
8609a53ad8SAndrew Turner	movk	wtmp2, #0x4010, lsl #16
8709a53ad8SAndrew Turner	dup	vrepchr.16b, chrin
8809a53ad8SAndrew Turner	/* Work with aligned 32-byte chunks */
8909a53ad8SAndrew Turner	bic	src, srcin, #31
9009a53ad8SAndrew Turner	dup	vrepmask.4s, wtmp2
9109a53ad8SAndrew Turner	ands	soff, srcin, #31
9209a53ad8SAndrew Turner	and	cntrem, cntin, #31
9309a53ad8SAndrew Turner	b.eq	.Lloop
9409a53ad8SAndrew Turner
9509a53ad8SAndrew Turner	/*
9609a53ad8SAndrew Turner	 * Input string is not 32-byte aligned. We calculate the syndrome
9709a53ad8SAndrew Turner	 * value for the aligned 32 bytes block containing the first bytes
9809a53ad8SAndrew Turner	 * and mask the irrelevant part.
9909a53ad8SAndrew Turner	 */
10009a53ad8SAndrew Turner
10109a53ad8SAndrew Turner	ld1	{vdata1.16b, vdata2.16b}, [src], #32
10209a53ad8SAndrew Turner	sub	tmp, soff, #32
10309a53ad8SAndrew Turner	adds	cntin, cntin, tmp
10409a53ad8SAndrew Turner	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
10509a53ad8SAndrew Turner	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
10609a53ad8SAndrew Turner	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
10709a53ad8SAndrew Turner	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
10809a53ad8SAndrew Turner	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
10909a53ad8SAndrew Turner	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
110*27044e17SAndrew Turner	mov	synd, vend.d[0]
11109a53ad8SAndrew Turner	/* Clear the soff*2 lower bits */
11209a53ad8SAndrew Turner	lsl	tmp, soff, #1
11309a53ad8SAndrew Turner	lsr	synd, synd, tmp
11409a53ad8SAndrew Turner	lsl	synd, synd, tmp
11509a53ad8SAndrew Turner	/* The first block can also be the last */
11609a53ad8SAndrew Turner	b.ls	.Lmasklast
11709a53ad8SAndrew Turner	/* Have we found something already? */
11809a53ad8SAndrew Turner	cbnz	synd, .Ltail
11909a53ad8SAndrew Turner
12009a53ad8SAndrew Turner.Lloop:
12109a53ad8SAndrew Turner	ld1	{vdata1.16b, vdata2.16b}, [src], #32
12209a53ad8SAndrew Turner	subs	cntin, cntin, #32
12309a53ad8SAndrew Turner	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
12409a53ad8SAndrew Turner	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
12509a53ad8SAndrew Turner	/* If we're out of data we finish regardless of the result */
12609a53ad8SAndrew Turner	b.ls	.Lend
12709a53ad8SAndrew Turner	/* Use a fast check for the termination condition */
12809a53ad8SAndrew Turner	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
12909a53ad8SAndrew Turner	addp	vend.2d, vend.2d, vend.2d
130*27044e17SAndrew Turner	mov	synd, vend.d[0]
13109a53ad8SAndrew Turner	/* We're not out of data, loop if we haven't found the character */
13209a53ad8SAndrew Turner	cbz	synd, .Lloop
13309a53ad8SAndrew Turner
13409a53ad8SAndrew Turner.Lend:
13509a53ad8SAndrew Turner	/* Termination condition found, let's calculate the syndrome value */
13609a53ad8SAndrew Turner	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
13709a53ad8SAndrew Turner	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
13809a53ad8SAndrew Turner	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
13909a53ad8SAndrew Turner	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
140*27044e17SAndrew Turner	mov	synd, vend.d[0]
14109a53ad8SAndrew Turner	/* Only do the clear for the last possible block */
14209a53ad8SAndrew Turner	b.hi	.Ltail
14309a53ad8SAndrew Turner
14409a53ad8SAndrew Turner.Lmasklast:
14509a53ad8SAndrew Turner	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
14609a53ad8SAndrew Turner	add	tmp, cntrem, soff
14709a53ad8SAndrew Turner	and	tmp, tmp, #31
14809a53ad8SAndrew Turner	sub	tmp, tmp, #32
14909a53ad8SAndrew Turner	neg	tmp, tmp, lsl #1
15009a53ad8SAndrew Turner	lsl	synd, synd, tmp
15109a53ad8SAndrew Turner	lsr	synd, synd, tmp
15209a53ad8SAndrew Turner
15309a53ad8SAndrew Turner.Ltail:
15409a53ad8SAndrew Turner	/* Count the trailing zeros using bit reversing */
15509a53ad8SAndrew Turner	rbit	synd, synd
15609a53ad8SAndrew Turner	/* Compensate the last post-increment */
15709a53ad8SAndrew Turner	sub	src, src, #32
15809a53ad8SAndrew Turner	/* Check that we have found a character */
15909a53ad8SAndrew Turner	cmp	synd, #0
16009a53ad8SAndrew Turner	/* And count the leading zeros */
16109a53ad8SAndrew Turner	clz	synd, synd
16209a53ad8SAndrew Turner	/* Compute the potential result */
16309a53ad8SAndrew Turner	add	result, src, synd, lsr #1
16409a53ad8SAndrew Turner	/* Select result or NULL */
16509a53ad8SAndrew Turner	csel	result, xzr, result, eq
16609a53ad8SAndrew Turner	ret
16709a53ad8SAndrew Turner
16809a53ad8SAndrew Turner.Lzero_length:
16909a53ad8SAndrew Turner	mov	result, #0
17009a53ad8SAndrew Turner	ret
17109a53ad8SAndrew Turner
17209a53ad8SAndrew Turner	.size	memchr, . - memchr
173