109a53ad8SAndrew Turner/* 209a53ad8SAndrew Turner strchr - find a character in a string 309a53ad8SAndrew Turner 409a53ad8SAndrew Turner Copyright (c) 2014, ARM Limited 509a53ad8SAndrew Turner All rights Reserved. 609a53ad8SAndrew Turner 709a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 809a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 909a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 1009a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 1109a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 1209a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 1309a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 1409a53ad8SAndrew Turner * Neither the name of the company nor the names of its contributors 1509a53ad8SAndrew Turner may be used to endorse or promote products derived from this 1609a53ad8SAndrew Turner software without specific prior written permission. 1709a53ad8SAndrew Turner 1809a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1909a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2009a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 2109a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 2209a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 2309a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 2409a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2509a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2609a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2709a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 2809a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 2909a53ad8SAndrew Turner 3009a53ad8SAndrew Turner/* Assumptions: 3109a53ad8SAndrew Turner * 3209a53ad8SAndrew Turner * ARMv8-a, AArch64 3309a53ad8SAndrew Turner * Neon Available. 3409a53ad8SAndrew Turner */ 3509a53ad8SAndrew Turner 3609a53ad8SAndrew Turner/* Arguments and results. */ 3709a53ad8SAndrew Turner#define srcin x0 3809a53ad8SAndrew Turner#define chrin w1 3909a53ad8SAndrew Turner 4009a53ad8SAndrew Turner#define result x0 4109a53ad8SAndrew Turner 4209a53ad8SAndrew Turner#define src x2 4309a53ad8SAndrew Turner#define tmp1 x3 4409a53ad8SAndrew Turner#define wtmp2 w4 4509a53ad8SAndrew Turner#define tmp3 x5 4609a53ad8SAndrew Turner 4709a53ad8SAndrew Turner#define vrepchr v0 4809a53ad8SAndrew Turner#define vdata1 v1 4909a53ad8SAndrew Turner#define vdata2 v2 5009a53ad8SAndrew Turner#define vhas_nul1 v3 5109a53ad8SAndrew Turner#define vhas_nul2 v4 5209a53ad8SAndrew Turner#define vhas_chr1 v5 5309a53ad8SAndrew Turner#define vhas_chr2 v6 5409a53ad8SAndrew Turner#define vrepmask_0 v7 5509a53ad8SAndrew Turner#define vrepmask_c v16 5609a53ad8SAndrew Turner#define vend1 v17 5709a53ad8SAndrew Turner#define vend2 v18 5809a53ad8SAndrew Turner 5909a53ad8SAndrew Turner/* Core algorithm. 6009a53ad8SAndrew Turner 6109a53ad8SAndrew Turner For each 32-byte hunk we calculate a 64-bit syndrome value, with 6209a53ad8SAndrew Turner two bits per byte (LSB is always in bits 0 and 1, for both big 6309a53ad8SAndrew Turner and little-endian systems). For each tuple, bit 0 is set iff 6409a53ad8SAndrew Turner the relevant byte matched the requested character; bit 1 is set 6509a53ad8SAndrew Turner iff the relevant byte matched the NUL end of string (we trigger 6609a53ad8SAndrew Turner off bit0 for the special case of looking for NUL). Since the bits 6709a53ad8SAndrew Turner in the syndrome reflect exactly the order in which things occur 6809a53ad8SAndrew Turner in the original string a count_trailing_zeros() operation will 6909a53ad8SAndrew Turner identify exactly which byte is causing the termination, and why. */ 7009a53ad8SAndrew Turner 7109a53ad8SAndrew Turner/* Locals and temporaries. */ 7209a53ad8SAndrew Turner 7309a53ad8SAndrew Turner .macro def_fn f p2align=0 7409a53ad8SAndrew Turner .text 7509a53ad8SAndrew Turner .p2align \p2align 7609a53ad8SAndrew Turner .global \f 7709a53ad8SAndrew Turner .type \f, %function 7809a53ad8SAndrew Turner\f: 7909a53ad8SAndrew Turner .endm 8009a53ad8SAndrew Turner 81*27044e17SAndrew Turner .macro def_alias f a 82*27044e17SAndrew Turner .weak \a 83*27044e17SAndrew Turner .set \a,\f 84*27044e17SAndrew Turner .endm 85*27044e17SAndrew Turner 8609a53ad8SAndrew Turnerdef_fn strchr 87*27044e17SAndrew Turnerdef_alias strchr index 8809a53ad8SAndrew Turner /* Magic constant 0x40100401 to allow us to identify which lane 8909a53ad8SAndrew Turner matches the requested byte. Magic constant 0x80200802 used 9009a53ad8SAndrew Turner similarly for NUL termination. */ 9109a53ad8SAndrew Turner mov wtmp2, #0x0401 9209a53ad8SAndrew Turner movk wtmp2, #0x4010, lsl #16 9309a53ad8SAndrew Turner dup vrepchr.16b, chrin 9409a53ad8SAndrew Turner bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 9509a53ad8SAndrew Turner dup vrepmask_c.4s, wtmp2 9609a53ad8SAndrew Turner ands tmp1, srcin, #31 9709a53ad8SAndrew Turner add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ 9809a53ad8SAndrew Turner b.eq .Lloop 9909a53ad8SAndrew Turner 10009a53ad8SAndrew Turner /* Input string is not 32-byte aligned. Rather than forcing 10109a53ad8SAndrew Turner the padding bytes to a safe value, we calculate the syndrome 10209a53ad8SAndrew Turner for all the bytes, but then mask off those bits of the 10309a53ad8SAndrew Turner syndrome that are related to the padding. */ 10409a53ad8SAndrew Turner ld1 {vdata1.16b, vdata2.16b}, [src], #32 10509a53ad8SAndrew Turner neg tmp1, tmp1 10609a53ad8SAndrew Turner cmeq vhas_nul1.16b, vdata1.16b, #0 10709a53ad8SAndrew Turner cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 10809a53ad8SAndrew Turner cmeq vhas_nul2.16b, vdata2.16b, #0 10909a53ad8SAndrew Turner cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 11009a53ad8SAndrew Turner and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 11109a53ad8SAndrew Turner and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 11209a53ad8SAndrew Turner and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 11309a53ad8SAndrew Turner and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 11409a53ad8SAndrew Turner orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b 11509a53ad8SAndrew Turner orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b 11609a53ad8SAndrew Turner lsl tmp1, tmp1, #1 11709a53ad8SAndrew Turner addp vend1.16b, vend1.16b, vend2.16b // 256->128 11809a53ad8SAndrew Turner mov tmp3, #~0 11909a53ad8SAndrew Turner addp vend1.16b, vend1.16b, vend2.16b // 128->64 12009a53ad8SAndrew Turner lsr tmp1, tmp3, tmp1 12109a53ad8SAndrew Turner 122*27044e17SAndrew Turner mov tmp3, vend1.d[0] 12309a53ad8SAndrew Turner bic tmp1, tmp3, tmp1 // Mask padding bits. 12409a53ad8SAndrew Turner cbnz tmp1, .Ltail 12509a53ad8SAndrew Turner 12609a53ad8SAndrew Turner.Lloop: 12709a53ad8SAndrew Turner ld1 {vdata1.16b, vdata2.16b}, [src], #32 12809a53ad8SAndrew Turner cmeq vhas_nul1.16b, vdata1.16b, #0 12909a53ad8SAndrew Turner cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 13009a53ad8SAndrew Turner cmeq vhas_nul2.16b, vdata2.16b, #0 13109a53ad8SAndrew Turner cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 13209a53ad8SAndrew Turner /* Use a fast check for the termination condition. */ 13309a53ad8SAndrew Turner orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b 13409a53ad8SAndrew Turner orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b 13509a53ad8SAndrew Turner orr vend1.16b, vend1.16b, vend2.16b 13609a53ad8SAndrew Turner addp vend1.2d, vend1.2d, vend1.2d 137*27044e17SAndrew Turner mov tmp1, vend1.d[0] 13809a53ad8SAndrew Turner cbz tmp1, .Lloop 13909a53ad8SAndrew Turner 14009a53ad8SAndrew Turner /* Termination condition found. Now need to establish exactly why 14109a53ad8SAndrew Turner we terminated. */ 14209a53ad8SAndrew Turner and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 14309a53ad8SAndrew Turner and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 14409a53ad8SAndrew Turner and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 14509a53ad8SAndrew Turner and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 14609a53ad8SAndrew Turner orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b 14709a53ad8SAndrew Turner orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b 14809a53ad8SAndrew Turner addp vend1.16b, vend1.16b, vend2.16b // 256->128 14909a53ad8SAndrew Turner addp vend1.16b, vend1.16b, vend2.16b // 128->64 15009a53ad8SAndrew Turner 151*27044e17SAndrew Turner mov tmp1, vend1.d[0] 15209a53ad8SAndrew Turner.Ltail: 15309a53ad8SAndrew Turner /* Count the trailing zeros, by bit reversing... */ 15409a53ad8SAndrew Turner rbit tmp1, tmp1 15509a53ad8SAndrew Turner /* Re-bias source. */ 15609a53ad8SAndrew Turner sub src, src, #32 15709a53ad8SAndrew Turner clz tmp1, tmp1 /* And counting the leading zeros. */ 15809a53ad8SAndrew Turner /* Tmp1 is even if the target charager was found first. Otherwise 15909a53ad8SAndrew Turner we've found the end of string and we weren't looking for NUL. */ 16009a53ad8SAndrew Turner tst tmp1, #1 16109a53ad8SAndrew Turner add result, src, tmp1, lsr #1 16209a53ad8SAndrew Turner csel result, result, xzr, eq 16309a53ad8SAndrew Turner ret 16409a53ad8SAndrew Turner 16509a53ad8SAndrew Turner .size strchr, . - strchr 166