109a53ad8SAndrew Turner/* 209a53ad8SAndrew Turner strchrnul - find a character or nul in a string 309a53ad8SAndrew Turner 409a53ad8SAndrew Turner Copyright (c) 2014, ARM Limited 509a53ad8SAndrew Turner All rights Reserved. 609a53ad8SAndrew Turner 709a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 809a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 909a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 1009a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 1109a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 1209a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 1309a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 1409a53ad8SAndrew Turner * Neither the name of the company nor the names of its contributors 1509a53ad8SAndrew Turner may be used to endorse or promote products derived from this 1609a53ad8SAndrew Turner software without specific prior written permission. 1709a53ad8SAndrew Turner 1809a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1909a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2009a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 2109a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 2209a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 2309a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 2409a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2509a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2609a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2709a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 2809a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 2909a53ad8SAndrew Turner 3009a53ad8SAndrew Turner/* Assumptions: 3109a53ad8SAndrew Turner * 3209a53ad8SAndrew Turner * ARMv8-a, AArch64 3309a53ad8SAndrew Turner * Neon Available. 3409a53ad8SAndrew Turner */ 3509a53ad8SAndrew Turner 3609a53ad8SAndrew Turner/* Arguments and results. */ 3709a53ad8SAndrew Turner#define srcin x0 3809a53ad8SAndrew Turner#define chrin w1 3909a53ad8SAndrew Turner 4009a53ad8SAndrew Turner#define result x0 4109a53ad8SAndrew Turner 4209a53ad8SAndrew Turner#define src x2 4309a53ad8SAndrew Turner#define tmp1 x3 4409a53ad8SAndrew Turner#define wtmp2 w4 4509a53ad8SAndrew Turner#define tmp3 x5 4609a53ad8SAndrew Turner 4709a53ad8SAndrew Turner#define vrepchr v0 4809a53ad8SAndrew Turner#define vdata1 v1 4909a53ad8SAndrew Turner#define vdata2 v2 5009a53ad8SAndrew Turner#define vhas_nul1 v3 5109a53ad8SAndrew Turner#define vhas_nul2 v4 5209a53ad8SAndrew Turner#define vhas_chr1 v5 5309a53ad8SAndrew Turner#define vhas_chr2 v6 5409a53ad8SAndrew Turner#define vrepmask v7 5509a53ad8SAndrew Turner#define vend1 v16 5609a53ad8SAndrew Turner 5709a53ad8SAndrew Turner/* Core algorithm. 5809a53ad8SAndrew Turner 5909a53ad8SAndrew Turner For each 32-byte hunk we calculate a 64-bit syndrome value, with 6009a53ad8SAndrew Turner two bits per byte (LSB is always in bits 0 and 1, for both big 6109a53ad8SAndrew Turner and little-endian systems). For each tuple, bit 0 is set iff 6209a53ad8SAndrew Turner the relevant byte matched the requested character or nul. Since the 6309a53ad8SAndrew Turner bits in the syndrome reflect exactly the order in which things occur 6409a53ad8SAndrew Turner in the original string a count_trailing_zeros() operation will 6509a53ad8SAndrew Turner identify exactly which byte is causing the termination. */ 6609a53ad8SAndrew Turner 6709a53ad8SAndrew Turner/* Locals and temporaries. */ 6809a53ad8SAndrew Turner 6909a53ad8SAndrew Turner .macro def_fn f p2align=0 7009a53ad8SAndrew Turner .text 7109a53ad8SAndrew Turner .p2align \p2align 7209a53ad8SAndrew Turner .global \f 7309a53ad8SAndrew Turner .type \f, %function 7409a53ad8SAndrew Turner\f: 7509a53ad8SAndrew Turner .endm 7609a53ad8SAndrew Turner 7709a53ad8SAndrew Turnerdef_fn strchrnul 7809a53ad8SAndrew Turner /* Magic constant 0x40100401 to allow us to identify which lane 7909a53ad8SAndrew Turner matches the termination condition. */ 8009a53ad8SAndrew Turner mov wtmp2, #0x0401 8109a53ad8SAndrew Turner movk wtmp2, #0x4010, lsl #16 8209a53ad8SAndrew Turner dup vrepchr.16b, chrin 8309a53ad8SAndrew Turner bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 8409a53ad8SAndrew Turner dup vrepmask.4s, wtmp2 8509a53ad8SAndrew Turner ands tmp1, srcin, #31 8609a53ad8SAndrew Turner b.eq .Lloop 8709a53ad8SAndrew Turner 8809a53ad8SAndrew Turner /* Input string is not 32-byte aligned. Rather than forcing 8909a53ad8SAndrew Turner the padding bytes to a safe value, we calculate the syndrome 9009a53ad8SAndrew Turner for all the bytes, but then mask off those bits of the 9109a53ad8SAndrew Turner syndrome that are related to the padding. */ 9209a53ad8SAndrew Turner ld1 {vdata1.16b, vdata2.16b}, [src], #32 9309a53ad8SAndrew Turner neg tmp1, tmp1 9409a53ad8SAndrew Turner cmeq vhas_nul1.16b, vdata1.16b, #0 9509a53ad8SAndrew Turner cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 9609a53ad8SAndrew Turner cmeq vhas_nul2.16b, vdata2.16b, #0 9709a53ad8SAndrew Turner cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 9809a53ad8SAndrew Turner orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b 9909a53ad8SAndrew Turner orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b 10009a53ad8SAndrew Turner and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 10109a53ad8SAndrew Turner and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 10209a53ad8SAndrew Turner lsl tmp1, tmp1, #1 10309a53ad8SAndrew Turner addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 10409a53ad8SAndrew Turner mov tmp3, #~0 10509a53ad8SAndrew Turner addp vend1.16b, vend1.16b, vend1.16b // 128->64 10609a53ad8SAndrew Turner lsr tmp1, tmp3, tmp1 10709a53ad8SAndrew Turner 108*27044e17SAndrew Turner mov tmp3, vend1.d[0] 10909a53ad8SAndrew Turner bic tmp1, tmp3, tmp1 // Mask padding bits. 11009a53ad8SAndrew Turner cbnz tmp1, .Ltail 11109a53ad8SAndrew Turner 11209a53ad8SAndrew Turner.Lloop: 11309a53ad8SAndrew Turner ld1 {vdata1.16b, vdata2.16b}, [src], #32 11409a53ad8SAndrew Turner cmeq vhas_nul1.16b, vdata1.16b, #0 11509a53ad8SAndrew Turner cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 11609a53ad8SAndrew Turner cmeq vhas_nul2.16b, vdata2.16b, #0 11709a53ad8SAndrew Turner cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 11809a53ad8SAndrew Turner /* Use a fast check for the termination condition. */ 11909a53ad8SAndrew Turner orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b 12009a53ad8SAndrew Turner orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b 12109a53ad8SAndrew Turner orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b 12209a53ad8SAndrew Turner addp vend1.2d, vend1.2d, vend1.2d 123*27044e17SAndrew Turner mov tmp1, vend1.d[0] 12409a53ad8SAndrew Turner cbz tmp1, .Lloop 12509a53ad8SAndrew Turner 12609a53ad8SAndrew Turner /* Termination condition found. Now need to establish exactly why 12709a53ad8SAndrew Turner we terminated. */ 12809a53ad8SAndrew Turner and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 12909a53ad8SAndrew Turner and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 13009a53ad8SAndrew Turner addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 13109a53ad8SAndrew Turner addp vend1.16b, vend1.16b, vend1.16b // 128->64 13209a53ad8SAndrew Turner 133*27044e17SAndrew Turner mov tmp1, vend1.d[0] 13409a53ad8SAndrew Turner.Ltail: 13509a53ad8SAndrew Turner /* Count the trailing zeros, by bit reversing... */ 13609a53ad8SAndrew Turner rbit tmp1, tmp1 13709a53ad8SAndrew Turner /* Re-bias source. */ 13809a53ad8SAndrew Turner sub src, src, #32 13909a53ad8SAndrew Turner clz tmp1, tmp1 /* ... and counting the leading zeros. */ 14009a53ad8SAndrew Turner /* tmp1 is twice the offset into the fragment. */ 14109a53ad8SAndrew Turner add result, src, tmp1, lsr #1 14209a53ad8SAndrew Turner ret 14309a53ad8SAndrew Turner 14409a53ad8SAndrew Turner .size strchrnul, . - strchrnul 145