109a53ad8SAndrew Turner/* 209a53ad8SAndrew Turner * memchr - find a character in a memory zone 309a53ad8SAndrew Turner * 409a53ad8SAndrew Turner * Copyright (c) 2014, ARM Limited 509a53ad8SAndrew Turner * All rights Reserved. 609a53ad8SAndrew Turner * 709a53ad8SAndrew Turner * Redistribution and use in source and binary forms, with or without 809a53ad8SAndrew Turner * modification, are permitted provided that the following conditions are met: 909a53ad8SAndrew Turner * * Redistributions of source code must retain the above copyright 1009a53ad8SAndrew Turner * notice, this list of conditions and the following disclaimer. 1109a53ad8SAndrew Turner * * Redistributions in binary form must reproduce the above copyright 1209a53ad8SAndrew Turner * notice, this list of conditions and the following disclaimer in the 1309a53ad8SAndrew Turner * documentation and/or other materials provided with the distribution. 1409a53ad8SAndrew Turner * * Neither the name of the company nor the names of its contributors 1509a53ad8SAndrew Turner * may be used to endorse or promote products derived from this 1609a53ad8SAndrew Turner * software without specific prior written permission. 1709a53ad8SAndrew Turner * 1809a53ad8SAndrew Turner * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1909a53ad8SAndrew Turner * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2009a53ad8SAndrew Turner * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 2109a53ad8SAndrew Turner * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 2209a53ad8SAndrew Turner * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 2309a53ad8SAndrew Turner * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 2409a53ad8SAndrew Turner * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2509a53ad8SAndrew Turner * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2609a53ad8SAndrew Turner * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2709a53ad8SAndrew Turner * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 2809a53ad8SAndrew Turner * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2909a53ad8SAndrew Turner */ 3009a53ad8SAndrew Turner 3109a53ad8SAndrew Turner/* Assumptions: 3209a53ad8SAndrew Turner * 3309a53ad8SAndrew Turner * ARMv8-a, AArch64 3409a53ad8SAndrew Turner * Neon Available. 3509a53ad8SAndrew Turner */ 3609a53ad8SAndrew Turner 3709a53ad8SAndrew Turner/* Arguments and results. */ 3809a53ad8SAndrew Turner#define srcin x0 3909a53ad8SAndrew Turner#define chrin w1 4009a53ad8SAndrew Turner#define cntin x2 4109a53ad8SAndrew Turner 4209a53ad8SAndrew Turner#define result x0 4309a53ad8SAndrew Turner 4409a53ad8SAndrew Turner#define src x3 4509a53ad8SAndrew Turner#define tmp x4 4609a53ad8SAndrew Turner#define wtmp2 w5 4709a53ad8SAndrew Turner#define synd x6 4809a53ad8SAndrew Turner#define soff x9 4909a53ad8SAndrew Turner#define cntrem x10 5009a53ad8SAndrew Turner 5109a53ad8SAndrew Turner#define vrepchr v0 5209a53ad8SAndrew Turner#define vdata1 v1 5309a53ad8SAndrew Turner#define vdata2 v2 5409a53ad8SAndrew Turner#define vhas_chr1 v3 5509a53ad8SAndrew Turner#define vhas_chr2 v4 5609a53ad8SAndrew Turner#define vrepmask v5 5709a53ad8SAndrew Turner#define vend v6 5809a53ad8SAndrew Turner 5909a53ad8SAndrew Turner/* 6009a53ad8SAndrew Turner * Core algorithm: 6109a53ad8SAndrew Turner * 6209a53ad8SAndrew Turner * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 6309a53ad8SAndrew Turner * per byte. For each tuple, bit 0 is set if the relevant byte matched the 6409a53ad8SAndrew Turner * requested character and bit 1 is not used (faster than using a 32bit 6509a53ad8SAndrew Turner * syndrome). Since the bits in the syndrome reflect exactly the order in which 6609a53ad8SAndrew Turner * things occur in the original string, counting trailing zeros allows to 6709a53ad8SAndrew Turner * identify exactly which byte has matched. 6809a53ad8SAndrew Turner */ 6909a53ad8SAndrew Turner 7009a53ad8SAndrew Turner .macro def_fn f p2align=0 7109a53ad8SAndrew Turner .text 7209a53ad8SAndrew Turner .p2align \p2align 7309a53ad8SAndrew Turner .global \f 7409a53ad8SAndrew Turner .type \f, %function 7509a53ad8SAndrew Turner\f: 7609a53ad8SAndrew Turner .endm 7709a53ad8SAndrew Turner 7809a53ad8SAndrew Turnerdef_fn memchr 7909a53ad8SAndrew Turner /* Do not dereference srcin if no bytes to compare. */ 8009a53ad8SAndrew Turner cbz cntin, .Lzero_length 8109a53ad8SAndrew Turner /* 8209a53ad8SAndrew Turner * Magic constant 0x40100401 allows us to identify which lane matches 8309a53ad8SAndrew Turner * the requested byte. 8409a53ad8SAndrew Turner */ 8509a53ad8SAndrew Turner mov wtmp2, #0x0401 8609a53ad8SAndrew Turner movk wtmp2, #0x4010, lsl #16 8709a53ad8SAndrew Turner dup vrepchr.16b, chrin 8809a53ad8SAndrew Turner /* Work with aligned 32-byte chunks */ 8909a53ad8SAndrew Turner bic src, srcin, #31 9009a53ad8SAndrew Turner dup vrepmask.4s, wtmp2 9109a53ad8SAndrew Turner ands soff, srcin, #31 9209a53ad8SAndrew Turner and cntrem, cntin, #31 9309a53ad8SAndrew Turner b.eq .Lloop 9409a53ad8SAndrew Turner 9509a53ad8SAndrew Turner /* 9609a53ad8SAndrew Turner * Input string is not 32-byte aligned. We calculate the syndrome 9709a53ad8SAndrew Turner * value for the aligned 32 bytes block containing the first bytes 9809a53ad8SAndrew Turner * and mask the irrelevant part. 9909a53ad8SAndrew Turner */ 10009a53ad8SAndrew Turner 10109a53ad8SAndrew Turner ld1 {vdata1.16b, vdata2.16b}, [src], #32 10209a53ad8SAndrew Turner sub tmp, soff, #32 10309a53ad8SAndrew Turner adds cntin, cntin, tmp 10409a53ad8SAndrew Turner cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 10509a53ad8SAndrew Turner cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 10609a53ad8SAndrew Turner and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 10709a53ad8SAndrew Turner and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 10809a53ad8SAndrew Turner addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 10909a53ad8SAndrew Turner addp vend.16b, vend.16b, vend.16b /* 128->64 */ 110*27044e17SAndrew Turner mov synd, vend.d[0] 11109a53ad8SAndrew Turner /* Clear the soff*2 lower bits */ 11209a53ad8SAndrew Turner lsl tmp, soff, #1 11309a53ad8SAndrew Turner lsr synd, synd, tmp 11409a53ad8SAndrew Turner lsl synd, synd, tmp 11509a53ad8SAndrew Turner /* The first block can also be the last */ 11609a53ad8SAndrew Turner b.ls .Lmasklast 11709a53ad8SAndrew Turner /* Have we found something already? */ 11809a53ad8SAndrew Turner cbnz synd, .Ltail 11909a53ad8SAndrew Turner 12009a53ad8SAndrew Turner.Lloop: 12109a53ad8SAndrew Turner ld1 {vdata1.16b, vdata2.16b}, [src], #32 12209a53ad8SAndrew Turner subs cntin, cntin, #32 12309a53ad8SAndrew Turner cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 12409a53ad8SAndrew Turner cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 12509a53ad8SAndrew Turner /* If we're out of data we finish regardless of the result */ 12609a53ad8SAndrew Turner b.ls .Lend 12709a53ad8SAndrew Turner /* Use a fast check for the termination condition */ 12809a53ad8SAndrew Turner orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 12909a53ad8SAndrew Turner addp vend.2d, vend.2d, vend.2d 130*27044e17SAndrew Turner mov synd, vend.d[0] 13109a53ad8SAndrew Turner /* We're not out of data, loop if we haven't found the character */ 13209a53ad8SAndrew Turner cbz synd, .Lloop 13309a53ad8SAndrew Turner 13409a53ad8SAndrew Turner.Lend: 13509a53ad8SAndrew Turner /* Termination condition found, let's calculate the syndrome value */ 13609a53ad8SAndrew Turner and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 13709a53ad8SAndrew Turner and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 13809a53ad8SAndrew Turner addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 13909a53ad8SAndrew Turner addp vend.16b, vend.16b, vend.16b /* 128->64 */ 140*27044e17SAndrew Turner mov synd, vend.d[0] 14109a53ad8SAndrew Turner /* Only do the clear for the last possible block */ 14209a53ad8SAndrew Turner b.hi .Ltail 14309a53ad8SAndrew Turner 14409a53ad8SAndrew Turner.Lmasklast: 14509a53ad8SAndrew Turner /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 14609a53ad8SAndrew Turner add tmp, cntrem, soff 14709a53ad8SAndrew Turner and tmp, tmp, #31 14809a53ad8SAndrew Turner sub tmp, tmp, #32 14909a53ad8SAndrew Turner neg tmp, tmp, lsl #1 15009a53ad8SAndrew Turner lsl synd, synd, tmp 15109a53ad8SAndrew Turner lsr synd, synd, tmp 15209a53ad8SAndrew Turner 15309a53ad8SAndrew Turner.Ltail: 15409a53ad8SAndrew Turner /* Count the trailing zeros using bit reversing */ 15509a53ad8SAndrew Turner rbit synd, synd 15609a53ad8SAndrew Turner /* Compensate the last post-increment */ 15709a53ad8SAndrew Turner sub src, src, #32 15809a53ad8SAndrew Turner /* Check that we have found a character */ 15909a53ad8SAndrew Turner cmp synd, #0 16009a53ad8SAndrew Turner /* And count the leading zeros */ 16109a53ad8SAndrew Turner clz synd, synd 16209a53ad8SAndrew Turner /* Compute the potential result */ 16309a53ad8SAndrew Turner add result, src, synd, lsr #1 16409a53ad8SAndrew Turner /* Select result or NULL */ 16509a53ad8SAndrew Turner csel result, xzr, result, eq 16609a53ad8SAndrew Turner ret 16709a53ad8SAndrew Turner 16809a53ad8SAndrew Turner.Lzero_length: 16909a53ad8SAndrew Turner mov result, #0 17009a53ad8SAndrew Turner ret 17109a53ad8SAndrew Turner 17209a53ad8SAndrew Turner .size memchr, . - memchr 173