1*09a53ad8SAndrew Turner/* Copyright (c) 2013-2015, Linaro Limited 2*09a53ad8SAndrew Turner All rights reserved. 3*09a53ad8SAndrew Turner 4*09a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 5*09a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 6*09a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 7*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 8*09a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 9*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 10*09a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 11*09a53ad8SAndrew Turner * Neither the name of the Linaro nor the 12*09a53ad8SAndrew Turner names of its contributors may be used to endorse or promote products 13*09a53ad8SAndrew Turner derived from this software without specific prior written permission. 14*09a53ad8SAndrew Turner 15*09a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16*09a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17*09a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18*09a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19*09a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20*09a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21*09a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22*09a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23*09a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*09a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25*09a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26*09a53ad8SAndrew Turner 27*09a53ad8SAndrew Turner/* Assumptions: 28*09a53ad8SAndrew Turner * 29*09a53ad8SAndrew Turner * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 30*09a53ad8SAndrew Turner */ 31*09a53ad8SAndrew Turner 32*09a53ad8SAndrew Turner/* To test the page crossing code path more thoroughly, compile with 33*09a53ad8SAndrew Turner -DTEST_PAGE_CROSS - this will force all calls through the slower 34*09a53ad8SAndrew Turner entry path. This option is not intended for production use. */ 35*09a53ad8SAndrew Turner 36*09a53ad8SAndrew Turner/* Arguments and results. */ 37*09a53ad8SAndrew Turner#define srcin x0 38*09a53ad8SAndrew Turner#define len x0 39*09a53ad8SAndrew Turner 40*09a53ad8SAndrew Turner/* Locals and temporaries. */ 41*09a53ad8SAndrew Turner#define src x1 42*09a53ad8SAndrew Turner#define data1 x2 43*09a53ad8SAndrew Turner#define data2 x3 44*09a53ad8SAndrew Turner#define has_nul1 x4 45*09a53ad8SAndrew Turner#define has_nul2 x5 46*09a53ad8SAndrew Turner#define tmp1 x4 47*09a53ad8SAndrew Turner#define tmp2 x5 48*09a53ad8SAndrew Turner#define tmp3 x6 49*09a53ad8SAndrew Turner#define tmp4 x7 50*09a53ad8SAndrew Turner#define zeroones x8 51*09a53ad8SAndrew Turner 52*09a53ad8SAndrew Turner#define L(l) .L ## l 53*09a53ad8SAndrew Turner 54*09a53ad8SAndrew Turner .macro def_fn f p2align=0 55*09a53ad8SAndrew Turner .text 56*09a53ad8SAndrew Turner .p2align \p2align 57*09a53ad8SAndrew Turner .global \f 58*09a53ad8SAndrew Turner .type \f, %function 59*09a53ad8SAndrew Turner\f: 60*09a53ad8SAndrew Turner .endm 61*09a53ad8SAndrew Turner 62*09a53ad8SAndrew Turner /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 63*09a53ad8SAndrew Turner (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 64*09a53ad8SAndrew Turner can be done in parallel across the entire word. A faster check 65*09a53ad8SAndrew Turner (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 66*09a53ad8SAndrew Turner false hits for characters 129..255. */ 67*09a53ad8SAndrew Turner 68*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101 69*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f 70*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080 71*09a53ad8SAndrew Turner 72*09a53ad8SAndrew Turner#ifdef TEST_PAGE_CROSS 73*09a53ad8SAndrew Turner# define MIN_PAGE_SIZE 15 74*09a53ad8SAndrew Turner#else 75*09a53ad8SAndrew Turner# define MIN_PAGE_SIZE 4096 76*09a53ad8SAndrew Turner#endif 77*09a53ad8SAndrew Turner 78*09a53ad8SAndrew Turner /* Since strings are short on average, we check the first 16 bytes 79*09a53ad8SAndrew Turner of the string for a NUL character. In order to do an unaligned ldp 80*09a53ad8SAndrew Turner safely we have to do a page cross check first. If there is a NUL 81*09a53ad8SAndrew Turner byte we calculate the length from the 2 8-byte words using 82*09a53ad8SAndrew Turner conditional select to reduce branch mispredictions (it is unlikely 83*09a53ad8SAndrew Turner strlen will be repeatedly called on strings with the same length). 84*09a53ad8SAndrew Turner 85*09a53ad8SAndrew Turner If the string is longer than 16 bytes, we align src so don't need 86*09a53ad8SAndrew Turner further page cross checks, and process 32 bytes per iteration 87*09a53ad8SAndrew Turner using the fast NUL check. If we encounter non-ASCII characters, 88*09a53ad8SAndrew Turner fallback to a second loop using the full NUL check. 89*09a53ad8SAndrew Turner 90*09a53ad8SAndrew Turner If the page cross check fails, we read 16 bytes from an aligned 91*09a53ad8SAndrew Turner address, remove any characters before the string, and continue 92*09a53ad8SAndrew Turner in the main loop using aligned loads. Since strings crossing a 93*09a53ad8SAndrew Turner page in the first 16 bytes are rare (probability of 94*09a53ad8SAndrew Turner 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. 95*09a53ad8SAndrew Turner 96*09a53ad8SAndrew Turner AArch64 systems have a minimum page size of 4k. We don't bother 97*09a53ad8SAndrew Turner checking for larger page sizes - the cost of setting up the correct 98*09a53ad8SAndrew Turner page size is just not worth the extra gain from a small reduction in 99*09a53ad8SAndrew Turner the cases taking the slow path. Note that we only care about 100*09a53ad8SAndrew Turner whether the first fetch, which may be misaligned, crosses a page 101*09a53ad8SAndrew Turner boundary. */ 102*09a53ad8SAndrew Turner 103*09a53ad8SAndrew Turnerdef_fn strlen p2align=6 104*09a53ad8SAndrew Turner and tmp1, srcin, MIN_PAGE_SIZE - 1 105*09a53ad8SAndrew Turner mov zeroones, REP8_01 106*09a53ad8SAndrew Turner cmp tmp1, MIN_PAGE_SIZE - 16 107*09a53ad8SAndrew Turner b.gt L(page_cross) 108*09a53ad8SAndrew Turner ldp data1, data2, [srcin] 109*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 110*09a53ad8SAndrew Turner /* For big-endian, carry propagation (if the final byte in the 111*09a53ad8SAndrew Turner string is 0x01) means we cannot use has_nul1/2 directly. 112*09a53ad8SAndrew Turner Since we expect strings to be small and early-exit, 113*09a53ad8SAndrew Turner byte-swap the data now so has_null1/2 will be correct. */ 114*09a53ad8SAndrew Turner rev data1, data1 115*09a53ad8SAndrew Turner rev data2, data2 116*09a53ad8SAndrew Turner#endif 117*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 118*09a53ad8SAndrew Turner orr tmp2, data1, REP8_7f 119*09a53ad8SAndrew Turner sub tmp3, data2, zeroones 120*09a53ad8SAndrew Turner orr tmp4, data2, REP8_7f 121*09a53ad8SAndrew Turner bics has_nul1, tmp1, tmp2 122*09a53ad8SAndrew Turner bic has_nul2, tmp3, tmp4 123*09a53ad8SAndrew Turner ccmp has_nul2, 0, 0, eq 124*09a53ad8SAndrew Turner beq L(main_loop_entry) 125*09a53ad8SAndrew Turner 126*09a53ad8SAndrew Turner /* Enter with C = has_nul1 == 0. */ 127*09a53ad8SAndrew Turner csel has_nul1, has_nul1, has_nul2, cc 128*09a53ad8SAndrew Turner mov len, 8 129*09a53ad8SAndrew Turner rev has_nul1, has_nul1 130*09a53ad8SAndrew Turner clz tmp1, has_nul1 131*09a53ad8SAndrew Turner csel len, xzr, len, cc 132*09a53ad8SAndrew Turner add len, len, tmp1, lsr 3 133*09a53ad8SAndrew Turner ret 134*09a53ad8SAndrew Turner 135*09a53ad8SAndrew Turner /* The inner loop processes 32 bytes per iteration and uses the fast 136*09a53ad8SAndrew Turner NUL check. If we encounter non-ASCII characters, use a second 137*09a53ad8SAndrew Turner loop with the accurate NUL check. */ 138*09a53ad8SAndrew Turner .p2align 4 139*09a53ad8SAndrew TurnerL(main_loop_entry): 140*09a53ad8SAndrew Turner bic src, srcin, 15 141*09a53ad8SAndrew Turner sub src, src, 16 142*09a53ad8SAndrew TurnerL(main_loop): 143*09a53ad8SAndrew Turner ldp data1, data2, [src, 32]! 144*09a53ad8SAndrew Turner.Lpage_cross_entry: 145*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 146*09a53ad8SAndrew Turner sub tmp3, data2, zeroones 147*09a53ad8SAndrew Turner orr tmp2, tmp1, tmp3 148*09a53ad8SAndrew Turner tst tmp2, zeroones, lsl 7 149*09a53ad8SAndrew Turner bne 1f 150*09a53ad8SAndrew Turner ldp data1, data2, [src, 16] 151*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 152*09a53ad8SAndrew Turner sub tmp3, data2, zeroones 153*09a53ad8SAndrew Turner orr tmp2, tmp1, tmp3 154*09a53ad8SAndrew Turner tst tmp2, zeroones, lsl 7 155*09a53ad8SAndrew Turner beq L(main_loop) 156*09a53ad8SAndrew Turner add src, src, 16 157*09a53ad8SAndrew Turner1: 158*09a53ad8SAndrew Turner /* The fast check failed, so do the slower, accurate NUL check. */ 159*09a53ad8SAndrew Turner orr tmp2, data1, REP8_7f 160*09a53ad8SAndrew Turner orr tmp4, data2, REP8_7f 161*09a53ad8SAndrew Turner bics has_nul1, tmp1, tmp2 162*09a53ad8SAndrew Turner bic has_nul2, tmp3, tmp4 163*09a53ad8SAndrew Turner ccmp has_nul2, 0, 0, eq 164*09a53ad8SAndrew Turner beq L(nonascii_loop) 165*09a53ad8SAndrew Turner 166*09a53ad8SAndrew Turner /* Enter with C = has_nul1 == 0. */ 167*09a53ad8SAndrew TurnerL(tail): 168*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 169*09a53ad8SAndrew Turner /* For big-endian, carry propagation (if the final byte in the 170*09a53ad8SAndrew Turner string is 0x01) means we cannot use has_nul1/2 directly. The 171*09a53ad8SAndrew Turner easiest way to get the correct byte is to byte-swap the data 172*09a53ad8SAndrew Turner and calculate the syndrome a second time. */ 173*09a53ad8SAndrew Turner csel data1, data1, data2, cc 174*09a53ad8SAndrew Turner rev data1, data1 175*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 176*09a53ad8SAndrew Turner orr tmp2, data1, REP8_7f 177*09a53ad8SAndrew Turner bic has_nul1, tmp1, tmp2 178*09a53ad8SAndrew Turner#else 179*09a53ad8SAndrew Turner csel has_nul1, has_nul1, has_nul2, cc 180*09a53ad8SAndrew Turner#endif 181*09a53ad8SAndrew Turner sub len, src, srcin 182*09a53ad8SAndrew Turner rev has_nul1, has_nul1 183*09a53ad8SAndrew Turner add tmp2, len, 8 184*09a53ad8SAndrew Turner clz tmp1, has_nul1 185*09a53ad8SAndrew Turner csel len, len, tmp2, cc 186*09a53ad8SAndrew Turner add len, len, tmp1, lsr 3 187*09a53ad8SAndrew Turner ret 188*09a53ad8SAndrew Turner 189*09a53ad8SAndrew TurnerL(nonascii_loop): 190*09a53ad8SAndrew Turner ldp data1, data2, [src, 16]! 191*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 192*09a53ad8SAndrew Turner orr tmp2, data1, REP8_7f 193*09a53ad8SAndrew Turner sub tmp3, data2, zeroones 194*09a53ad8SAndrew Turner orr tmp4, data2, REP8_7f 195*09a53ad8SAndrew Turner bics has_nul1, tmp1, tmp2 196*09a53ad8SAndrew Turner bic has_nul2, tmp3, tmp4 197*09a53ad8SAndrew Turner ccmp has_nul2, 0, 0, eq 198*09a53ad8SAndrew Turner bne L(tail) 199*09a53ad8SAndrew Turner ldp data1, data2, [src, 16]! 200*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 201*09a53ad8SAndrew Turner orr tmp2, data1, REP8_7f 202*09a53ad8SAndrew Turner sub tmp3, data2, zeroones 203*09a53ad8SAndrew Turner orr tmp4, data2, REP8_7f 204*09a53ad8SAndrew Turner bics has_nul1, tmp1, tmp2 205*09a53ad8SAndrew Turner bic has_nul2, tmp3, tmp4 206*09a53ad8SAndrew Turner ccmp has_nul2, 0, 0, eq 207*09a53ad8SAndrew Turner beq L(nonascii_loop) 208*09a53ad8SAndrew Turner b L(tail) 209*09a53ad8SAndrew Turner 210*09a53ad8SAndrew Turner /* Load 16 bytes from [srcin & ~15] and force the bytes that precede 211*09a53ad8SAndrew Turner srcin to 0x7f, so we ignore any NUL bytes before the string. 212*09a53ad8SAndrew Turner Then continue in the aligned loop. */ 213*09a53ad8SAndrew TurnerL(page_cross): 214*09a53ad8SAndrew Turner bic src, srcin, 15 215*09a53ad8SAndrew Turner ldp data1, data2, [src] 216*09a53ad8SAndrew Turner lsl tmp1, srcin, 3 217*09a53ad8SAndrew Turner mov tmp4, -1 218*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 219*09a53ad8SAndrew Turner /* Big-endian. Early bytes are at MSB. */ 220*09a53ad8SAndrew Turner lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 221*09a53ad8SAndrew Turner#else 222*09a53ad8SAndrew Turner /* Little-endian. Early bytes are at LSB. */ 223*09a53ad8SAndrew Turner lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 224*09a53ad8SAndrew Turner#endif 225*09a53ad8SAndrew Turner orr tmp1, tmp1, REP8_80 226*09a53ad8SAndrew Turner orn data1, data1, tmp1 227*09a53ad8SAndrew Turner orn tmp2, data2, tmp1 228*09a53ad8SAndrew Turner tst srcin, 8 229*09a53ad8SAndrew Turner csel data1, data1, tmp4, eq 230*09a53ad8SAndrew Turner csel data2, data2, tmp2, eq 231*09a53ad8SAndrew Turner b L(page_cross_entry) 232*09a53ad8SAndrew Turner 233*09a53ad8SAndrew Turner .size strlen, . - strlen 234