1*09a53ad8SAndrew Turner/* strnlen - calculate the length of a string with limit. 2*09a53ad8SAndrew Turner 3*09a53ad8SAndrew Turner Copyright (c) 2013, Linaro Limited 4*09a53ad8SAndrew Turner All rights reserved. 5*09a53ad8SAndrew Turner 6*09a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 7*09a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 8*09a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 9*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 10*09a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 11*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 12*09a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 13*09a53ad8SAndrew Turner * Neither the name of the Linaro nor the 14*09a53ad8SAndrew Turner names of its contributors may be used to endorse or promote products 15*09a53ad8SAndrew Turner derived from this software without specific prior written permission. 16*09a53ad8SAndrew Turner 17*09a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18*09a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19*09a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20*09a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21*09a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22*09a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23*09a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24*09a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25*09a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26*09a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27*09a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 28*09a53ad8SAndrew Turner 29*09a53ad8SAndrew Turner/* Assumptions: 30*09a53ad8SAndrew Turner * 31*09a53ad8SAndrew Turner * ARMv8-a, AArch64 32*09a53ad8SAndrew Turner */ 33*09a53ad8SAndrew Turner 34*09a53ad8SAndrew Turner/* Arguments and results. */ 35*09a53ad8SAndrew Turner#define srcin x0 36*09a53ad8SAndrew Turner#define len x0 37*09a53ad8SAndrew Turner#define limit x1 38*09a53ad8SAndrew Turner 39*09a53ad8SAndrew Turner/* Locals and temporaries. */ 40*09a53ad8SAndrew Turner#define src x2 41*09a53ad8SAndrew Turner#define data1 x3 42*09a53ad8SAndrew Turner#define data2 x4 43*09a53ad8SAndrew Turner#define data2a x5 44*09a53ad8SAndrew Turner#define has_nul1 x6 45*09a53ad8SAndrew Turner#define has_nul2 x7 46*09a53ad8SAndrew Turner#define tmp1 x8 47*09a53ad8SAndrew Turner#define tmp2 x9 48*09a53ad8SAndrew Turner#define tmp3 x10 49*09a53ad8SAndrew Turner#define tmp4 x11 50*09a53ad8SAndrew Turner#define zeroones x12 51*09a53ad8SAndrew Turner#define pos x13 52*09a53ad8SAndrew Turner#define limit_wd x14 53*09a53ad8SAndrew Turner 54*09a53ad8SAndrew Turner .macro def_fn f p2align=0 55*09a53ad8SAndrew Turner .text 56*09a53ad8SAndrew Turner .p2align \p2align 57*09a53ad8SAndrew Turner .global \f 58*09a53ad8SAndrew Turner .type \f, %function 59*09a53ad8SAndrew Turner\f: 60*09a53ad8SAndrew Turner .endm 61*09a53ad8SAndrew Turner 62*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101 63*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f 64*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080 65*09a53ad8SAndrew Turner 66*09a53ad8SAndrew Turner .text 67*09a53ad8SAndrew Turner .p2align 6 68*09a53ad8SAndrew Turner.Lstart: 69*09a53ad8SAndrew Turner /* Pre-pad to ensure critical loop begins an icache line. */ 70*09a53ad8SAndrew Turner .rep 7 71*09a53ad8SAndrew Turner nop 72*09a53ad8SAndrew Turner .endr 73*09a53ad8SAndrew Turner /* Put this code here to avoid wasting more space with pre-padding. */ 74*09a53ad8SAndrew Turner.Lhit_limit: 75*09a53ad8SAndrew Turner mov len, limit 76*09a53ad8SAndrew Turner ret 77*09a53ad8SAndrew Turner 78*09a53ad8SAndrew Turnerdef_fn strnlen 79*09a53ad8SAndrew Turner cbz limit, .Lhit_limit 80*09a53ad8SAndrew Turner mov zeroones, #REP8_01 81*09a53ad8SAndrew Turner bic src, srcin, #15 82*09a53ad8SAndrew Turner ands tmp1, srcin, #15 83*09a53ad8SAndrew Turner b.ne .Lmisaligned 84*09a53ad8SAndrew Turner /* Calculate the number of full and partial words -1. */ 85*09a53ad8SAndrew Turner sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 86*09a53ad8SAndrew Turner lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 87*09a53ad8SAndrew Turner 88*09a53ad8SAndrew Turner /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 89*09a53ad8SAndrew Turner (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 90*09a53ad8SAndrew Turner can be done in parallel across the entire word. */ 91*09a53ad8SAndrew Turner /* The inner loop deals with two Dwords at a time. This has a 92*09a53ad8SAndrew Turner slightly higher start-up cost, but we should win quite quickly, 93*09a53ad8SAndrew Turner especially on cores with a high number of issue slots per 94*09a53ad8SAndrew Turner cycle, as we get much better parallelism out of the operations. */ 95*09a53ad8SAndrew Turner 96*09a53ad8SAndrew Turner /* Start of critial section -- keep to one 64Byte cache line. */ 97*09a53ad8SAndrew Turner.Lloop: 98*09a53ad8SAndrew Turner ldp data1, data2, [src], #16 99*09a53ad8SAndrew Turner.Lrealigned: 100*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 101*09a53ad8SAndrew Turner orr tmp2, data1, #REP8_7f 102*09a53ad8SAndrew Turner sub tmp3, data2, zeroones 103*09a53ad8SAndrew Turner orr tmp4, data2, #REP8_7f 104*09a53ad8SAndrew Turner bic has_nul1, tmp1, tmp2 105*09a53ad8SAndrew Turner bic has_nul2, tmp3, tmp4 106*09a53ad8SAndrew Turner subs limit_wd, limit_wd, #1 107*09a53ad8SAndrew Turner orr tmp1, has_nul1, has_nul2 108*09a53ad8SAndrew Turner ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 109*09a53ad8SAndrew Turner b.eq .Lloop 110*09a53ad8SAndrew Turner /* End of critical section -- keep to one 64Byte cache line. */ 111*09a53ad8SAndrew Turner 112*09a53ad8SAndrew Turner orr tmp1, has_nul1, has_nul2 113*09a53ad8SAndrew Turner cbz tmp1, .Lhit_limit /* No null in final Qword. */ 114*09a53ad8SAndrew Turner 115*09a53ad8SAndrew Turner /* We know there's a null in the final Qword. The easiest thing 116*09a53ad8SAndrew Turner to do now is work out the length of the string and return 117*09a53ad8SAndrew Turner MIN (len, limit). */ 118*09a53ad8SAndrew Turner 119*09a53ad8SAndrew Turner sub len, src, srcin 120*09a53ad8SAndrew Turner cbz has_nul1, .Lnul_in_data2 121*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 122*09a53ad8SAndrew Turner mov data2, data1 123*09a53ad8SAndrew Turner#endif 124*09a53ad8SAndrew Turner sub len, len, #8 125*09a53ad8SAndrew Turner mov has_nul2, has_nul1 126*09a53ad8SAndrew Turner.Lnul_in_data2: 127*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 128*09a53ad8SAndrew Turner /* For big-endian, carry propagation (if the final byte in the 129*09a53ad8SAndrew Turner string is 0x01) means we cannot use has_nul directly. The 130*09a53ad8SAndrew Turner easiest way to get the correct byte is to byte-swap the data 131*09a53ad8SAndrew Turner and calculate the syndrome a second time. */ 132*09a53ad8SAndrew Turner rev data2, data2 133*09a53ad8SAndrew Turner sub tmp1, data2, zeroones 134*09a53ad8SAndrew Turner orr tmp2, data2, #REP8_7f 135*09a53ad8SAndrew Turner bic has_nul2, tmp1, tmp2 136*09a53ad8SAndrew Turner#endif 137*09a53ad8SAndrew Turner sub len, len, #8 138*09a53ad8SAndrew Turner rev has_nul2, has_nul2 139*09a53ad8SAndrew Turner clz pos, has_nul2 140*09a53ad8SAndrew Turner add len, len, pos, lsr #3 /* Bits to bytes. */ 141*09a53ad8SAndrew Turner cmp len, limit 142*09a53ad8SAndrew Turner csel len, len, limit, ls /* Return the lower value. */ 143*09a53ad8SAndrew Turner ret 144*09a53ad8SAndrew Turner 145*09a53ad8SAndrew Turner.Lmisaligned: 146*09a53ad8SAndrew Turner /* Deal with a partial first word. 147*09a53ad8SAndrew Turner We're doing two things in parallel here; 148*09a53ad8SAndrew Turner 1) Calculate the number of words (but avoiding overflow if 149*09a53ad8SAndrew Turner limit is near ULONG_MAX) - to do this we need to work out 150*09a53ad8SAndrew Turner limit + tmp1 - 1 as a 65-bit value before shifting it; 151*09a53ad8SAndrew Turner 2) Load and mask the initial data words - we force the bytes 152*09a53ad8SAndrew Turner before the ones we are interested in to 0xff - this ensures 153*09a53ad8SAndrew Turner early bytes will not hit any zero detection. */ 154*09a53ad8SAndrew Turner sub limit_wd, limit, #1 155*09a53ad8SAndrew Turner neg tmp4, tmp1 156*09a53ad8SAndrew Turner cmp tmp1, #8 157*09a53ad8SAndrew Turner 158*09a53ad8SAndrew Turner and tmp3, limit_wd, #15 159*09a53ad8SAndrew Turner lsr limit_wd, limit_wd, #4 160*09a53ad8SAndrew Turner mov tmp2, #~0 161*09a53ad8SAndrew Turner 162*09a53ad8SAndrew Turner ldp data1, data2, [src], #16 163*09a53ad8SAndrew Turner lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 164*09a53ad8SAndrew Turner add tmp3, tmp3, tmp1 165*09a53ad8SAndrew Turner 166*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 167*09a53ad8SAndrew Turner /* Big-endian. Early bytes are at MSB. */ 168*09a53ad8SAndrew Turner lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 169*09a53ad8SAndrew Turner#else 170*09a53ad8SAndrew Turner /* Little-endian. Early bytes are at LSB. */ 171*09a53ad8SAndrew Turner lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 172*09a53ad8SAndrew Turner#endif 173*09a53ad8SAndrew Turner add limit_wd, limit_wd, tmp3, lsr #4 174*09a53ad8SAndrew Turner 175*09a53ad8SAndrew Turner orr data1, data1, tmp2 176*09a53ad8SAndrew Turner orr data2a, data2, tmp2 177*09a53ad8SAndrew Turner 178*09a53ad8SAndrew Turner csinv data1, data1, xzr, le 179*09a53ad8SAndrew Turner csel data2, data2, data2a, le 180*09a53ad8SAndrew Turner b .Lrealigned 181*09a53ad8SAndrew Turner .size strnlen, . - .Lstart /* Include pre-padding in size. */ 182