1*09a53ad8SAndrew Turner/* Copyright (c) 2013, Linaro Limited 2*09a53ad8SAndrew Turner All rights reserved. 3*09a53ad8SAndrew Turner 4*09a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 5*09a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 6*09a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 7*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 8*09a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 9*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 10*09a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 11*09a53ad8SAndrew Turner * Neither the name of the Linaro nor the 12*09a53ad8SAndrew Turner names of its contributors may be used to endorse or promote products 13*09a53ad8SAndrew Turner derived from this software without specific prior written permission. 14*09a53ad8SAndrew Turner 15*09a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16*09a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17*09a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18*09a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19*09a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20*09a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21*09a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22*09a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23*09a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*09a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25*09a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26*09a53ad8SAndrew Turner 27*09a53ad8SAndrew Turner/* Assumptions: 28*09a53ad8SAndrew Turner * 29*09a53ad8SAndrew Turner * ARMv8-a, AArch64 30*09a53ad8SAndrew Turner */ 31*09a53ad8SAndrew Turner 32*09a53ad8SAndrew Turner .macro def_fn f p2align=0 33*09a53ad8SAndrew Turner .text 34*09a53ad8SAndrew Turner .p2align \p2align 35*09a53ad8SAndrew Turner .global \f 36*09a53ad8SAndrew Turner .type \f, %function 37*09a53ad8SAndrew Turner\f: 38*09a53ad8SAndrew Turner .endm 39*09a53ad8SAndrew Turner 40*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101 41*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f 42*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080 43*09a53ad8SAndrew Turner 44*09a53ad8SAndrew Turner/* Parameters and result. */ 45*09a53ad8SAndrew Turner#define src1 x0 46*09a53ad8SAndrew Turner#define src2 x1 47*09a53ad8SAndrew Turner#define limit x2 48*09a53ad8SAndrew Turner#define result x0 49*09a53ad8SAndrew Turner 50*09a53ad8SAndrew Turner/* Internal variables. */ 51*09a53ad8SAndrew Turner#define data1 x3 52*09a53ad8SAndrew Turner#define data1w w3 53*09a53ad8SAndrew Turner#define data2 x4 54*09a53ad8SAndrew Turner#define data2w w4 55*09a53ad8SAndrew Turner#define has_nul x5 56*09a53ad8SAndrew Turner#define diff x6 57*09a53ad8SAndrew Turner#define syndrome x7 58*09a53ad8SAndrew Turner#define tmp1 x8 59*09a53ad8SAndrew Turner#define tmp2 x9 60*09a53ad8SAndrew Turner#define tmp3 x10 61*09a53ad8SAndrew Turner#define zeroones x11 62*09a53ad8SAndrew Turner#define pos x12 63*09a53ad8SAndrew Turner#define limit_wd x13 64*09a53ad8SAndrew Turner#define mask x14 65*09a53ad8SAndrew Turner#define endloop x15 66*09a53ad8SAndrew Turner 67*09a53ad8SAndrew Turner .text 68*09a53ad8SAndrew Turner .p2align 6 69*09a53ad8SAndrew Turner .rep 7 70*09a53ad8SAndrew Turner nop /* Pad so that the loop below fits a cache line. */ 71*09a53ad8SAndrew Turner .endr 72*09a53ad8SAndrew Turnerdef_fn strncmp 73*09a53ad8SAndrew Turner cbz limit, .Lret0 74*09a53ad8SAndrew Turner eor tmp1, src1, src2 75*09a53ad8SAndrew Turner mov zeroones, #REP8_01 76*09a53ad8SAndrew Turner tst tmp1, #7 77*09a53ad8SAndrew Turner b.ne .Lmisaligned8 78*09a53ad8SAndrew Turner ands tmp1, src1, #7 79*09a53ad8SAndrew Turner b.ne .Lmutual_align 80*09a53ad8SAndrew Turner /* Calculate the number of full and partial words -1. */ 81*09a53ad8SAndrew Turner sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 82*09a53ad8SAndrew Turner lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 83*09a53ad8SAndrew Turner 84*09a53ad8SAndrew Turner /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 85*09a53ad8SAndrew Turner (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 86*09a53ad8SAndrew Turner can be done in parallel across the entire word. */ 87*09a53ad8SAndrew Turner /* Start of performance-critical section -- one 64B cache line. */ 88*09a53ad8SAndrew Turner.Lloop_aligned: 89*09a53ad8SAndrew Turner ldr data1, [src1], #8 90*09a53ad8SAndrew Turner ldr data2, [src2], #8 91*09a53ad8SAndrew Turner.Lstart_realigned: 92*09a53ad8SAndrew Turner subs limit_wd, limit_wd, #1 93*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 94*09a53ad8SAndrew Turner orr tmp2, data1, #REP8_7f 95*09a53ad8SAndrew Turner eor diff, data1, data2 /* Non-zero if differences found. */ 96*09a53ad8SAndrew Turner csinv endloop, diff, xzr, pl /* Last Dword or differences. */ 97*09a53ad8SAndrew Turner bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 98*09a53ad8SAndrew Turner ccmp endloop, #0, #0, eq 99*09a53ad8SAndrew Turner b.eq .Lloop_aligned 100*09a53ad8SAndrew Turner /* End of performance-critical section -- one 64B cache line. */ 101*09a53ad8SAndrew Turner 102*09a53ad8SAndrew Turner /* Not reached the limit, must have found the end or a diff. */ 103*09a53ad8SAndrew Turner tbz limit_wd, #63, .Lnot_limit 104*09a53ad8SAndrew Turner 105*09a53ad8SAndrew Turner /* Limit % 8 == 0 => all bytes significant. */ 106*09a53ad8SAndrew Turner ands limit, limit, #7 107*09a53ad8SAndrew Turner b.eq .Lnot_limit 108*09a53ad8SAndrew Turner 109*09a53ad8SAndrew Turner lsl limit, limit, #3 /* Bits -> bytes. */ 110*09a53ad8SAndrew Turner mov mask, #~0 111*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 112*09a53ad8SAndrew Turner lsr mask, mask, limit 113*09a53ad8SAndrew Turner#else 114*09a53ad8SAndrew Turner lsl mask, mask, limit 115*09a53ad8SAndrew Turner#endif 116*09a53ad8SAndrew Turner bic data1, data1, mask 117*09a53ad8SAndrew Turner bic data2, data2, mask 118*09a53ad8SAndrew Turner 119*09a53ad8SAndrew Turner /* Make sure that the NUL byte is marked in the syndrome. */ 120*09a53ad8SAndrew Turner orr has_nul, has_nul, mask 121*09a53ad8SAndrew Turner 122*09a53ad8SAndrew Turner.Lnot_limit: 123*09a53ad8SAndrew Turner orr syndrome, diff, has_nul 124*09a53ad8SAndrew Turner 125*09a53ad8SAndrew Turner#ifndef __AARCH64EB__ 126*09a53ad8SAndrew Turner rev syndrome, syndrome 127*09a53ad8SAndrew Turner rev data1, data1 128*09a53ad8SAndrew Turner /* The MS-non-zero bit of the syndrome marks either the first bit 129*09a53ad8SAndrew Turner that is different, or the top bit of the first zero byte. 130*09a53ad8SAndrew Turner Shifting left now will bring the critical information into the 131*09a53ad8SAndrew Turner top bits. */ 132*09a53ad8SAndrew Turner clz pos, syndrome 133*09a53ad8SAndrew Turner rev data2, data2 134*09a53ad8SAndrew Turner lsl data1, data1, pos 135*09a53ad8SAndrew Turner lsl data2, data2, pos 136*09a53ad8SAndrew Turner /* But we need to zero-extend (char is unsigned) the value and then 137*09a53ad8SAndrew Turner perform a signed 32-bit subtraction. */ 138*09a53ad8SAndrew Turner lsr data1, data1, #56 139*09a53ad8SAndrew Turner sub result, data1, data2, lsr #56 140*09a53ad8SAndrew Turner ret 141*09a53ad8SAndrew Turner#else 142*09a53ad8SAndrew Turner /* For big-endian we cannot use the trick with the syndrome value 143*09a53ad8SAndrew Turner as carry-propagation can corrupt the upper bits if the trailing 144*09a53ad8SAndrew Turner bytes in the string contain 0x01. */ 145*09a53ad8SAndrew Turner /* However, if there is no NUL byte in the dword, we can generate 146*09a53ad8SAndrew Turner the result directly. We can't just subtract the bytes as the 147*09a53ad8SAndrew Turner MSB might be significant. */ 148*09a53ad8SAndrew Turner cbnz has_nul, 1f 149*09a53ad8SAndrew Turner cmp data1, data2 150*09a53ad8SAndrew Turner cset result, ne 151*09a53ad8SAndrew Turner cneg result, result, lo 152*09a53ad8SAndrew Turner ret 153*09a53ad8SAndrew Turner1: 154*09a53ad8SAndrew Turner /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 155*09a53ad8SAndrew Turner rev tmp3, data1 156*09a53ad8SAndrew Turner sub tmp1, tmp3, zeroones 157*09a53ad8SAndrew Turner orr tmp2, tmp3, #REP8_7f 158*09a53ad8SAndrew Turner bic has_nul, tmp1, tmp2 159*09a53ad8SAndrew Turner rev has_nul, has_nul 160*09a53ad8SAndrew Turner orr syndrome, diff, has_nul 161*09a53ad8SAndrew Turner clz pos, syndrome 162*09a53ad8SAndrew Turner /* The MS-non-zero bit of the syndrome marks either the first bit 163*09a53ad8SAndrew Turner that is different, or the top bit of the first zero byte. 164*09a53ad8SAndrew Turner Shifting left now will bring the critical information into the 165*09a53ad8SAndrew Turner top bits. */ 166*09a53ad8SAndrew Turner lsl data1, data1, pos 167*09a53ad8SAndrew Turner lsl data2, data2, pos 168*09a53ad8SAndrew Turner /* But we need to zero-extend (char is unsigned) the value and then 169*09a53ad8SAndrew Turner perform a signed 32-bit subtraction. */ 170*09a53ad8SAndrew Turner lsr data1, data1, #56 171*09a53ad8SAndrew Turner sub result, data1, data2, lsr #56 172*09a53ad8SAndrew Turner ret 173*09a53ad8SAndrew Turner#endif 174*09a53ad8SAndrew Turner 175*09a53ad8SAndrew Turner.Lmutual_align: 176*09a53ad8SAndrew Turner /* Sources are mutually aligned, but are not currently at an 177*09a53ad8SAndrew Turner alignment boundary. Round down the addresses and then mask off 178*09a53ad8SAndrew Turner the bytes that precede the start point. 179*09a53ad8SAndrew Turner We also need to adjust the limit calculations, but without 180*09a53ad8SAndrew Turner overflowing if the limit is near ULONG_MAX. */ 181*09a53ad8SAndrew Turner bic src1, src1, #7 182*09a53ad8SAndrew Turner bic src2, src2, #7 183*09a53ad8SAndrew Turner ldr data1, [src1], #8 184*09a53ad8SAndrew Turner neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ 185*09a53ad8SAndrew Turner ldr data2, [src2], #8 186*09a53ad8SAndrew Turner mov tmp2, #~0 187*09a53ad8SAndrew Turner sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 188*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 189*09a53ad8SAndrew Turner /* Big-endian. Early bytes are at MSB. */ 190*09a53ad8SAndrew Turner lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ 191*09a53ad8SAndrew Turner#else 192*09a53ad8SAndrew Turner /* Little-endian. Early bytes are at LSB. */ 193*09a53ad8SAndrew Turner lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ 194*09a53ad8SAndrew Turner#endif 195*09a53ad8SAndrew Turner and tmp3, limit_wd, #7 196*09a53ad8SAndrew Turner lsr limit_wd, limit_wd, #3 197*09a53ad8SAndrew Turner /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ 198*09a53ad8SAndrew Turner add limit, limit, tmp1 199*09a53ad8SAndrew Turner add tmp3, tmp3, tmp1 200*09a53ad8SAndrew Turner orr data1, data1, tmp2 201*09a53ad8SAndrew Turner orr data2, data2, tmp2 202*09a53ad8SAndrew Turner add limit_wd, limit_wd, tmp3, lsr #3 203*09a53ad8SAndrew Turner b .Lstart_realigned 204*09a53ad8SAndrew Turner 205*09a53ad8SAndrew Turner.Lret0: 206*09a53ad8SAndrew Turner mov result, #0 207*09a53ad8SAndrew Turner ret 208*09a53ad8SAndrew Turner 209*09a53ad8SAndrew Turner .p2align 6 210*09a53ad8SAndrew Turner.Lmisaligned8: 211*09a53ad8SAndrew Turner sub limit, limit, #1 212*09a53ad8SAndrew Turner1: 213*09a53ad8SAndrew Turner /* Perhaps we can do better than this. */ 214*09a53ad8SAndrew Turner ldrb data1w, [src1], #1 215*09a53ad8SAndrew Turner ldrb data2w, [src2], #1 216*09a53ad8SAndrew Turner subs limit, limit, #1 217*09a53ad8SAndrew Turner ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ 218*09a53ad8SAndrew Turner ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 219*09a53ad8SAndrew Turner b.eq 1b 220*09a53ad8SAndrew Turner sub result, data1, data2 221*09a53ad8SAndrew Turner ret 222*09a53ad8SAndrew Turner .size strncmp, . - strncmp 223