1*09a53ad8SAndrew Turner/* Copyright (c) 2012, Linaro Limited 2*09a53ad8SAndrew Turner All rights reserved. 3*09a53ad8SAndrew Turner 4*09a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 5*09a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 6*09a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 7*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 8*09a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 9*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 10*09a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 11*09a53ad8SAndrew Turner * Neither the name of the Linaro nor the 12*09a53ad8SAndrew Turner names of its contributors may be used to endorse or promote products 13*09a53ad8SAndrew Turner derived from this software without specific prior written permission. 14*09a53ad8SAndrew Turner 15*09a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16*09a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17*09a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18*09a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19*09a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20*09a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21*09a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22*09a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23*09a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*09a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25*09a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26*09a53ad8SAndrew Turner 27*09a53ad8SAndrew Turner/* Assumptions: 28*09a53ad8SAndrew Turner * 29*09a53ad8SAndrew Turner * ARMv8-a, AArch64 30*09a53ad8SAndrew Turner */ 31*09a53ad8SAndrew Turner 32*09a53ad8SAndrew Turner .macro def_fn f p2align=0 33*09a53ad8SAndrew Turner .text 34*09a53ad8SAndrew Turner .p2align \p2align 35*09a53ad8SAndrew Turner .global \f 36*09a53ad8SAndrew Turner .type \f, %function 37*09a53ad8SAndrew Turner\f: 38*09a53ad8SAndrew Turner .endm 39*09a53ad8SAndrew Turner 40*09a53ad8SAndrew Turner#define REP8_01 0x0101010101010101 41*09a53ad8SAndrew Turner#define REP8_7f 0x7f7f7f7f7f7f7f7f 42*09a53ad8SAndrew Turner#define REP8_80 0x8080808080808080 43*09a53ad8SAndrew Turner 44*09a53ad8SAndrew Turner/* Parameters and result. */ 45*09a53ad8SAndrew Turner#define src1 x0 46*09a53ad8SAndrew Turner#define src2 x1 47*09a53ad8SAndrew Turner#define result x0 48*09a53ad8SAndrew Turner 49*09a53ad8SAndrew Turner/* Internal variables. */ 50*09a53ad8SAndrew Turner#define data1 x2 51*09a53ad8SAndrew Turner#define data1w w2 52*09a53ad8SAndrew Turner#define data2 x3 53*09a53ad8SAndrew Turner#define data2w w3 54*09a53ad8SAndrew Turner#define has_nul x4 55*09a53ad8SAndrew Turner#define diff x5 56*09a53ad8SAndrew Turner#define syndrome x6 57*09a53ad8SAndrew Turner#define tmp1 x7 58*09a53ad8SAndrew Turner#define tmp2 x8 59*09a53ad8SAndrew Turner#define tmp3 x9 60*09a53ad8SAndrew Turner#define zeroones x10 61*09a53ad8SAndrew Turner#define pos x11 62*09a53ad8SAndrew Turner 63*09a53ad8SAndrew Turner /* Start of performance-critical section -- one 64B cache line. */ 64*09a53ad8SAndrew Turnerdef_fn strcmp p2align=6 65*09a53ad8SAndrew Turner eor tmp1, src1, src2 66*09a53ad8SAndrew Turner mov zeroones, #REP8_01 67*09a53ad8SAndrew Turner tst tmp1, #7 68*09a53ad8SAndrew Turner b.ne .Lmisaligned8 69*09a53ad8SAndrew Turner ands tmp1, src1, #7 70*09a53ad8SAndrew Turner b.ne .Lmutual_align 71*09a53ad8SAndrew Turner /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 72*09a53ad8SAndrew Turner (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 73*09a53ad8SAndrew Turner can be done in parallel across the entire word. */ 74*09a53ad8SAndrew Turner.Lloop_aligned: 75*09a53ad8SAndrew Turner ldr data1, [src1], #8 76*09a53ad8SAndrew Turner ldr data2, [src2], #8 77*09a53ad8SAndrew Turner.Lstart_realigned: 78*09a53ad8SAndrew Turner sub tmp1, data1, zeroones 79*09a53ad8SAndrew Turner orr tmp2, data1, #REP8_7f 80*09a53ad8SAndrew Turner eor diff, data1, data2 /* Non-zero if differences found. */ 81*09a53ad8SAndrew Turner bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 82*09a53ad8SAndrew Turner orr syndrome, diff, has_nul 83*09a53ad8SAndrew Turner cbz syndrome, .Lloop_aligned 84*09a53ad8SAndrew Turner /* End of performance-critical section -- one 64B cache line. */ 85*09a53ad8SAndrew Turner 86*09a53ad8SAndrew Turner#ifndef __AARCH64EB__ 87*09a53ad8SAndrew Turner rev syndrome, syndrome 88*09a53ad8SAndrew Turner rev data1, data1 89*09a53ad8SAndrew Turner /* The MS-non-zero bit of the syndrome marks either the first bit 90*09a53ad8SAndrew Turner that is different, or the top bit of the first zero byte. 91*09a53ad8SAndrew Turner Shifting left now will bring the critical information into the 92*09a53ad8SAndrew Turner top bits. */ 93*09a53ad8SAndrew Turner clz pos, syndrome 94*09a53ad8SAndrew Turner rev data2, data2 95*09a53ad8SAndrew Turner lsl data1, data1, pos 96*09a53ad8SAndrew Turner lsl data2, data2, pos 97*09a53ad8SAndrew Turner /* But we need to zero-extend (char is unsigned) the value and then 98*09a53ad8SAndrew Turner perform a signed 32-bit subtraction. */ 99*09a53ad8SAndrew Turner lsr data1, data1, #56 100*09a53ad8SAndrew Turner sub result, data1, data2, lsr #56 101*09a53ad8SAndrew Turner ret 102*09a53ad8SAndrew Turner#else 103*09a53ad8SAndrew Turner /* For big-endian we cannot use the trick with the syndrome value 104*09a53ad8SAndrew Turner as carry-propagation can corrupt the upper bits if the trailing 105*09a53ad8SAndrew Turner bytes in the string contain 0x01. */ 106*09a53ad8SAndrew Turner /* However, if there is no NUL byte in the dword, we can generate 107*09a53ad8SAndrew Turner the result directly. We can't just subtract the bytes as the 108*09a53ad8SAndrew Turner MSB might be significant. */ 109*09a53ad8SAndrew Turner cbnz has_nul, 1f 110*09a53ad8SAndrew Turner cmp data1, data2 111*09a53ad8SAndrew Turner cset result, ne 112*09a53ad8SAndrew Turner cneg result, result, lo 113*09a53ad8SAndrew Turner ret 114*09a53ad8SAndrew Turner1: 115*09a53ad8SAndrew Turner /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 116*09a53ad8SAndrew Turner rev tmp3, data1 117*09a53ad8SAndrew Turner sub tmp1, tmp3, zeroones 118*09a53ad8SAndrew Turner orr tmp2, tmp3, #REP8_7f 119*09a53ad8SAndrew Turner bic has_nul, tmp1, tmp2 120*09a53ad8SAndrew Turner rev has_nul, has_nul 121*09a53ad8SAndrew Turner orr syndrome, diff, has_nul 122*09a53ad8SAndrew Turner clz pos, syndrome 123*09a53ad8SAndrew Turner /* The MS-non-zero bit of the syndrome marks either the first bit 124*09a53ad8SAndrew Turner that is different, or the top bit of the first zero byte. 125*09a53ad8SAndrew Turner Shifting left now will bring the critical information into the 126*09a53ad8SAndrew Turner top bits. */ 127*09a53ad8SAndrew Turner lsl data1, data1, pos 128*09a53ad8SAndrew Turner lsl data2, data2, pos 129*09a53ad8SAndrew Turner /* But we need to zero-extend (char is unsigned) the value and then 130*09a53ad8SAndrew Turner perform a signed 32-bit subtraction. */ 131*09a53ad8SAndrew Turner lsr data1, data1, #56 132*09a53ad8SAndrew Turner sub result, data1, data2, lsr #56 133*09a53ad8SAndrew Turner ret 134*09a53ad8SAndrew Turner#endif 135*09a53ad8SAndrew Turner 136*09a53ad8SAndrew Turner.Lmutual_align: 137*09a53ad8SAndrew Turner /* Sources are mutually aligned, but are not currently at an 138*09a53ad8SAndrew Turner alignment boundary. Round down the addresses and then mask off 139*09a53ad8SAndrew Turner the bytes that preceed the start point. */ 140*09a53ad8SAndrew Turner bic src1, src1, #7 141*09a53ad8SAndrew Turner bic src2, src2, #7 142*09a53ad8SAndrew Turner lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 143*09a53ad8SAndrew Turner ldr data1, [src1], #8 144*09a53ad8SAndrew Turner neg tmp1, tmp1 /* Bits to alignment -64. */ 145*09a53ad8SAndrew Turner ldr data2, [src2], #8 146*09a53ad8SAndrew Turner mov tmp2, #~0 147*09a53ad8SAndrew Turner#ifdef __AARCH64EB__ 148*09a53ad8SAndrew Turner /* Big-endian. Early bytes are at MSB. */ 149*09a53ad8SAndrew Turner lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 150*09a53ad8SAndrew Turner#else 151*09a53ad8SAndrew Turner /* Little-endian. Early bytes are at LSB. */ 152*09a53ad8SAndrew Turner lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 153*09a53ad8SAndrew Turner#endif 154*09a53ad8SAndrew Turner orr data1, data1, tmp2 155*09a53ad8SAndrew Turner orr data2, data2, tmp2 156*09a53ad8SAndrew Turner b .Lstart_realigned 157*09a53ad8SAndrew Turner 158*09a53ad8SAndrew Turner.Lmisaligned8: 159*09a53ad8SAndrew Turner /* We can do better than this. */ 160*09a53ad8SAndrew Turner ldrb data1w, [src1], #1 161*09a53ad8SAndrew Turner ldrb data2w, [src2], #1 162*09a53ad8SAndrew Turner cmp data1w, #1 163*09a53ad8SAndrew Turner ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 164*09a53ad8SAndrew Turner b.eq .Lmisaligned8 165*09a53ad8SAndrew Turner sub result, data1, data2 166*09a53ad8SAndrew Turner ret 167