1*8ddb146aSEd Maste/* 2*8ddb146aSEd MasteCopyright (c) 2014, Intel Corporation 3*8ddb146aSEd MasteAll rights reserved. 4*8ddb146aSEd Maste 5*8ddb146aSEd MasteRedistribution and use in source and binary forms, with or without 6*8ddb146aSEd Mastemodification, are permitted provided that the following conditions are met: 7*8ddb146aSEd Maste 8*8ddb146aSEd Maste * Redistributions of source code must retain the above copyright notice, 9*8ddb146aSEd Maste * this list of conditions and the following disclaimer. 10*8ddb146aSEd Maste 11*8ddb146aSEd Maste * Redistributions in binary form must reproduce the above copyright notice, 12*8ddb146aSEd Maste * this list of conditions and the following disclaimer in the documentation 13*8ddb146aSEd Maste * and/or other materials provided with the distribution. 14*8ddb146aSEd Maste 15*8ddb146aSEd Maste * Neither the name of Intel Corporation nor the names of its contributors 16*8ddb146aSEd Maste * may be used to endorse or promote products derived from this software 17*8ddb146aSEd Maste * without specific prior written permission. 18*8ddb146aSEd Maste 19*8ddb146aSEd MasteTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20*8ddb146aSEd MasteANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21*8ddb146aSEd MasteWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22*8ddb146aSEd MasteDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23*8ddb146aSEd MasteANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24*8ddb146aSEd Maste(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25*8ddb146aSEd MasteLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26*8ddb146aSEd MasteANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27*8ddb146aSEd Maste(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28*8ddb146aSEd MasteSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*8ddb146aSEd Maste*/ 30*8ddb146aSEd Maste 31*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 32*8ddb146aSEd Maste/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz 33*8ddb146aSEd Maste if the new counter > the old one or is 0. */ 34*8ddb146aSEd Maste#define UPDATE_STRNCMP_COUNTER \ 35*8ddb146aSEd Maste /* calculate left number to compare */ \ 36*8ddb146aSEd Maste lea -16(%rcx, %r11), %r9; \ 37*8ddb146aSEd Maste cmp %r9, %r11; \ 38*8ddb146aSEd Maste jb L(strcmp_exitz); \ 39*8ddb146aSEd Maste test %r9, %r9; \ 40*8ddb146aSEd Maste je L(strcmp_exitz); \ 41*8ddb146aSEd Maste mov %r9, %r11 42*8ddb146aSEd Maste 43*8ddb146aSEd Maste#else 44*8ddb146aSEd Maste#define UPDATE_STRNCMP_COUNTER 45*8ddb146aSEd Maste#ifndef STRCMP 46*8ddb146aSEd Maste#define STRCMP strcmp 47*8ddb146aSEd Maste#endif 48*8ddb146aSEd Maste#endif 49*8ddb146aSEd Maste 50*8ddb146aSEd Maste#ifndef L 51*8ddb146aSEd Maste# define L(label) .L##label 52*8ddb146aSEd Maste#endif 53*8ddb146aSEd Maste 54*8ddb146aSEd Maste#ifndef cfi_startproc 55*8ddb146aSEd Maste# define cfi_startproc .cfi_startproc 56*8ddb146aSEd Maste#endif 57*8ddb146aSEd Maste 58*8ddb146aSEd Maste#ifndef cfi_endproc 59*8ddb146aSEd Maste# define cfi_endproc .cfi_endproc 60*8ddb146aSEd Maste#endif 61*8ddb146aSEd Maste 62*8ddb146aSEd Maste#ifndef ENTRY 63*8ddb146aSEd Maste# define ENTRY(name) \ 64*8ddb146aSEd Maste .type name, @function; \ 65*8ddb146aSEd Maste .globl name; \ 66*8ddb146aSEd Maste .p2align 4; \ 67*8ddb146aSEd Mastename: \ 68*8ddb146aSEd Maste cfi_startproc 69*8ddb146aSEd Maste#endif 70*8ddb146aSEd Maste 71*8ddb146aSEd Maste#ifndef END 72*8ddb146aSEd Maste# define END(name) \ 73*8ddb146aSEd Maste cfi_endproc; \ 74*8ddb146aSEd Maste .size name, .-name 75*8ddb146aSEd Maste#endif 76*8ddb146aSEd Maste#define RETURN ret 77*8ddb146aSEd Maste .section .text.ssse3,"ax",@progbits 78*8ddb146aSEd MasteENTRY (STRCMP) 79*8ddb146aSEd Maste/* 80*8ddb146aSEd Maste * This implementation uses SSE to compare up to 16 bytes at a time. 81*8ddb146aSEd Maste */ 82*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 83*8ddb146aSEd Maste test %rdx, %rdx 84*8ddb146aSEd Maste je L(strcmp_exitz) 85*8ddb146aSEd Maste cmp $1, %rdx 86*8ddb146aSEd Maste je L(Byte0) 87*8ddb146aSEd Maste mov %rdx, %r11 88*8ddb146aSEd Maste#endif 89*8ddb146aSEd Maste mov %esi, %ecx 90*8ddb146aSEd Maste mov %edi, %eax 91*8ddb146aSEd Maste/* Use 64bit AND here to avoid long NOP padding. */ 92*8ddb146aSEd Maste and $0x3f, %rcx /* rsi alignment in cache line */ 93*8ddb146aSEd Maste and $0x3f, %rax /* rdi alignment in cache line */ 94*8ddb146aSEd Maste cmp $0x30, %ecx 95*8ddb146aSEd Maste ja L(crosscache) /* rsi: 16-byte load will cross cache line */ 96*8ddb146aSEd Maste cmp $0x30, %eax 97*8ddb146aSEd Maste ja L(crosscache) /* rdi: 16-byte load will cross cache line */ 98*8ddb146aSEd Maste movlpd (%rdi), %xmm1 99*8ddb146aSEd Maste movlpd (%rsi), %xmm2 100*8ddb146aSEd Maste movhpd 8(%rdi), %xmm1 101*8ddb146aSEd Maste movhpd 8(%rsi), %xmm2 102*8ddb146aSEd Maste pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 103*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 104*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ 105*8ddb146aSEd Maste psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 106*8ddb146aSEd Maste pmovmskb %xmm1, %edx 107*8ddb146aSEd Maste sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 108*8ddb146aSEd Maste jnz L(less16bytes) /* If not, find different value or null char */ 109*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 110*8ddb146aSEd Maste sub $16, %r11 111*8ddb146aSEd Maste jbe L(strcmp_exitz) /* finish comparision */ 112*8ddb146aSEd Maste#endif 113*8ddb146aSEd Maste add $16, %rsi /* prepare to search next 16 bytes */ 114*8ddb146aSEd Maste add $16, %rdi /* prepare to search next 16 bytes */ 115*8ddb146aSEd Maste 116*8ddb146aSEd Maste /* 117*8ddb146aSEd Maste * Determine source and destination string offsets from 16-byte alignment. 118*8ddb146aSEd Maste * Use relative offset difference between the two to determine which case 119*8ddb146aSEd Maste * below to use. 120*8ddb146aSEd Maste */ 121*8ddb146aSEd Maste .p2align 4 122*8ddb146aSEd MasteL(crosscache): 123*8ddb146aSEd Maste and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ 124*8ddb146aSEd Maste and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ 125*8ddb146aSEd Maste mov $0xffff, %edx /* for equivalent offset */ 126*8ddb146aSEd Maste xor %r8d, %r8d 127*8ddb146aSEd Maste and $0xf, %ecx /* offset of rsi */ 128*8ddb146aSEd Maste and $0xf, %eax /* offset of rdi */ 129*8ddb146aSEd Maste cmp %eax, %ecx 130*8ddb146aSEd Maste je L(ashr_0) /* rsi and rdi relative offset same */ 131*8ddb146aSEd Maste ja L(bigger) 132*8ddb146aSEd Maste mov %edx, %r8d /* r8d is offset flag for exit tail */ 133*8ddb146aSEd Maste xchg %ecx, %eax 134*8ddb146aSEd Maste xchg %rsi, %rdi 135*8ddb146aSEd MasteL(bigger): 136*8ddb146aSEd Maste lea 15(%rax), %r9 137*8ddb146aSEd Maste sub %rcx, %r9 138*8ddb146aSEd Maste lea L(unaligned_table)(%rip), %r10 139*8ddb146aSEd Maste movslq (%r10, %r9,4), %r9 140*8ddb146aSEd Maste lea (%r10, %r9), %r10 141*8ddb146aSEd Maste jmp *%r10 /* jump to corresponding case */ 142*8ddb146aSEd Maste 143*8ddb146aSEd Maste/* 144*8ddb146aSEd Maste * The following cases will be handled by ashr_0 145*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 146*8ddb146aSEd Maste * n(0~15) n(0~15) 15(15+ n-n) ashr_0 147*8ddb146aSEd Maste */ 148*8ddb146aSEd Maste .p2align 4 149*8ddb146aSEd MasteL(ashr_0): 150*8ddb146aSEd Maste 151*8ddb146aSEd Maste movdqa (%rsi), %xmm1 152*8ddb146aSEd Maste pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ 153*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 154*8ddb146aSEd Maste pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ 155*8ddb146aSEd Maste psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 156*8ddb146aSEd Maste pmovmskb %xmm1, %r9d 157*8ddb146aSEd Maste shr %cl, %edx /* adjust 0xffff for offset */ 158*8ddb146aSEd Maste shr %cl, %r9d /* adjust for 16-byte offset */ 159*8ddb146aSEd Maste sub %r9d, %edx 160*8ddb146aSEd Maste /* 161*8ddb146aSEd Maste * edx must be the same with r9d if in left byte (16-rcx) is equal to 162*8ddb146aSEd Maste * the start from (16-rax) and no null char was seen. 163*8ddb146aSEd Maste */ 164*8ddb146aSEd Maste jne L(less32bytes) /* mismatch or null char */ 165*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 166*8ddb146aSEd Maste mov $16, %rcx 167*8ddb146aSEd Maste mov $16, %r9 168*8ddb146aSEd Maste pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ 169*8ddb146aSEd Maste 170*8ddb146aSEd Maste /* 171*8ddb146aSEd Maste * Now both strings are aligned at 16-byte boundary. Loop over strings 172*8ddb146aSEd Maste * checking 32-bytes per iteration. 173*8ddb146aSEd Maste */ 174*8ddb146aSEd Maste .p2align 4 175*8ddb146aSEd MasteL(loop_ashr_0): 176*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 177*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 178*8ddb146aSEd Maste 179*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 180*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 181*8ddb146aSEd Maste psubb %xmm0, %xmm1 182*8ddb146aSEd Maste pmovmskb %xmm1, %edx 183*8ddb146aSEd Maste sub $0xffff, %edx 184*8ddb146aSEd Maste jnz L(exit) /* mismatch or null char seen */ 185*8ddb146aSEd Maste 186*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 187*8ddb146aSEd Maste sub $16, %r11 188*8ddb146aSEd Maste jbe L(strcmp_exitz) 189*8ddb146aSEd Maste#endif 190*8ddb146aSEd Maste add $16, %rcx 191*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 192*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 193*8ddb146aSEd Maste 194*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 195*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 196*8ddb146aSEd Maste psubb %xmm0, %xmm1 197*8ddb146aSEd Maste pmovmskb %xmm1, %edx 198*8ddb146aSEd Maste sub $0xffff, %edx 199*8ddb146aSEd Maste jnz L(exit) 200*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 201*8ddb146aSEd Maste sub $16, %r11 202*8ddb146aSEd Maste jbe L(strcmp_exitz) 203*8ddb146aSEd Maste#endif 204*8ddb146aSEd Maste add $16, %rcx 205*8ddb146aSEd Maste jmp L(loop_ashr_0) 206*8ddb146aSEd Maste 207*8ddb146aSEd Maste/* 208*8ddb146aSEd Maste * The following cases will be handled by ashr_1 209*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 210*8ddb146aSEd Maste * n(15) n -15 0(15 +(n-15) - n) ashr_1 211*8ddb146aSEd Maste */ 212*8ddb146aSEd Maste .p2align 4 213*8ddb146aSEd MasteL(ashr_1): 214*8ddb146aSEd Maste pxor %xmm0, %xmm0 215*8ddb146aSEd Maste movdqa (%rdi), %xmm2 216*8ddb146aSEd Maste movdqa (%rsi), %xmm1 217*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 218*8ddb146aSEd Maste pslldq $15, %xmm2 /* shift first string to align with second */ 219*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ 220*8ddb146aSEd Maste psubb %xmm0, %xmm2 /* packed sub of comparison results*/ 221*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 222*8ddb146aSEd Maste shr %cl, %edx /* adjust 0xffff for offset */ 223*8ddb146aSEd Maste shr %cl, %r9d /* adjust for 16-byte offset */ 224*8ddb146aSEd Maste sub %r9d, %edx 225*8ddb146aSEd Maste jnz L(less32bytes) /* mismatch or null char seen */ 226*8ddb146aSEd Maste movdqa (%rdi), %xmm3 227*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 228*8ddb146aSEd Maste 229*8ddb146aSEd Maste pxor %xmm0, %xmm0 230*8ddb146aSEd Maste mov $16, %rcx /* index for loads*/ 231*8ddb146aSEd Maste mov $1, %r9d /* byte position left over from less32bytes case */ 232*8ddb146aSEd Maste /* 233*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 234*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 235*8ddb146aSEd Maste * need to do a nibble. 236*8ddb146aSEd Maste */ 237*8ddb146aSEd Maste lea 1(%rdi), %r10 238*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 239*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 240*8ddb146aSEd Maste 241*8ddb146aSEd Maste .p2align 4 242*8ddb146aSEd MasteL(loop_ashr_1): 243*8ddb146aSEd Maste add $16, %r10 244*8ddb146aSEd Maste jg L(nibble_ashr_1) /* cross page boundary */ 245*8ddb146aSEd Maste 246*8ddb146aSEd MasteL(gobble_ashr_1): 247*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 248*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 249*8ddb146aSEd Maste movdqa %xmm2, %xmm4 /* store for next cycle */ 250*8ddb146aSEd Maste 251*8ddb146aSEd Maste palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ 252*8ddb146aSEd Maste 253*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 254*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 255*8ddb146aSEd Maste psubb %xmm0, %xmm1 256*8ddb146aSEd Maste pmovmskb %xmm1, %edx 257*8ddb146aSEd Maste sub $0xffff, %edx 258*8ddb146aSEd Maste jnz L(exit) 259*8ddb146aSEd Maste 260*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 261*8ddb146aSEd Maste sub $16, %r11 262*8ddb146aSEd Maste jbe L(strcmp_exitz) 263*8ddb146aSEd Maste#endif 264*8ddb146aSEd Maste add $16, %rcx 265*8ddb146aSEd Maste movdqa %xmm4, %xmm3 266*8ddb146aSEd Maste 267*8ddb146aSEd Maste add $16, %r10 268*8ddb146aSEd Maste jg L(nibble_ashr_1) /* cross page boundary */ 269*8ddb146aSEd Maste 270*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 271*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 272*8ddb146aSEd Maste movdqa %xmm2, %xmm4 /* store for next cycle */ 273*8ddb146aSEd Maste 274*8ddb146aSEd Maste palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ 275*8ddb146aSEd Maste 276*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 277*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 278*8ddb146aSEd Maste psubb %xmm0, %xmm1 279*8ddb146aSEd Maste pmovmskb %xmm1, %edx 280*8ddb146aSEd Maste sub $0xffff, %edx 281*8ddb146aSEd Maste jnz L(exit) 282*8ddb146aSEd Maste 283*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 284*8ddb146aSEd Maste sub $16, %r11 285*8ddb146aSEd Maste jbe L(strcmp_exitz) 286*8ddb146aSEd Maste#endif 287*8ddb146aSEd Maste add $16, %rcx 288*8ddb146aSEd Maste movdqa %xmm4, %xmm3 289*8ddb146aSEd Maste jmp L(loop_ashr_1) 290*8ddb146aSEd Maste 291*8ddb146aSEd Maste /* 292*8ddb146aSEd Maste * Nibble avoids loads across page boundary. This is to avoid a potential 293*8ddb146aSEd Maste * access into unmapped memory. 294*8ddb146aSEd Maste */ 295*8ddb146aSEd Maste .p2align 4 296*8ddb146aSEd MasteL(nibble_ashr_1): 297*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ 298*8ddb146aSEd Maste pmovmskb %xmm0, %edx 299*8ddb146aSEd Maste test $0xfffe, %edx 300*8ddb146aSEd Maste jnz L(ashr_1_exittail) /* find null char*/ 301*8ddb146aSEd Maste 302*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 303*8ddb146aSEd Maste cmp $14, %r11 304*8ddb146aSEd Maste jbe L(ashr_1_exittail) 305*8ddb146aSEd Maste#endif 306*8ddb146aSEd Maste 307*8ddb146aSEd Maste pxor %xmm0, %xmm0 308*8ddb146aSEd Maste sub $0x1000, %r10 /* substract 4K from %r10 */ 309*8ddb146aSEd Maste jmp L(gobble_ashr_1) 310*8ddb146aSEd Maste 311*8ddb146aSEd Maste /* 312*8ddb146aSEd Maste * Once find null char, determine if there is a string mismatch 313*8ddb146aSEd Maste * before the null char. 314*8ddb146aSEd Maste */ 315*8ddb146aSEd Maste .p2align 4 316*8ddb146aSEd MasteL(ashr_1_exittail): 317*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 318*8ddb146aSEd Maste psrldq $1, %xmm0 319*8ddb146aSEd Maste psrldq $1, %xmm3 320*8ddb146aSEd Maste jmp L(aftertail) 321*8ddb146aSEd Maste 322*8ddb146aSEd Maste/* 323*8ddb146aSEd Maste * The following cases will be handled by ashr_2 324*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 325*8ddb146aSEd Maste * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 326*8ddb146aSEd Maste */ 327*8ddb146aSEd Maste .p2align 4 328*8ddb146aSEd MasteL(ashr_2): 329*8ddb146aSEd Maste pxor %xmm0, %xmm0 330*8ddb146aSEd Maste movdqa (%rdi), %xmm2 331*8ddb146aSEd Maste movdqa (%rsi), %xmm1 332*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 333*8ddb146aSEd Maste pslldq $14, %xmm2 334*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 335*8ddb146aSEd Maste psubb %xmm0, %xmm2 336*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 337*8ddb146aSEd Maste shr %cl, %edx 338*8ddb146aSEd Maste shr %cl, %r9d 339*8ddb146aSEd Maste sub %r9d, %edx 340*8ddb146aSEd Maste jnz L(less32bytes) 341*8ddb146aSEd Maste movdqa (%rdi), %xmm3 342*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 343*8ddb146aSEd Maste 344*8ddb146aSEd Maste pxor %xmm0, %xmm0 345*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 346*8ddb146aSEd Maste mov $2, %r9d /* byte position left over from less32bytes case */ 347*8ddb146aSEd Maste /* 348*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 349*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 350*8ddb146aSEd Maste * need to do a nibble. 351*8ddb146aSEd Maste */ 352*8ddb146aSEd Maste lea 2(%rdi), %r10 353*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 354*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 355*8ddb146aSEd Maste 356*8ddb146aSEd Maste .p2align 4 357*8ddb146aSEd MasteL(loop_ashr_2): 358*8ddb146aSEd Maste add $16, %r10 359*8ddb146aSEd Maste jg L(nibble_ashr_2) 360*8ddb146aSEd Maste 361*8ddb146aSEd MasteL(gobble_ashr_2): 362*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 363*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 364*8ddb146aSEd Maste movdqa %xmm2, %xmm4 365*8ddb146aSEd Maste 366*8ddb146aSEd Maste palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ 367*8ddb146aSEd Maste 368*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 369*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 370*8ddb146aSEd Maste psubb %xmm0, %xmm1 371*8ddb146aSEd Maste pmovmskb %xmm1, %edx 372*8ddb146aSEd Maste sub $0xffff, %edx 373*8ddb146aSEd Maste jnz L(exit) 374*8ddb146aSEd Maste 375*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 376*8ddb146aSEd Maste sub $16, %r11 377*8ddb146aSEd Maste jbe L(strcmp_exitz) 378*8ddb146aSEd Maste#endif 379*8ddb146aSEd Maste 380*8ddb146aSEd Maste add $16, %rcx 381*8ddb146aSEd Maste movdqa %xmm4, %xmm3 382*8ddb146aSEd Maste 383*8ddb146aSEd Maste add $16, %r10 384*8ddb146aSEd Maste jg L(nibble_ashr_2) /* cross page boundary */ 385*8ddb146aSEd Maste 386*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 387*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 388*8ddb146aSEd Maste movdqa %xmm2, %xmm4 389*8ddb146aSEd Maste 390*8ddb146aSEd Maste palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ 391*8ddb146aSEd Maste 392*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 393*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 394*8ddb146aSEd Maste psubb %xmm0, %xmm1 395*8ddb146aSEd Maste pmovmskb %xmm1, %edx 396*8ddb146aSEd Maste sub $0xffff, %edx 397*8ddb146aSEd Maste jnz L(exit) 398*8ddb146aSEd Maste 399*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 400*8ddb146aSEd Maste sub $16, %r11 401*8ddb146aSEd Maste jbe L(strcmp_exitz) 402*8ddb146aSEd Maste#endif 403*8ddb146aSEd Maste 404*8ddb146aSEd Maste add $16, %rcx 405*8ddb146aSEd Maste movdqa %xmm4, %xmm3 406*8ddb146aSEd Maste jmp L(loop_ashr_2) 407*8ddb146aSEd Maste 408*8ddb146aSEd Maste .p2align 4 409*8ddb146aSEd MasteL(nibble_ashr_2): 410*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 411*8ddb146aSEd Maste pmovmskb %xmm0, %edx 412*8ddb146aSEd Maste test $0xfffc, %edx 413*8ddb146aSEd Maste jnz L(ashr_2_exittail) 414*8ddb146aSEd Maste 415*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 416*8ddb146aSEd Maste cmp $13, %r11 417*8ddb146aSEd Maste jbe L(ashr_2_exittail) 418*8ddb146aSEd Maste#endif 419*8ddb146aSEd Maste 420*8ddb146aSEd Maste pxor %xmm0, %xmm0 421*8ddb146aSEd Maste sub $0x1000, %r10 422*8ddb146aSEd Maste jmp L(gobble_ashr_2) 423*8ddb146aSEd Maste 424*8ddb146aSEd Maste .p2align 4 425*8ddb146aSEd MasteL(ashr_2_exittail): 426*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 427*8ddb146aSEd Maste psrldq $2, %xmm0 428*8ddb146aSEd Maste psrldq $2, %xmm3 429*8ddb146aSEd Maste jmp L(aftertail) 430*8ddb146aSEd Maste 431*8ddb146aSEd Maste/* 432*8ddb146aSEd Maste * The following cases will be handled by ashr_3 433*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 434*8ddb146aSEd Maste * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 435*8ddb146aSEd Maste */ 436*8ddb146aSEd Maste .p2align 4 437*8ddb146aSEd MasteL(ashr_3): 438*8ddb146aSEd Maste pxor %xmm0, %xmm0 439*8ddb146aSEd Maste movdqa (%rdi), %xmm2 440*8ddb146aSEd Maste movdqa (%rsi), %xmm1 441*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 442*8ddb146aSEd Maste pslldq $13, %xmm2 443*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 444*8ddb146aSEd Maste psubb %xmm0, %xmm2 445*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 446*8ddb146aSEd Maste shr %cl, %edx 447*8ddb146aSEd Maste shr %cl, %r9d 448*8ddb146aSEd Maste sub %r9d, %edx 449*8ddb146aSEd Maste jnz L(less32bytes) 450*8ddb146aSEd Maste movdqa (%rdi), %xmm3 451*8ddb146aSEd Maste 452*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 453*8ddb146aSEd Maste 454*8ddb146aSEd Maste pxor %xmm0, %xmm0 455*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 456*8ddb146aSEd Maste mov $3, %r9d /* byte position left over from less32bytes case */ 457*8ddb146aSEd Maste /* 458*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 459*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 460*8ddb146aSEd Maste * need to do a nibble. 461*8ddb146aSEd Maste */ 462*8ddb146aSEd Maste lea 3(%rdi), %r10 463*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 464*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 465*8ddb146aSEd Maste 466*8ddb146aSEd Maste .p2align 4 467*8ddb146aSEd MasteL(loop_ashr_3): 468*8ddb146aSEd Maste add $16, %r10 469*8ddb146aSEd Maste jg L(nibble_ashr_3) 470*8ddb146aSEd Maste 471*8ddb146aSEd MasteL(gobble_ashr_3): 472*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 473*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 474*8ddb146aSEd Maste movdqa %xmm2, %xmm4 475*8ddb146aSEd Maste 476*8ddb146aSEd Maste palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ 477*8ddb146aSEd Maste 478*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 479*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 480*8ddb146aSEd Maste psubb %xmm0, %xmm1 481*8ddb146aSEd Maste pmovmskb %xmm1, %edx 482*8ddb146aSEd Maste sub $0xffff, %edx 483*8ddb146aSEd Maste jnz L(exit) 484*8ddb146aSEd Maste 485*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 486*8ddb146aSEd Maste sub $16, %r11 487*8ddb146aSEd Maste jbe L(strcmp_exitz) 488*8ddb146aSEd Maste#endif 489*8ddb146aSEd Maste 490*8ddb146aSEd Maste add $16, %rcx 491*8ddb146aSEd Maste movdqa %xmm4, %xmm3 492*8ddb146aSEd Maste 493*8ddb146aSEd Maste add $16, %r10 494*8ddb146aSEd Maste jg L(nibble_ashr_3) /* cross page boundary */ 495*8ddb146aSEd Maste 496*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 497*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 498*8ddb146aSEd Maste movdqa %xmm2, %xmm4 499*8ddb146aSEd Maste 500*8ddb146aSEd Maste palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ 501*8ddb146aSEd Maste 502*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 503*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 504*8ddb146aSEd Maste psubb %xmm0, %xmm1 505*8ddb146aSEd Maste pmovmskb %xmm1, %edx 506*8ddb146aSEd Maste sub $0xffff, %edx 507*8ddb146aSEd Maste jnz L(exit) 508*8ddb146aSEd Maste 509*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 510*8ddb146aSEd Maste sub $16, %r11 511*8ddb146aSEd Maste jbe L(strcmp_exitz) 512*8ddb146aSEd Maste#endif 513*8ddb146aSEd Maste 514*8ddb146aSEd Maste add $16, %rcx 515*8ddb146aSEd Maste movdqa %xmm4, %xmm3 516*8ddb146aSEd Maste jmp L(loop_ashr_3) 517*8ddb146aSEd Maste 518*8ddb146aSEd Maste .p2align 4 519*8ddb146aSEd MasteL(nibble_ashr_3): 520*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 521*8ddb146aSEd Maste pmovmskb %xmm0, %edx 522*8ddb146aSEd Maste test $0xfff8, %edx 523*8ddb146aSEd Maste jnz L(ashr_3_exittail) 524*8ddb146aSEd Maste 525*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 526*8ddb146aSEd Maste cmp $12, %r11 527*8ddb146aSEd Maste jbe L(ashr_3_exittail) 528*8ddb146aSEd Maste#endif 529*8ddb146aSEd Maste 530*8ddb146aSEd Maste pxor %xmm0, %xmm0 531*8ddb146aSEd Maste sub $0x1000, %r10 532*8ddb146aSEd Maste jmp L(gobble_ashr_3) 533*8ddb146aSEd Maste 534*8ddb146aSEd Maste .p2align 4 535*8ddb146aSEd MasteL(ashr_3_exittail): 536*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 537*8ddb146aSEd Maste psrldq $3, %xmm0 538*8ddb146aSEd Maste psrldq $3, %xmm3 539*8ddb146aSEd Maste jmp L(aftertail) 540*8ddb146aSEd Maste 541*8ddb146aSEd Maste/* 542*8ddb146aSEd Maste * The following cases will be handled by ashr_4 543*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 544*8ddb146aSEd Maste * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 545*8ddb146aSEd Maste */ 546*8ddb146aSEd Maste .p2align 4 547*8ddb146aSEd MasteL(ashr_4): 548*8ddb146aSEd Maste pxor %xmm0, %xmm0 549*8ddb146aSEd Maste movdqa (%rdi), %xmm2 550*8ddb146aSEd Maste movdqa (%rsi), %xmm1 551*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 552*8ddb146aSEd Maste pslldq $12, %xmm2 553*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 554*8ddb146aSEd Maste psubb %xmm0, %xmm2 555*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 556*8ddb146aSEd Maste shr %cl, %edx 557*8ddb146aSEd Maste shr %cl, %r9d 558*8ddb146aSEd Maste sub %r9d, %edx 559*8ddb146aSEd Maste jnz L(less32bytes) 560*8ddb146aSEd Maste movdqa (%rdi), %xmm3 561*8ddb146aSEd Maste 562*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 563*8ddb146aSEd Maste 564*8ddb146aSEd Maste pxor %xmm0, %xmm0 565*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 566*8ddb146aSEd Maste mov $4, %r9d /* byte position left over from less32bytes case */ 567*8ddb146aSEd Maste /* 568*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 569*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 570*8ddb146aSEd Maste * need to do a nibble. 571*8ddb146aSEd Maste */ 572*8ddb146aSEd Maste lea 4(%rdi), %r10 573*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 574*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 575*8ddb146aSEd Maste 576*8ddb146aSEd Maste .p2align 4 577*8ddb146aSEd MasteL(loop_ashr_4): 578*8ddb146aSEd Maste add $16, %r10 579*8ddb146aSEd Maste jg L(nibble_ashr_4) 580*8ddb146aSEd Maste 581*8ddb146aSEd MasteL(gobble_ashr_4): 582*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 583*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 584*8ddb146aSEd Maste movdqa %xmm2, %xmm4 585*8ddb146aSEd Maste 586*8ddb146aSEd Maste palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ 587*8ddb146aSEd Maste 588*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 589*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 590*8ddb146aSEd Maste psubb %xmm0, %xmm1 591*8ddb146aSEd Maste pmovmskb %xmm1, %edx 592*8ddb146aSEd Maste sub $0xffff, %edx 593*8ddb146aSEd Maste jnz L(exit) 594*8ddb146aSEd Maste 595*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 596*8ddb146aSEd Maste sub $16, %r11 597*8ddb146aSEd Maste jbe L(strcmp_exitz) 598*8ddb146aSEd Maste#endif 599*8ddb146aSEd Maste 600*8ddb146aSEd Maste add $16, %rcx 601*8ddb146aSEd Maste movdqa %xmm4, %xmm3 602*8ddb146aSEd Maste 603*8ddb146aSEd Maste add $16, %r10 604*8ddb146aSEd Maste jg L(nibble_ashr_4) /* cross page boundary */ 605*8ddb146aSEd Maste 606*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 607*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 608*8ddb146aSEd Maste movdqa %xmm2, %xmm4 609*8ddb146aSEd Maste 610*8ddb146aSEd Maste palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ 611*8ddb146aSEd Maste 612*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 613*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 614*8ddb146aSEd Maste psubb %xmm0, %xmm1 615*8ddb146aSEd Maste pmovmskb %xmm1, %edx 616*8ddb146aSEd Maste sub $0xffff, %edx 617*8ddb146aSEd Maste jnz L(exit) 618*8ddb146aSEd Maste 619*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 620*8ddb146aSEd Maste sub $16, %r11 621*8ddb146aSEd Maste jbe L(strcmp_exitz) 622*8ddb146aSEd Maste#endif 623*8ddb146aSEd Maste 624*8ddb146aSEd Maste add $16, %rcx 625*8ddb146aSEd Maste movdqa %xmm4, %xmm3 626*8ddb146aSEd Maste jmp L(loop_ashr_4) 627*8ddb146aSEd Maste 628*8ddb146aSEd Maste .p2align 4 629*8ddb146aSEd MasteL(nibble_ashr_4): 630*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 631*8ddb146aSEd Maste pmovmskb %xmm0, %edx 632*8ddb146aSEd Maste test $0xfff0, %edx 633*8ddb146aSEd Maste jnz L(ashr_4_exittail) 634*8ddb146aSEd Maste 635*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 636*8ddb146aSEd Maste cmp $11, %r11 637*8ddb146aSEd Maste jbe L(ashr_4_exittail) 638*8ddb146aSEd Maste#endif 639*8ddb146aSEd Maste 640*8ddb146aSEd Maste pxor %xmm0, %xmm0 641*8ddb146aSEd Maste sub $0x1000, %r10 642*8ddb146aSEd Maste jmp L(gobble_ashr_4) 643*8ddb146aSEd Maste 644*8ddb146aSEd Maste .p2align 4 645*8ddb146aSEd MasteL(ashr_4_exittail): 646*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 647*8ddb146aSEd Maste psrldq $4, %xmm0 648*8ddb146aSEd Maste psrldq $4, %xmm3 649*8ddb146aSEd Maste jmp L(aftertail) 650*8ddb146aSEd Maste 651*8ddb146aSEd Maste/* 652*8ddb146aSEd Maste * The following cases will be handled by ashr_5 653*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 654*8ddb146aSEd Maste * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 655*8ddb146aSEd Maste */ 656*8ddb146aSEd Maste .p2align 4 657*8ddb146aSEd MasteL(ashr_5): 658*8ddb146aSEd Maste pxor %xmm0, %xmm0 659*8ddb146aSEd Maste movdqa (%rdi), %xmm2 660*8ddb146aSEd Maste movdqa (%rsi), %xmm1 661*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 662*8ddb146aSEd Maste pslldq $11, %xmm2 663*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 664*8ddb146aSEd Maste psubb %xmm0, %xmm2 665*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 666*8ddb146aSEd Maste shr %cl, %edx 667*8ddb146aSEd Maste shr %cl, %r9d 668*8ddb146aSEd Maste sub %r9d, %edx 669*8ddb146aSEd Maste jnz L(less32bytes) 670*8ddb146aSEd Maste movdqa (%rdi), %xmm3 671*8ddb146aSEd Maste 672*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 673*8ddb146aSEd Maste 674*8ddb146aSEd Maste pxor %xmm0, %xmm0 675*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 676*8ddb146aSEd Maste mov $5, %r9d /* byte position left over from less32bytes case */ 677*8ddb146aSEd Maste /* 678*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 679*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 680*8ddb146aSEd Maste * need to do a nibble. 681*8ddb146aSEd Maste */ 682*8ddb146aSEd Maste lea 5(%rdi), %r10 683*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 684*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 685*8ddb146aSEd Maste 686*8ddb146aSEd Maste .p2align 4 687*8ddb146aSEd MasteL(loop_ashr_5): 688*8ddb146aSEd Maste add $16, %r10 689*8ddb146aSEd Maste jg L(nibble_ashr_5) 690*8ddb146aSEd Maste 691*8ddb146aSEd MasteL(gobble_ashr_5): 692*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 693*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 694*8ddb146aSEd Maste movdqa %xmm2, %xmm4 695*8ddb146aSEd Maste 696*8ddb146aSEd Maste palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ 697*8ddb146aSEd Maste 698*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 699*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 700*8ddb146aSEd Maste psubb %xmm0, %xmm1 701*8ddb146aSEd Maste pmovmskb %xmm1, %edx 702*8ddb146aSEd Maste sub $0xffff, %edx 703*8ddb146aSEd Maste jnz L(exit) 704*8ddb146aSEd Maste 705*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 706*8ddb146aSEd Maste sub $16, %r11 707*8ddb146aSEd Maste jbe L(strcmp_exitz) 708*8ddb146aSEd Maste#endif 709*8ddb146aSEd Maste 710*8ddb146aSEd Maste add $16, %rcx 711*8ddb146aSEd Maste movdqa %xmm4, %xmm3 712*8ddb146aSEd Maste 713*8ddb146aSEd Maste add $16, %r10 714*8ddb146aSEd Maste jg L(nibble_ashr_5) /* cross page boundary */ 715*8ddb146aSEd Maste 716*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 717*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 718*8ddb146aSEd Maste movdqa %xmm2, %xmm4 719*8ddb146aSEd Maste 720*8ddb146aSEd Maste palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ 721*8ddb146aSEd Maste 722*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 723*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 724*8ddb146aSEd Maste psubb %xmm0, %xmm1 725*8ddb146aSEd Maste pmovmskb %xmm1, %edx 726*8ddb146aSEd Maste sub $0xffff, %edx 727*8ddb146aSEd Maste jnz L(exit) 728*8ddb146aSEd Maste 729*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 730*8ddb146aSEd Maste sub $16, %r11 731*8ddb146aSEd Maste jbe L(strcmp_exitz) 732*8ddb146aSEd Maste#endif 733*8ddb146aSEd Maste 734*8ddb146aSEd Maste add $16, %rcx 735*8ddb146aSEd Maste movdqa %xmm4, %xmm3 736*8ddb146aSEd Maste jmp L(loop_ashr_5) 737*8ddb146aSEd Maste 738*8ddb146aSEd Maste .p2align 4 739*8ddb146aSEd MasteL(nibble_ashr_5): 740*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 741*8ddb146aSEd Maste pmovmskb %xmm0, %edx 742*8ddb146aSEd Maste test $0xffe0, %edx 743*8ddb146aSEd Maste jnz L(ashr_5_exittail) 744*8ddb146aSEd Maste 745*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 746*8ddb146aSEd Maste cmp $10, %r11 747*8ddb146aSEd Maste jbe L(ashr_5_exittail) 748*8ddb146aSEd Maste#endif 749*8ddb146aSEd Maste 750*8ddb146aSEd Maste pxor %xmm0, %xmm0 751*8ddb146aSEd Maste sub $0x1000, %r10 752*8ddb146aSEd Maste jmp L(gobble_ashr_5) 753*8ddb146aSEd Maste 754*8ddb146aSEd Maste .p2align 4 755*8ddb146aSEd MasteL(ashr_5_exittail): 756*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 757*8ddb146aSEd Maste psrldq $5, %xmm0 758*8ddb146aSEd Maste psrldq $5, %xmm3 759*8ddb146aSEd Maste jmp L(aftertail) 760*8ddb146aSEd Maste 761*8ddb146aSEd Maste/* 762*8ddb146aSEd Maste * The following cases will be handled by ashr_6 763*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 764*8ddb146aSEd Maste * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 765*8ddb146aSEd Maste */ 766*8ddb146aSEd Maste .p2align 4 767*8ddb146aSEd MasteL(ashr_6): 768*8ddb146aSEd Maste pxor %xmm0, %xmm0 769*8ddb146aSEd Maste movdqa (%rdi), %xmm2 770*8ddb146aSEd Maste movdqa (%rsi), %xmm1 771*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 772*8ddb146aSEd Maste pslldq $10, %xmm2 773*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 774*8ddb146aSEd Maste psubb %xmm0, %xmm2 775*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 776*8ddb146aSEd Maste shr %cl, %edx 777*8ddb146aSEd Maste shr %cl, %r9d 778*8ddb146aSEd Maste sub %r9d, %edx 779*8ddb146aSEd Maste jnz L(less32bytes) 780*8ddb146aSEd Maste movdqa (%rdi), %xmm3 781*8ddb146aSEd Maste 782*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 783*8ddb146aSEd Maste 784*8ddb146aSEd Maste pxor %xmm0, %xmm0 785*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 786*8ddb146aSEd Maste mov $6, %r9d /* byte position left over from less32bytes case */ 787*8ddb146aSEd Maste /* 788*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 789*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 790*8ddb146aSEd Maste * need to do a nibble. 791*8ddb146aSEd Maste */ 792*8ddb146aSEd Maste lea 6(%rdi), %r10 793*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 794*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 795*8ddb146aSEd Maste 796*8ddb146aSEd Maste .p2align 4 797*8ddb146aSEd MasteL(loop_ashr_6): 798*8ddb146aSEd Maste add $16, %r10 799*8ddb146aSEd Maste jg L(nibble_ashr_6) 800*8ddb146aSEd Maste 801*8ddb146aSEd MasteL(gobble_ashr_6): 802*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 803*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 804*8ddb146aSEd Maste movdqa %xmm2, %xmm4 805*8ddb146aSEd Maste 806*8ddb146aSEd Maste palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ 807*8ddb146aSEd Maste 808*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 809*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 810*8ddb146aSEd Maste psubb %xmm0, %xmm1 811*8ddb146aSEd Maste pmovmskb %xmm1, %edx 812*8ddb146aSEd Maste sub $0xffff, %edx 813*8ddb146aSEd Maste jnz L(exit) 814*8ddb146aSEd Maste 815*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 816*8ddb146aSEd Maste sub $16, %r11 817*8ddb146aSEd Maste jbe L(strcmp_exitz) 818*8ddb146aSEd Maste#endif 819*8ddb146aSEd Maste 820*8ddb146aSEd Maste add $16, %rcx 821*8ddb146aSEd Maste movdqa %xmm4, %xmm3 822*8ddb146aSEd Maste 823*8ddb146aSEd Maste add $16, %r10 824*8ddb146aSEd Maste jg L(nibble_ashr_6) /* cross page boundary */ 825*8ddb146aSEd Maste 826*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 827*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 828*8ddb146aSEd Maste movdqa %xmm2, %xmm4 829*8ddb146aSEd Maste 830*8ddb146aSEd Maste palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ 831*8ddb146aSEd Maste 832*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 833*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 834*8ddb146aSEd Maste psubb %xmm0, %xmm1 835*8ddb146aSEd Maste pmovmskb %xmm1, %edx 836*8ddb146aSEd Maste sub $0xffff, %edx 837*8ddb146aSEd Maste jnz L(exit) 838*8ddb146aSEd Maste 839*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 840*8ddb146aSEd Maste sub $16, %r11 841*8ddb146aSEd Maste jbe L(strcmp_exitz) 842*8ddb146aSEd Maste#endif 843*8ddb146aSEd Maste 844*8ddb146aSEd Maste add $16, %rcx 845*8ddb146aSEd Maste movdqa %xmm4, %xmm3 846*8ddb146aSEd Maste jmp L(loop_ashr_6) 847*8ddb146aSEd Maste 848*8ddb146aSEd Maste .p2align 4 849*8ddb146aSEd MasteL(nibble_ashr_6): 850*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 851*8ddb146aSEd Maste pmovmskb %xmm0, %edx 852*8ddb146aSEd Maste test $0xffc0, %edx 853*8ddb146aSEd Maste jnz L(ashr_6_exittail) 854*8ddb146aSEd Maste 855*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 856*8ddb146aSEd Maste cmp $9, %r11 857*8ddb146aSEd Maste jbe L(ashr_6_exittail) 858*8ddb146aSEd Maste#endif 859*8ddb146aSEd Maste 860*8ddb146aSEd Maste pxor %xmm0, %xmm0 861*8ddb146aSEd Maste sub $0x1000, %r10 862*8ddb146aSEd Maste jmp L(gobble_ashr_6) 863*8ddb146aSEd Maste 864*8ddb146aSEd Maste .p2align 4 865*8ddb146aSEd MasteL(ashr_6_exittail): 866*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 867*8ddb146aSEd Maste psrldq $6, %xmm0 868*8ddb146aSEd Maste psrldq $6, %xmm3 869*8ddb146aSEd Maste jmp L(aftertail) 870*8ddb146aSEd Maste 871*8ddb146aSEd Maste/* 872*8ddb146aSEd Maste * The following cases will be handled by ashr_7 873*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 874*8ddb146aSEd Maste * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 875*8ddb146aSEd Maste */ 876*8ddb146aSEd Maste .p2align 4 877*8ddb146aSEd MasteL(ashr_7): 878*8ddb146aSEd Maste pxor %xmm0, %xmm0 879*8ddb146aSEd Maste movdqa (%rdi), %xmm2 880*8ddb146aSEd Maste movdqa (%rsi), %xmm1 881*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 882*8ddb146aSEd Maste pslldq $9, %xmm2 883*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 884*8ddb146aSEd Maste psubb %xmm0, %xmm2 885*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 886*8ddb146aSEd Maste shr %cl, %edx 887*8ddb146aSEd Maste shr %cl, %r9d 888*8ddb146aSEd Maste sub %r9d, %edx 889*8ddb146aSEd Maste jnz L(less32bytes) 890*8ddb146aSEd Maste movdqa (%rdi), %xmm3 891*8ddb146aSEd Maste 892*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 893*8ddb146aSEd Maste 894*8ddb146aSEd Maste pxor %xmm0, %xmm0 895*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 896*8ddb146aSEd Maste mov $7, %r9d /* byte position left over from less32bytes case */ 897*8ddb146aSEd Maste /* 898*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 899*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 900*8ddb146aSEd Maste * need to do a nibble. 901*8ddb146aSEd Maste */ 902*8ddb146aSEd Maste lea 7(%rdi), %r10 903*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 904*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 905*8ddb146aSEd Maste 906*8ddb146aSEd Maste .p2align 4 907*8ddb146aSEd MasteL(loop_ashr_7): 908*8ddb146aSEd Maste add $16, %r10 909*8ddb146aSEd Maste jg L(nibble_ashr_7) 910*8ddb146aSEd Maste 911*8ddb146aSEd MasteL(gobble_ashr_7): 912*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 913*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 914*8ddb146aSEd Maste movdqa %xmm2, %xmm4 915*8ddb146aSEd Maste 916*8ddb146aSEd Maste palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ 917*8ddb146aSEd Maste 918*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 919*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 920*8ddb146aSEd Maste psubb %xmm0, %xmm1 921*8ddb146aSEd Maste pmovmskb %xmm1, %edx 922*8ddb146aSEd Maste sub $0xffff, %edx 923*8ddb146aSEd Maste jnz L(exit) 924*8ddb146aSEd Maste 925*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 926*8ddb146aSEd Maste sub $16, %r11 927*8ddb146aSEd Maste jbe L(strcmp_exitz) 928*8ddb146aSEd Maste#endif 929*8ddb146aSEd Maste 930*8ddb146aSEd Maste add $16, %rcx 931*8ddb146aSEd Maste movdqa %xmm4, %xmm3 932*8ddb146aSEd Maste 933*8ddb146aSEd Maste add $16, %r10 934*8ddb146aSEd Maste jg L(nibble_ashr_7) /* cross page boundary */ 935*8ddb146aSEd Maste 936*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 937*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 938*8ddb146aSEd Maste movdqa %xmm2, %xmm4 939*8ddb146aSEd Maste 940*8ddb146aSEd Maste palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ 941*8ddb146aSEd Maste 942*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 943*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 944*8ddb146aSEd Maste psubb %xmm0, %xmm1 945*8ddb146aSEd Maste pmovmskb %xmm1, %edx 946*8ddb146aSEd Maste sub $0xffff, %edx 947*8ddb146aSEd Maste jnz L(exit) 948*8ddb146aSEd Maste 949*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 950*8ddb146aSEd Maste sub $16, %r11 951*8ddb146aSEd Maste jbe L(strcmp_exitz) 952*8ddb146aSEd Maste#endif 953*8ddb146aSEd Maste 954*8ddb146aSEd Maste add $16, %rcx 955*8ddb146aSEd Maste movdqa %xmm4, %xmm3 956*8ddb146aSEd Maste jmp L(loop_ashr_7) 957*8ddb146aSEd Maste 958*8ddb146aSEd Maste .p2align 4 959*8ddb146aSEd MasteL(nibble_ashr_7): 960*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 961*8ddb146aSEd Maste pmovmskb %xmm0, %edx 962*8ddb146aSEd Maste test $0xff80, %edx 963*8ddb146aSEd Maste jnz L(ashr_7_exittail) 964*8ddb146aSEd Maste 965*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 966*8ddb146aSEd Maste cmp $8, %r11 967*8ddb146aSEd Maste jbe L(ashr_7_exittail) 968*8ddb146aSEd Maste#endif 969*8ddb146aSEd Maste 970*8ddb146aSEd Maste pxor %xmm0, %xmm0 971*8ddb146aSEd Maste sub $0x1000, %r10 972*8ddb146aSEd Maste jmp L(gobble_ashr_7) 973*8ddb146aSEd Maste 974*8ddb146aSEd Maste .p2align 4 975*8ddb146aSEd MasteL(ashr_7_exittail): 976*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 977*8ddb146aSEd Maste psrldq $7, %xmm0 978*8ddb146aSEd Maste psrldq $7, %xmm3 979*8ddb146aSEd Maste jmp L(aftertail) 980*8ddb146aSEd Maste 981*8ddb146aSEd Maste/* 982*8ddb146aSEd Maste * The following cases will be handled by ashr_8 983*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 984*8ddb146aSEd Maste * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 985*8ddb146aSEd Maste */ 986*8ddb146aSEd Maste .p2align 4 987*8ddb146aSEd MasteL(ashr_8): 988*8ddb146aSEd Maste pxor %xmm0, %xmm0 989*8ddb146aSEd Maste movdqa (%rdi), %xmm2 990*8ddb146aSEd Maste movdqa (%rsi), %xmm1 991*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 992*8ddb146aSEd Maste pslldq $8, %xmm2 993*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 994*8ddb146aSEd Maste psubb %xmm0, %xmm2 995*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 996*8ddb146aSEd Maste shr %cl, %edx 997*8ddb146aSEd Maste shr %cl, %r9d 998*8ddb146aSEd Maste sub %r9d, %edx 999*8ddb146aSEd Maste jnz L(less32bytes) 1000*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1001*8ddb146aSEd Maste 1002*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1003*8ddb146aSEd Maste 1004*8ddb146aSEd Maste pxor %xmm0, %xmm0 1005*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1006*8ddb146aSEd Maste mov $8, %r9d /* byte position left over from less32bytes case */ 1007*8ddb146aSEd Maste /* 1008*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1009*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1010*8ddb146aSEd Maste * need to do a nibble. 1011*8ddb146aSEd Maste */ 1012*8ddb146aSEd Maste lea 8(%rdi), %r10 1013*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1014*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1015*8ddb146aSEd Maste 1016*8ddb146aSEd Maste .p2align 4 1017*8ddb146aSEd MasteL(loop_ashr_8): 1018*8ddb146aSEd Maste add $16, %r10 1019*8ddb146aSEd Maste jg L(nibble_ashr_8) 1020*8ddb146aSEd Maste 1021*8ddb146aSEd MasteL(gobble_ashr_8): 1022*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1023*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1024*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1025*8ddb146aSEd Maste 1026*8ddb146aSEd Maste palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ 1027*8ddb146aSEd Maste 1028*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1029*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1030*8ddb146aSEd Maste psubb %xmm0, %xmm1 1031*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1032*8ddb146aSEd Maste sub $0xffff, %edx 1033*8ddb146aSEd Maste jnz L(exit) 1034*8ddb146aSEd Maste 1035*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1036*8ddb146aSEd Maste sub $16, %r11 1037*8ddb146aSEd Maste jbe L(strcmp_exitz) 1038*8ddb146aSEd Maste#endif 1039*8ddb146aSEd Maste 1040*8ddb146aSEd Maste add $16, %rcx 1041*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1042*8ddb146aSEd Maste 1043*8ddb146aSEd Maste add $16, %r10 1044*8ddb146aSEd Maste jg L(nibble_ashr_8) /* cross page boundary */ 1045*8ddb146aSEd Maste 1046*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1047*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1048*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1049*8ddb146aSEd Maste 1050*8ddb146aSEd Maste palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ 1051*8ddb146aSEd Maste 1052*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1053*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1054*8ddb146aSEd Maste psubb %xmm0, %xmm1 1055*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1056*8ddb146aSEd Maste sub $0xffff, %edx 1057*8ddb146aSEd Maste jnz L(exit) 1058*8ddb146aSEd Maste 1059*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1060*8ddb146aSEd Maste sub $16, %r11 1061*8ddb146aSEd Maste jbe L(strcmp_exitz) 1062*8ddb146aSEd Maste#endif 1063*8ddb146aSEd Maste 1064*8ddb146aSEd Maste add $16, %rcx 1065*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1066*8ddb146aSEd Maste jmp L(loop_ashr_8) 1067*8ddb146aSEd Maste 1068*8ddb146aSEd Maste .p2align 4 1069*8ddb146aSEd MasteL(nibble_ashr_8): 1070*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1071*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1072*8ddb146aSEd Maste test $0xff00, %edx 1073*8ddb146aSEd Maste jnz L(ashr_8_exittail) 1074*8ddb146aSEd Maste 1075*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1076*8ddb146aSEd Maste cmp $7, %r11 1077*8ddb146aSEd Maste jbe L(ashr_8_exittail) 1078*8ddb146aSEd Maste#endif 1079*8ddb146aSEd Maste 1080*8ddb146aSEd Maste pxor %xmm0, %xmm0 1081*8ddb146aSEd Maste sub $0x1000, %r10 1082*8ddb146aSEd Maste jmp L(gobble_ashr_8) 1083*8ddb146aSEd Maste 1084*8ddb146aSEd Maste .p2align 4 1085*8ddb146aSEd MasteL(ashr_8_exittail): 1086*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1087*8ddb146aSEd Maste psrldq $8, %xmm0 1088*8ddb146aSEd Maste psrldq $8, %xmm3 1089*8ddb146aSEd Maste jmp L(aftertail) 1090*8ddb146aSEd Maste 1091*8ddb146aSEd Maste/* 1092*8ddb146aSEd Maste * The following cases will be handled by ashr_9 1093*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1094*8ddb146aSEd Maste * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 1095*8ddb146aSEd Maste */ 1096*8ddb146aSEd Maste .p2align 4 1097*8ddb146aSEd MasteL(ashr_9): 1098*8ddb146aSEd Maste pxor %xmm0, %xmm0 1099*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1100*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1101*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1102*8ddb146aSEd Maste pslldq $7, %xmm2 1103*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1104*8ddb146aSEd Maste psubb %xmm0, %xmm2 1105*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1106*8ddb146aSEd Maste shr %cl, %edx 1107*8ddb146aSEd Maste shr %cl, %r9d 1108*8ddb146aSEd Maste sub %r9d, %edx 1109*8ddb146aSEd Maste jnz L(less32bytes) 1110*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1111*8ddb146aSEd Maste 1112*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1113*8ddb146aSEd Maste 1114*8ddb146aSEd Maste pxor %xmm0, %xmm0 1115*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1116*8ddb146aSEd Maste mov $9, %r9d /* byte position left over from less32bytes case */ 1117*8ddb146aSEd Maste /* 1118*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1119*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1120*8ddb146aSEd Maste * need to do a nibble. 1121*8ddb146aSEd Maste */ 1122*8ddb146aSEd Maste lea 9(%rdi), %r10 1123*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1124*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1125*8ddb146aSEd Maste 1126*8ddb146aSEd Maste .p2align 4 1127*8ddb146aSEd MasteL(loop_ashr_9): 1128*8ddb146aSEd Maste add $16, %r10 1129*8ddb146aSEd Maste jg L(nibble_ashr_9) 1130*8ddb146aSEd Maste 1131*8ddb146aSEd MasteL(gobble_ashr_9): 1132*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1133*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1134*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1135*8ddb146aSEd Maste 1136*8ddb146aSEd Maste palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ 1137*8ddb146aSEd Maste 1138*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1139*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1140*8ddb146aSEd Maste psubb %xmm0, %xmm1 1141*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1142*8ddb146aSEd Maste sub $0xffff, %edx 1143*8ddb146aSEd Maste jnz L(exit) 1144*8ddb146aSEd Maste 1145*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1146*8ddb146aSEd Maste sub $16, %r11 1147*8ddb146aSEd Maste jbe L(strcmp_exitz) 1148*8ddb146aSEd Maste#endif 1149*8ddb146aSEd Maste 1150*8ddb146aSEd Maste add $16, %rcx 1151*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1152*8ddb146aSEd Maste 1153*8ddb146aSEd Maste add $16, %r10 1154*8ddb146aSEd Maste jg L(nibble_ashr_9) /* cross page boundary */ 1155*8ddb146aSEd Maste 1156*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1157*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1158*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1159*8ddb146aSEd Maste 1160*8ddb146aSEd Maste palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ 1161*8ddb146aSEd Maste 1162*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1163*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1164*8ddb146aSEd Maste psubb %xmm0, %xmm1 1165*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1166*8ddb146aSEd Maste sub $0xffff, %edx 1167*8ddb146aSEd Maste jnz L(exit) 1168*8ddb146aSEd Maste 1169*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1170*8ddb146aSEd Maste sub $16, %r11 1171*8ddb146aSEd Maste jbe L(strcmp_exitz) 1172*8ddb146aSEd Maste#endif 1173*8ddb146aSEd Maste 1174*8ddb146aSEd Maste add $16, %rcx 1175*8ddb146aSEd Maste movdqa %xmm4, %xmm3 /* store for next cycle */ 1176*8ddb146aSEd Maste jmp L(loop_ashr_9) 1177*8ddb146aSEd Maste 1178*8ddb146aSEd Maste .p2align 4 1179*8ddb146aSEd MasteL(nibble_ashr_9): 1180*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1181*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1182*8ddb146aSEd Maste test $0xfe00, %edx 1183*8ddb146aSEd Maste jnz L(ashr_9_exittail) 1184*8ddb146aSEd Maste 1185*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1186*8ddb146aSEd Maste cmp $6, %r11 1187*8ddb146aSEd Maste jbe L(ashr_9_exittail) 1188*8ddb146aSEd Maste#endif 1189*8ddb146aSEd Maste 1190*8ddb146aSEd Maste pxor %xmm0, %xmm0 1191*8ddb146aSEd Maste sub $0x1000, %r10 1192*8ddb146aSEd Maste jmp L(gobble_ashr_9) 1193*8ddb146aSEd Maste 1194*8ddb146aSEd Maste .p2align 4 1195*8ddb146aSEd MasteL(ashr_9_exittail): 1196*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1197*8ddb146aSEd Maste psrldq $9, %xmm0 1198*8ddb146aSEd Maste psrldq $9, %xmm3 1199*8ddb146aSEd Maste jmp L(aftertail) 1200*8ddb146aSEd Maste 1201*8ddb146aSEd Maste/* 1202*8ddb146aSEd Maste * The following cases will be handled by ashr_10 1203*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1204*8ddb146aSEd Maste * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 1205*8ddb146aSEd Maste */ 1206*8ddb146aSEd Maste .p2align 4 1207*8ddb146aSEd MasteL(ashr_10): 1208*8ddb146aSEd Maste pxor %xmm0, %xmm0 1209*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1210*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1211*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1212*8ddb146aSEd Maste pslldq $6, %xmm2 1213*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1214*8ddb146aSEd Maste psubb %xmm0, %xmm2 1215*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1216*8ddb146aSEd Maste shr %cl, %edx 1217*8ddb146aSEd Maste shr %cl, %r9d 1218*8ddb146aSEd Maste sub %r9d, %edx 1219*8ddb146aSEd Maste jnz L(less32bytes) 1220*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1221*8ddb146aSEd Maste 1222*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1223*8ddb146aSEd Maste 1224*8ddb146aSEd Maste pxor %xmm0, %xmm0 1225*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1226*8ddb146aSEd Maste mov $10, %r9d /* byte position left over from less32bytes case */ 1227*8ddb146aSEd Maste /* 1228*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1229*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1230*8ddb146aSEd Maste * need to do a nibble. 1231*8ddb146aSEd Maste */ 1232*8ddb146aSEd Maste lea 10(%rdi), %r10 1233*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1234*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1235*8ddb146aSEd Maste 1236*8ddb146aSEd Maste .p2align 4 1237*8ddb146aSEd MasteL(loop_ashr_10): 1238*8ddb146aSEd Maste add $16, %r10 1239*8ddb146aSEd Maste jg L(nibble_ashr_10) 1240*8ddb146aSEd Maste 1241*8ddb146aSEd MasteL(gobble_ashr_10): 1242*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1243*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1244*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1245*8ddb146aSEd Maste 1246*8ddb146aSEd Maste palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ 1247*8ddb146aSEd Maste 1248*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1249*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1250*8ddb146aSEd Maste psubb %xmm0, %xmm1 1251*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1252*8ddb146aSEd Maste sub $0xffff, %edx 1253*8ddb146aSEd Maste jnz L(exit) 1254*8ddb146aSEd Maste 1255*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1256*8ddb146aSEd Maste sub $16, %r11 1257*8ddb146aSEd Maste jbe L(strcmp_exitz) 1258*8ddb146aSEd Maste#endif 1259*8ddb146aSEd Maste 1260*8ddb146aSEd Maste add $16, %rcx 1261*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1262*8ddb146aSEd Maste 1263*8ddb146aSEd Maste add $16, %r10 1264*8ddb146aSEd Maste jg L(nibble_ashr_10) /* cross page boundary */ 1265*8ddb146aSEd Maste 1266*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1267*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1268*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1269*8ddb146aSEd Maste 1270*8ddb146aSEd Maste palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ 1271*8ddb146aSEd Maste 1272*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1273*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1274*8ddb146aSEd Maste psubb %xmm0, %xmm1 1275*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1276*8ddb146aSEd Maste sub $0xffff, %edx 1277*8ddb146aSEd Maste jnz L(exit) 1278*8ddb146aSEd Maste 1279*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1280*8ddb146aSEd Maste sub $16, %r11 1281*8ddb146aSEd Maste jbe L(strcmp_exitz) 1282*8ddb146aSEd Maste#endif 1283*8ddb146aSEd Maste 1284*8ddb146aSEd Maste add $16, %rcx 1285*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1286*8ddb146aSEd Maste jmp L(loop_ashr_10) 1287*8ddb146aSEd Maste 1288*8ddb146aSEd Maste .p2align 4 1289*8ddb146aSEd MasteL(nibble_ashr_10): 1290*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1291*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1292*8ddb146aSEd Maste test $0xfc00, %edx 1293*8ddb146aSEd Maste jnz L(ashr_10_exittail) 1294*8ddb146aSEd Maste 1295*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1296*8ddb146aSEd Maste cmp $5, %r11 1297*8ddb146aSEd Maste jbe L(ashr_10_exittail) 1298*8ddb146aSEd Maste#endif 1299*8ddb146aSEd Maste 1300*8ddb146aSEd Maste pxor %xmm0, %xmm0 1301*8ddb146aSEd Maste sub $0x1000, %r10 1302*8ddb146aSEd Maste jmp L(gobble_ashr_10) 1303*8ddb146aSEd Maste 1304*8ddb146aSEd Maste .p2align 4 1305*8ddb146aSEd MasteL(ashr_10_exittail): 1306*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1307*8ddb146aSEd Maste psrldq $10, %xmm0 1308*8ddb146aSEd Maste psrldq $10, %xmm3 1309*8ddb146aSEd Maste jmp L(aftertail) 1310*8ddb146aSEd Maste 1311*8ddb146aSEd Maste/* 1312*8ddb146aSEd Maste * The following cases will be handled by ashr_11 1313*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1314*8ddb146aSEd Maste * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 1315*8ddb146aSEd Maste */ 1316*8ddb146aSEd Maste .p2align 4 1317*8ddb146aSEd MasteL(ashr_11): 1318*8ddb146aSEd Maste pxor %xmm0, %xmm0 1319*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1320*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1321*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1322*8ddb146aSEd Maste pslldq $5, %xmm2 1323*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1324*8ddb146aSEd Maste psubb %xmm0, %xmm2 1325*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1326*8ddb146aSEd Maste shr %cl, %edx 1327*8ddb146aSEd Maste shr %cl, %r9d 1328*8ddb146aSEd Maste sub %r9d, %edx 1329*8ddb146aSEd Maste jnz L(less32bytes) 1330*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1331*8ddb146aSEd Maste 1332*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1333*8ddb146aSEd Maste 1334*8ddb146aSEd Maste pxor %xmm0, %xmm0 1335*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1336*8ddb146aSEd Maste mov $11, %r9d /* byte position left over from less32bytes case */ 1337*8ddb146aSEd Maste /* 1338*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1339*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1340*8ddb146aSEd Maste * need to do a nibble. 1341*8ddb146aSEd Maste */ 1342*8ddb146aSEd Maste lea 11(%rdi), %r10 1343*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1344*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1345*8ddb146aSEd Maste 1346*8ddb146aSEd Maste .p2align 4 1347*8ddb146aSEd MasteL(loop_ashr_11): 1348*8ddb146aSEd Maste add $16, %r10 1349*8ddb146aSEd Maste jg L(nibble_ashr_11) 1350*8ddb146aSEd Maste 1351*8ddb146aSEd MasteL(gobble_ashr_11): 1352*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1353*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1354*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1355*8ddb146aSEd Maste 1356*8ddb146aSEd Maste palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ 1357*8ddb146aSEd Maste 1358*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1359*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1360*8ddb146aSEd Maste psubb %xmm0, %xmm1 1361*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1362*8ddb146aSEd Maste sub $0xffff, %edx 1363*8ddb146aSEd Maste jnz L(exit) 1364*8ddb146aSEd Maste 1365*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1366*8ddb146aSEd Maste sub $16, %r11 1367*8ddb146aSEd Maste jbe L(strcmp_exitz) 1368*8ddb146aSEd Maste#endif 1369*8ddb146aSEd Maste 1370*8ddb146aSEd Maste add $16, %rcx 1371*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1372*8ddb146aSEd Maste 1373*8ddb146aSEd Maste add $16, %r10 1374*8ddb146aSEd Maste jg L(nibble_ashr_11) /* cross page boundary */ 1375*8ddb146aSEd Maste 1376*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1377*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1378*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1379*8ddb146aSEd Maste 1380*8ddb146aSEd Maste palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ 1381*8ddb146aSEd Maste 1382*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1383*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1384*8ddb146aSEd Maste psubb %xmm0, %xmm1 1385*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1386*8ddb146aSEd Maste sub $0xffff, %edx 1387*8ddb146aSEd Maste jnz L(exit) 1388*8ddb146aSEd Maste 1389*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1390*8ddb146aSEd Maste sub $16, %r11 1391*8ddb146aSEd Maste jbe L(strcmp_exitz) 1392*8ddb146aSEd Maste#endif 1393*8ddb146aSEd Maste 1394*8ddb146aSEd Maste add $16, %rcx 1395*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1396*8ddb146aSEd Maste jmp L(loop_ashr_11) 1397*8ddb146aSEd Maste 1398*8ddb146aSEd Maste .p2align 4 1399*8ddb146aSEd MasteL(nibble_ashr_11): 1400*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1401*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1402*8ddb146aSEd Maste test $0xf800, %edx 1403*8ddb146aSEd Maste jnz L(ashr_11_exittail) 1404*8ddb146aSEd Maste 1405*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1406*8ddb146aSEd Maste cmp $4, %r11 1407*8ddb146aSEd Maste jbe L(ashr_11_exittail) 1408*8ddb146aSEd Maste#endif 1409*8ddb146aSEd Maste 1410*8ddb146aSEd Maste pxor %xmm0, %xmm0 1411*8ddb146aSEd Maste sub $0x1000, %r10 1412*8ddb146aSEd Maste jmp L(gobble_ashr_11) 1413*8ddb146aSEd Maste 1414*8ddb146aSEd Maste .p2align 4 1415*8ddb146aSEd MasteL(ashr_11_exittail): 1416*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1417*8ddb146aSEd Maste psrldq $11, %xmm0 1418*8ddb146aSEd Maste psrldq $11, %xmm3 1419*8ddb146aSEd Maste jmp L(aftertail) 1420*8ddb146aSEd Maste 1421*8ddb146aSEd Maste/* 1422*8ddb146aSEd Maste * The following cases will be handled by ashr_12 1423*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1424*8ddb146aSEd Maste * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 1425*8ddb146aSEd Maste */ 1426*8ddb146aSEd Maste .p2align 4 1427*8ddb146aSEd MasteL(ashr_12): 1428*8ddb146aSEd Maste pxor %xmm0, %xmm0 1429*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1430*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1431*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1432*8ddb146aSEd Maste pslldq $4, %xmm2 1433*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1434*8ddb146aSEd Maste psubb %xmm0, %xmm2 1435*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1436*8ddb146aSEd Maste shr %cl, %edx 1437*8ddb146aSEd Maste shr %cl, %r9d 1438*8ddb146aSEd Maste sub %r9d, %edx 1439*8ddb146aSEd Maste jnz L(less32bytes) 1440*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1441*8ddb146aSEd Maste 1442*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1443*8ddb146aSEd Maste 1444*8ddb146aSEd Maste pxor %xmm0, %xmm0 1445*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1446*8ddb146aSEd Maste mov $12, %r9d /* byte position left over from less32bytes case */ 1447*8ddb146aSEd Maste /* 1448*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1449*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1450*8ddb146aSEd Maste * need to do a nibble. 1451*8ddb146aSEd Maste */ 1452*8ddb146aSEd Maste lea 12(%rdi), %r10 1453*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1454*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1455*8ddb146aSEd Maste 1456*8ddb146aSEd Maste .p2align 4 1457*8ddb146aSEd MasteL(loop_ashr_12): 1458*8ddb146aSEd Maste add $16, %r10 1459*8ddb146aSEd Maste jg L(nibble_ashr_12) 1460*8ddb146aSEd Maste 1461*8ddb146aSEd MasteL(gobble_ashr_12): 1462*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1463*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1464*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1465*8ddb146aSEd Maste 1466*8ddb146aSEd Maste palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ 1467*8ddb146aSEd Maste 1468*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1469*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1470*8ddb146aSEd Maste psubb %xmm0, %xmm1 1471*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1472*8ddb146aSEd Maste sub $0xffff, %edx 1473*8ddb146aSEd Maste jnz L(exit) 1474*8ddb146aSEd Maste 1475*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1476*8ddb146aSEd Maste sub $16, %r11 1477*8ddb146aSEd Maste jbe L(strcmp_exitz) 1478*8ddb146aSEd Maste#endif 1479*8ddb146aSEd Maste 1480*8ddb146aSEd Maste add $16, %rcx 1481*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1482*8ddb146aSEd Maste 1483*8ddb146aSEd Maste add $16, %r10 1484*8ddb146aSEd Maste jg L(nibble_ashr_12) /* cross page boundary */ 1485*8ddb146aSEd Maste 1486*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1487*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1488*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1489*8ddb146aSEd Maste 1490*8ddb146aSEd Maste palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ 1491*8ddb146aSEd Maste 1492*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1493*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1494*8ddb146aSEd Maste psubb %xmm0, %xmm1 1495*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1496*8ddb146aSEd Maste sub $0xffff, %edx 1497*8ddb146aSEd Maste jnz L(exit) 1498*8ddb146aSEd Maste 1499*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1500*8ddb146aSEd Maste sub $16, %r11 1501*8ddb146aSEd Maste jbe L(strcmp_exitz) 1502*8ddb146aSEd Maste#endif 1503*8ddb146aSEd Maste 1504*8ddb146aSEd Maste add $16, %rcx 1505*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1506*8ddb146aSEd Maste jmp L(loop_ashr_12) 1507*8ddb146aSEd Maste 1508*8ddb146aSEd Maste .p2align 4 1509*8ddb146aSEd MasteL(nibble_ashr_12): 1510*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1511*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1512*8ddb146aSEd Maste test $0xf000, %edx 1513*8ddb146aSEd Maste jnz L(ashr_12_exittail) 1514*8ddb146aSEd Maste 1515*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1516*8ddb146aSEd Maste cmp $3, %r11 1517*8ddb146aSEd Maste jbe L(ashr_12_exittail) 1518*8ddb146aSEd Maste#endif 1519*8ddb146aSEd Maste 1520*8ddb146aSEd Maste pxor %xmm0, %xmm0 1521*8ddb146aSEd Maste sub $0x1000, %r10 1522*8ddb146aSEd Maste jmp L(gobble_ashr_12) 1523*8ddb146aSEd Maste 1524*8ddb146aSEd Maste .p2align 4 1525*8ddb146aSEd MasteL(ashr_12_exittail): 1526*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1527*8ddb146aSEd Maste psrldq $12, %xmm0 1528*8ddb146aSEd Maste psrldq $12, %xmm3 1529*8ddb146aSEd Maste jmp L(aftertail) 1530*8ddb146aSEd Maste 1531*8ddb146aSEd Maste/* 1532*8ddb146aSEd Maste * The following cases will be handled by ashr_13 1533*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1534*8ddb146aSEd Maste * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 1535*8ddb146aSEd Maste */ 1536*8ddb146aSEd Maste .p2align 4 1537*8ddb146aSEd MasteL(ashr_13): 1538*8ddb146aSEd Maste pxor %xmm0, %xmm0 1539*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1540*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1541*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1542*8ddb146aSEd Maste pslldq $3, %xmm2 1543*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1544*8ddb146aSEd Maste psubb %xmm0, %xmm2 1545*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1546*8ddb146aSEd Maste shr %cl, %edx 1547*8ddb146aSEd Maste shr %cl, %r9d 1548*8ddb146aSEd Maste sub %r9d, %edx 1549*8ddb146aSEd Maste jnz L(less32bytes) 1550*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1551*8ddb146aSEd Maste 1552*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1553*8ddb146aSEd Maste 1554*8ddb146aSEd Maste pxor %xmm0, %xmm0 1555*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1556*8ddb146aSEd Maste mov $13, %r9d /* byte position left over from less32bytes case */ 1557*8ddb146aSEd Maste /* 1558*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1559*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1560*8ddb146aSEd Maste * need to do a nibble. 1561*8ddb146aSEd Maste */ 1562*8ddb146aSEd Maste lea 13(%rdi), %r10 1563*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1564*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1565*8ddb146aSEd Maste 1566*8ddb146aSEd Maste .p2align 4 1567*8ddb146aSEd MasteL(loop_ashr_13): 1568*8ddb146aSEd Maste add $16, %r10 1569*8ddb146aSEd Maste jg L(nibble_ashr_13) 1570*8ddb146aSEd Maste 1571*8ddb146aSEd MasteL(gobble_ashr_13): 1572*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1573*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1574*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1575*8ddb146aSEd Maste 1576*8ddb146aSEd Maste palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ 1577*8ddb146aSEd Maste 1578*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1579*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1580*8ddb146aSEd Maste psubb %xmm0, %xmm1 1581*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1582*8ddb146aSEd Maste sub $0xffff, %edx 1583*8ddb146aSEd Maste jnz L(exit) 1584*8ddb146aSEd Maste 1585*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1586*8ddb146aSEd Maste sub $16, %r11 1587*8ddb146aSEd Maste jbe L(strcmp_exitz) 1588*8ddb146aSEd Maste#endif 1589*8ddb146aSEd Maste 1590*8ddb146aSEd Maste add $16, %rcx 1591*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1592*8ddb146aSEd Maste 1593*8ddb146aSEd Maste add $16, %r10 1594*8ddb146aSEd Maste jg L(nibble_ashr_13) /* cross page boundary */ 1595*8ddb146aSEd Maste 1596*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1597*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1598*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1599*8ddb146aSEd Maste 1600*8ddb146aSEd Maste palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ 1601*8ddb146aSEd Maste 1602*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1603*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1604*8ddb146aSEd Maste psubb %xmm0, %xmm1 1605*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1606*8ddb146aSEd Maste sub $0xffff, %edx 1607*8ddb146aSEd Maste jnz L(exit) 1608*8ddb146aSEd Maste 1609*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1610*8ddb146aSEd Maste sub $16, %r11 1611*8ddb146aSEd Maste jbe L(strcmp_exitz) 1612*8ddb146aSEd Maste#endif 1613*8ddb146aSEd Maste 1614*8ddb146aSEd Maste add $16, %rcx 1615*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1616*8ddb146aSEd Maste jmp L(loop_ashr_13) 1617*8ddb146aSEd Maste 1618*8ddb146aSEd Maste .p2align 4 1619*8ddb146aSEd MasteL(nibble_ashr_13): 1620*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1621*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1622*8ddb146aSEd Maste test $0xe000, %edx 1623*8ddb146aSEd Maste jnz L(ashr_13_exittail) 1624*8ddb146aSEd Maste 1625*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1626*8ddb146aSEd Maste cmp $2, %r11 1627*8ddb146aSEd Maste jbe L(ashr_13_exittail) 1628*8ddb146aSEd Maste#endif 1629*8ddb146aSEd Maste 1630*8ddb146aSEd Maste pxor %xmm0, %xmm0 1631*8ddb146aSEd Maste sub $0x1000, %r10 1632*8ddb146aSEd Maste jmp L(gobble_ashr_13) 1633*8ddb146aSEd Maste 1634*8ddb146aSEd Maste .p2align 4 1635*8ddb146aSEd MasteL(ashr_13_exittail): 1636*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1637*8ddb146aSEd Maste psrldq $13, %xmm0 1638*8ddb146aSEd Maste psrldq $13, %xmm3 1639*8ddb146aSEd Maste jmp L(aftertail) 1640*8ddb146aSEd Maste 1641*8ddb146aSEd Maste/* 1642*8ddb146aSEd Maste * The following cases will be handled by ashr_14 1643*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1644*8ddb146aSEd Maste * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 1645*8ddb146aSEd Maste */ 1646*8ddb146aSEd Maste .p2align 4 1647*8ddb146aSEd MasteL(ashr_14): 1648*8ddb146aSEd Maste pxor %xmm0, %xmm0 1649*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1650*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1651*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1652*8ddb146aSEd Maste pslldq $2, %xmm2 1653*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1654*8ddb146aSEd Maste psubb %xmm0, %xmm2 1655*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1656*8ddb146aSEd Maste shr %cl, %edx 1657*8ddb146aSEd Maste shr %cl, %r9d 1658*8ddb146aSEd Maste sub %r9d, %edx 1659*8ddb146aSEd Maste jnz L(less32bytes) 1660*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1661*8ddb146aSEd Maste 1662*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1663*8ddb146aSEd Maste 1664*8ddb146aSEd Maste pxor %xmm0, %xmm0 1665*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1666*8ddb146aSEd Maste mov $14, %r9d /* byte position left over from less32bytes case */ 1667*8ddb146aSEd Maste /* 1668*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1669*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1670*8ddb146aSEd Maste * need to do a nibble. 1671*8ddb146aSEd Maste */ 1672*8ddb146aSEd Maste lea 14(%rdi), %r10 1673*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1674*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1675*8ddb146aSEd Maste 1676*8ddb146aSEd Maste .p2align 4 1677*8ddb146aSEd MasteL(loop_ashr_14): 1678*8ddb146aSEd Maste add $16, %r10 1679*8ddb146aSEd Maste jg L(nibble_ashr_14) 1680*8ddb146aSEd Maste 1681*8ddb146aSEd MasteL(gobble_ashr_14): 1682*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1683*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1684*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1685*8ddb146aSEd Maste 1686*8ddb146aSEd Maste palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ 1687*8ddb146aSEd Maste 1688*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1689*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1690*8ddb146aSEd Maste psubb %xmm0, %xmm1 1691*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1692*8ddb146aSEd Maste sub $0xffff, %edx 1693*8ddb146aSEd Maste jnz L(exit) 1694*8ddb146aSEd Maste 1695*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1696*8ddb146aSEd Maste sub $16, %r11 1697*8ddb146aSEd Maste jbe L(strcmp_exitz) 1698*8ddb146aSEd Maste#endif 1699*8ddb146aSEd Maste 1700*8ddb146aSEd Maste add $16, %rcx 1701*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1702*8ddb146aSEd Maste 1703*8ddb146aSEd Maste add $16, %r10 1704*8ddb146aSEd Maste jg L(nibble_ashr_14) /* cross page boundary */ 1705*8ddb146aSEd Maste 1706*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1707*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1708*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1709*8ddb146aSEd Maste 1710*8ddb146aSEd Maste palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ 1711*8ddb146aSEd Maste 1712*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1713*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1714*8ddb146aSEd Maste psubb %xmm0, %xmm1 1715*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1716*8ddb146aSEd Maste sub $0xffff, %edx 1717*8ddb146aSEd Maste jnz L(exit) 1718*8ddb146aSEd Maste 1719*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1720*8ddb146aSEd Maste sub $16, %r11 1721*8ddb146aSEd Maste jbe L(strcmp_exitz) 1722*8ddb146aSEd Maste#endif 1723*8ddb146aSEd Maste 1724*8ddb146aSEd Maste add $16, %rcx 1725*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1726*8ddb146aSEd Maste jmp L(loop_ashr_14) 1727*8ddb146aSEd Maste 1728*8ddb146aSEd Maste .p2align 4 1729*8ddb146aSEd MasteL(nibble_ashr_14): 1730*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1731*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1732*8ddb146aSEd Maste test $0xc000, %edx 1733*8ddb146aSEd Maste jnz L(ashr_14_exittail) 1734*8ddb146aSEd Maste 1735*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1736*8ddb146aSEd Maste cmp $1, %r11 1737*8ddb146aSEd Maste jbe L(ashr_14_exittail) 1738*8ddb146aSEd Maste#endif 1739*8ddb146aSEd Maste 1740*8ddb146aSEd Maste pxor %xmm0, %xmm0 1741*8ddb146aSEd Maste sub $0x1000, %r10 1742*8ddb146aSEd Maste jmp L(gobble_ashr_14) 1743*8ddb146aSEd Maste 1744*8ddb146aSEd Maste .p2align 4 1745*8ddb146aSEd MasteL(ashr_14_exittail): 1746*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1747*8ddb146aSEd Maste psrldq $14, %xmm0 1748*8ddb146aSEd Maste psrldq $14, %xmm3 1749*8ddb146aSEd Maste jmp L(aftertail) 1750*8ddb146aSEd Maste 1751*8ddb146aSEd Maste/* 1752*8ddb146aSEd Maste * The following cases will be handled by ashr_15 1753*8ddb146aSEd Maste * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case 1754*8ddb146aSEd Maste * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 1755*8ddb146aSEd Maste */ 1756*8ddb146aSEd Maste .p2align 4 1757*8ddb146aSEd MasteL(ashr_15): 1758*8ddb146aSEd Maste pxor %xmm0, %xmm0 1759*8ddb146aSEd Maste movdqa (%rdi), %xmm2 1760*8ddb146aSEd Maste movdqa (%rsi), %xmm1 1761*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1762*8ddb146aSEd Maste pslldq $1, %xmm2 1763*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm2 1764*8ddb146aSEd Maste psubb %xmm0, %xmm2 1765*8ddb146aSEd Maste pmovmskb %xmm2, %r9d 1766*8ddb146aSEd Maste shr %cl, %edx 1767*8ddb146aSEd Maste shr %cl, %r9d 1768*8ddb146aSEd Maste sub %r9d, %edx 1769*8ddb146aSEd Maste jnz L(less32bytes) 1770*8ddb146aSEd Maste 1771*8ddb146aSEd Maste movdqa (%rdi), %xmm3 1772*8ddb146aSEd Maste 1773*8ddb146aSEd Maste UPDATE_STRNCMP_COUNTER 1774*8ddb146aSEd Maste 1775*8ddb146aSEd Maste pxor %xmm0, %xmm0 1776*8ddb146aSEd Maste mov $16, %rcx /* index for loads */ 1777*8ddb146aSEd Maste mov $15, %r9d /* byte position left over from less32bytes case */ 1778*8ddb146aSEd Maste /* 1779*8ddb146aSEd Maste * Setup %r10 value allows us to detect crossing a page boundary. 1780*8ddb146aSEd Maste * When %r10 goes positive we have crossed a page boundary and 1781*8ddb146aSEd Maste * need to do a nibble. 1782*8ddb146aSEd Maste */ 1783*8ddb146aSEd Maste lea 15(%rdi), %r10 1784*8ddb146aSEd Maste and $0xfff, %r10 /* offset into 4K page */ 1785*8ddb146aSEd Maste 1786*8ddb146aSEd Maste sub $0x1000, %r10 /* subtract 4K pagesize */ 1787*8ddb146aSEd Maste 1788*8ddb146aSEd Maste .p2align 4 1789*8ddb146aSEd MasteL(loop_ashr_15): 1790*8ddb146aSEd Maste add $16, %r10 1791*8ddb146aSEd Maste jg L(nibble_ashr_15) 1792*8ddb146aSEd Maste 1793*8ddb146aSEd MasteL(gobble_ashr_15): 1794*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1795*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1796*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1797*8ddb146aSEd Maste 1798*8ddb146aSEd Maste palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ 1799*8ddb146aSEd Maste 1800*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1801*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1802*8ddb146aSEd Maste psubb %xmm0, %xmm1 1803*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1804*8ddb146aSEd Maste sub $0xffff, %edx 1805*8ddb146aSEd Maste jnz L(exit) 1806*8ddb146aSEd Maste 1807*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1808*8ddb146aSEd Maste sub $16, %r11 1809*8ddb146aSEd Maste jbe L(strcmp_exitz) 1810*8ddb146aSEd Maste#endif 1811*8ddb146aSEd Maste 1812*8ddb146aSEd Maste add $16, %rcx 1813*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1814*8ddb146aSEd Maste 1815*8ddb146aSEd Maste add $16, %r10 1816*8ddb146aSEd Maste jg L(nibble_ashr_15) /* cross page boundary */ 1817*8ddb146aSEd Maste 1818*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1819*8ddb146aSEd Maste movdqa (%rdi, %rcx), %xmm2 1820*8ddb146aSEd Maste movdqa %xmm2, %xmm4 1821*8ddb146aSEd Maste 1822*8ddb146aSEd Maste palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ 1823*8ddb146aSEd Maste 1824*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 1825*8ddb146aSEd Maste pcmpeqb %xmm2, %xmm1 1826*8ddb146aSEd Maste psubb %xmm0, %xmm1 1827*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1828*8ddb146aSEd Maste sub $0xffff, %edx 1829*8ddb146aSEd Maste jnz L(exit) 1830*8ddb146aSEd Maste 1831*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1832*8ddb146aSEd Maste sub $16, %r11 1833*8ddb146aSEd Maste jbe L(strcmp_exitz) 1834*8ddb146aSEd Maste#endif 1835*8ddb146aSEd Maste 1836*8ddb146aSEd Maste add $16, %rcx 1837*8ddb146aSEd Maste movdqa %xmm4, %xmm3 1838*8ddb146aSEd Maste jmp L(loop_ashr_15) 1839*8ddb146aSEd Maste 1840*8ddb146aSEd Maste .p2align 4 1841*8ddb146aSEd MasteL(nibble_ashr_15): 1842*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ 1843*8ddb146aSEd Maste pmovmskb %xmm0, %edx 1844*8ddb146aSEd Maste test $0x8000, %edx 1845*8ddb146aSEd Maste jnz L(ashr_15_exittail) 1846*8ddb146aSEd Maste 1847*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1848*8ddb146aSEd Maste test %r11, %r11 1849*8ddb146aSEd Maste je L(ashr_15_exittail) 1850*8ddb146aSEd Maste#endif 1851*8ddb146aSEd Maste 1852*8ddb146aSEd Maste pxor %xmm0, %xmm0 1853*8ddb146aSEd Maste sub $0x1000, %r10 1854*8ddb146aSEd Maste jmp L(gobble_ashr_15) 1855*8ddb146aSEd Maste 1856*8ddb146aSEd Maste .p2align 4 1857*8ddb146aSEd MasteL(ashr_15_exittail): 1858*8ddb146aSEd Maste movdqa (%rsi, %rcx), %xmm1 1859*8ddb146aSEd Maste psrldq $15, %xmm3 1860*8ddb146aSEd Maste psrldq $15, %xmm0 1861*8ddb146aSEd Maste 1862*8ddb146aSEd Maste .p2align 4 1863*8ddb146aSEd MasteL(aftertail): 1864*8ddb146aSEd Maste pcmpeqb %xmm3, %xmm1 1865*8ddb146aSEd Maste psubb %xmm0, %xmm1 1866*8ddb146aSEd Maste pmovmskb %xmm1, %edx 1867*8ddb146aSEd Maste not %edx 1868*8ddb146aSEd Maste 1869*8ddb146aSEd Maste .p2align 4 1870*8ddb146aSEd MasteL(exit): 1871*8ddb146aSEd Maste lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ 1872*8ddb146aSEd MasteL(less32bytes): 1873*8ddb146aSEd Maste lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 1874*8ddb146aSEd Maste lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 1875*8ddb146aSEd Maste test %r8d, %r8d 1876*8ddb146aSEd Maste jz L(ret) 1877*8ddb146aSEd Maste xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 1878*8ddb146aSEd Maste 1879*8ddb146aSEd Maste .p2align 4 1880*8ddb146aSEd MasteL(ret): 1881*8ddb146aSEd MasteL(less16bytes): 1882*8ddb146aSEd Maste bsf %rdx, %rdx /* find and store bit index in %rdx */ 1883*8ddb146aSEd Maste 1884*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP 1885*8ddb146aSEd Maste sub %rdx, %r11 1886*8ddb146aSEd Maste jbe L(strcmp_exitz) 1887*8ddb146aSEd Maste#endif 1888*8ddb146aSEd Maste movzbl (%rsi, %rdx), %ecx 1889*8ddb146aSEd Maste movzbl (%rdi, %rdx), %eax 1890*8ddb146aSEd Maste 1891*8ddb146aSEd Maste sub %ecx, %eax 1892*8ddb146aSEd Maste ret 1893*8ddb146aSEd Maste 1894*8ddb146aSEd MasteL(strcmp_exitz): 1895*8ddb146aSEd Maste xor %eax, %eax 1896*8ddb146aSEd Maste ret 1897*8ddb146aSEd Maste 1898*8ddb146aSEd Maste .p2align 4 1899*8ddb146aSEd MasteL(Byte0): 1900*8ddb146aSEd Maste movzbl (%rsi), %ecx 1901*8ddb146aSEd Maste movzbl (%rdi), %eax 1902*8ddb146aSEd Maste 1903*8ddb146aSEd Maste sub %ecx, %eax 1904*8ddb146aSEd Maste ret 1905*8ddb146aSEd MasteEND (STRCMP) 1906*8ddb146aSEd Maste 1907*8ddb146aSEd Maste .section .rodata,"a",@progbits 1908*8ddb146aSEd Maste .p2align 3 1909*8ddb146aSEd MasteL(unaligned_table): 1910*8ddb146aSEd Maste .int L(ashr_1) - L(unaligned_table) 1911*8ddb146aSEd Maste .int L(ashr_2) - L(unaligned_table) 1912*8ddb146aSEd Maste .int L(ashr_3) - L(unaligned_table) 1913*8ddb146aSEd Maste .int L(ashr_4) - L(unaligned_table) 1914*8ddb146aSEd Maste .int L(ashr_5) - L(unaligned_table) 1915*8ddb146aSEd Maste .int L(ashr_6) - L(unaligned_table) 1916*8ddb146aSEd Maste .int L(ashr_7) - L(unaligned_table) 1917*8ddb146aSEd Maste .int L(ashr_8) - L(unaligned_table) 1918*8ddb146aSEd Maste .int L(ashr_9) - L(unaligned_table) 1919*8ddb146aSEd Maste .int L(ashr_10) - L(unaligned_table) 1920*8ddb146aSEd Maste .int L(ashr_11) - L(unaligned_table) 1921*8ddb146aSEd Maste .int L(ashr_12) - L(unaligned_table) 1922*8ddb146aSEd Maste .int L(ashr_13) - L(unaligned_table) 1923*8ddb146aSEd Maste .int L(ashr_14) - L(unaligned_table) 1924*8ddb146aSEd Maste .int L(ashr_15) - L(unaligned_table) 1925*8ddb146aSEd Maste .int L(ashr_0) - L(unaligned_table) 1926