1*5d9d9091SRichard Lowe/* 2*5d9d9091SRichard Lowe * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 3*5d9d9091SRichard Lowe * Use is subject to license terms. 4*5d9d9091SRichard Lowe */ 5*5d9d9091SRichard Lowe 6*5d9d9091SRichard Lowe/* 7*5d9d9091SRichard Lowe * Copyright (c) 2002 Advanced Micro Devices, Inc. 8*5d9d9091SRichard Lowe * 9*5d9d9091SRichard Lowe * All rights reserved. 10*5d9d9091SRichard Lowe * 11*5d9d9091SRichard Lowe * Redistribution and use in source and binary forms, with or 12*5d9d9091SRichard Lowe * without modification, are permitted provided that the 13*5d9d9091SRichard Lowe * following conditions are met: 14*5d9d9091SRichard Lowe * 15*5d9d9091SRichard Lowe * + Redistributions of source code must retain the above 16*5d9d9091SRichard Lowe * copyright notice, this list of conditions and the 17*5d9d9091SRichard Lowe * following disclaimer. 18*5d9d9091SRichard Lowe * 19*5d9d9091SRichard Lowe * + Redistributions in binary form must reproduce the above 20*5d9d9091SRichard Lowe * copyright notice, this list of conditions and the 21*5d9d9091SRichard Lowe * following disclaimer in the documentation and/or other 22*5d9d9091SRichard Lowe * materials provided with the distribution. 23*5d9d9091SRichard Lowe * 24*5d9d9091SRichard Lowe * + Neither the name of Advanced Micro Devices, Inc. nor the 25*5d9d9091SRichard Lowe * names of its contributors may be used to endorse or 26*5d9d9091SRichard Lowe * promote products derived from this software without 27*5d9d9091SRichard Lowe * specific prior written permission. 28*5d9d9091SRichard Lowe * 29*5d9d9091SRichard Lowe * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 30*5d9d9091SRichard Lowe * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 31*5d9d9091SRichard Lowe * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 32*5d9d9091SRichard Lowe * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 33*5d9d9091SRichard Lowe * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 34*5d9d9091SRichard Lowe * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 35*5d9d9091SRichard Lowe * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 36*5d9d9091SRichard Lowe * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 37*5d9d9091SRichard Lowe * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 38*5d9d9091SRichard Lowe * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 39*5d9d9091SRichard Lowe * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40*5d9d9091SRichard Lowe * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 41*5d9d9091SRichard Lowe * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 42*5d9d9091SRichard Lowe * POSSIBILITY OF SUCH DAMAGE. 43*5d9d9091SRichard Lowe * 44*5d9d9091SRichard Lowe * It is licensee's responsibility to comply with any export 45*5d9d9091SRichard Lowe * regulations applicable in licensee's jurisdiction. 46*5d9d9091SRichard Lowe */ 47*5d9d9091SRichard Lowe 48*5d9d9091SRichard Lowe .file "memcmp.s" 49*5d9d9091SRichard Lowe 50*5d9d9091SRichard Lowe#include <sys/asm_linkage.h> 51*5d9d9091SRichard Lowe 52*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memcmp,function) 53*5d9d9091SRichard Lowe 54*5d9d9091SRichard Lowe#include "SYS.h" 55*5d9d9091SRichard Lowe#include "cache.h" 56*5d9d9091SRichard Lowe 57*5d9d9091SRichard Lowe#define LABEL(s) .memcmp##s 58*5d9d9091SRichard Lowe 59*5d9d9091SRichard Lowe ENTRY(memcmp) /* (const void *, const void*, size_t) */ 60*5d9d9091SRichard Lowe 61*5d9d9091SRichard LoweLABEL(try1): 62*5d9d9091SRichard Lowe cmp $8, %rdx 63*5d9d9091SRichard Lowe jae LABEL(1after) 64*5d9d9091SRichard Lowe 65*5d9d9091SRichard LoweLABEL(1): /* 1-byte */ 66*5d9d9091SRichard Lowe test %rdx, %rdx 67*5d9d9091SRichard Lowe mov $0, %eax 68*5d9d9091SRichard Lowe jz LABEL(exit) 69*5d9d9091SRichard Lowe 70*5d9d9091SRichard LoweLABEL(1loop): 71*5d9d9091SRichard Lowe movzbl (%rdi), %eax 72*5d9d9091SRichard Lowe movzbl (%rsi), %ecx 73*5d9d9091SRichard Lowe sub %ecx, %eax 74*5d9d9091SRichard Lowe jnz LABEL(exit) 75*5d9d9091SRichard Lowe 76*5d9d9091SRichard Lowe dec %rdx 77*5d9d9091SRichard Lowe 78*5d9d9091SRichard Lowe lea 1 (%rdi), %rdi 79*5d9d9091SRichard Lowe lea 1 (%rsi), %rsi 80*5d9d9091SRichard Lowe 81*5d9d9091SRichard Lowe jnz LABEL(1loop) 82*5d9d9091SRichard Lowe 83*5d9d9091SRichard LoweLABEL(exit): 84*5d9d9091SRichard Lowe rep 85*5d9d9091SRichard Lowe ret 86*5d9d9091SRichard Lowe 87*5d9d9091SRichard Lowe .p2align 4 88*5d9d9091SRichard Lowe 89*5d9d9091SRichard LoweLABEL(1after): 90*5d9d9091SRichard Lowe 91*5d9d9091SRichard LoweLABEL(8try): 92*5d9d9091SRichard Lowe cmp $32, %rdx 93*5d9d9091SRichard Lowe jae LABEL(8after) 94*5d9d9091SRichard Lowe 95*5d9d9091SRichard LoweLABEL(8): /* 8-byte */ 96*5d9d9091SRichard Lowe mov %edx, %ecx 97*5d9d9091SRichard Lowe shr $3, %ecx 98*5d9d9091SRichard Lowe jz LABEL(1) 99*5d9d9091SRichard Lowe 100*5d9d9091SRichard Lowe .p2align 4 101*5d9d9091SRichard Lowe 102*5d9d9091SRichard LoweLABEL(8loop): 103*5d9d9091SRichard Lowe mov (%rsi), %rax 104*5d9d9091SRichard Lowe cmp (%rdi), %rax 105*5d9d9091SRichard Lowe jne LABEL(1) 106*5d9d9091SRichard Lowe 107*5d9d9091SRichard Lowe sub $8, %rdx 108*5d9d9091SRichard Lowe dec %ecx 109*5d9d9091SRichard Lowe 110*5d9d9091SRichard Lowe lea 8 (%rsi), %rsi 111*5d9d9091SRichard Lowe lea 8 (%rdi), %rdi 112*5d9d9091SRichard Lowe 113*5d9d9091SRichard Lowe jnz LABEL(8loop) 114*5d9d9091SRichard Lowe 115*5d9d9091SRichard LoweLABEL(8skip): 116*5d9d9091SRichard Lowe and $7, %edx 117*5d9d9091SRichard Lowe jnz LABEL(1) 118*5d9d9091SRichard Lowe 119*5d9d9091SRichard Lowe xor %eax, %eax 120*5d9d9091SRichard Lowe ret 121*5d9d9091SRichard Lowe 122*5d9d9091SRichard Lowe .p2align 4 123*5d9d9091SRichard Lowe 124*5d9d9091SRichard LoweLABEL(8after): 125*5d9d9091SRichard Lowe 126*5d9d9091SRichard LoweLABEL(32try): 127*5d9d9091SRichard Lowe cmp $2048, %rdx 128*5d9d9091SRichard Lowe ja LABEL(32after) 129*5d9d9091SRichard Lowe 130*5d9d9091SRichard LoweLABEL(32): /* 32-byte */ 131*5d9d9091SRichard Lowe mov %edx, %ecx 132*5d9d9091SRichard Lowe shr $5, %ecx 133*5d9d9091SRichard Lowe jz LABEL(8) 134*5d9d9091SRichard Lowe 135*5d9d9091SRichard Lowe .p2align 4 136*5d9d9091SRichard Lowe 137*5d9d9091SRichard LoweLABEL(32loop): 138*5d9d9091SRichard Lowe mov (%rsi), %rax 139*5d9d9091SRichard Lowe mov 8 (%rsi), %r8 140*5d9d9091SRichard Lowe mov 16 (%rsi), %r9 141*5d9d9091SRichard Lowe mov 24 (%rsi), %r10 142*5d9d9091SRichard Lowe sub (%rdi), %rax 143*5d9d9091SRichard Lowe sub 8 (%rdi), %r8 144*5d9d9091SRichard Lowe sub 16 (%rdi), %r9 145*5d9d9091SRichard Lowe sub 24 (%rdi), %r10 146*5d9d9091SRichard Lowe 147*5d9d9091SRichard Lowe or %rax, %r8 148*5d9d9091SRichard Lowe or %r9, %r10 149*5d9d9091SRichard Lowe or %r8, %r10 150*5d9d9091SRichard Lowe jnz LABEL(8) 151*5d9d9091SRichard Lowe 152*5d9d9091SRichard Lowe sub $32, %rdx 153*5d9d9091SRichard Lowe dec %ecx 154*5d9d9091SRichard Lowe 155*5d9d9091SRichard Lowe lea 32 (%rsi), %rsi 156*5d9d9091SRichard Lowe lea 32 (%rdi), %rdi 157*5d9d9091SRichard Lowe 158*5d9d9091SRichard Lowe jnz LABEL(32loop) 159*5d9d9091SRichard Lowe 160*5d9d9091SRichard LoweLABEL(32skip): 161*5d9d9091SRichard Lowe and $31, %edx 162*5d9d9091SRichard Lowe jnz LABEL(8) 163*5d9d9091SRichard Lowe 164*5d9d9091SRichard Lowe xor %eax, %eax 165*5d9d9091SRichard Lowe ret 166*5d9d9091SRichard Lowe 167*5d9d9091SRichard Lowe .p2align 4 168*5d9d9091SRichard Lowe 169*5d9d9091SRichard LoweLABEL(32after): 170*5d9d9091SRichard Lowe 171*5d9d9091SRichard Lowe prefetchnta _sref_(.amd64cache1half) /* 3DNow: use prefetch */ 172*5d9d9091SRichard Lowe 173*5d9d9091SRichard LoweLABEL(srctry): 174*5d9d9091SRichard Lowe mov %esi, %r8d /* align by source */ 175*5d9d9091SRichard Lowe 176*5d9d9091SRichard Lowe and $7, %r8d 177*5d9d9091SRichard Lowe jz LABEL(srcafter) /* not unaligned */ 178*5d9d9091SRichard Lowe 179*5d9d9091SRichard LoweLABEL(src): /* align */ 180*5d9d9091SRichard Lowe lea -8 (%r8, %rdx), %rdx 181*5d9d9091SRichard Lowe sub $8, %r8d 182*5d9d9091SRichard Lowe 183*5d9d9091SRichard Lowe 184*5d9d9091SRichard LoweLABEL(srcloop): 185*5d9d9091SRichard Lowe movzbl (%rdi), %eax 186*5d9d9091SRichard Lowe movzbl (%rsi), %ecx 187*5d9d9091SRichard Lowe sub %ecx, %eax 188*5d9d9091SRichard Lowe jnz LABEL(exit) 189*5d9d9091SRichard Lowe 190*5d9d9091SRichard Lowe inc %r8d 191*5d9d9091SRichard Lowe 192*5d9d9091SRichard Lowe lea 1 (%rdi), %rdi 193*5d9d9091SRichard Lowe lea 1 (%rsi), %rsi 194*5d9d9091SRichard Lowe 195*5d9d9091SRichard Lowe jnz LABEL(srcloop) 196*5d9d9091SRichard Lowe 197*5d9d9091SRichard Lowe .p2align 4 198*5d9d9091SRichard Lowe 199*5d9d9091SRichard LoweLABEL(srcafter): 200*5d9d9091SRichard Lowe 201*5d9d9091SRichard LoweLABEL(64try): 202*5d9d9091SRichard Lowe mov _sref_(.amd64cache1half), %rcx 203*5d9d9091SRichard Lowe cmp %rdx, %rcx 204*5d9d9091SRichard Lowe cmova %rdx, %rcx 205*5d9d9091SRichard Lowe 206*5d9d9091SRichard LoweLABEL(64): /* 64-byte */ 207*5d9d9091SRichard Lowe shr $6, %rcx 208*5d9d9091SRichard Lowe jz LABEL(32) 209*5d9d9091SRichard Lowe 210*5d9d9091SRichard Lowe .p2align 4 211*5d9d9091SRichard Lowe 212*5d9d9091SRichard LoweLABEL(64loop): 213*5d9d9091SRichard Lowe mov (%rsi), %rax 214*5d9d9091SRichard Lowe mov 8 (%rsi), %r8 215*5d9d9091SRichard Lowe sub (%rdi), %rax 216*5d9d9091SRichard Lowe sub 8 (%rdi), %r8 217*5d9d9091SRichard Lowe or %r8, %rax 218*5d9d9091SRichard Lowe 219*5d9d9091SRichard Lowe mov 16 (%rsi), %r9 220*5d9d9091SRichard Lowe mov 24 (%rsi), %r10 221*5d9d9091SRichard Lowe sub 16 (%rdi), %r9 222*5d9d9091SRichard Lowe sub 24 (%rdi), %r10 223*5d9d9091SRichard Lowe or %r10, %r9 224*5d9d9091SRichard Lowe 225*5d9d9091SRichard Lowe or %r9, %rax 226*5d9d9091SRichard Lowe jnz LABEL(32) 227*5d9d9091SRichard Lowe 228*5d9d9091SRichard Lowe mov 32 (%rsi), %rax 229*5d9d9091SRichard Lowe mov 40 (%rsi), %r8 230*5d9d9091SRichard Lowe sub 32 (%rdi), %rax 231*5d9d9091SRichard Lowe sub 40 (%rdi), %r8 232*5d9d9091SRichard Lowe or %r8, %rax 233*5d9d9091SRichard Lowe 234*5d9d9091SRichard Lowe mov 48 (%rsi), %r9 235*5d9d9091SRichard Lowe mov 56 (%rsi), %r10 236*5d9d9091SRichard Lowe sub 48 (%rdi), %r9 237*5d9d9091SRichard Lowe sub 56 (%rdi), %r10 238*5d9d9091SRichard Lowe or %r10, %r9 239*5d9d9091SRichard Lowe 240*5d9d9091SRichard Lowe or %r9, %rax 241*5d9d9091SRichard Lowe jnz LABEL(32) 242*5d9d9091SRichard Lowe 243*5d9d9091SRichard Lowe lea 64 (%rsi), %rsi 244*5d9d9091SRichard Lowe lea 64 (%rdi), %rdi 245*5d9d9091SRichard Lowe 246*5d9d9091SRichard Lowe sub $64, %rdx 247*5d9d9091SRichard Lowe dec %rcx 248*5d9d9091SRichard Lowe jnz LABEL(64loop) 249*5d9d9091SRichard Lowe 250*5d9d9091SRichard LoweLABEL(64skip): 251*5d9d9091SRichard Lowe cmp $2048, %rdx 252*5d9d9091SRichard Lowe ja LABEL(64after) 253*5d9d9091SRichard Lowe 254*5d9d9091SRichard Lowe test %edx, %edx 255*5d9d9091SRichard Lowe jnz LABEL(32) 256*5d9d9091SRichard Lowe 257*5d9d9091SRichard Lowe xor %eax, %eax 258*5d9d9091SRichard Lowe ret 259*5d9d9091SRichard Lowe 260*5d9d9091SRichard Lowe .p2align 4 261*5d9d9091SRichard Lowe 262*5d9d9091SRichard LoweLABEL(64after): 263*5d9d9091SRichard Lowe 264*5d9d9091SRichard LoweLABEL(pretry): 265*5d9d9091SRichard Lowe 266*5d9d9091SRichard LoweLABEL(pre): /* 64-byte prefetching */ 267*5d9d9091SRichard Lowe mov _sref_(.amd64cache2half), %rcx 268*5d9d9091SRichard Lowe cmp %rdx, %rcx 269*5d9d9091SRichard Lowe cmova %rdx, %rcx 270*5d9d9091SRichard Lowe 271*5d9d9091SRichard Lowe shr $6, %rcx 272*5d9d9091SRichard Lowe jz LABEL(preskip) 273*5d9d9091SRichard Lowe 274*5d9d9091SRichard Lowe prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 275*5d9d9091SRichard Lowe prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 276*5d9d9091SRichard Lowe 277*5d9d9091SRichard Lowe mov (%rsi), %rax 278*5d9d9091SRichard Lowe mov 8 (%rsi), %r9 279*5d9d9091SRichard Lowe mov 16 (%rsi), %r10 280*5d9d9091SRichard Lowe mov 24 (%rsi), %r11 281*5d9d9091SRichard Lowe sub (%rdi), %rax 282*5d9d9091SRichard Lowe sub 8 (%rdi), %r9 283*5d9d9091SRichard Lowe sub 16 (%rdi), %r10 284*5d9d9091SRichard Lowe sub 24 (%rdi), %r11 285*5d9d9091SRichard Lowe 286*5d9d9091SRichard Lowe or %r9, %rax 287*5d9d9091SRichard Lowe or %r11, %r10 288*5d9d9091SRichard Lowe or %r10, %rax 289*5d9d9091SRichard Lowe jnz LABEL(32) 290*5d9d9091SRichard Lowe 291*5d9d9091SRichard Lowe mov 32 (%rsi), %rax 292*5d9d9091SRichard Lowe mov 40 (%rsi), %r9 293*5d9d9091SRichard Lowe mov 48 (%rsi), %r10 294*5d9d9091SRichard Lowe mov 56 (%rsi), %r11 295*5d9d9091SRichard Lowe sub 32 (%rdi), %rax 296*5d9d9091SRichard Lowe sub 40 (%rdi), %r9 297*5d9d9091SRichard Lowe sub 48 (%rdi), %r10 298*5d9d9091SRichard Lowe sub 56 (%rdi), %r11 299*5d9d9091SRichard Lowe 300*5d9d9091SRichard Lowe or %r9, %rax 301*5d9d9091SRichard Lowe or %r11, %r10 302*5d9d9091SRichard Lowe or %r10, %rax 303*5d9d9091SRichard Lowe jnz LABEL(32) 304*5d9d9091SRichard Lowe 305*5d9d9091SRichard Lowe lea 64 (%rsi), %rsi 306*5d9d9091SRichard Lowe lea 64 (%rdi), %rdi 307*5d9d9091SRichard Lowe 308*5d9d9091SRichard Lowe sub $64, %rdx 309*5d9d9091SRichard Lowe dec %rcx 310*5d9d9091SRichard Lowe 311*5d9d9091SRichard Lowe .p2align 4 312*5d9d9091SRichard Lowe 313*5d9d9091SRichard LoweLABEL(preloop): 314*5d9d9091SRichard Lowe prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 315*5d9d9091SRichard Lowe prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 316*5d9d9091SRichard Lowe 317*5d9d9091SRichard Lowe mov (%rsi), %rax 318*5d9d9091SRichard Lowe mov 8 (%rsi), %r9 319*5d9d9091SRichard Lowe mov 16 (%rsi), %r10 320*5d9d9091SRichard Lowe mov 24 (%rsi), %r11 321*5d9d9091SRichard Lowe sub (%rdi), %rax 322*5d9d9091SRichard Lowe sub 8 (%rdi), %r9 323*5d9d9091SRichard Lowe sub 16 (%rdi), %r10 324*5d9d9091SRichard Lowe sub 24 (%rdi), %r11 325*5d9d9091SRichard Lowe 326*5d9d9091SRichard Lowe or %r9, %rax 327*5d9d9091SRichard Lowe or %r11, %r10 328*5d9d9091SRichard Lowe or %r10, %rax 329*5d9d9091SRichard Lowe jnz LABEL(32) 330*5d9d9091SRichard Lowe 331*5d9d9091SRichard Lowe mov 32 (%rsi), %rax 332*5d9d9091SRichard Lowe mov 40 (%rsi), %r9 333*5d9d9091SRichard Lowe mov 48 (%rsi), %r10 334*5d9d9091SRichard Lowe mov 56 (%rsi), %r11 335*5d9d9091SRichard Lowe sub 32 (%rdi), %rax 336*5d9d9091SRichard Lowe sub 40 (%rdi), %r9 337*5d9d9091SRichard Lowe sub 48 (%rdi), %r10 338*5d9d9091SRichard Lowe sub 56 (%rdi), %r11 339*5d9d9091SRichard Lowe 340*5d9d9091SRichard Lowe or %r9, %rax 341*5d9d9091SRichard Lowe or %r11, %r10 342*5d9d9091SRichard Lowe or %r10, %rax 343*5d9d9091SRichard Lowe jnz LABEL(32) 344*5d9d9091SRichard Lowe 345*5d9d9091SRichard Lowe lea 64 (%rsi), %rsi 346*5d9d9091SRichard Lowe lea 64 (%rdi), %rdi 347*5d9d9091SRichard Lowe 348*5d9d9091SRichard Lowe sub $64, %rdx 349*5d9d9091SRichard Lowe dec %rcx 350*5d9d9091SRichard Lowe jnz LABEL(preloop) 351*5d9d9091SRichard Lowe 352*5d9d9091SRichard Lowe 353*5d9d9091SRichard LoweLABEL(preskip): 354*5d9d9091SRichard Lowe cmp $2048, %rdx 355*5d9d9091SRichard Lowe ja LABEL(preafter) 356*5d9d9091SRichard Lowe 357*5d9d9091SRichard Lowe test %edx, %edx 358*5d9d9091SRichard Lowe jnz LABEL(32) 359*5d9d9091SRichard Lowe 360*5d9d9091SRichard Lowe xor %eax, %eax 361*5d9d9091SRichard Lowe ret 362*5d9d9091SRichard Lowe 363*5d9d9091SRichard Lowe .p2align 4 364*5d9d9091SRichard Lowe 365*5d9d9091SRichard LoweLABEL(preafter): 366*5d9d9091SRichard Lowe 367*5d9d9091SRichard LoweLABEL(128try): 368*5d9d9091SRichard Lowe 369*5d9d9091SRichard LoweLABEL(128): /* 128-byte */ 370*5d9d9091SRichard Lowe mov %rdx, %rcx 371*5d9d9091SRichard Lowe shr $7, %rcx 372*5d9d9091SRichard Lowe jz LABEL(128skip) 373*5d9d9091SRichard Lowe 374*5d9d9091SRichard Lowe .p2align 4 375*5d9d9091SRichard Lowe 376*5d9d9091SRichard LoweLABEL(128loop): 377*5d9d9091SRichard Lowe prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 378*5d9d9091SRichard Lowe prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 379*5d9d9091SRichard Lowe 380*5d9d9091SRichard Lowe mov (%rsi), %rax 381*5d9d9091SRichard Lowe mov 8 (%rsi), %r8 382*5d9d9091SRichard Lowe sub (%rdi), %rax 383*5d9d9091SRichard Lowe sub 8 (%rdi), %r8 384*5d9d9091SRichard Lowe mov 16 (%rsi), %r9 385*5d9d9091SRichard Lowe mov 24 (%rsi), %r10 386*5d9d9091SRichard Lowe sub 16 (%rdi), %r9 387*5d9d9091SRichard Lowe sub 24 (%rdi), %r10 388*5d9d9091SRichard Lowe 389*5d9d9091SRichard Lowe or %r8, %rax 390*5d9d9091SRichard Lowe or %r9, %r10 391*5d9d9091SRichard Lowe or %r10, %rax 392*5d9d9091SRichard Lowe 393*5d9d9091SRichard Lowe mov 32 (%rsi), %r8 394*5d9d9091SRichard Lowe mov 40 (%rsi), %r9 395*5d9d9091SRichard Lowe sub 32 (%rdi), %r8 396*5d9d9091SRichard Lowe sub 40 (%rdi), %r9 397*5d9d9091SRichard Lowe mov 48 (%rsi), %r10 398*5d9d9091SRichard Lowe mov 56 (%rsi), %r11 399*5d9d9091SRichard Lowe sub 48 (%rdi), %r10 400*5d9d9091SRichard Lowe sub 56 (%rdi), %r11 401*5d9d9091SRichard Lowe 402*5d9d9091SRichard Lowe or %r9, %r8 403*5d9d9091SRichard Lowe or %r11, %r10 404*5d9d9091SRichard Lowe or %r10, %r8 405*5d9d9091SRichard Lowe 406*5d9d9091SRichard Lowe or %r8, %rax 407*5d9d9091SRichard Lowe jnz LABEL(32) 408*5d9d9091SRichard Lowe 409*5d9d9091SRichard Lowe prefetchnta 576 (%rsi) /* 3DNow: use prefetch */ 410*5d9d9091SRichard Lowe prefetchnta 576 (%rdi) /* 3DNow: use prefetch */ 411*5d9d9091SRichard Lowe 412*5d9d9091SRichard Lowe mov 64 (%rsi), %rax 413*5d9d9091SRichard Lowe mov 72 (%rsi), %r8 414*5d9d9091SRichard Lowe sub 64 (%rdi), %rax 415*5d9d9091SRichard Lowe sub 72 (%rdi), %r8 416*5d9d9091SRichard Lowe mov 80 (%rsi), %r9 417*5d9d9091SRichard Lowe mov 88 (%rsi), %r10 418*5d9d9091SRichard Lowe sub 80 (%rdi), %r9 419*5d9d9091SRichard Lowe sub 88 (%rdi), %r10 420*5d9d9091SRichard Lowe 421*5d9d9091SRichard Lowe or %r8, %rax 422*5d9d9091SRichard Lowe or %r9, %r10 423*5d9d9091SRichard Lowe or %r10, %rax 424*5d9d9091SRichard Lowe 425*5d9d9091SRichard Lowe mov 96 (%rsi), %r8 426*5d9d9091SRichard Lowe mov 104 (%rsi), %r9 427*5d9d9091SRichard Lowe sub 96 (%rdi), %r8 428*5d9d9091SRichard Lowe sub 104 (%rdi), %r9 429*5d9d9091SRichard Lowe mov 112 (%rsi), %r10 430*5d9d9091SRichard Lowe mov 120 (%rsi), %r11 431*5d9d9091SRichard Lowe sub 112 (%rdi), %r10 432*5d9d9091SRichard Lowe sub 120 (%rdi), %r11 433*5d9d9091SRichard Lowe 434*5d9d9091SRichard Lowe or %r9, %r8 435*5d9d9091SRichard Lowe or %r11, %r10 436*5d9d9091SRichard Lowe or %r10, %r8 437*5d9d9091SRichard Lowe 438*5d9d9091SRichard Lowe or %r8, %rax 439*5d9d9091SRichard Lowe jnz LABEL(32) 440*5d9d9091SRichard Lowe 441*5d9d9091SRichard Lowe sub $128, %rdx 442*5d9d9091SRichard Lowe dec %rcx 443*5d9d9091SRichard Lowe 444*5d9d9091SRichard Lowe lea 128 (%rsi), %rsi 445*5d9d9091SRichard Lowe lea 128 (%rdi), %rdi 446*5d9d9091SRichard Lowe 447*5d9d9091SRichard Lowe jnz LABEL(128loop) 448*5d9d9091SRichard Lowe 449*5d9d9091SRichard LoweLABEL(128skip): 450*5d9d9091SRichard Lowe and $127, %edx 451*5d9d9091SRichard Lowe jnz LABEL(32) 452*5d9d9091SRichard Lowe 453*5d9d9091SRichard Lowe xor %eax, %eax 454*5d9d9091SRichard Lowe ret 455*5d9d9091SRichard Lowe 456*5d9d9091SRichard Lowe SET_SIZE(memcmp) 457