xref: /illumos-gate/usr/src/lib/libc/amd64/gen/memcmp.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1*5d9d9091SRichard Lowe/*
2*5d9d9091SRichard Lowe * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
3*5d9d9091SRichard Lowe * Use is subject to license terms.
4*5d9d9091SRichard Lowe */
5*5d9d9091SRichard Lowe
6*5d9d9091SRichard Lowe/*
7*5d9d9091SRichard Lowe * Copyright (c) 2002 Advanced Micro Devices, Inc.
8*5d9d9091SRichard Lowe *
9*5d9d9091SRichard Lowe * All rights reserved.
10*5d9d9091SRichard Lowe *
11*5d9d9091SRichard Lowe * Redistribution and  use in source and binary  forms, with or
12*5d9d9091SRichard Lowe * without  modification,  are   permitted  provided  that  the
13*5d9d9091SRichard Lowe * following conditions are met:
14*5d9d9091SRichard Lowe *
15*5d9d9091SRichard Lowe * + Redistributions  of source  code  must  retain  the  above
16*5d9d9091SRichard Lowe *   copyright  notice,   this  list  of   conditions  and  the
17*5d9d9091SRichard Lowe *   following disclaimer.
18*5d9d9091SRichard Lowe *
19*5d9d9091SRichard Lowe * + Redistributions  in binary  form must reproduce  the above
20*5d9d9091SRichard Lowe *   copyright  notice,   this  list  of   conditions  and  the
21*5d9d9091SRichard Lowe *   following  disclaimer in  the  documentation and/or  other
22*5d9d9091SRichard Lowe *   materials provided with the distribution.
23*5d9d9091SRichard Lowe *
24*5d9d9091SRichard Lowe * + Neither the  name of Advanced Micro Devices,  Inc. nor the
25*5d9d9091SRichard Lowe *   names  of  its contributors  may  be  used  to endorse  or
26*5d9d9091SRichard Lowe *   promote  products  derived   from  this  software  without
27*5d9d9091SRichard Lowe *   specific prior written permission.
28*5d9d9091SRichard Lowe *
29*5d9d9091SRichard Lowe * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
30*5d9d9091SRichard Lowe * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
31*5d9d9091SRichard Lowe * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
32*5d9d9091SRichard Lowe * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
33*5d9d9091SRichard Lowe * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
34*5d9d9091SRichard Lowe * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
35*5d9d9091SRichard Lowe * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
36*5d9d9091SRichard Lowe * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
37*5d9d9091SRichard Lowe * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
38*5d9d9091SRichard Lowe * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
39*5d9d9091SRichard Lowe * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
40*5d9d9091SRichard Lowe * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
41*5d9d9091SRichard Lowe * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
42*5d9d9091SRichard Lowe * POSSIBILITY OF SUCH DAMAGE.
43*5d9d9091SRichard Lowe *
44*5d9d9091SRichard Lowe * It is  licensee's responsibility  to comply with  any export
45*5d9d9091SRichard Lowe * regulations applicable in licensee's jurisdiction.
46*5d9d9091SRichard Lowe */
47*5d9d9091SRichard Lowe
48*5d9d9091SRichard Lowe	.file	"memcmp.s"
49*5d9d9091SRichard Lowe
50*5d9d9091SRichard Lowe#include <sys/asm_linkage.h>
51*5d9d9091SRichard Lowe
52*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memcmp,function)
53*5d9d9091SRichard Lowe
54*5d9d9091SRichard Lowe#include "SYS.h"
55*5d9d9091SRichard Lowe#include "cache.h"
56*5d9d9091SRichard Lowe
57*5d9d9091SRichard Lowe#define LABEL(s) .memcmp##s
58*5d9d9091SRichard Lowe
59*5d9d9091SRichard Lowe	ENTRY(memcmp)                 /* (const void *, const void*, size_t) */
60*5d9d9091SRichard Lowe
61*5d9d9091SRichard LoweLABEL(try1):
62*5d9d9091SRichard Lowe        cmp     $8, %rdx
63*5d9d9091SRichard Lowe        jae     LABEL(1after)
64*5d9d9091SRichard Lowe
65*5d9d9091SRichard LoweLABEL(1):                                /* 1-byte */
66*5d9d9091SRichard Lowe        test    %rdx, %rdx
67*5d9d9091SRichard Lowe        mov     $0, %eax
68*5d9d9091SRichard Lowe        jz      LABEL(exit)
69*5d9d9091SRichard Lowe
70*5d9d9091SRichard LoweLABEL(1loop):
71*5d9d9091SRichard Lowe        movzbl  (%rdi), %eax
72*5d9d9091SRichard Lowe        movzbl  (%rsi), %ecx
73*5d9d9091SRichard Lowe        sub     %ecx, %eax
74*5d9d9091SRichard Lowe        jnz     LABEL(exit)
75*5d9d9091SRichard Lowe
76*5d9d9091SRichard Lowe        dec     %rdx
77*5d9d9091SRichard Lowe
78*5d9d9091SRichard Lowe        lea     1 (%rdi), %rdi
79*5d9d9091SRichard Lowe        lea     1 (%rsi), %rsi
80*5d9d9091SRichard Lowe
81*5d9d9091SRichard Lowe        jnz     LABEL(1loop)
82*5d9d9091SRichard Lowe
83*5d9d9091SRichard LoweLABEL(exit):
84*5d9d9091SRichard Lowe        rep
85*5d9d9091SRichard Lowe        ret
86*5d9d9091SRichard Lowe
87*5d9d9091SRichard Lowe        .p2align 4
88*5d9d9091SRichard Lowe
89*5d9d9091SRichard LoweLABEL(1after):
90*5d9d9091SRichard Lowe
91*5d9d9091SRichard LoweLABEL(8try):
92*5d9d9091SRichard Lowe        cmp     $32, %rdx
93*5d9d9091SRichard Lowe        jae     LABEL(8after)
94*5d9d9091SRichard Lowe
95*5d9d9091SRichard LoweLABEL(8):                        /* 8-byte */
96*5d9d9091SRichard Lowe        mov     %edx, %ecx
97*5d9d9091SRichard Lowe        shr     $3, %ecx
98*5d9d9091SRichard Lowe        jz      LABEL(1)
99*5d9d9091SRichard Lowe
100*5d9d9091SRichard Lowe        .p2align 4
101*5d9d9091SRichard Lowe
102*5d9d9091SRichard LoweLABEL(8loop):
103*5d9d9091SRichard Lowe        mov     (%rsi), %rax
104*5d9d9091SRichard Lowe        cmp     (%rdi), %rax
105*5d9d9091SRichard Lowe        jne     LABEL(1)
106*5d9d9091SRichard Lowe
107*5d9d9091SRichard Lowe        sub     $8, %rdx
108*5d9d9091SRichard Lowe        dec     %ecx
109*5d9d9091SRichard Lowe
110*5d9d9091SRichard Lowe        lea     8 (%rsi), %rsi
111*5d9d9091SRichard Lowe        lea     8 (%rdi), %rdi
112*5d9d9091SRichard Lowe
113*5d9d9091SRichard Lowe        jnz     LABEL(8loop)
114*5d9d9091SRichard Lowe
115*5d9d9091SRichard LoweLABEL(8skip):
116*5d9d9091SRichard Lowe        and     $7, %edx
117*5d9d9091SRichard Lowe        jnz     LABEL(1)
118*5d9d9091SRichard Lowe
119*5d9d9091SRichard Lowe        xor     %eax, %eax
120*5d9d9091SRichard Lowe        ret
121*5d9d9091SRichard Lowe
122*5d9d9091SRichard Lowe        .p2align 4
123*5d9d9091SRichard Lowe
124*5d9d9091SRichard LoweLABEL(8after):
125*5d9d9091SRichard Lowe
126*5d9d9091SRichard LoweLABEL(32try):
127*5d9d9091SRichard Lowe        cmp     $2048, %rdx
128*5d9d9091SRichard Lowe        ja      LABEL(32after)
129*5d9d9091SRichard Lowe
130*5d9d9091SRichard LoweLABEL(32):                               /* 32-byte */
131*5d9d9091SRichard Lowe        mov     %edx, %ecx
132*5d9d9091SRichard Lowe        shr     $5, %ecx
133*5d9d9091SRichard Lowe        jz      LABEL(8)
134*5d9d9091SRichard Lowe
135*5d9d9091SRichard Lowe        .p2align 4
136*5d9d9091SRichard Lowe
137*5d9d9091SRichard LoweLABEL(32loop):
138*5d9d9091SRichard Lowe        mov        (%rsi), %rax
139*5d9d9091SRichard Lowe        mov      8 (%rsi),  %r8
140*5d9d9091SRichard Lowe        mov     16 (%rsi),  %r9
141*5d9d9091SRichard Lowe        mov     24 (%rsi), %r10
142*5d9d9091SRichard Lowe        sub        (%rdi), %rax
143*5d9d9091SRichard Lowe        sub      8 (%rdi),  %r8
144*5d9d9091SRichard Lowe        sub     16 (%rdi),  %r9
145*5d9d9091SRichard Lowe        sub     24 (%rdi), %r10
146*5d9d9091SRichard Lowe
147*5d9d9091SRichard Lowe        or      %rax,  %r8
148*5d9d9091SRichard Lowe        or       %r9, %r10
149*5d9d9091SRichard Lowe        or       %r8, %r10
150*5d9d9091SRichard Lowe        jnz     LABEL(8)
151*5d9d9091SRichard Lowe
152*5d9d9091SRichard Lowe        sub     $32, %rdx
153*5d9d9091SRichard Lowe        dec     %ecx
154*5d9d9091SRichard Lowe
155*5d9d9091SRichard Lowe        lea     32 (%rsi), %rsi
156*5d9d9091SRichard Lowe        lea     32 (%rdi), %rdi
157*5d9d9091SRichard Lowe
158*5d9d9091SRichard Lowe        jnz     LABEL(32loop)
159*5d9d9091SRichard Lowe
160*5d9d9091SRichard LoweLABEL(32skip):
161*5d9d9091SRichard Lowe        and     $31, %edx
162*5d9d9091SRichard Lowe        jnz     LABEL(8)
163*5d9d9091SRichard Lowe
164*5d9d9091SRichard Lowe        xor     %eax, %eax
165*5d9d9091SRichard Lowe        ret
166*5d9d9091SRichard Lowe
167*5d9d9091SRichard Lowe        .p2align 4
168*5d9d9091SRichard Lowe
169*5d9d9091SRichard LoweLABEL(32after):
170*5d9d9091SRichard Lowe
171*5d9d9091SRichard Lowe	prefetchnta _sref_(.amd64cache1half)	/* 3DNow: use prefetch */
172*5d9d9091SRichard Lowe
173*5d9d9091SRichard LoweLABEL(srctry):
174*5d9d9091SRichard Lowe        mov     %esi, %r8d      /* align by source */
175*5d9d9091SRichard Lowe
176*5d9d9091SRichard Lowe        and     $7, %r8d
177*5d9d9091SRichard Lowe        jz      LABEL(srcafter)  /* not unaligned */
178*5d9d9091SRichard Lowe
179*5d9d9091SRichard LoweLABEL(src):                      /* align */
180*5d9d9091SRichard Lowe        lea     -8 (%r8, %rdx), %rdx
181*5d9d9091SRichard Lowe        sub     $8, %r8d
182*5d9d9091SRichard Lowe
183*5d9d9091SRichard Lowe
184*5d9d9091SRichard LoweLABEL(srcloop):
185*5d9d9091SRichard Lowe        movzbl  (%rdi), %eax
186*5d9d9091SRichard Lowe        movzbl  (%rsi), %ecx
187*5d9d9091SRichard Lowe        sub     %ecx, %eax
188*5d9d9091SRichard Lowe        jnz     LABEL(exit)
189*5d9d9091SRichard Lowe
190*5d9d9091SRichard Lowe        inc     %r8d
191*5d9d9091SRichard Lowe
192*5d9d9091SRichard Lowe        lea     1 (%rdi), %rdi
193*5d9d9091SRichard Lowe        lea     1 (%rsi), %rsi
194*5d9d9091SRichard Lowe
195*5d9d9091SRichard Lowe        jnz     LABEL(srcloop)
196*5d9d9091SRichard Lowe
197*5d9d9091SRichard Lowe        .p2align 4
198*5d9d9091SRichard Lowe
199*5d9d9091SRichard LoweLABEL(srcafter):
200*5d9d9091SRichard Lowe
201*5d9d9091SRichard LoweLABEL(64try):
202*5d9d9091SRichard Lowe        mov     _sref_(.amd64cache1half), %rcx
203*5d9d9091SRichard Lowe        cmp	%rdx, %rcx
204*5d9d9091SRichard Lowe        cmova   %rdx, %rcx
205*5d9d9091SRichard Lowe
206*5d9d9091SRichard LoweLABEL(64):                               /* 64-byte */
207*5d9d9091SRichard Lowe        shr     $6, %rcx
208*5d9d9091SRichard Lowe        jz      LABEL(32)
209*5d9d9091SRichard Lowe
210*5d9d9091SRichard Lowe        .p2align 4
211*5d9d9091SRichard Lowe
212*5d9d9091SRichard LoweLABEL(64loop):
213*5d9d9091SRichard Lowe        mov        (%rsi), %rax
214*5d9d9091SRichard Lowe        mov      8 (%rsi),  %r8
215*5d9d9091SRichard Lowe        sub        (%rdi), %rax
216*5d9d9091SRichard Lowe        sub      8 (%rdi),  %r8
217*5d9d9091SRichard Lowe        or      %r8,  %rax
218*5d9d9091SRichard Lowe
219*5d9d9091SRichard Lowe        mov     16 (%rsi),  %r9
220*5d9d9091SRichard Lowe        mov     24 (%rsi), %r10
221*5d9d9091SRichard Lowe        sub     16 (%rdi),  %r9
222*5d9d9091SRichard Lowe        sub     24 (%rdi), %r10
223*5d9d9091SRichard Lowe        or      %r10, %r9
224*5d9d9091SRichard Lowe
225*5d9d9091SRichard Lowe        or      %r9,  %rax
226*5d9d9091SRichard Lowe        jnz     LABEL(32)
227*5d9d9091SRichard Lowe
228*5d9d9091SRichard Lowe        mov     32 (%rsi), %rax
229*5d9d9091SRichard Lowe        mov     40 (%rsi),  %r8
230*5d9d9091SRichard Lowe        sub     32 (%rdi), %rax
231*5d9d9091SRichard Lowe        sub     40 (%rdi),  %r8
232*5d9d9091SRichard Lowe        or      %r8,  %rax
233*5d9d9091SRichard Lowe
234*5d9d9091SRichard Lowe        mov     48 (%rsi),  %r9
235*5d9d9091SRichard Lowe        mov     56 (%rsi), %r10
236*5d9d9091SRichard Lowe        sub     48 (%rdi),  %r9
237*5d9d9091SRichard Lowe        sub     56 (%rdi), %r10
238*5d9d9091SRichard Lowe        or      %r10, %r9
239*5d9d9091SRichard Lowe
240*5d9d9091SRichard Lowe        or      %r9,  %rax
241*5d9d9091SRichard Lowe        jnz    	LABEL(32)
242*5d9d9091SRichard Lowe
243*5d9d9091SRichard Lowe        lea     64 (%rsi), %rsi
244*5d9d9091SRichard Lowe        lea     64 (%rdi), %rdi
245*5d9d9091SRichard Lowe
246*5d9d9091SRichard Lowe        sub     $64, %rdx
247*5d9d9091SRichard Lowe        dec     %rcx
248*5d9d9091SRichard Lowe        jnz     LABEL(64loop)
249*5d9d9091SRichard Lowe
250*5d9d9091SRichard LoweLABEL(64skip):
251*5d9d9091SRichard Lowe        cmp     $2048, %rdx
252*5d9d9091SRichard Lowe        ja     LABEL(64after)
253*5d9d9091SRichard Lowe
254*5d9d9091SRichard Lowe        test    %edx, %edx
255*5d9d9091SRichard Lowe        jnz     LABEL(32)
256*5d9d9091SRichard Lowe
257*5d9d9091SRichard Lowe        xor     %eax, %eax
258*5d9d9091SRichard Lowe        ret
259*5d9d9091SRichard Lowe
260*5d9d9091SRichard Lowe        .p2align 4
261*5d9d9091SRichard Lowe
262*5d9d9091SRichard LoweLABEL(64after):
263*5d9d9091SRichard Lowe
264*5d9d9091SRichard LoweLABEL(pretry):
265*5d9d9091SRichard Lowe
266*5d9d9091SRichard LoweLABEL(pre):                              /* 64-byte prefetching */
267*5d9d9091SRichard Lowe        mov     _sref_(.amd64cache2half), %rcx
268*5d9d9091SRichard Lowe        cmp	%rdx, %rcx
269*5d9d9091SRichard Lowe        cmova   %rdx, %rcx
270*5d9d9091SRichard Lowe
271*5d9d9091SRichard Lowe        shr     $6, %rcx
272*5d9d9091SRichard Lowe        jz      LABEL(preskip)
273*5d9d9091SRichard Lowe
274*5d9d9091SRichard Lowe        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
275*5d9d9091SRichard Lowe        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
276*5d9d9091SRichard Lowe
277*5d9d9091SRichard Lowe        mov        (%rsi), %rax
278*5d9d9091SRichard Lowe        mov      8 (%rsi), %r9
279*5d9d9091SRichard Lowe        mov     16 (%rsi), %r10
280*5d9d9091SRichard Lowe        mov     24 (%rsi), %r11
281*5d9d9091SRichard Lowe        sub        (%rdi), %rax
282*5d9d9091SRichard Lowe        sub      8 (%rdi), %r9
283*5d9d9091SRichard Lowe        sub     16 (%rdi), %r10
284*5d9d9091SRichard Lowe        sub     24 (%rdi), %r11
285*5d9d9091SRichard Lowe
286*5d9d9091SRichard Lowe        or       %r9, %rax
287*5d9d9091SRichard Lowe        or      %r11, %r10
288*5d9d9091SRichard Lowe        or      %r10, %rax
289*5d9d9091SRichard Lowe        jnz     LABEL(32)
290*5d9d9091SRichard Lowe
291*5d9d9091SRichard Lowe        mov     32 (%rsi), %rax
292*5d9d9091SRichard Lowe        mov     40 (%rsi), %r9
293*5d9d9091SRichard Lowe        mov     48 (%rsi), %r10
294*5d9d9091SRichard Lowe        mov     56 (%rsi), %r11
295*5d9d9091SRichard Lowe        sub     32 (%rdi), %rax
296*5d9d9091SRichard Lowe        sub     40 (%rdi), %r9
297*5d9d9091SRichard Lowe        sub     48 (%rdi), %r10
298*5d9d9091SRichard Lowe        sub     56 (%rdi), %r11
299*5d9d9091SRichard Lowe
300*5d9d9091SRichard Lowe        or       %r9, %rax
301*5d9d9091SRichard Lowe        or      %r11, %r10
302*5d9d9091SRichard Lowe        or      %r10, %rax
303*5d9d9091SRichard Lowe        jnz     LABEL(32)
304*5d9d9091SRichard Lowe
305*5d9d9091SRichard Lowe        lea     64 (%rsi), %rsi
306*5d9d9091SRichard Lowe        lea     64 (%rdi), %rdi
307*5d9d9091SRichard Lowe
308*5d9d9091SRichard Lowe        sub     $64, %rdx
309*5d9d9091SRichard Lowe        dec     %rcx
310*5d9d9091SRichard Lowe
311*5d9d9091SRichard Lowe        .p2align 4
312*5d9d9091SRichard Lowe
313*5d9d9091SRichard LoweLABEL(preloop):
314*5d9d9091SRichard Lowe        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
315*5d9d9091SRichard Lowe        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
316*5d9d9091SRichard Lowe
317*5d9d9091SRichard Lowe        mov        (%rsi), %rax
318*5d9d9091SRichard Lowe        mov      8 (%rsi), %r9
319*5d9d9091SRichard Lowe        mov     16 (%rsi), %r10
320*5d9d9091SRichard Lowe        mov     24 (%rsi), %r11
321*5d9d9091SRichard Lowe        sub        (%rdi), %rax
322*5d9d9091SRichard Lowe        sub      8 (%rdi), %r9
323*5d9d9091SRichard Lowe        sub     16 (%rdi), %r10
324*5d9d9091SRichard Lowe        sub     24 (%rdi), %r11
325*5d9d9091SRichard Lowe
326*5d9d9091SRichard Lowe        or       %r9, %rax
327*5d9d9091SRichard Lowe        or      %r11, %r10
328*5d9d9091SRichard Lowe        or      %r10, %rax
329*5d9d9091SRichard Lowe        jnz     LABEL(32)
330*5d9d9091SRichard Lowe
331*5d9d9091SRichard Lowe        mov     32 (%rsi), %rax
332*5d9d9091SRichard Lowe        mov     40 (%rsi), %r9
333*5d9d9091SRichard Lowe        mov     48 (%rsi), %r10
334*5d9d9091SRichard Lowe        mov     56 (%rsi), %r11
335*5d9d9091SRichard Lowe        sub     32 (%rdi), %rax
336*5d9d9091SRichard Lowe        sub     40 (%rdi), %r9
337*5d9d9091SRichard Lowe        sub     48 (%rdi), %r10
338*5d9d9091SRichard Lowe        sub     56 (%rdi), %r11
339*5d9d9091SRichard Lowe
340*5d9d9091SRichard Lowe        or       %r9, %rax
341*5d9d9091SRichard Lowe        or      %r11, %r10
342*5d9d9091SRichard Lowe        or      %r10, %rax
343*5d9d9091SRichard Lowe        jnz     LABEL(32)
344*5d9d9091SRichard Lowe
345*5d9d9091SRichard Lowe        lea     64 (%rsi), %rsi
346*5d9d9091SRichard Lowe        lea     64 (%rdi), %rdi
347*5d9d9091SRichard Lowe
348*5d9d9091SRichard Lowe        sub     $64, %rdx
349*5d9d9091SRichard Lowe        dec     %rcx
350*5d9d9091SRichard Lowe        jnz     LABEL(preloop)
351*5d9d9091SRichard Lowe
352*5d9d9091SRichard Lowe
353*5d9d9091SRichard LoweLABEL(preskip):
354*5d9d9091SRichard Lowe        cmp     $2048, %rdx
355*5d9d9091SRichard Lowe        ja      LABEL(preafter)
356*5d9d9091SRichard Lowe
357*5d9d9091SRichard Lowe        test    %edx, %edx
358*5d9d9091SRichard Lowe        jnz     LABEL(32)
359*5d9d9091SRichard Lowe
360*5d9d9091SRichard Lowe        xor     %eax, %eax
361*5d9d9091SRichard Lowe        ret
362*5d9d9091SRichard Lowe
363*5d9d9091SRichard Lowe        .p2align 4
364*5d9d9091SRichard Lowe
365*5d9d9091SRichard LoweLABEL(preafter):
366*5d9d9091SRichard Lowe
367*5d9d9091SRichard LoweLABEL(128try):
368*5d9d9091SRichard Lowe
369*5d9d9091SRichard LoweLABEL(128):                              /* 128-byte */
370*5d9d9091SRichard Lowe        mov     %rdx, %rcx
371*5d9d9091SRichard Lowe        shr     $7, %rcx
372*5d9d9091SRichard Lowe        jz      LABEL(128skip)
373*5d9d9091SRichard Lowe
374*5d9d9091SRichard Lowe        .p2align 4
375*5d9d9091SRichard Lowe
376*5d9d9091SRichard LoweLABEL(128loop):
377*5d9d9091SRichard Lowe        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
378*5d9d9091SRichard Lowe        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
379*5d9d9091SRichard Lowe
380*5d9d9091SRichard Lowe        mov        (%rsi), %rax
381*5d9d9091SRichard Lowe        mov      8 (%rsi), %r8
382*5d9d9091SRichard Lowe        sub        (%rdi), %rax
383*5d9d9091SRichard Lowe        sub      8 (%rdi), %r8
384*5d9d9091SRichard Lowe        mov     16 (%rsi), %r9
385*5d9d9091SRichard Lowe        mov     24 (%rsi), %r10
386*5d9d9091SRichard Lowe        sub     16 (%rdi), %r9
387*5d9d9091SRichard Lowe        sub     24 (%rdi), %r10
388*5d9d9091SRichard Lowe
389*5d9d9091SRichard Lowe        or       %r8, %rax
390*5d9d9091SRichard Lowe        or       %r9, %r10
391*5d9d9091SRichard Lowe        or      %r10, %rax
392*5d9d9091SRichard Lowe
393*5d9d9091SRichard Lowe        mov     32 (%rsi), %r8
394*5d9d9091SRichard Lowe        mov     40 (%rsi), %r9
395*5d9d9091SRichard Lowe        sub     32 (%rdi), %r8
396*5d9d9091SRichard Lowe        sub     40 (%rdi), %r9
397*5d9d9091SRichard Lowe        mov     48 (%rsi), %r10
398*5d9d9091SRichard Lowe        mov     56 (%rsi), %r11
399*5d9d9091SRichard Lowe        sub     48 (%rdi), %r10
400*5d9d9091SRichard Lowe        sub     56 (%rdi), %r11
401*5d9d9091SRichard Lowe
402*5d9d9091SRichard Lowe        or       %r9, %r8
403*5d9d9091SRichard Lowe        or      %r11, %r10
404*5d9d9091SRichard Lowe        or      %r10, %r8
405*5d9d9091SRichard Lowe
406*5d9d9091SRichard Lowe        or      %r8, %rax
407*5d9d9091SRichard Lowe        jnz     LABEL(32)
408*5d9d9091SRichard Lowe
409*5d9d9091SRichard Lowe        prefetchnta 576 (%rsi)	/* 3DNow: use prefetch */
410*5d9d9091SRichard Lowe        prefetchnta 576 (%rdi)	/* 3DNow: use prefetch */
411*5d9d9091SRichard Lowe
412*5d9d9091SRichard Lowe        mov      64 (%rsi), %rax
413*5d9d9091SRichard Lowe        mov      72 (%rsi), %r8
414*5d9d9091SRichard Lowe        sub      64 (%rdi), %rax
415*5d9d9091SRichard Lowe        sub      72 (%rdi), %r8
416*5d9d9091SRichard Lowe        mov      80 (%rsi), %r9
417*5d9d9091SRichard Lowe        mov      88 (%rsi), %r10
418*5d9d9091SRichard Lowe        sub      80 (%rdi), %r9
419*5d9d9091SRichard Lowe        sub      88 (%rdi), %r10
420*5d9d9091SRichard Lowe
421*5d9d9091SRichard Lowe        or       %r8, %rax
422*5d9d9091SRichard Lowe        or       %r9, %r10
423*5d9d9091SRichard Lowe        or      %r10, %rax
424*5d9d9091SRichard Lowe
425*5d9d9091SRichard Lowe        mov      96 (%rsi), %r8
426*5d9d9091SRichard Lowe        mov     104 (%rsi), %r9
427*5d9d9091SRichard Lowe        sub      96 (%rdi), %r8
428*5d9d9091SRichard Lowe        sub     104 (%rdi), %r9
429*5d9d9091SRichard Lowe        mov     112 (%rsi), %r10
430*5d9d9091SRichard Lowe        mov     120 (%rsi), %r11
431*5d9d9091SRichard Lowe        sub     112 (%rdi), %r10
432*5d9d9091SRichard Lowe        sub     120 (%rdi), %r11
433*5d9d9091SRichard Lowe
434*5d9d9091SRichard Lowe        or       %r9, %r8
435*5d9d9091SRichard Lowe        or      %r11, %r10
436*5d9d9091SRichard Lowe        or      %r10, %r8
437*5d9d9091SRichard Lowe
438*5d9d9091SRichard Lowe        or      %r8, %rax
439*5d9d9091SRichard Lowe        jnz     LABEL(32)
440*5d9d9091SRichard Lowe
441*5d9d9091SRichard Lowe        sub     $128, %rdx
442*5d9d9091SRichard Lowe        dec     %rcx
443*5d9d9091SRichard Lowe
444*5d9d9091SRichard Lowe        lea     128 (%rsi), %rsi
445*5d9d9091SRichard Lowe        lea     128 (%rdi), %rdi
446*5d9d9091SRichard Lowe
447*5d9d9091SRichard Lowe        jnz     LABEL(128loop)
448*5d9d9091SRichard Lowe
449*5d9d9091SRichard LoweLABEL(128skip):
450*5d9d9091SRichard Lowe        and     $127, %edx
451*5d9d9091SRichard Lowe        jnz     LABEL(32)
452*5d9d9091SRichard Lowe
453*5d9d9091SRichard Lowe        xor     %eax, %eax
454*5d9d9091SRichard Lowe        ret
455*5d9d9091SRichard Lowe
456*5d9d9091SRichard Lowe	SET_SIZE(memcmp)
457