xref: /freebsd-src/contrib/bionic-x86_64-string/ssse3-strcmp-slm.S (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1*8ddb146aSEd Maste/*
2*8ddb146aSEd MasteCopyright (c) 2014, Intel Corporation
3*8ddb146aSEd MasteAll rights reserved.
4*8ddb146aSEd Maste
5*8ddb146aSEd MasteRedistribution and use in source and binary forms, with or without
6*8ddb146aSEd Mastemodification, are permitted provided that the following conditions are met:
7*8ddb146aSEd Maste
8*8ddb146aSEd Maste    * Redistributions of source code must retain the above copyright notice,
9*8ddb146aSEd Maste    * this list of conditions and the following disclaimer.
10*8ddb146aSEd Maste
11*8ddb146aSEd Maste    * Redistributions in binary form must reproduce the above copyright notice,
12*8ddb146aSEd Maste    * this list of conditions and the following disclaimer in the documentation
13*8ddb146aSEd Maste    * and/or other materials provided with the distribution.
14*8ddb146aSEd Maste
15*8ddb146aSEd Maste    * Neither the name of Intel Corporation nor the names of its contributors
16*8ddb146aSEd Maste    * may be used to endorse or promote products derived from this software
17*8ddb146aSEd Maste    * without specific prior written permission.
18*8ddb146aSEd Maste
19*8ddb146aSEd MasteTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20*8ddb146aSEd MasteANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21*8ddb146aSEd MasteWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22*8ddb146aSEd MasteDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23*8ddb146aSEd MasteANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24*8ddb146aSEd Maste(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25*8ddb146aSEd MasteLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26*8ddb146aSEd MasteANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27*8ddb146aSEd Maste(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28*8ddb146aSEd MasteSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*8ddb146aSEd Maste*/
30*8ddb146aSEd Maste
31*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
32*8ddb146aSEd Maste/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
33*8ddb146aSEd Maste   if the new counter > the old one or is 0.  */
34*8ddb146aSEd Maste#define UPDATE_STRNCMP_COUNTER				\
35*8ddb146aSEd Maste	/* calculate left number to compare */		\
36*8ddb146aSEd Maste	lea	-16(%rcx, %r11), %r9;			\
37*8ddb146aSEd Maste	cmp	%r9, %r11;				\
38*8ddb146aSEd Maste	jb	L(strcmp_exitz);			\
39*8ddb146aSEd Maste	test	%r9, %r9;				\
40*8ddb146aSEd Maste	je	L(strcmp_exitz);			\
41*8ddb146aSEd Maste	mov	%r9, %r11
42*8ddb146aSEd Maste
43*8ddb146aSEd Maste#else
44*8ddb146aSEd Maste#define UPDATE_STRNCMP_COUNTER
45*8ddb146aSEd Maste#ifndef STRCMP
46*8ddb146aSEd Maste#define STRCMP		strcmp
47*8ddb146aSEd Maste#endif
48*8ddb146aSEd Maste#endif
49*8ddb146aSEd Maste
50*8ddb146aSEd Maste#ifndef L
51*8ddb146aSEd Maste# define L(label)	.L##label
52*8ddb146aSEd Maste#endif
53*8ddb146aSEd Maste
54*8ddb146aSEd Maste#ifndef cfi_startproc
55*8ddb146aSEd Maste# define cfi_startproc			.cfi_startproc
56*8ddb146aSEd Maste#endif
57*8ddb146aSEd Maste
58*8ddb146aSEd Maste#ifndef cfi_endproc
59*8ddb146aSEd Maste# define cfi_endproc			.cfi_endproc
60*8ddb146aSEd Maste#endif
61*8ddb146aSEd Maste
62*8ddb146aSEd Maste#ifndef ENTRY
63*8ddb146aSEd Maste# define ENTRY(name)			\
64*8ddb146aSEd Maste	.type name,  @function; 	\
65*8ddb146aSEd Maste	.globl name;			\
66*8ddb146aSEd Maste	.p2align 4;			\
67*8ddb146aSEd Mastename:					\
68*8ddb146aSEd Maste	cfi_startproc
69*8ddb146aSEd Maste#endif
70*8ddb146aSEd Maste
71*8ddb146aSEd Maste#ifndef END
72*8ddb146aSEd Maste# define END(name)			\
73*8ddb146aSEd Maste	cfi_endproc;			\
74*8ddb146aSEd Maste	.size name, .-name
75*8ddb146aSEd Maste#endif
76*8ddb146aSEd Maste#define RETURN ret
77*8ddb146aSEd Maste	.section .text.ssse3,"ax",@progbits
78*8ddb146aSEd MasteENTRY (STRCMP)
79*8ddb146aSEd Maste/*
80*8ddb146aSEd Maste * This implementation uses SSE to compare up to 16 bytes at a time.
81*8ddb146aSEd Maste */
82*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
83*8ddb146aSEd Maste	test	%rdx, %rdx
84*8ddb146aSEd Maste	je	L(strcmp_exitz)
85*8ddb146aSEd Maste	cmp	$1, %rdx
86*8ddb146aSEd Maste	je	L(Byte0)
87*8ddb146aSEd Maste	mov	%rdx, %r11
88*8ddb146aSEd Maste#endif
89*8ddb146aSEd Maste	mov	%esi, %ecx
90*8ddb146aSEd Maste	mov	%edi, %eax
91*8ddb146aSEd Maste/* Use 64bit AND here to avoid long NOP padding.  */
92*8ddb146aSEd Maste	and	$0x3f, %rcx		/* rsi alignment in cache line */
93*8ddb146aSEd Maste	and	$0x3f, %rax		/* rdi alignment in cache line */
94*8ddb146aSEd Maste	cmp	$0x30, %ecx
95*8ddb146aSEd Maste	ja	L(crosscache)	/* rsi: 16-byte load will cross cache line */
96*8ddb146aSEd Maste	cmp	$0x30, %eax
97*8ddb146aSEd Maste	ja	L(crosscache)	/* rdi: 16-byte load will cross cache line */
98*8ddb146aSEd Maste	movlpd	(%rdi), %xmm1
99*8ddb146aSEd Maste	movlpd	(%rsi), %xmm2
100*8ddb146aSEd Maste	movhpd	8(%rdi), %xmm1
101*8ddb146aSEd Maste	movhpd	8(%rsi), %xmm2
102*8ddb146aSEd Maste	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
103*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
104*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
105*8ddb146aSEd Maste	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
106*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
107*8ddb146aSEd Maste	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
108*8ddb146aSEd Maste	jnz	L(less16bytes)	/* If not, find different value or null char */
109*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
110*8ddb146aSEd Maste	sub	$16, %r11
111*8ddb146aSEd Maste	jbe	L(strcmp_exitz)	/* finish comparision */
112*8ddb146aSEd Maste#endif
113*8ddb146aSEd Maste	add	$16, %rsi		/* prepare to search next 16 bytes */
114*8ddb146aSEd Maste	add	$16, %rdi		/* prepare to search next 16 bytes */
115*8ddb146aSEd Maste
116*8ddb146aSEd Maste	/*
117*8ddb146aSEd Maste	 * Determine source and destination string offsets from 16-byte alignment.
118*8ddb146aSEd Maste	 * Use relative offset difference between the two to determine which case
119*8ddb146aSEd Maste	 * below to use.
120*8ddb146aSEd Maste	 */
121*8ddb146aSEd Maste	.p2align 4
122*8ddb146aSEd MasteL(crosscache):
123*8ddb146aSEd Maste	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
124*8ddb146aSEd Maste	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
125*8ddb146aSEd Maste	mov	$0xffff, %edx			/* for equivalent offset */
126*8ddb146aSEd Maste	xor	%r8d, %r8d
127*8ddb146aSEd Maste	and	$0xf, %ecx			/* offset of rsi */
128*8ddb146aSEd Maste	and	$0xf, %eax			/* offset of rdi */
129*8ddb146aSEd Maste	cmp	%eax, %ecx
130*8ddb146aSEd Maste	je	L(ashr_0)			/* rsi and rdi relative offset same */
131*8ddb146aSEd Maste	ja	L(bigger)
132*8ddb146aSEd Maste	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
133*8ddb146aSEd Maste	xchg	%ecx, %eax
134*8ddb146aSEd Maste	xchg	%rsi, %rdi
135*8ddb146aSEd MasteL(bigger):
136*8ddb146aSEd Maste	lea	15(%rax), %r9
137*8ddb146aSEd Maste	sub	%rcx, %r9
138*8ddb146aSEd Maste	lea	L(unaligned_table)(%rip), %r10
139*8ddb146aSEd Maste	movslq	(%r10, %r9,4), %r9
140*8ddb146aSEd Maste	lea	(%r10, %r9), %r10
141*8ddb146aSEd Maste	jmp	*%r10				/* jump to corresponding case */
142*8ddb146aSEd Maste
143*8ddb146aSEd Maste/*
144*8ddb146aSEd Maste * The following cases will be handled by ashr_0
145*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
146*8ddb146aSEd Maste *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
147*8ddb146aSEd Maste */
148*8ddb146aSEd Maste	.p2align 4
149*8ddb146aSEd MasteL(ashr_0):
150*8ddb146aSEd Maste
151*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
152*8ddb146aSEd Maste	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
153*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
154*8ddb146aSEd Maste	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
155*8ddb146aSEd Maste	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
156*8ddb146aSEd Maste	pmovmskb %xmm1, %r9d
157*8ddb146aSEd Maste	shr	%cl, %edx			/* adjust 0xffff for offset */
158*8ddb146aSEd Maste	shr	%cl, %r9d			/* adjust for 16-byte offset */
159*8ddb146aSEd Maste	sub	%r9d, %edx
160*8ddb146aSEd Maste	/*
161*8ddb146aSEd Maste	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
162*8ddb146aSEd Maste	 * the start from (16-rax) and no null char was seen.
163*8ddb146aSEd Maste	 */
164*8ddb146aSEd Maste	jne	L(less32bytes)		/* mismatch or null char */
165*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
166*8ddb146aSEd Maste	mov	$16, %rcx
167*8ddb146aSEd Maste	mov	$16, %r9
168*8ddb146aSEd Maste	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
169*8ddb146aSEd Maste
170*8ddb146aSEd Maste	/*
171*8ddb146aSEd Maste	 * Now both strings are aligned at 16-byte boundary. Loop over strings
172*8ddb146aSEd Maste	 * checking 32-bytes per iteration.
173*8ddb146aSEd Maste	 */
174*8ddb146aSEd Maste	.p2align 4
175*8ddb146aSEd MasteL(loop_ashr_0):
176*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
177*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
178*8ddb146aSEd Maste
179*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
180*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
181*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
182*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
183*8ddb146aSEd Maste	sub	$0xffff, %edx
184*8ddb146aSEd Maste	jnz	L(exit)		/* mismatch or null char seen */
185*8ddb146aSEd Maste
186*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
187*8ddb146aSEd Maste	sub	$16, %r11
188*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
189*8ddb146aSEd Maste#endif
190*8ddb146aSEd Maste	add	$16, %rcx
191*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
192*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
193*8ddb146aSEd Maste
194*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
195*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
196*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
197*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
198*8ddb146aSEd Maste	sub	$0xffff, %edx
199*8ddb146aSEd Maste	jnz	L(exit)
200*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
201*8ddb146aSEd Maste	sub	$16, %r11
202*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
203*8ddb146aSEd Maste#endif
204*8ddb146aSEd Maste	add	$16, %rcx
205*8ddb146aSEd Maste	jmp	L(loop_ashr_0)
206*8ddb146aSEd Maste
207*8ddb146aSEd Maste/*
208*8ddb146aSEd Maste * The following cases will be handled by ashr_1
209*8ddb146aSEd Maste * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
210*8ddb146aSEd Maste *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
211*8ddb146aSEd Maste */
212*8ddb146aSEd Maste	.p2align 4
213*8ddb146aSEd MasteL(ashr_1):
214*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
215*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
216*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
217*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
218*8ddb146aSEd Maste	pslldq	$15, %xmm2		/* shift first string to align with second */
219*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
220*8ddb146aSEd Maste	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
221*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
222*8ddb146aSEd Maste	shr	%cl, %edx		/* adjust 0xffff for offset */
223*8ddb146aSEd Maste	shr	%cl, %r9d		/* adjust for 16-byte offset */
224*8ddb146aSEd Maste	sub	%r9d, %edx
225*8ddb146aSEd Maste	jnz	L(less32bytes)	/* mismatch or null char seen */
226*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
227*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
228*8ddb146aSEd Maste
229*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
230*8ddb146aSEd Maste	mov	$16, %rcx		/* index for loads*/
231*8ddb146aSEd Maste	mov	$1, %r9d		/* byte position left over from less32bytes case */
232*8ddb146aSEd Maste	/*
233*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
234*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
235*8ddb146aSEd Maste	 * need to do a nibble.
236*8ddb146aSEd Maste	 */
237*8ddb146aSEd Maste	lea	1(%rdi), %r10
238*8ddb146aSEd Maste	and	$0xfff, %r10		/* offset into 4K page */
239*8ddb146aSEd Maste	sub	$0x1000, %r10		/* subtract 4K pagesize */
240*8ddb146aSEd Maste
241*8ddb146aSEd Maste	.p2align 4
242*8ddb146aSEd MasteL(loop_ashr_1):
243*8ddb146aSEd Maste	add	$16, %r10
244*8ddb146aSEd Maste	jg	L(nibble_ashr_1)	/* cross page boundary */
245*8ddb146aSEd Maste
246*8ddb146aSEd MasteL(gobble_ashr_1):
247*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
248*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
249*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4		 /* store for next cycle */
250*8ddb146aSEd Maste
251*8ddb146aSEd Maste	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
252*8ddb146aSEd Maste
253*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
254*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
255*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
256*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
257*8ddb146aSEd Maste	sub	$0xffff, %edx
258*8ddb146aSEd Maste	jnz	L(exit)
259*8ddb146aSEd Maste
260*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
261*8ddb146aSEd Maste	sub	$16, %r11
262*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
263*8ddb146aSEd Maste#endif
264*8ddb146aSEd Maste	add	$16, %rcx
265*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
266*8ddb146aSEd Maste
267*8ddb146aSEd Maste	add	$16, %r10
268*8ddb146aSEd Maste	jg	L(nibble_ashr_1)	/* cross page boundary */
269*8ddb146aSEd Maste
270*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
271*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
272*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4		/* store for next cycle */
273*8ddb146aSEd Maste
274*8ddb146aSEd Maste	palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
275*8ddb146aSEd Maste
276*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
277*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
278*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
279*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
280*8ddb146aSEd Maste	sub	$0xffff, %edx
281*8ddb146aSEd Maste	jnz	L(exit)
282*8ddb146aSEd Maste
283*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
284*8ddb146aSEd Maste	sub	$16, %r11
285*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
286*8ddb146aSEd Maste#endif
287*8ddb146aSEd Maste	add	$16, %rcx
288*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
289*8ddb146aSEd Maste	jmp	L(loop_ashr_1)
290*8ddb146aSEd Maste
291*8ddb146aSEd Maste	/*
292*8ddb146aSEd Maste	 * Nibble avoids loads across page boundary. This is to avoid a potential
293*8ddb146aSEd Maste	 * access into unmapped memory.
294*8ddb146aSEd Maste	 */
295*8ddb146aSEd Maste	.p2align 4
296*8ddb146aSEd MasteL(nibble_ashr_1):
297*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
298*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
299*8ddb146aSEd Maste	test	$0xfffe, %edx
300*8ddb146aSEd Maste	jnz	L(ashr_1_exittail)	/* find null char*/
301*8ddb146aSEd Maste
302*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
303*8ddb146aSEd Maste	cmp	$14, %r11
304*8ddb146aSEd Maste	jbe	L(ashr_1_exittail)
305*8ddb146aSEd Maste#endif
306*8ddb146aSEd Maste
307*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
308*8ddb146aSEd Maste	sub	$0x1000, %r10		/* substract 4K from %r10 */
309*8ddb146aSEd Maste	jmp	L(gobble_ashr_1)
310*8ddb146aSEd Maste
311*8ddb146aSEd Maste	/*
312*8ddb146aSEd Maste	 * Once find null char, determine if there is a string mismatch
313*8ddb146aSEd Maste	 * before the null char.
314*8ddb146aSEd Maste	 */
315*8ddb146aSEd Maste	.p2align 4
316*8ddb146aSEd MasteL(ashr_1_exittail):
317*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
318*8ddb146aSEd Maste	psrldq	$1, %xmm0
319*8ddb146aSEd Maste	psrldq	$1, %xmm3
320*8ddb146aSEd Maste	jmp	L(aftertail)
321*8ddb146aSEd Maste
322*8ddb146aSEd Maste/*
323*8ddb146aSEd Maste * The following cases will be handled by ashr_2
324*8ddb146aSEd Maste * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
325*8ddb146aSEd Maste *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
326*8ddb146aSEd Maste */
327*8ddb146aSEd Maste	.p2align 4
328*8ddb146aSEd MasteL(ashr_2):
329*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
330*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
331*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
332*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
333*8ddb146aSEd Maste	pslldq	$14, %xmm2
334*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
335*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
336*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
337*8ddb146aSEd Maste	shr	%cl, %edx
338*8ddb146aSEd Maste	shr	%cl, %r9d
339*8ddb146aSEd Maste	sub	%r9d, %edx
340*8ddb146aSEd Maste	jnz	L(less32bytes)
341*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
342*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
343*8ddb146aSEd Maste
344*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
345*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
346*8ddb146aSEd Maste	mov	$2, %r9d	/* byte position left over from less32bytes case */
347*8ddb146aSEd Maste	/*
348*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
349*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
350*8ddb146aSEd Maste	 * need to do a nibble.
351*8ddb146aSEd Maste	 */
352*8ddb146aSEd Maste	lea	2(%rdi), %r10
353*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
354*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
355*8ddb146aSEd Maste
356*8ddb146aSEd Maste	.p2align 4
357*8ddb146aSEd MasteL(loop_ashr_2):
358*8ddb146aSEd Maste	add	$16, %r10
359*8ddb146aSEd Maste	jg	L(nibble_ashr_2)
360*8ddb146aSEd Maste
361*8ddb146aSEd MasteL(gobble_ashr_2):
362*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
363*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
364*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
365*8ddb146aSEd Maste
366*8ddb146aSEd Maste	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
367*8ddb146aSEd Maste
368*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
369*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
370*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
371*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
372*8ddb146aSEd Maste	sub	$0xffff, %edx
373*8ddb146aSEd Maste	jnz	L(exit)
374*8ddb146aSEd Maste
375*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
376*8ddb146aSEd Maste	sub	$16, %r11
377*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
378*8ddb146aSEd Maste#endif
379*8ddb146aSEd Maste
380*8ddb146aSEd Maste	add	$16, %rcx
381*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
382*8ddb146aSEd Maste
383*8ddb146aSEd Maste	add	$16, %r10
384*8ddb146aSEd Maste	jg	L(nibble_ashr_2)	/* cross page boundary */
385*8ddb146aSEd Maste
386*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
387*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
388*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
389*8ddb146aSEd Maste
390*8ddb146aSEd Maste	palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
391*8ddb146aSEd Maste
392*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
393*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
394*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
395*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
396*8ddb146aSEd Maste	sub	$0xffff, %edx
397*8ddb146aSEd Maste	jnz	L(exit)
398*8ddb146aSEd Maste
399*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
400*8ddb146aSEd Maste	sub	$16, %r11
401*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
402*8ddb146aSEd Maste#endif
403*8ddb146aSEd Maste
404*8ddb146aSEd Maste	add	$16, %rcx
405*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
406*8ddb146aSEd Maste	jmp	L(loop_ashr_2)
407*8ddb146aSEd Maste
408*8ddb146aSEd Maste	.p2align 4
409*8ddb146aSEd MasteL(nibble_ashr_2):
410*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
411*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
412*8ddb146aSEd Maste	test	$0xfffc, %edx
413*8ddb146aSEd Maste	jnz	L(ashr_2_exittail)
414*8ddb146aSEd Maste
415*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
416*8ddb146aSEd Maste	cmp	$13, %r11
417*8ddb146aSEd Maste	jbe	L(ashr_2_exittail)
418*8ddb146aSEd Maste#endif
419*8ddb146aSEd Maste
420*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
421*8ddb146aSEd Maste	sub	$0x1000, %r10
422*8ddb146aSEd Maste	jmp	L(gobble_ashr_2)
423*8ddb146aSEd Maste
424*8ddb146aSEd Maste	.p2align 4
425*8ddb146aSEd MasteL(ashr_2_exittail):
426*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
427*8ddb146aSEd Maste	psrldq	$2, %xmm0
428*8ddb146aSEd Maste	psrldq	$2, %xmm3
429*8ddb146aSEd Maste	jmp	L(aftertail)
430*8ddb146aSEd Maste
431*8ddb146aSEd Maste/*
432*8ddb146aSEd Maste * The following cases will be handled by ashr_3
433*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
434*8ddb146aSEd Maste *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
435*8ddb146aSEd Maste */
436*8ddb146aSEd Maste	.p2align 4
437*8ddb146aSEd MasteL(ashr_3):
438*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
439*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
440*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
441*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
442*8ddb146aSEd Maste	pslldq	$13, %xmm2
443*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
444*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
445*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
446*8ddb146aSEd Maste	shr	%cl, %edx
447*8ddb146aSEd Maste	shr	%cl, %r9d
448*8ddb146aSEd Maste	sub	%r9d, %edx
449*8ddb146aSEd Maste	jnz	L(less32bytes)
450*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
451*8ddb146aSEd Maste
452*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
453*8ddb146aSEd Maste
454*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
455*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
456*8ddb146aSEd Maste	mov	$3, %r9d	/* byte position left over from less32bytes case */
457*8ddb146aSEd Maste	/*
458*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
459*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
460*8ddb146aSEd Maste	 * need to do a nibble.
461*8ddb146aSEd Maste	 */
462*8ddb146aSEd Maste	lea	3(%rdi), %r10
463*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
464*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
465*8ddb146aSEd Maste
466*8ddb146aSEd Maste	.p2align 4
467*8ddb146aSEd MasteL(loop_ashr_3):
468*8ddb146aSEd Maste	add	$16, %r10
469*8ddb146aSEd Maste	jg	L(nibble_ashr_3)
470*8ddb146aSEd Maste
471*8ddb146aSEd MasteL(gobble_ashr_3):
472*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
473*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
474*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
475*8ddb146aSEd Maste
476*8ddb146aSEd Maste	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
477*8ddb146aSEd Maste
478*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
479*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
480*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
481*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
482*8ddb146aSEd Maste	sub	$0xffff, %edx
483*8ddb146aSEd Maste	jnz	L(exit)
484*8ddb146aSEd Maste
485*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
486*8ddb146aSEd Maste	sub	$16, %r11
487*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
488*8ddb146aSEd Maste#endif
489*8ddb146aSEd Maste
490*8ddb146aSEd Maste	add	$16, %rcx
491*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
492*8ddb146aSEd Maste
493*8ddb146aSEd Maste	add	$16, %r10
494*8ddb146aSEd Maste	jg	L(nibble_ashr_3)	/* cross page boundary */
495*8ddb146aSEd Maste
496*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
497*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
498*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
499*8ddb146aSEd Maste
500*8ddb146aSEd Maste	palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
501*8ddb146aSEd Maste
502*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
503*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
504*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
505*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
506*8ddb146aSEd Maste	sub	$0xffff, %edx
507*8ddb146aSEd Maste	jnz	L(exit)
508*8ddb146aSEd Maste
509*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
510*8ddb146aSEd Maste	sub	$16, %r11
511*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
512*8ddb146aSEd Maste#endif
513*8ddb146aSEd Maste
514*8ddb146aSEd Maste	add	$16, %rcx
515*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
516*8ddb146aSEd Maste	jmp	L(loop_ashr_3)
517*8ddb146aSEd Maste
518*8ddb146aSEd Maste	.p2align 4
519*8ddb146aSEd MasteL(nibble_ashr_3):
520*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
521*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
522*8ddb146aSEd Maste	test	$0xfff8, %edx
523*8ddb146aSEd Maste	jnz	L(ashr_3_exittail)
524*8ddb146aSEd Maste
525*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
526*8ddb146aSEd Maste	cmp	$12, %r11
527*8ddb146aSEd Maste	jbe	L(ashr_3_exittail)
528*8ddb146aSEd Maste#endif
529*8ddb146aSEd Maste
530*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
531*8ddb146aSEd Maste	sub	$0x1000, %r10
532*8ddb146aSEd Maste	jmp	L(gobble_ashr_3)
533*8ddb146aSEd Maste
534*8ddb146aSEd Maste	.p2align 4
535*8ddb146aSEd MasteL(ashr_3_exittail):
536*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
537*8ddb146aSEd Maste	psrldq	$3, %xmm0
538*8ddb146aSEd Maste	psrldq	$3, %xmm3
539*8ddb146aSEd Maste	jmp	L(aftertail)
540*8ddb146aSEd Maste
541*8ddb146aSEd Maste/*
542*8ddb146aSEd Maste * The following cases will be handled by ashr_4
543*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
544*8ddb146aSEd Maste *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
545*8ddb146aSEd Maste */
546*8ddb146aSEd Maste	.p2align 4
547*8ddb146aSEd MasteL(ashr_4):
548*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
549*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
550*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
551*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
552*8ddb146aSEd Maste	pslldq	$12, %xmm2
553*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
554*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
555*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
556*8ddb146aSEd Maste	shr	%cl, %edx
557*8ddb146aSEd Maste	shr	%cl, %r9d
558*8ddb146aSEd Maste	sub	%r9d, %edx
559*8ddb146aSEd Maste	jnz	L(less32bytes)
560*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
561*8ddb146aSEd Maste
562*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
563*8ddb146aSEd Maste
564*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
565*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
566*8ddb146aSEd Maste	mov	$4, %r9d	/* byte position left over from less32bytes case */
567*8ddb146aSEd Maste	/*
568*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
569*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
570*8ddb146aSEd Maste	 * need to do a nibble.
571*8ddb146aSEd Maste	 */
572*8ddb146aSEd Maste	lea	4(%rdi), %r10
573*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
574*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
575*8ddb146aSEd Maste
576*8ddb146aSEd Maste	.p2align 4
577*8ddb146aSEd MasteL(loop_ashr_4):
578*8ddb146aSEd Maste	add	$16, %r10
579*8ddb146aSEd Maste	jg	L(nibble_ashr_4)
580*8ddb146aSEd Maste
581*8ddb146aSEd MasteL(gobble_ashr_4):
582*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
583*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
584*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
585*8ddb146aSEd Maste
586*8ddb146aSEd Maste	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
587*8ddb146aSEd Maste
588*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
589*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
590*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
591*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
592*8ddb146aSEd Maste	sub	$0xffff, %edx
593*8ddb146aSEd Maste	jnz	L(exit)
594*8ddb146aSEd Maste
595*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
596*8ddb146aSEd Maste	sub	$16, %r11
597*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
598*8ddb146aSEd Maste#endif
599*8ddb146aSEd Maste
600*8ddb146aSEd Maste	add	$16, %rcx
601*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
602*8ddb146aSEd Maste
603*8ddb146aSEd Maste	add	$16, %r10
604*8ddb146aSEd Maste	jg	L(nibble_ashr_4)	/* cross page boundary */
605*8ddb146aSEd Maste
606*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
607*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
608*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
609*8ddb146aSEd Maste
610*8ddb146aSEd Maste	palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
611*8ddb146aSEd Maste
612*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
613*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
614*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
615*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
616*8ddb146aSEd Maste	sub	$0xffff, %edx
617*8ddb146aSEd Maste	jnz	L(exit)
618*8ddb146aSEd Maste
619*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
620*8ddb146aSEd Maste	sub	$16, %r11
621*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
622*8ddb146aSEd Maste#endif
623*8ddb146aSEd Maste
624*8ddb146aSEd Maste	add	$16, %rcx
625*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
626*8ddb146aSEd Maste	jmp	L(loop_ashr_4)
627*8ddb146aSEd Maste
628*8ddb146aSEd Maste	.p2align 4
629*8ddb146aSEd MasteL(nibble_ashr_4):
630*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
631*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
632*8ddb146aSEd Maste	test	$0xfff0, %edx
633*8ddb146aSEd Maste	jnz	L(ashr_4_exittail)
634*8ddb146aSEd Maste
635*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
636*8ddb146aSEd Maste	cmp	$11, %r11
637*8ddb146aSEd Maste	jbe	L(ashr_4_exittail)
638*8ddb146aSEd Maste#endif
639*8ddb146aSEd Maste
640*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
641*8ddb146aSEd Maste	sub	$0x1000, %r10
642*8ddb146aSEd Maste	jmp	L(gobble_ashr_4)
643*8ddb146aSEd Maste
644*8ddb146aSEd Maste	.p2align 4
645*8ddb146aSEd MasteL(ashr_4_exittail):
646*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
647*8ddb146aSEd Maste	psrldq	$4, %xmm0
648*8ddb146aSEd Maste	psrldq	$4, %xmm3
649*8ddb146aSEd Maste	jmp	L(aftertail)
650*8ddb146aSEd Maste
651*8ddb146aSEd Maste/*
652*8ddb146aSEd Maste * The following cases will be handled by ashr_5
653*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
654*8ddb146aSEd Maste *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
655*8ddb146aSEd Maste */
656*8ddb146aSEd Maste	.p2align 4
657*8ddb146aSEd MasteL(ashr_5):
658*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
659*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
660*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
661*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
662*8ddb146aSEd Maste	pslldq	$11, %xmm2
663*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
664*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
665*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
666*8ddb146aSEd Maste	shr	%cl, %edx
667*8ddb146aSEd Maste	shr	%cl, %r9d
668*8ddb146aSEd Maste	sub	%r9d, %edx
669*8ddb146aSEd Maste	jnz	L(less32bytes)
670*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
671*8ddb146aSEd Maste
672*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
673*8ddb146aSEd Maste
674*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
675*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
676*8ddb146aSEd Maste	mov	$5, %r9d	/* byte position left over from less32bytes case */
677*8ddb146aSEd Maste	/*
678*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
679*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
680*8ddb146aSEd Maste	 * need to do a nibble.
681*8ddb146aSEd Maste	 */
682*8ddb146aSEd Maste	lea	5(%rdi), %r10
683*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
684*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
685*8ddb146aSEd Maste
686*8ddb146aSEd Maste	.p2align 4
687*8ddb146aSEd MasteL(loop_ashr_5):
688*8ddb146aSEd Maste	add	$16, %r10
689*8ddb146aSEd Maste	jg	L(nibble_ashr_5)
690*8ddb146aSEd Maste
691*8ddb146aSEd MasteL(gobble_ashr_5):
692*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
693*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
694*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
695*8ddb146aSEd Maste
696*8ddb146aSEd Maste	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
697*8ddb146aSEd Maste
698*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
699*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
700*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
701*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
702*8ddb146aSEd Maste	sub	$0xffff, %edx
703*8ddb146aSEd Maste	jnz	L(exit)
704*8ddb146aSEd Maste
705*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
706*8ddb146aSEd Maste	sub	$16, %r11
707*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
708*8ddb146aSEd Maste#endif
709*8ddb146aSEd Maste
710*8ddb146aSEd Maste	add	$16, %rcx
711*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
712*8ddb146aSEd Maste
713*8ddb146aSEd Maste	add	$16, %r10
714*8ddb146aSEd Maste	jg	L(nibble_ashr_5)	/* cross page boundary */
715*8ddb146aSEd Maste
716*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
717*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
718*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
719*8ddb146aSEd Maste
720*8ddb146aSEd Maste	palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
721*8ddb146aSEd Maste
722*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
723*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
724*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
725*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
726*8ddb146aSEd Maste	sub	$0xffff, %edx
727*8ddb146aSEd Maste	jnz	L(exit)
728*8ddb146aSEd Maste
729*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
730*8ddb146aSEd Maste	sub	$16, %r11
731*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
732*8ddb146aSEd Maste#endif
733*8ddb146aSEd Maste
734*8ddb146aSEd Maste	add	$16, %rcx
735*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
736*8ddb146aSEd Maste	jmp	L(loop_ashr_5)
737*8ddb146aSEd Maste
738*8ddb146aSEd Maste	.p2align 4
739*8ddb146aSEd MasteL(nibble_ashr_5):
740*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
741*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
742*8ddb146aSEd Maste	test	$0xffe0, %edx
743*8ddb146aSEd Maste	jnz	L(ashr_5_exittail)
744*8ddb146aSEd Maste
745*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
746*8ddb146aSEd Maste	cmp	$10, %r11
747*8ddb146aSEd Maste	jbe	L(ashr_5_exittail)
748*8ddb146aSEd Maste#endif
749*8ddb146aSEd Maste
750*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
751*8ddb146aSEd Maste	sub	$0x1000, %r10
752*8ddb146aSEd Maste	jmp	L(gobble_ashr_5)
753*8ddb146aSEd Maste
754*8ddb146aSEd Maste	.p2align 4
755*8ddb146aSEd MasteL(ashr_5_exittail):
756*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
757*8ddb146aSEd Maste	psrldq	$5, %xmm0
758*8ddb146aSEd Maste	psrldq	$5, %xmm3
759*8ddb146aSEd Maste	jmp	L(aftertail)
760*8ddb146aSEd Maste
761*8ddb146aSEd Maste/*
762*8ddb146aSEd Maste * The following cases will be handled by ashr_6
763*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
764*8ddb146aSEd Maste *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
765*8ddb146aSEd Maste */
766*8ddb146aSEd Maste	.p2align 4
767*8ddb146aSEd MasteL(ashr_6):
768*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
769*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
770*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
771*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
772*8ddb146aSEd Maste	pslldq	$10, %xmm2
773*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
774*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
775*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
776*8ddb146aSEd Maste	shr	%cl, %edx
777*8ddb146aSEd Maste	shr	%cl, %r9d
778*8ddb146aSEd Maste	sub	%r9d, %edx
779*8ddb146aSEd Maste	jnz	L(less32bytes)
780*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
781*8ddb146aSEd Maste
782*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
783*8ddb146aSEd Maste
784*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
785*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
786*8ddb146aSEd Maste	mov	$6, %r9d	/* byte position left over from less32bytes case */
787*8ddb146aSEd Maste	/*
788*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
789*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
790*8ddb146aSEd Maste	 * need to do a nibble.
791*8ddb146aSEd Maste	 */
792*8ddb146aSEd Maste	lea	6(%rdi), %r10
793*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
794*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
795*8ddb146aSEd Maste
796*8ddb146aSEd Maste	.p2align 4
797*8ddb146aSEd MasteL(loop_ashr_6):
798*8ddb146aSEd Maste	add	$16, %r10
799*8ddb146aSEd Maste	jg	L(nibble_ashr_6)
800*8ddb146aSEd Maste
801*8ddb146aSEd MasteL(gobble_ashr_6):
802*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
803*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
804*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
805*8ddb146aSEd Maste
806*8ddb146aSEd Maste	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
807*8ddb146aSEd Maste
808*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
809*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
810*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
811*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
812*8ddb146aSEd Maste	sub	$0xffff, %edx
813*8ddb146aSEd Maste	jnz	L(exit)
814*8ddb146aSEd Maste
815*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
816*8ddb146aSEd Maste	sub	$16, %r11
817*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
818*8ddb146aSEd Maste#endif
819*8ddb146aSEd Maste
820*8ddb146aSEd Maste	add	$16, %rcx
821*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
822*8ddb146aSEd Maste
823*8ddb146aSEd Maste	add	$16, %r10
824*8ddb146aSEd Maste	jg	L(nibble_ashr_6)	/* cross page boundary */
825*8ddb146aSEd Maste
826*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
827*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
828*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
829*8ddb146aSEd Maste
830*8ddb146aSEd Maste	palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
831*8ddb146aSEd Maste
832*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
833*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
834*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
835*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
836*8ddb146aSEd Maste	sub	$0xffff, %edx
837*8ddb146aSEd Maste	jnz	L(exit)
838*8ddb146aSEd Maste
839*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
840*8ddb146aSEd Maste	sub	$16, %r11
841*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
842*8ddb146aSEd Maste#endif
843*8ddb146aSEd Maste
844*8ddb146aSEd Maste	add	$16, %rcx
845*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
846*8ddb146aSEd Maste	jmp	L(loop_ashr_6)
847*8ddb146aSEd Maste
848*8ddb146aSEd Maste	.p2align 4
849*8ddb146aSEd MasteL(nibble_ashr_6):
850*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
851*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
852*8ddb146aSEd Maste	test	$0xffc0, %edx
853*8ddb146aSEd Maste	jnz	L(ashr_6_exittail)
854*8ddb146aSEd Maste
855*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
856*8ddb146aSEd Maste	cmp	$9, %r11
857*8ddb146aSEd Maste	jbe	L(ashr_6_exittail)
858*8ddb146aSEd Maste#endif
859*8ddb146aSEd Maste
860*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
861*8ddb146aSEd Maste	sub	$0x1000, %r10
862*8ddb146aSEd Maste	jmp	L(gobble_ashr_6)
863*8ddb146aSEd Maste
864*8ddb146aSEd Maste	.p2align 4
865*8ddb146aSEd MasteL(ashr_6_exittail):
866*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
867*8ddb146aSEd Maste	psrldq	$6, %xmm0
868*8ddb146aSEd Maste	psrldq	$6, %xmm3
869*8ddb146aSEd Maste	jmp	L(aftertail)
870*8ddb146aSEd Maste
871*8ddb146aSEd Maste/*
872*8ddb146aSEd Maste * The following cases will be handled by ashr_7
873*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
874*8ddb146aSEd Maste *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
875*8ddb146aSEd Maste */
876*8ddb146aSEd Maste	.p2align 4
877*8ddb146aSEd MasteL(ashr_7):
878*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
879*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
880*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
881*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
882*8ddb146aSEd Maste	pslldq	$9, %xmm2
883*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
884*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
885*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
886*8ddb146aSEd Maste	shr	%cl, %edx
887*8ddb146aSEd Maste	shr	%cl, %r9d
888*8ddb146aSEd Maste	sub	%r9d, %edx
889*8ddb146aSEd Maste	jnz	L(less32bytes)
890*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
891*8ddb146aSEd Maste
892*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
893*8ddb146aSEd Maste
894*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
895*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
896*8ddb146aSEd Maste	mov	$7, %r9d	/* byte position left over from less32bytes case */
897*8ddb146aSEd Maste	/*
898*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
899*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
900*8ddb146aSEd Maste	 * need to do a nibble.
901*8ddb146aSEd Maste	 */
902*8ddb146aSEd Maste	lea	7(%rdi), %r10
903*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
904*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
905*8ddb146aSEd Maste
906*8ddb146aSEd Maste	.p2align 4
907*8ddb146aSEd MasteL(loop_ashr_7):
908*8ddb146aSEd Maste	add	$16, %r10
909*8ddb146aSEd Maste	jg	L(nibble_ashr_7)
910*8ddb146aSEd Maste
911*8ddb146aSEd MasteL(gobble_ashr_7):
912*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
913*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
914*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
915*8ddb146aSEd Maste
916*8ddb146aSEd Maste	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
917*8ddb146aSEd Maste
918*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
919*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
920*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
921*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
922*8ddb146aSEd Maste	sub	$0xffff, %edx
923*8ddb146aSEd Maste	jnz	L(exit)
924*8ddb146aSEd Maste
925*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
926*8ddb146aSEd Maste	sub	$16, %r11
927*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
928*8ddb146aSEd Maste#endif
929*8ddb146aSEd Maste
930*8ddb146aSEd Maste	add	$16, %rcx
931*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
932*8ddb146aSEd Maste
933*8ddb146aSEd Maste	add	$16, %r10
934*8ddb146aSEd Maste	jg	L(nibble_ashr_7)	/* cross page boundary */
935*8ddb146aSEd Maste
936*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
937*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
938*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
939*8ddb146aSEd Maste
940*8ddb146aSEd Maste	palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
941*8ddb146aSEd Maste
942*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
943*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
944*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
945*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
946*8ddb146aSEd Maste	sub	$0xffff, %edx
947*8ddb146aSEd Maste	jnz	L(exit)
948*8ddb146aSEd Maste
949*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
950*8ddb146aSEd Maste	sub	$16, %r11
951*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
952*8ddb146aSEd Maste#endif
953*8ddb146aSEd Maste
954*8ddb146aSEd Maste	add	$16, %rcx
955*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
956*8ddb146aSEd Maste	jmp	L(loop_ashr_7)
957*8ddb146aSEd Maste
958*8ddb146aSEd Maste	.p2align 4
959*8ddb146aSEd MasteL(nibble_ashr_7):
960*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
961*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
962*8ddb146aSEd Maste	test	$0xff80, %edx
963*8ddb146aSEd Maste	jnz	L(ashr_7_exittail)
964*8ddb146aSEd Maste
965*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
966*8ddb146aSEd Maste	cmp	$8, %r11
967*8ddb146aSEd Maste	jbe	L(ashr_7_exittail)
968*8ddb146aSEd Maste#endif
969*8ddb146aSEd Maste
970*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
971*8ddb146aSEd Maste	sub	$0x1000, %r10
972*8ddb146aSEd Maste	jmp	L(gobble_ashr_7)
973*8ddb146aSEd Maste
974*8ddb146aSEd Maste	.p2align 4
975*8ddb146aSEd MasteL(ashr_7_exittail):
976*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
977*8ddb146aSEd Maste	psrldq	$7, %xmm0
978*8ddb146aSEd Maste	psrldq	$7, %xmm3
979*8ddb146aSEd Maste	jmp	L(aftertail)
980*8ddb146aSEd Maste
981*8ddb146aSEd Maste/*
982*8ddb146aSEd Maste *  The following cases will be handled by ashr_8
983*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
984*8ddb146aSEd Maste *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
985*8ddb146aSEd Maste */
986*8ddb146aSEd Maste	.p2align 4
987*8ddb146aSEd MasteL(ashr_8):
988*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
989*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
990*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
991*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
992*8ddb146aSEd Maste	pslldq	$8, %xmm2
993*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
994*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
995*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
996*8ddb146aSEd Maste	shr	%cl, %edx
997*8ddb146aSEd Maste	shr	%cl, %r9d
998*8ddb146aSEd Maste	sub	%r9d, %edx
999*8ddb146aSEd Maste	jnz	L(less32bytes)
1000*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1001*8ddb146aSEd Maste
1002*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1003*8ddb146aSEd Maste
1004*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1005*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1006*8ddb146aSEd Maste	mov	$8, %r9d	/* byte position left over from less32bytes case */
1007*8ddb146aSEd Maste	/*
1008*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1009*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1010*8ddb146aSEd Maste	 * need to do a nibble.
1011*8ddb146aSEd Maste	 */
1012*8ddb146aSEd Maste	lea	8(%rdi), %r10
1013*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1014*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1015*8ddb146aSEd Maste
1016*8ddb146aSEd Maste	.p2align 4
1017*8ddb146aSEd MasteL(loop_ashr_8):
1018*8ddb146aSEd Maste	add	$16, %r10
1019*8ddb146aSEd Maste	jg	L(nibble_ashr_8)
1020*8ddb146aSEd Maste
1021*8ddb146aSEd MasteL(gobble_ashr_8):
1022*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1023*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1024*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1025*8ddb146aSEd Maste
1026*8ddb146aSEd Maste	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
1027*8ddb146aSEd Maste
1028*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1029*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1030*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1031*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1032*8ddb146aSEd Maste	sub	$0xffff, %edx
1033*8ddb146aSEd Maste	jnz	L(exit)
1034*8ddb146aSEd Maste
1035*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1036*8ddb146aSEd Maste	sub	$16, %r11
1037*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1038*8ddb146aSEd Maste#endif
1039*8ddb146aSEd Maste
1040*8ddb146aSEd Maste	add	$16, %rcx
1041*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1042*8ddb146aSEd Maste
1043*8ddb146aSEd Maste	add	$16, %r10
1044*8ddb146aSEd Maste	jg	L(nibble_ashr_8)	/* cross page boundary */
1045*8ddb146aSEd Maste
1046*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1047*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1048*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1049*8ddb146aSEd Maste
1050*8ddb146aSEd Maste	palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
1051*8ddb146aSEd Maste
1052*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1053*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1054*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1055*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1056*8ddb146aSEd Maste	sub	$0xffff, %edx
1057*8ddb146aSEd Maste	jnz	L(exit)
1058*8ddb146aSEd Maste
1059*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1060*8ddb146aSEd Maste	sub	$16, %r11
1061*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1062*8ddb146aSEd Maste#endif
1063*8ddb146aSEd Maste
1064*8ddb146aSEd Maste	add	$16, %rcx
1065*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1066*8ddb146aSEd Maste	jmp	L(loop_ashr_8)
1067*8ddb146aSEd Maste
1068*8ddb146aSEd Maste	.p2align 4
1069*8ddb146aSEd MasteL(nibble_ashr_8):
1070*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1071*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1072*8ddb146aSEd Maste	test	$0xff00, %edx
1073*8ddb146aSEd Maste	jnz	L(ashr_8_exittail)
1074*8ddb146aSEd Maste
1075*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1076*8ddb146aSEd Maste	cmp	$7, %r11
1077*8ddb146aSEd Maste	jbe	L(ashr_8_exittail)
1078*8ddb146aSEd Maste#endif
1079*8ddb146aSEd Maste
1080*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1081*8ddb146aSEd Maste	sub	$0x1000, %r10
1082*8ddb146aSEd Maste	jmp	L(gobble_ashr_8)
1083*8ddb146aSEd Maste
1084*8ddb146aSEd Maste	.p2align 4
1085*8ddb146aSEd MasteL(ashr_8_exittail):
1086*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1087*8ddb146aSEd Maste	psrldq	$8, %xmm0
1088*8ddb146aSEd Maste	psrldq	$8, %xmm3
1089*8ddb146aSEd Maste	jmp	L(aftertail)
1090*8ddb146aSEd Maste
1091*8ddb146aSEd Maste/*
1092*8ddb146aSEd Maste *  The following cases will be handled by ashr_9
1093*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1094*8ddb146aSEd Maste *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
1095*8ddb146aSEd Maste */
1096*8ddb146aSEd Maste	.p2align 4
1097*8ddb146aSEd MasteL(ashr_9):
1098*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1099*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1100*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1101*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1102*8ddb146aSEd Maste	pslldq	$7, %xmm2
1103*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1104*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1105*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1106*8ddb146aSEd Maste	shr	%cl, %edx
1107*8ddb146aSEd Maste	shr	%cl, %r9d
1108*8ddb146aSEd Maste	sub	%r9d, %edx
1109*8ddb146aSEd Maste	jnz	L(less32bytes)
1110*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1111*8ddb146aSEd Maste
1112*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1113*8ddb146aSEd Maste
1114*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1115*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1116*8ddb146aSEd Maste	mov	$9, %r9d	/* byte position left over from less32bytes case */
1117*8ddb146aSEd Maste	/*
1118*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1119*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1120*8ddb146aSEd Maste	 * need to do a nibble.
1121*8ddb146aSEd Maste	 */
1122*8ddb146aSEd Maste	lea	9(%rdi), %r10
1123*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1124*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1125*8ddb146aSEd Maste
1126*8ddb146aSEd Maste	.p2align 4
1127*8ddb146aSEd MasteL(loop_ashr_9):
1128*8ddb146aSEd Maste	add	$16, %r10
1129*8ddb146aSEd Maste	jg	L(nibble_ashr_9)
1130*8ddb146aSEd Maste
1131*8ddb146aSEd MasteL(gobble_ashr_9):
1132*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1133*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1134*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1135*8ddb146aSEd Maste
1136*8ddb146aSEd Maste	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
1137*8ddb146aSEd Maste
1138*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1139*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1140*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1141*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1142*8ddb146aSEd Maste	sub	$0xffff, %edx
1143*8ddb146aSEd Maste	jnz	L(exit)
1144*8ddb146aSEd Maste
1145*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1146*8ddb146aSEd Maste	sub	$16, %r11
1147*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1148*8ddb146aSEd Maste#endif
1149*8ddb146aSEd Maste
1150*8ddb146aSEd Maste	add	$16, %rcx
1151*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1152*8ddb146aSEd Maste
1153*8ddb146aSEd Maste	add	$16, %r10
1154*8ddb146aSEd Maste	jg	L(nibble_ashr_9)	/* cross page boundary */
1155*8ddb146aSEd Maste
1156*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1157*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1158*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1159*8ddb146aSEd Maste
1160*8ddb146aSEd Maste	palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
1161*8ddb146aSEd Maste
1162*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1163*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1164*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1165*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1166*8ddb146aSEd Maste	sub	$0xffff, %edx
1167*8ddb146aSEd Maste	jnz	L(exit)
1168*8ddb146aSEd Maste
1169*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1170*8ddb146aSEd Maste	sub	$16, %r11
1171*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1172*8ddb146aSEd Maste#endif
1173*8ddb146aSEd Maste
1174*8ddb146aSEd Maste	add	$16, %rcx
1175*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3		/* store for next cycle */
1176*8ddb146aSEd Maste	jmp	L(loop_ashr_9)
1177*8ddb146aSEd Maste
1178*8ddb146aSEd Maste	.p2align 4
1179*8ddb146aSEd MasteL(nibble_ashr_9):
1180*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1181*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1182*8ddb146aSEd Maste	test	$0xfe00, %edx
1183*8ddb146aSEd Maste	jnz	L(ashr_9_exittail)
1184*8ddb146aSEd Maste
1185*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1186*8ddb146aSEd Maste	cmp	$6, %r11
1187*8ddb146aSEd Maste	jbe	L(ashr_9_exittail)
1188*8ddb146aSEd Maste#endif
1189*8ddb146aSEd Maste
1190*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1191*8ddb146aSEd Maste	sub	$0x1000, %r10
1192*8ddb146aSEd Maste	jmp	L(gobble_ashr_9)
1193*8ddb146aSEd Maste
1194*8ddb146aSEd Maste	.p2align 4
1195*8ddb146aSEd MasteL(ashr_9_exittail):
1196*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1197*8ddb146aSEd Maste	psrldq	$9, %xmm0
1198*8ddb146aSEd Maste	psrldq	$9, %xmm3
1199*8ddb146aSEd Maste	jmp	L(aftertail)
1200*8ddb146aSEd Maste
1201*8ddb146aSEd Maste/*
1202*8ddb146aSEd Maste *  The following cases will be handled by ashr_10
1203*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1204*8ddb146aSEd Maste *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
1205*8ddb146aSEd Maste */
1206*8ddb146aSEd Maste	.p2align 4
1207*8ddb146aSEd MasteL(ashr_10):
1208*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1209*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1210*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1211*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1212*8ddb146aSEd Maste	pslldq	$6, %xmm2
1213*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1214*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1215*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1216*8ddb146aSEd Maste	shr	%cl, %edx
1217*8ddb146aSEd Maste	shr	%cl, %r9d
1218*8ddb146aSEd Maste	sub	%r9d, %edx
1219*8ddb146aSEd Maste	jnz	L(less32bytes)
1220*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1221*8ddb146aSEd Maste
1222*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1223*8ddb146aSEd Maste
1224*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1225*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1226*8ddb146aSEd Maste	mov	$10, %r9d	/* byte position left over from less32bytes case */
1227*8ddb146aSEd Maste	/*
1228*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1229*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1230*8ddb146aSEd Maste	 * need to do a nibble.
1231*8ddb146aSEd Maste	 */
1232*8ddb146aSEd Maste	lea	10(%rdi), %r10
1233*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1234*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1235*8ddb146aSEd Maste
1236*8ddb146aSEd Maste	.p2align 4
1237*8ddb146aSEd MasteL(loop_ashr_10):
1238*8ddb146aSEd Maste	add	$16, %r10
1239*8ddb146aSEd Maste	jg	L(nibble_ashr_10)
1240*8ddb146aSEd Maste
1241*8ddb146aSEd MasteL(gobble_ashr_10):
1242*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1243*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1244*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1245*8ddb146aSEd Maste
1246*8ddb146aSEd Maste	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
1247*8ddb146aSEd Maste
1248*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1249*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1250*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1251*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1252*8ddb146aSEd Maste	sub	$0xffff, %edx
1253*8ddb146aSEd Maste	jnz	L(exit)
1254*8ddb146aSEd Maste
1255*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1256*8ddb146aSEd Maste	sub	$16, %r11
1257*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1258*8ddb146aSEd Maste#endif
1259*8ddb146aSEd Maste
1260*8ddb146aSEd Maste	add	$16, %rcx
1261*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1262*8ddb146aSEd Maste
1263*8ddb146aSEd Maste	add	$16, %r10
1264*8ddb146aSEd Maste	jg	L(nibble_ashr_10)	/* cross page boundary */
1265*8ddb146aSEd Maste
1266*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1267*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1268*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1269*8ddb146aSEd Maste
1270*8ddb146aSEd Maste	palignr $10, %xmm3, %xmm2        /* merge into one 16byte value */
1271*8ddb146aSEd Maste
1272*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1273*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1274*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1275*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1276*8ddb146aSEd Maste	sub	$0xffff, %edx
1277*8ddb146aSEd Maste	jnz	L(exit)
1278*8ddb146aSEd Maste
1279*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1280*8ddb146aSEd Maste	sub	$16, %r11
1281*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1282*8ddb146aSEd Maste#endif
1283*8ddb146aSEd Maste
1284*8ddb146aSEd Maste	add	$16, %rcx
1285*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1286*8ddb146aSEd Maste	jmp	L(loop_ashr_10)
1287*8ddb146aSEd Maste
1288*8ddb146aSEd Maste	.p2align 4
1289*8ddb146aSEd MasteL(nibble_ashr_10):
1290*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1291*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1292*8ddb146aSEd Maste	test	$0xfc00, %edx
1293*8ddb146aSEd Maste	jnz	L(ashr_10_exittail)
1294*8ddb146aSEd Maste
1295*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1296*8ddb146aSEd Maste	cmp	$5, %r11
1297*8ddb146aSEd Maste	jbe	L(ashr_10_exittail)
1298*8ddb146aSEd Maste#endif
1299*8ddb146aSEd Maste
1300*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1301*8ddb146aSEd Maste	sub	$0x1000, %r10
1302*8ddb146aSEd Maste	jmp	L(gobble_ashr_10)
1303*8ddb146aSEd Maste
1304*8ddb146aSEd Maste	.p2align 4
1305*8ddb146aSEd MasteL(ashr_10_exittail):
1306*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1307*8ddb146aSEd Maste	psrldq	$10, %xmm0
1308*8ddb146aSEd Maste	psrldq	$10, %xmm3
1309*8ddb146aSEd Maste	jmp	L(aftertail)
1310*8ddb146aSEd Maste
1311*8ddb146aSEd Maste/*
1312*8ddb146aSEd Maste *  The following cases will be handled by ashr_11
1313*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1314*8ddb146aSEd Maste *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
1315*8ddb146aSEd Maste */
1316*8ddb146aSEd Maste	.p2align 4
1317*8ddb146aSEd MasteL(ashr_11):
1318*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1319*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1320*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1321*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1322*8ddb146aSEd Maste	pslldq	$5, %xmm2
1323*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1324*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1325*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1326*8ddb146aSEd Maste	shr	%cl, %edx
1327*8ddb146aSEd Maste	shr	%cl, %r9d
1328*8ddb146aSEd Maste	sub	%r9d, %edx
1329*8ddb146aSEd Maste	jnz	L(less32bytes)
1330*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1331*8ddb146aSEd Maste
1332*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1333*8ddb146aSEd Maste
1334*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1335*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1336*8ddb146aSEd Maste	mov	$11, %r9d	/* byte position left over from less32bytes case */
1337*8ddb146aSEd Maste	/*
1338*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1339*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1340*8ddb146aSEd Maste	 * need to do a nibble.
1341*8ddb146aSEd Maste	 */
1342*8ddb146aSEd Maste	lea	11(%rdi), %r10
1343*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1344*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1345*8ddb146aSEd Maste
1346*8ddb146aSEd Maste	.p2align 4
1347*8ddb146aSEd MasteL(loop_ashr_11):
1348*8ddb146aSEd Maste	add	$16, %r10
1349*8ddb146aSEd Maste	jg	L(nibble_ashr_11)
1350*8ddb146aSEd Maste
1351*8ddb146aSEd MasteL(gobble_ashr_11):
1352*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1353*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1354*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1355*8ddb146aSEd Maste
1356*8ddb146aSEd Maste	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
1357*8ddb146aSEd Maste
1358*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1359*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1360*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1361*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1362*8ddb146aSEd Maste	sub	$0xffff, %edx
1363*8ddb146aSEd Maste	jnz	L(exit)
1364*8ddb146aSEd Maste
1365*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1366*8ddb146aSEd Maste	sub	$16, %r11
1367*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1368*8ddb146aSEd Maste#endif
1369*8ddb146aSEd Maste
1370*8ddb146aSEd Maste	add	$16, %rcx
1371*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1372*8ddb146aSEd Maste
1373*8ddb146aSEd Maste	add	$16, %r10
1374*8ddb146aSEd Maste	jg	L(nibble_ashr_11)	/* cross page boundary */
1375*8ddb146aSEd Maste
1376*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1377*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1378*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1379*8ddb146aSEd Maste
1380*8ddb146aSEd Maste	palignr $11, %xmm3, %xmm2        /* merge into one 16byte value */
1381*8ddb146aSEd Maste
1382*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1383*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1384*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1385*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1386*8ddb146aSEd Maste	sub	$0xffff, %edx
1387*8ddb146aSEd Maste	jnz	L(exit)
1388*8ddb146aSEd Maste
1389*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1390*8ddb146aSEd Maste	sub	$16, %r11
1391*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1392*8ddb146aSEd Maste#endif
1393*8ddb146aSEd Maste
1394*8ddb146aSEd Maste	add	$16, %rcx
1395*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1396*8ddb146aSEd Maste	jmp	L(loop_ashr_11)
1397*8ddb146aSEd Maste
1398*8ddb146aSEd Maste	.p2align 4
1399*8ddb146aSEd MasteL(nibble_ashr_11):
1400*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1401*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1402*8ddb146aSEd Maste	test	$0xf800, %edx
1403*8ddb146aSEd Maste	jnz	L(ashr_11_exittail)
1404*8ddb146aSEd Maste
1405*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1406*8ddb146aSEd Maste	cmp	$4, %r11
1407*8ddb146aSEd Maste	jbe	L(ashr_11_exittail)
1408*8ddb146aSEd Maste#endif
1409*8ddb146aSEd Maste
1410*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1411*8ddb146aSEd Maste	sub	$0x1000, %r10
1412*8ddb146aSEd Maste	jmp	L(gobble_ashr_11)
1413*8ddb146aSEd Maste
1414*8ddb146aSEd Maste	.p2align 4
1415*8ddb146aSEd MasteL(ashr_11_exittail):
1416*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1417*8ddb146aSEd Maste	psrldq	$11, %xmm0
1418*8ddb146aSEd Maste	psrldq	$11, %xmm3
1419*8ddb146aSEd Maste	jmp	L(aftertail)
1420*8ddb146aSEd Maste
1421*8ddb146aSEd Maste/*
1422*8ddb146aSEd Maste *  The following cases will be handled by ashr_12
1423*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1424*8ddb146aSEd Maste *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
1425*8ddb146aSEd Maste */
1426*8ddb146aSEd Maste	.p2align 4
1427*8ddb146aSEd MasteL(ashr_12):
1428*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1429*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1430*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1431*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1432*8ddb146aSEd Maste	pslldq	$4, %xmm2
1433*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1434*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1435*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1436*8ddb146aSEd Maste	shr	%cl, %edx
1437*8ddb146aSEd Maste	shr	%cl, %r9d
1438*8ddb146aSEd Maste	sub	%r9d, %edx
1439*8ddb146aSEd Maste	jnz	L(less32bytes)
1440*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1441*8ddb146aSEd Maste
1442*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1443*8ddb146aSEd Maste
1444*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1445*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1446*8ddb146aSEd Maste	mov	$12, %r9d	/* byte position left over from less32bytes case */
1447*8ddb146aSEd Maste	/*
1448*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1449*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1450*8ddb146aSEd Maste	 * need to do a nibble.
1451*8ddb146aSEd Maste	 */
1452*8ddb146aSEd Maste	lea	12(%rdi), %r10
1453*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1454*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1455*8ddb146aSEd Maste
1456*8ddb146aSEd Maste	.p2align 4
1457*8ddb146aSEd MasteL(loop_ashr_12):
1458*8ddb146aSEd Maste	add	$16, %r10
1459*8ddb146aSEd Maste	jg	L(nibble_ashr_12)
1460*8ddb146aSEd Maste
1461*8ddb146aSEd MasteL(gobble_ashr_12):
1462*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1463*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1464*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1465*8ddb146aSEd Maste
1466*8ddb146aSEd Maste	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
1467*8ddb146aSEd Maste
1468*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1469*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1470*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1471*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1472*8ddb146aSEd Maste	sub	$0xffff, %edx
1473*8ddb146aSEd Maste	jnz	L(exit)
1474*8ddb146aSEd Maste
1475*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1476*8ddb146aSEd Maste	sub	$16, %r11
1477*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1478*8ddb146aSEd Maste#endif
1479*8ddb146aSEd Maste
1480*8ddb146aSEd Maste	add	$16, %rcx
1481*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1482*8ddb146aSEd Maste
1483*8ddb146aSEd Maste	add	$16, %r10
1484*8ddb146aSEd Maste	jg	L(nibble_ashr_12)	/* cross page boundary */
1485*8ddb146aSEd Maste
1486*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1487*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1488*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1489*8ddb146aSEd Maste
1490*8ddb146aSEd Maste	palignr $12, %xmm3, %xmm2        /* merge into one 16byte value */
1491*8ddb146aSEd Maste
1492*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1493*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1494*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1495*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1496*8ddb146aSEd Maste	sub	$0xffff, %edx
1497*8ddb146aSEd Maste	jnz	L(exit)
1498*8ddb146aSEd Maste
1499*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1500*8ddb146aSEd Maste	sub	$16, %r11
1501*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1502*8ddb146aSEd Maste#endif
1503*8ddb146aSEd Maste
1504*8ddb146aSEd Maste	add	$16, %rcx
1505*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1506*8ddb146aSEd Maste	jmp	L(loop_ashr_12)
1507*8ddb146aSEd Maste
1508*8ddb146aSEd Maste	.p2align 4
1509*8ddb146aSEd MasteL(nibble_ashr_12):
1510*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1511*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1512*8ddb146aSEd Maste	test	$0xf000, %edx
1513*8ddb146aSEd Maste	jnz	L(ashr_12_exittail)
1514*8ddb146aSEd Maste
1515*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1516*8ddb146aSEd Maste	cmp	$3, %r11
1517*8ddb146aSEd Maste	jbe	L(ashr_12_exittail)
1518*8ddb146aSEd Maste#endif
1519*8ddb146aSEd Maste
1520*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1521*8ddb146aSEd Maste	sub	$0x1000, %r10
1522*8ddb146aSEd Maste	jmp	L(gobble_ashr_12)
1523*8ddb146aSEd Maste
1524*8ddb146aSEd Maste	.p2align 4
1525*8ddb146aSEd MasteL(ashr_12_exittail):
1526*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1527*8ddb146aSEd Maste	psrldq	$12, %xmm0
1528*8ddb146aSEd Maste	psrldq	$12, %xmm3
1529*8ddb146aSEd Maste	jmp	L(aftertail)
1530*8ddb146aSEd Maste
1531*8ddb146aSEd Maste/*
1532*8ddb146aSEd Maste *  The following cases will be handled by ashr_13
1533*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1534*8ddb146aSEd Maste *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
1535*8ddb146aSEd Maste */
1536*8ddb146aSEd Maste	.p2align 4
1537*8ddb146aSEd MasteL(ashr_13):
1538*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1539*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1540*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1541*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1542*8ddb146aSEd Maste	pslldq	$3, %xmm2
1543*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1544*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1545*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1546*8ddb146aSEd Maste	shr	%cl, %edx
1547*8ddb146aSEd Maste	shr	%cl, %r9d
1548*8ddb146aSEd Maste	sub	%r9d, %edx
1549*8ddb146aSEd Maste	jnz	L(less32bytes)
1550*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1551*8ddb146aSEd Maste
1552*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1553*8ddb146aSEd Maste
1554*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1555*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1556*8ddb146aSEd Maste	mov	$13, %r9d	/* byte position left over from less32bytes case */
1557*8ddb146aSEd Maste	/*
1558*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1559*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1560*8ddb146aSEd Maste	 * need to do a nibble.
1561*8ddb146aSEd Maste	 */
1562*8ddb146aSEd Maste	lea	13(%rdi), %r10
1563*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1564*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1565*8ddb146aSEd Maste
1566*8ddb146aSEd Maste	.p2align 4
1567*8ddb146aSEd MasteL(loop_ashr_13):
1568*8ddb146aSEd Maste	add	$16, %r10
1569*8ddb146aSEd Maste	jg	L(nibble_ashr_13)
1570*8ddb146aSEd Maste
1571*8ddb146aSEd MasteL(gobble_ashr_13):
1572*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1573*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1574*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1575*8ddb146aSEd Maste
1576*8ddb146aSEd Maste	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
1577*8ddb146aSEd Maste
1578*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1579*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1580*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1581*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1582*8ddb146aSEd Maste	sub	$0xffff, %edx
1583*8ddb146aSEd Maste	jnz	L(exit)
1584*8ddb146aSEd Maste
1585*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1586*8ddb146aSEd Maste	sub	$16, %r11
1587*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1588*8ddb146aSEd Maste#endif
1589*8ddb146aSEd Maste
1590*8ddb146aSEd Maste	add	$16, %rcx
1591*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1592*8ddb146aSEd Maste
1593*8ddb146aSEd Maste	add	$16, %r10
1594*8ddb146aSEd Maste	jg	L(nibble_ashr_13)	/* cross page boundary */
1595*8ddb146aSEd Maste
1596*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1597*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1598*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1599*8ddb146aSEd Maste
1600*8ddb146aSEd Maste	palignr $13, %xmm3, %xmm2        /* merge into one 16byte value */
1601*8ddb146aSEd Maste
1602*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1603*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1604*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1605*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1606*8ddb146aSEd Maste	sub	$0xffff, %edx
1607*8ddb146aSEd Maste	jnz	L(exit)
1608*8ddb146aSEd Maste
1609*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1610*8ddb146aSEd Maste	sub	$16, %r11
1611*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1612*8ddb146aSEd Maste#endif
1613*8ddb146aSEd Maste
1614*8ddb146aSEd Maste	add	$16, %rcx
1615*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1616*8ddb146aSEd Maste	jmp	L(loop_ashr_13)
1617*8ddb146aSEd Maste
1618*8ddb146aSEd Maste	.p2align 4
1619*8ddb146aSEd MasteL(nibble_ashr_13):
1620*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1621*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1622*8ddb146aSEd Maste	test	$0xe000, %edx
1623*8ddb146aSEd Maste	jnz	L(ashr_13_exittail)
1624*8ddb146aSEd Maste
1625*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1626*8ddb146aSEd Maste	cmp	$2, %r11
1627*8ddb146aSEd Maste	jbe	L(ashr_13_exittail)
1628*8ddb146aSEd Maste#endif
1629*8ddb146aSEd Maste
1630*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1631*8ddb146aSEd Maste	sub	$0x1000, %r10
1632*8ddb146aSEd Maste	jmp	L(gobble_ashr_13)
1633*8ddb146aSEd Maste
1634*8ddb146aSEd Maste	.p2align 4
1635*8ddb146aSEd MasteL(ashr_13_exittail):
1636*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1637*8ddb146aSEd Maste	psrldq  $13, %xmm0
1638*8ddb146aSEd Maste	psrldq  $13, %xmm3
1639*8ddb146aSEd Maste	jmp	L(aftertail)
1640*8ddb146aSEd Maste
1641*8ddb146aSEd Maste/*
1642*8ddb146aSEd Maste *  The following cases will be handled by ashr_14
1643*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1644*8ddb146aSEd Maste *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
1645*8ddb146aSEd Maste */
1646*8ddb146aSEd Maste	.p2align 4
1647*8ddb146aSEd MasteL(ashr_14):
1648*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1649*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1650*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1651*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1652*8ddb146aSEd Maste	pslldq  $2, %xmm2
1653*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1654*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1655*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1656*8ddb146aSEd Maste	shr	%cl, %edx
1657*8ddb146aSEd Maste	shr	%cl, %r9d
1658*8ddb146aSEd Maste	sub	%r9d, %edx
1659*8ddb146aSEd Maste	jnz	L(less32bytes)
1660*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1661*8ddb146aSEd Maste
1662*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1663*8ddb146aSEd Maste
1664*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1665*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1666*8ddb146aSEd Maste	mov	$14, %r9d	/* byte position left over from less32bytes case */
1667*8ddb146aSEd Maste	/*
1668*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1669*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1670*8ddb146aSEd Maste	 * need to do a nibble.
1671*8ddb146aSEd Maste	 */
1672*8ddb146aSEd Maste	lea	14(%rdi), %r10
1673*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1674*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1675*8ddb146aSEd Maste
1676*8ddb146aSEd Maste	.p2align 4
1677*8ddb146aSEd MasteL(loop_ashr_14):
1678*8ddb146aSEd Maste	add	$16, %r10
1679*8ddb146aSEd Maste	jg	L(nibble_ashr_14)
1680*8ddb146aSEd Maste
1681*8ddb146aSEd MasteL(gobble_ashr_14):
1682*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1683*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1684*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1685*8ddb146aSEd Maste
1686*8ddb146aSEd Maste	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
1687*8ddb146aSEd Maste
1688*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1689*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1690*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1691*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1692*8ddb146aSEd Maste	sub	$0xffff, %edx
1693*8ddb146aSEd Maste	jnz	L(exit)
1694*8ddb146aSEd Maste
1695*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1696*8ddb146aSEd Maste	sub	$16, %r11
1697*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1698*8ddb146aSEd Maste#endif
1699*8ddb146aSEd Maste
1700*8ddb146aSEd Maste	add	$16, %rcx
1701*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1702*8ddb146aSEd Maste
1703*8ddb146aSEd Maste	add	$16, %r10
1704*8ddb146aSEd Maste	jg	L(nibble_ashr_14)	/* cross page boundary */
1705*8ddb146aSEd Maste
1706*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1707*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1708*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1709*8ddb146aSEd Maste
1710*8ddb146aSEd Maste	palignr $14, %xmm3, %xmm2        /* merge into one 16byte value */
1711*8ddb146aSEd Maste
1712*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1713*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1714*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1715*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1716*8ddb146aSEd Maste	sub	$0xffff, %edx
1717*8ddb146aSEd Maste	jnz	L(exit)
1718*8ddb146aSEd Maste
1719*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1720*8ddb146aSEd Maste	sub	$16, %r11
1721*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1722*8ddb146aSEd Maste#endif
1723*8ddb146aSEd Maste
1724*8ddb146aSEd Maste	add	$16, %rcx
1725*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1726*8ddb146aSEd Maste	jmp	L(loop_ashr_14)
1727*8ddb146aSEd Maste
1728*8ddb146aSEd Maste	.p2align 4
1729*8ddb146aSEd MasteL(nibble_ashr_14):
1730*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1731*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1732*8ddb146aSEd Maste	test	$0xc000, %edx
1733*8ddb146aSEd Maste	jnz	L(ashr_14_exittail)
1734*8ddb146aSEd Maste
1735*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1736*8ddb146aSEd Maste	cmp	$1, %r11
1737*8ddb146aSEd Maste	jbe	L(ashr_14_exittail)
1738*8ddb146aSEd Maste#endif
1739*8ddb146aSEd Maste
1740*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1741*8ddb146aSEd Maste	sub	$0x1000, %r10
1742*8ddb146aSEd Maste	jmp	L(gobble_ashr_14)
1743*8ddb146aSEd Maste
1744*8ddb146aSEd Maste	.p2align 4
1745*8ddb146aSEd MasteL(ashr_14_exittail):
1746*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1747*8ddb146aSEd Maste	psrldq	$14, %xmm0
1748*8ddb146aSEd Maste	psrldq	$14, %xmm3
1749*8ddb146aSEd Maste	jmp	L(aftertail)
1750*8ddb146aSEd Maste
1751*8ddb146aSEd Maste/*
1752*8ddb146aSEd Maste *  The following cases will be handled by ashr_15
1753*8ddb146aSEd Maste *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
1754*8ddb146aSEd Maste *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
1755*8ddb146aSEd Maste */
1756*8ddb146aSEd Maste	.p2align 4
1757*8ddb146aSEd MasteL(ashr_15):
1758*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1759*8ddb146aSEd Maste	movdqa	(%rdi), %xmm2
1760*8ddb146aSEd Maste	movdqa	(%rsi), %xmm1
1761*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1762*8ddb146aSEd Maste	pslldq	$1, %xmm2
1763*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm2
1764*8ddb146aSEd Maste	psubb	%xmm0, %xmm2
1765*8ddb146aSEd Maste	pmovmskb %xmm2, %r9d
1766*8ddb146aSEd Maste	shr	%cl, %edx
1767*8ddb146aSEd Maste	shr	%cl, %r9d
1768*8ddb146aSEd Maste	sub	%r9d, %edx
1769*8ddb146aSEd Maste	jnz	L(less32bytes)
1770*8ddb146aSEd Maste
1771*8ddb146aSEd Maste	movdqa	(%rdi), %xmm3
1772*8ddb146aSEd Maste
1773*8ddb146aSEd Maste	UPDATE_STRNCMP_COUNTER
1774*8ddb146aSEd Maste
1775*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1776*8ddb146aSEd Maste	mov	$16, %rcx	/* index for loads */
1777*8ddb146aSEd Maste	mov	$15, %r9d	/* byte position left over from less32bytes case */
1778*8ddb146aSEd Maste	/*
1779*8ddb146aSEd Maste	 * Setup %r10 value allows us to detect crossing a page boundary.
1780*8ddb146aSEd Maste	 * When %r10 goes positive we have crossed a page boundary and
1781*8ddb146aSEd Maste	 * need to do a nibble.
1782*8ddb146aSEd Maste	 */
1783*8ddb146aSEd Maste	lea	15(%rdi), %r10
1784*8ddb146aSEd Maste	and	$0xfff, %r10	/* offset into 4K page */
1785*8ddb146aSEd Maste
1786*8ddb146aSEd Maste	sub	$0x1000, %r10	/* subtract 4K pagesize */
1787*8ddb146aSEd Maste
1788*8ddb146aSEd Maste	.p2align 4
1789*8ddb146aSEd MasteL(loop_ashr_15):
1790*8ddb146aSEd Maste	add	$16, %r10
1791*8ddb146aSEd Maste	jg	L(nibble_ashr_15)
1792*8ddb146aSEd Maste
1793*8ddb146aSEd MasteL(gobble_ashr_15):
1794*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1795*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1796*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1797*8ddb146aSEd Maste
1798*8ddb146aSEd Maste	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
1799*8ddb146aSEd Maste
1800*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1801*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1802*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1803*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1804*8ddb146aSEd Maste	sub	$0xffff, %edx
1805*8ddb146aSEd Maste	jnz	L(exit)
1806*8ddb146aSEd Maste
1807*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1808*8ddb146aSEd Maste	sub	$16, %r11
1809*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1810*8ddb146aSEd Maste#endif
1811*8ddb146aSEd Maste
1812*8ddb146aSEd Maste	add	$16, %rcx
1813*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1814*8ddb146aSEd Maste
1815*8ddb146aSEd Maste	add	$16, %r10
1816*8ddb146aSEd Maste	jg	L(nibble_ashr_15)	/* cross page boundary */
1817*8ddb146aSEd Maste
1818*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1819*8ddb146aSEd Maste	movdqa	(%rdi, %rcx), %xmm2
1820*8ddb146aSEd Maste	movdqa	%xmm2, %xmm4
1821*8ddb146aSEd Maste
1822*8ddb146aSEd Maste	palignr $15, %xmm3, %xmm2        /* merge into one 16byte value */
1823*8ddb146aSEd Maste
1824*8ddb146aSEd Maste	pcmpeqb	%xmm1, %xmm0
1825*8ddb146aSEd Maste	pcmpeqb	%xmm2, %xmm1
1826*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1827*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1828*8ddb146aSEd Maste	sub	$0xffff, %edx
1829*8ddb146aSEd Maste	jnz	L(exit)
1830*8ddb146aSEd Maste
1831*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1832*8ddb146aSEd Maste	sub	$16, %r11
1833*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1834*8ddb146aSEd Maste#endif
1835*8ddb146aSEd Maste
1836*8ddb146aSEd Maste	add	$16, %rcx
1837*8ddb146aSEd Maste	movdqa	%xmm4, %xmm3
1838*8ddb146aSEd Maste	jmp	L(loop_ashr_15)
1839*8ddb146aSEd Maste
1840*8ddb146aSEd Maste	.p2align 4
1841*8ddb146aSEd MasteL(nibble_ashr_15):
1842*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
1843*8ddb146aSEd Maste	pmovmskb %xmm0, %edx
1844*8ddb146aSEd Maste	test	$0x8000, %edx
1845*8ddb146aSEd Maste	jnz	L(ashr_15_exittail)
1846*8ddb146aSEd Maste
1847*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1848*8ddb146aSEd Maste	test	%r11, %r11
1849*8ddb146aSEd Maste	je	L(ashr_15_exittail)
1850*8ddb146aSEd Maste#endif
1851*8ddb146aSEd Maste
1852*8ddb146aSEd Maste	pxor	%xmm0, %xmm0
1853*8ddb146aSEd Maste	sub	$0x1000, %r10
1854*8ddb146aSEd Maste	jmp	L(gobble_ashr_15)
1855*8ddb146aSEd Maste
1856*8ddb146aSEd Maste	.p2align 4
1857*8ddb146aSEd MasteL(ashr_15_exittail):
1858*8ddb146aSEd Maste	movdqa	(%rsi, %rcx), %xmm1
1859*8ddb146aSEd Maste	psrldq	$15, %xmm3
1860*8ddb146aSEd Maste	psrldq	$15, %xmm0
1861*8ddb146aSEd Maste
1862*8ddb146aSEd Maste	.p2align 4
1863*8ddb146aSEd MasteL(aftertail):
1864*8ddb146aSEd Maste	pcmpeqb	%xmm3, %xmm1
1865*8ddb146aSEd Maste	psubb	%xmm0, %xmm1
1866*8ddb146aSEd Maste	pmovmskb %xmm1, %edx
1867*8ddb146aSEd Maste	not	%edx
1868*8ddb146aSEd Maste
1869*8ddb146aSEd Maste	.p2align 4
1870*8ddb146aSEd MasteL(exit):
1871*8ddb146aSEd Maste	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
1872*8ddb146aSEd MasteL(less32bytes):
1873*8ddb146aSEd Maste	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
1874*8ddb146aSEd Maste	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
1875*8ddb146aSEd Maste	test	%r8d, %r8d
1876*8ddb146aSEd Maste	jz	L(ret)
1877*8ddb146aSEd Maste	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
1878*8ddb146aSEd Maste
1879*8ddb146aSEd Maste	.p2align 4
1880*8ddb146aSEd MasteL(ret):
1881*8ddb146aSEd MasteL(less16bytes):
1882*8ddb146aSEd Maste	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
1883*8ddb146aSEd Maste
1884*8ddb146aSEd Maste#ifdef USE_AS_STRNCMP
1885*8ddb146aSEd Maste	sub	%rdx, %r11
1886*8ddb146aSEd Maste	jbe	L(strcmp_exitz)
1887*8ddb146aSEd Maste#endif
1888*8ddb146aSEd Maste	movzbl	(%rsi, %rdx), %ecx
1889*8ddb146aSEd Maste	movzbl	(%rdi, %rdx), %eax
1890*8ddb146aSEd Maste
1891*8ddb146aSEd Maste	sub	%ecx, %eax
1892*8ddb146aSEd Maste	ret
1893*8ddb146aSEd Maste
1894*8ddb146aSEd MasteL(strcmp_exitz):
1895*8ddb146aSEd Maste	xor	%eax, %eax
1896*8ddb146aSEd Maste	ret
1897*8ddb146aSEd Maste
1898*8ddb146aSEd Maste	.p2align 4
1899*8ddb146aSEd MasteL(Byte0):
1900*8ddb146aSEd Maste	movzbl	(%rsi), %ecx
1901*8ddb146aSEd Maste	movzbl	(%rdi), %eax
1902*8ddb146aSEd Maste
1903*8ddb146aSEd Maste	sub	%ecx, %eax
1904*8ddb146aSEd Maste	ret
1905*8ddb146aSEd MasteEND (STRCMP)
1906*8ddb146aSEd Maste
1907*8ddb146aSEd Maste	.section .rodata,"a",@progbits
1908*8ddb146aSEd Maste	.p2align 3
1909*8ddb146aSEd MasteL(unaligned_table):
1910*8ddb146aSEd Maste	.int	L(ashr_1) - L(unaligned_table)
1911*8ddb146aSEd Maste	.int	L(ashr_2) - L(unaligned_table)
1912*8ddb146aSEd Maste	.int	L(ashr_3) - L(unaligned_table)
1913*8ddb146aSEd Maste	.int	L(ashr_4) - L(unaligned_table)
1914*8ddb146aSEd Maste	.int	L(ashr_5) - L(unaligned_table)
1915*8ddb146aSEd Maste	.int	L(ashr_6) - L(unaligned_table)
1916*8ddb146aSEd Maste	.int	L(ashr_7) - L(unaligned_table)
1917*8ddb146aSEd Maste	.int	L(ashr_8) - L(unaligned_table)
1918*8ddb146aSEd Maste	.int	L(ashr_9) - L(unaligned_table)
1919*8ddb146aSEd Maste	.int	L(ashr_10) - L(unaligned_table)
1920*8ddb146aSEd Maste	.int	L(ashr_11) - L(unaligned_table)
1921*8ddb146aSEd Maste	.int	L(ashr_12) - L(unaligned_table)
1922*8ddb146aSEd Maste	.int	L(ashr_13) - L(unaligned_table)
1923*8ddb146aSEd Maste	.int	L(ashr_14) - L(unaligned_table)
1924*8ddb146aSEd Maste	.int	L(ashr_15) - L(unaligned_table)
1925*8ddb146aSEd Maste	.int	L(ashr_0) - L(unaligned_table)
1926