xref: /onnv-gate/usr/src/lib/libc/amd64/gen/strcpy.s (revision 10583:1058268e7f53)
10Sstevel@tonic-gate/*
2*10583SEdward.Gillett@Sun.COM * CDDL HEADER START
3*10583SEdward.Gillett@Sun.COM *
4*10583SEdward.Gillett@Sun.COM * The contents of this file are subject to the terms of the
5*10583SEdward.Gillett@Sun.COM * Common Development and Distribution License (the "License").
6*10583SEdward.Gillett@Sun.COM * You may not use this file except in compliance with the License.
7*10583SEdward.Gillett@Sun.COM *
8*10583SEdward.Gillett@Sun.COM * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*10583SEdward.Gillett@Sun.COM * or http://www.opensolaris.org/os/licensing.
10*10583SEdward.Gillett@Sun.COM * See the License for the specific language governing permissions
11*10583SEdward.Gillett@Sun.COM * and limitations under the License.
12*10583SEdward.Gillett@Sun.COM *
13*10583SEdward.Gillett@Sun.COM * When distributing Covered Code, include this CDDL HEADER in each
14*10583SEdward.Gillett@Sun.COM * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*10583SEdward.Gillett@Sun.COM * If applicable, add the following below this CDDL HEADER, with the
16*10583SEdward.Gillett@Sun.COM * fields enclosed by brackets "[]" replaced with your own identifying
17*10583SEdward.Gillett@Sun.COM * information: Portions Copyright [yyyy] [name of copyright owner]
18*10583SEdward.Gillett@Sun.COM *
19*10583SEdward.Gillett@Sun.COM * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate
220Sstevel@tonic-gate/*
23*10583SEdward.Gillett@Sun.COM * Copyright (c) 2009, Intel Corporation
240Sstevel@tonic-gate * All rights reserved.
250Sstevel@tonic-gate */
260Sstevel@tonic-gate
27*10583SEdward.Gillett@Sun.COM/*
28*10583SEdward.Gillett@Sun.COM *	str[n]cpy - copy [n] chars from second operand into first operand
29*10583SEdward.Gillett@Sun.COM */
300Sstevel@tonic-gate#include "SYS.h"
31*10583SEdward.Gillett@Sun.COM#include "proc64_id.h"
320Sstevel@tonic-gate
330Sstevel@tonic-gate#define LABEL(s) .strcpy/**/s
340Sstevel@tonic-gate
350Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
360Sstevel@tonic-gate	ENTRY(strncpy)
37*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
38*10583SEdward.Gillett@Sun.COM	jz	LABEL(strncpy_exitz)
39*10583SEdward.Gillett@Sun.COM	mov	%rdx, %r8
400Sstevel@tonic-gate#else
41*10583SEdward.Gillett@Sun.COM	ENTRY(strcpy)				/* (char *, const char *) */
42*10583SEdward.Gillett@Sun.COM	xor	%rdx, %rdx
43*10583SEdward.Gillett@Sun.COM#endif
44*10583SEdward.Gillett@Sun.COM	mov	%esi, %ecx
45*10583SEdward.Gillett@Sun.COM	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
46*10583SEdward.Gillett@Sun.COM	and	$0xf, %rcx
47*10583SEdward.Gillett@Sun.COM	mov	%rdi, %rax			/* save destination address for return value */
48*10583SEdward.Gillett@Sun.COM
49*10583SEdward.Gillett@Sun.COM
50*10583SEdward.Gillett@Sun.COM	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
51*10583SEdward.Gillett@Sun.COM	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
52*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
53*10583SEdward.Gillett@Sun.COM	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
54*10583SEdward.Gillett@Sun.COM	test	%edx, %edx			/* edx will be 0 if chars are non-null */
55*10583SEdward.Gillett@Sun.COM	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
56*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
57*10583SEdward.Gillett@Sun.COM	/*
58*10583SEdward.Gillett@Sun.COM	 * Check if the count is satisfied in first 16 bytes examined.
59*10583SEdward.Gillett@Sun.COM	 */
60*10583SEdward.Gillett@Sun.COM	lea	-16(%r8, %rcx), %r11
61*10583SEdward.Gillett@Sun.COM	cmp	$0, %r11
62*10583SEdward.Gillett@Sun.COM	jle	LABEL(less16bytes)
63*10583SEdward.Gillett@Sun.COM#endif
64*10583SEdward.Gillett@Sun.COM	mov	%rcx, %r9			/* rsi alignment offset */
65*10583SEdward.Gillett@Sun.COM	or	%edi, %ecx
66*10583SEdward.Gillett@Sun.COM	and	$0xf, %ecx
67*10583SEdward.Gillett@Sun.COM	lea	-16(%r9), %r10
68*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */
69*10583SEdward.Gillett@Sun.COM
70*10583SEdward.Gillett@Sun.COM	neg	%r10				/* max src bytes remaining in current dqword */
71*10583SEdward.Gillett@Sun.COM
72*10583SEdward.Gillett@Sun.COM	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
73*10583SEdward.Gillett@Sun.COM	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
74*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
75*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
76*10583SEdward.Gillett@Sun.COM	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */
77*10583SEdward.Gillett@Sun.COM
78*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
79*10583SEdward.Gillett@Sun.COM	/*
80*10583SEdward.Gillett@Sun.COM	 * If strncpy count <= 16 go to exit case
81*10583SEdward.Gillett@Sun.COM	 */
82*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
83*10583SEdward.Gillett@Sun.COM	jbe	LABEL(less32bytes_strncpy_truncation)
84*10583SEdward.Gillett@Sun.COM#endif
85*10583SEdward.Gillett@Sun.COM	/*
86*10583SEdward.Gillett@Sun.COM	 * At least 16 bytes to copy to destination string. Move them now.
87*10583SEdward.Gillett@Sun.COM	 * Don't worry about alignment.
88*10583SEdward.Gillett@Sun.COM	 */
89*10583SEdward.Gillett@Sun.COM	mov	(%rsi, %r9), %rdx
90*10583SEdward.Gillett@Sun.COM	mov	%rdx, (%rdi)
91*10583SEdward.Gillett@Sun.COM	mov	8(%rsi, %r9), %rdx
92*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
93*10583SEdward.Gillett@Sun.COM
94*10583SEdward.Gillett@Sun.COM	/*
95*10583SEdward.Gillett@Sun.COM	 * so far destination rdi may be aligned by 16, re-calculate rsi and
96*10583SEdward.Gillett@Sun.COM	 * jump to corresponding src/dest relative offset case.
97*10583SEdward.Gillett@Sun.COM	 * 	rcx is offset of rsi
98*10583SEdward.Gillett@Sun.COM	 * 	rdx is offset of rdi
99*10583SEdward.Gillett@Sun.COM	 */
100*10583SEdward.Gillett@Sun.COM	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
101*10583SEdward.Gillett@Sun.COM	mov	%rax, %rdx			/* rax contains orignal rdi */
102*10583SEdward.Gillett@Sun.COM	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
103*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
104*10583SEdward.Gillett@Sun.COM	/*
105*10583SEdward.Gillett@Sun.COM	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
106*10583SEdward.Gillett@Sun.COM	 * (ie store twice) if destination was unaligned. Compensate here.
107*10583SEdward.Gillett@Sun.COM	 */
108*10583SEdward.Gillett@Sun.COM	add	%rdx, %r8			/* compensate for overlap */
109*10583SEdward.Gillett@Sun.COM#endif
110*10583SEdward.Gillett@Sun.COM
111*10583SEdward.Gillett@Sun.COM	add	$16, %rdi			/* next 16 bytes for dest */
112*10583SEdward.Gillett@Sun.COM
113*10583SEdward.Gillett@Sun.COM	/*
114*10583SEdward.Gillett@Sun.COM	 * align src to 16-byte boundary. Could be up or down depending on
115*10583SEdward.Gillett@Sun.COM	 * whether src offset - dest offset > 0 (up) or
116*10583SEdward.Gillett@Sun.COM	 *  src offset - dest offset < 0 (down).
117*10583SEdward.Gillett@Sun.COM	 */
118*10583SEdward.Gillett@Sun.COM	sub	%rdx, %r9			/* src offset - dest offset */
119*10583SEdward.Gillett@Sun.COM
120*10583SEdward.Gillett@Sun.COM	lea	16(%r9, %rsi), %rsi
121*10583SEdward.Gillett@Sun.COM	mov	%esi, %ecx			/* for new src offset */
122*10583SEdward.Gillett@Sun.COM	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
123*10583SEdward.Gillett@Sun.COM
124*10583SEdward.Gillett@Sun.COM	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
125*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_0)
126*10583SEdward.Gillett@Sun.COM
127*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
128*10583SEdward.Gillett@Sun.COM	xor	%edx, %edx			/* In case unaligned_exit is taken */
129*10583SEdward.Gillett@Sun.COM#endif
130*10583SEdward.Gillett@Sun.COM	/*
131*10583SEdward.Gillett@Sun.COM	 * Jump to case corresponding to source/dest string relative offsets
132*10583SEdward.Gillett@Sun.COM	 * Index = (16 + (src offset - dest offset)) % 16
133*10583SEdward.Gillett@Sun.COM	 */
134*10583SEdward.Gillett@Sun.COM	lea	-16(%rcx), %r10
135*10583SEdward.Gillett@Sun.COM	mov	%rcx, %r9
136*10583SEdward.Gillett@Sun.COM	neg	%r10				/* max src bytes remaining in current dqword */
137*10583SEdward.Gillett@Sun.COM	lea	LABEL(unaligned_table)(%rip), %r11
138*10583SEdward.Gillett@Sun.COM	movslq	(%r11, %rcx, 4), %rcx
139*10583SEdward.Gillett@Sun.COM	lea	(%r11, %rcx), %rcx
140*10583SEdward.Gillett@Sun.COM	jmp	*%rcx
141*10583SEdward.Gillett@Sun.COM
142*10583SEdward.Gillett@Sun.COM/*
143*10583SEdward.Gillett@Sun.COM * ashr_0 handles the following cases:
144*10583SEdward.Gillett@Sun.COM * 	src alignment offset = dest alignment offset
145*10583SEdward.Gillett@Sun.COM */
146*10583SEdward.Gillett@Sun.COM	.p2align 5
147*10583SEdward.Gillett@Sun.COMLABEL(ashr_0):
148*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
149*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
150*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_aligned)
151*10583SEdward.Gillett@Sun.COM#endif
152*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
153*10583SEdward.Gillett@Sun.COM	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
154*10583SEdward.Gillett@Sun.COM	add	$16, %rsi
155*10583SEdward.Gillett@Sun.COM	add	$16, %rdi
156*10583SEdward.Gillett@Sun.COM	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
157*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
158*10583SEdward.Gillett@Sun.COM
159*10583SEdward.Gillett@Sun.COM	test	%edx, %edx		/* edx will be 0 if chars are non-null */
160*10583SEdward.Gillett@Sun.COM	jnz	LABEL(aligned_16bytes)	/* exit tail */
161*10583SEdward.Gillett@Sun.COM
162*10583SEdward.Gillett@Sun.COMLABEL(ashr_0_loop):
163*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
164*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
165*10583SEdward.Gillett@Sun.COM	jbe	LABEL(strncpy_truncation_aligned)
166*10583SEdward.Gillett@Sun.COM#endif
167*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm1
168*10583SEdward.Gillett@Sun.COM	movdqa	%xmm1, (%rdi, %rcx)
169*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
170*10583SEdward.Gillett@Sun.COM	pcmpeqb	(%rsi, %rcx), %xmm0
171*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
172*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
173*10583SEdward.Gillett@Sun.COM	jnz	LABEL(aligned_exit)
174*10583SEdward.Gillett@Sun.COM
175*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
176*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
177*10583SEdward.Gillett@Sun.COM	jbe	LABEL(strncpy_truncation_aligned)
178*10583SEdward.Gillett@Sun.COM#endif
179*10583SEdward.Gillett@Sun.COM	movdqa  (%rsi, %rcx), %xmm1
180*10583SEdward.Gillett@Sun.COM	movdqa  %xmm1, (%rdi, %rcx)
181*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
182*10583SEdward.Gillett@Sun.COM	pcmpeqb  (%rsi, %rcx), %xmm0
183*10583SEdward.Gillett@Sun.COM	pmovmskb  %xmm0, %edx
184*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
185*10583SEdward.Gillett@Sun.COM	jnz	LABEL(aligned_exit)
186*10583SEdward.Gillett@Sun.COM
187*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
188*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
189*10583SEdward.Gillett@Sun.COM	jbe	LABEL(strncpy_truncation_aligned)
190*10583SEdward.Gillett@Sun.COM#endif
191*10583SEdward.Gillett@Sun.COM	movdqa  (%rsi, %rcx), %xmm1
192*10583SEdward.Gillett@Sun.COM	movdqa  %xmm1, (%rdi, %rcx)
193*10583SEdward.Gillett@Sun.COM
194*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
195*10583SEdward.Gillett@Sun.COM	pcmpeqb  (%rsi, %rcx), %xmm0
196*10583SEdward.Gillett@Sun.COM	pmovmskb  %xmm0, %edx
197*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
198*10583SEdward.Gillett@Sun.COM	jnz	LABEL(aligned_exit)
199*10583SEdward.Gillett@Sun.COM
200*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
201*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
202*10583SEdward.Gillett@Sun.COM	jbe	LABEL(strncpy_truncation_aligned)
203*10583SEdward.Gillett@Sun.COM#endif
204*10583SEdward.Gillett@Sun.COM	movdqa  (%rsi, %rcx), %xmm1
205*10583SEdward.Gillett@Sun.COM	movdqa  %xmm1, (%rdi, %rcx)
206*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
207*10583SEdward.Gillett@Sun.COM	pcmpeqb  (%rsi, %rcx), %xmm0
208*10583SEdward.Gillett@Sun.COM	pmovmskb  %xmm0, %edx
209*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
210*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_0_loop)
211*10583SEdward.Gillett@Sun.COM	jmp	LABEL(aligned_exit)
212*10583SEdward.Gillett@Sun.COM
213*10583SEdward.Gillett@Sun.COM
214*10583SEdward.Gillett@Sun.COM/*
215*10583SEdward.Gillett@Sun.COM * ashr_15 handles the following cases:
216*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 15
217*10583SEdward.Gillett@Sun.COM *
218*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
219*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
220*10583SEdward.Gillett@Sun.COM */
221*10583SEdward.Gillett@Sun.COM	.p2align 4
222*10583SEdward.Gillett@Sun.COMLABEL(ashr_15):
223*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
224*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
225*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
226*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
227*10583SEdward.Gillett@Sun.COM#endif
228*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
229*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_15_use_sse2)
230*10583SEdward.Gillett@Sun.COM
231*10583SEdward.Gillett@Sun.COM	.p2align 4
232*10583SEdward.Gillett@Sun.COMLABEL(ashr_15_use_ssse3):
233*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
234*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
235*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
236*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
237*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
238*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
239*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
240*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
241*10583SEdward.Gillett@Sun.COM#endif
242*10583SEdward.Gillett@Sun.COM
243*10583SEdward.Gillett@Sun.COM	#palignr $15, (%rsi, %rcx), %xmm3
244*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
245*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0f
246*10583SEdward.Gillett@Sun.COM
247*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
248*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
249*10583SEdward.Gillett@Sun.COM
250*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
251*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
252*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
253*10583SEdward.Gillett@Sun.COM#endif
254*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
255*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
256*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
257*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
258*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
259*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
260*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
261*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
262*10583SEdward.Gillett@Sun.COM#endif
263*10583SEdward.Gillett@Sun.COM
264*10583SEdward.Gillett@Sun.COM	#palignr $15, (%rsi, %rcx), %xmm3
265*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
266*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0f
267*10583SEdward.Gillett@Sun.COM
268*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
269*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
270*10583SEdward.Gillett@Sun.COM
271*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
272*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
273*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
274*10583SEdward.Gillett@Sun.COM#endif
275*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_15_use_ssse3)
276*10583SEdward.Gillett@Sun.COM
277*10583SEdward.Gillett@Sun.COM	.p2align 4
278*10583SEdward.Gillett@Sun.COMLABEL(ashr_15_use_sse2):
279*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
280*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
281*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
282*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
283*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
284*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
285*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
286*10583SEdward.Gillett@Sun.COM#endif
287*10583SEdward.Gillett@Sun.COM
288*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
289*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
290*10583SEdward.Gillett@Sun.COM
291*10583SEdward.Gillett@Sun.COM	psrldq	$15, %xmm2
292*10583SEdward.Gillett@Sun.COM	pslldq	$1, %xmm3
293*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
294*10583SEdward.Gillett@Sun.COM
295*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
296*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
297*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
298*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
299*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
300*10583SEdward.Gillett@Sun.COM#endif
301*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
302*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
303*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
304*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
305*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
306*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
307*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
308*10583SEdward.Gillett@Sun.COM#endif
309*10583SEdward.Gillett@Sun.COM
310*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
311*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
312*10583SEdward.Gillett@Sun.COM
313*10583SEdward.Gillett@Sun.COM	psrldq	$15, %xmm2
314*10583SEdward.Gillett@Sun.COM	pslldq	$1, %xmm3
315*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
316*10583SEdward.Gillett@Sun.COM
317*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
318*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
319*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
320*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
321*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
322*10583SEdward.Gillett@Sun.COM#endif
323*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_15_use_sse2)
324*10583SEdward.Gillett@Sun.COM
325*10583SEdward.Gillett@Sun.COM
326*10583SEdward.Gillett@Sun.COM/*
327*10583SEdward.Gillett@Sun.COM * ashr_14 handles the following cases:
328*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 14
329*10583SEdward.Gillett@Sun.COM *
330*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
331*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
332*10583SEdward.Gillett@Sun.COM */
333*10583SEdward.Gillett@Sun.COM	.p2align 4
334*10583SEdward.Gillett@Sun.COMLABEL(ashr_14):
335*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
336*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
337*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
338*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
339*10583SEdward.Gillett@Sun.COM#endif
340*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
341*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_14_use_sse2)
342*10583SEdward.Gillett@Sun.COM
343*10583SEdward.Gillett@Sun.COM	.p2align 4
344*10583SEdward.Gillett@Sun.COMLABEL(ashr_14_use_ssse3):
345*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
346*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
347*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
348*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
349*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
350*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
351*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
352*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
353*10583SEdward.Gillett@Sun.COM#endif
354*10583SEdward.Gillett@Sun.COM
355*10583SEdward.Gillett@Sun.COM	#palignr $14, (%rsi, %rcx), %xmm3
356*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
357*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0e
358*10583SEdward.Gillett@Sun.COM
359*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
360*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
361*10583SEdward.Gillett@Sun.COM
362*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
363*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
364*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
365*10583SEdward.Gillett@Sun.COM#endif
366*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
367*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
368*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
369*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
370*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
371*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
372*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
373*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
374*10583SEdward.Gillett@Sun.COM#endif
375*10583SEdward.Gillett@Sun.COM
376*10583SEdward.Gillett@Sun.COM	#palignr $14, (%rsi, %rcx), %xmm3
377*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
378*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0e
379*10583SEdward.Gillett@Sun.COM
380*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
381*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
382*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
383*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
384*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
385*10583SEdward.Gillett@Sun.COM#endif
386*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_14_use_ssse3)
387*10583SEdward.Gillett@Sun.COM
388*10583SEdward.Gillett@Sun.COM	.p2align 4
389*10583SEdward.Gillett@Sun.COMLABEL(ashr_14_use_sse2):
390*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
391*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
392*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
393*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
394*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
395*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
396*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
397*10583SEdward.Gillett@Sun.COM#endif
398*10583SEdward.Gillett@Sun.COM
399*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
400*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
401*10583SEdward.Gillett@Sun.COM
402*10583SEdward.Gillett@Sun.COM	psrldq	$14, %xmm2
403*10583SEdward.Gillett@Sun.COM	pslldq	$2, %xmm3
404*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
405*10583SEdward.Gillett@Sun.COM
406*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
407*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
408*10583SEdward.Gillett@Sun.COM
409*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
410*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
411*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
412*10583SEdward.Gillett@Sun.COM#endif
413*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
414*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
415*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
416*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
417*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
418*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
419*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
420*10583SEdward.Gillett@Sun.COM#endif
421*10583SEdward.Gillett@Sun.COM
422*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
423*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
424*10583SEdward.Gillett@Sun.COM
425*10583SEdward.Gillett@Sun.COM	psrldq	$14, %xmm2
426*10583SEdward.Gillett@Sun.COM	pslldq	$2, %xmm3
427*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
428*10583SEdward.Gillett@Sun.COM
429*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
430*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
431*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
432*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
433*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
434*10583SEdward.Gillett@Sun.COM#endif
435*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_14_use_sse2)
436*10583SEdward.Gillett@Sun.COM
437*10583SEdward.Gillett@Sun.COM
438*10583SEdward.Gillett@Sun.COM/*
439*10583SEdward.Gillett@Sun.COM * ashr_13 handles the following cases:
440*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 13
441*10583SEdward.Gillett@Sun.COM *
442*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
443*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
444*10583SEdward.Gillett@Sun.COM */
445*10583SEdward.Gillett@Sun.COM	.p2align 4
446*10583SEdward.Gillett@Sun.COMLABEL(ashr_13):
447*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
448*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
449*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
450*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
451*10583SEdward.Gillett@Sun.COM#endif
452*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
453*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_13_use_sse2)
454*10583SEdward.Gillett@Sun.COM
455*10583SEdward.Gillett@Sun.COM	.p2align 4
456*10583SEdward.Gillett@Sun.COMLABEL(ashr_13_use_ssse3):
457*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
458*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
459*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
460*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
461*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
462*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
463*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
464*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
465*10583SEdward.Gillett@Sun.COM#endif
466*10583SEdward.Gillett@Sun.COM
467*10583SEdward.Gillett@Sun.COM	#palignr $13, (%rsi, %rcx), %xmm3
468*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
469*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0d
470*10583SEdward.Gillett@Sun.COM
471*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
472*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
473*10583SEdward.Gillett@Sun.COM
474*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
475*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
476*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
477*10583SEdward.Gillett@Sun.COM#endif
478*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
479*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
480*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
481*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
482*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
483*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
484*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
485*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
4860Sstevel@tonic-gate#endif
4870Sstevel@tonic-gate
488*10583SEdward.Gillett@Sun.COM	#palignr $13, (%rsi, %rcx), %xmm3
489*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
490*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0d
491*10583SEdward.Gillett@Sun.COM
492*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
493*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
4940Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
495*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
496*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
497*10583SEdward.Gillett@Sun.COM#endif
498*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_13_use_ssse3)
499*10583SEdward.Gillett@Sun.COM
500*10583SEdward.Gillett@Sun.COM	.p2align 4
501*10583SEdward.Gillett@Sun.COMLABEL(ashr_13_use_sse2):
502*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
503*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
504*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
505*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
506*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
507*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
508*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
509*10583SEdward.Gillett@Sun.COM#endif
510*10583SEdward.Gillett@Sun.COM
511*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
512*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
513*10583SEdward.Gillett@Sun.COM
514*10583SEdward.Gillett@Sun.COM	psrldq	$13, %xmm2
515*10583SEdward.Gillett@Sun.COM	pslldq	$3, %xmm3
516*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
517*10583SEdward.Gillett@Sun.COM
518*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
519*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
520*10583SEdward.Gillett@Sun.COM
521*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
522*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
523*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
524*10583SEdward.Gillett@Sun.COM#endif
525*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
526*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
527*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
528*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
529*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
530*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
531*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
532*10583SEdward.Gillett@Sun.COM#endif
533*10583SEdward.Gillett@Sun.COM
534*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
535*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
536*10583SEdward.Gillett@Sun.COM
537*10583SEdward.Gillett@Sun.COM	psrldq	$13, %xmm2
538*10583SEdward.Gillett@Sun.COM	pslldq	$3, %xmm3
539*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
540*10583SEdward.Gillett@Sun.COM
541*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
542*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
543*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
544*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
545*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
546*10583SEdward.Gillett@Sun.COM#endif
547*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_13_use_sse2)
548*10583SEdward.Gillett@Sun.COM
549*10583SEdward.Gillett@Sun.COM
550*10583SEdward.Gillett@Sun.COM/*
551*10583SEdward.Gillett@Sun.COM * ashr_12 handles the following cases:
552*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 12
553*10583SEdward.Gillett@Sun.COM *
554*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
555*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
556*10583SEdward.Gillett@Sun.COM */
557*10583SEdward.Gillett@Sun.COM	.p2align 4
558*10583SEdward.Gillett@Sun.COMLABEL(ashr_12):
559*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
560*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
561*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
562*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
563*10583SEdward.Gillett@Sun.COM#endif
564*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
565*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_12_use_sse2)
566*10583SEdward.Gillett@Sun.COM
567*10583SEdward.Gillett@Sun.COM	.p2align 4
568*10583SEdward.Gillett@Sun.COMLABEL(ashr_12_use_ssse3):
569*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
570*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
571*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
572*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
573*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
574*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
575*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
576*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
577*10583SEdward.Gillett@Sun.COM#endif
578*10583SEdward.Gillett@Sun.COM
579*10583SEdward.Gillett@Sun.COM	#palignr $12, (%rsi, %rcx), %xmm3
580*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
581*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0c
582*10583SEdward.Gillett@Sun.COM
583*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
584*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
585*10583SEdward.Gillett@Sun.COM
586*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
587*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
588*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
589*10583SEdward.Gillett@Sun.COM#endif
590*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
591*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
592*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
593*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
594*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
595*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
596*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
597*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
598*10583SEdward.Gillett@Sun.COM#endif
599*10583SEdward.Gillett@Sun.COM
600*10583SEdward.Gillett@Sun.COM	#palignr $12, (%rsi, %rcx), %xmm3
601*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
602*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0c
603*10583SEdward.Gillett@Sun.COM
604*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
605*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
606*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
607*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
608*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
609*10583SEdward.Gillett@Sun.COM#endif
610*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_12_use_ssse3)
611*10583SEdward.Gillett@Sun.COM
612*10583SEdward.Gillett@Sun.COM	.p2align 4
613*10583SEdward.Gillett@Sun.COMLABEL(ashr_12_use_sse2):
614*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
615*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
616*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
617*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
618*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
619*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
620*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
621*10583SEdward.Gillett@Sun.COM#endif
622*10583SEdward.Gillett@Sun.COM
623*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
624*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
625*10583SEdward.Gillett@Sun.COM
626*10583SEdward.Gillett@Sun.COM	psrldq	$12, %xmm2
627*10583SEdward.Gillett@Sun.COM	pslldq	$4, %xmm3
628*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
629*10583SEdward.Gillett@Sun.COM
630*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
631*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
632*10583SEdward.Gillett@Sun.COM
633*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
634*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
635*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
636*10583SEdward.Gillett@Sun.COM#endif
637*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
638*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
639*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
640*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
641*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
642*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
643*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
644*10583SEdward.Gillett@Sun.COM#endif
645*10583SEdward.Gillett@Sun.COM
646*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
647*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
648*10583SEdward.Gillett@Sun.COM
649*10583SEdward.Gillett@Sun.COM	psrldq	$12, %xmm2
650*10583SEdward.Gillett@Sun.COM	pslldq	$4, %xmm3
651*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
652*10583SEdward.Gillett@Sun.COM
653*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
654*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
655*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
656*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
657*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
658*10583SEdward.Gillett@Sun.COM#endif
659*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_12_use_sse2)
660*10583SEdward.Gillett@Sun.COM
661*10583SEdward.Gillett@Sun.COM
662*10583SEdward.Gillett@Sun.COM/*
663*10583SEdward.Gillett@Sun.COM * ashr_11 handles the following cases:
664*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 11
665*10583SEdward.Gillett@Sun.COM *
666*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
667*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
668*10583SEdward.Gillett@Sun.COM */
669*10583SEdward.Gillett@Sun.COM	.p2align 4
670*10583SEdward.Gillett@Sun.COMLABEL(ashr_11):
671*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
672*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
673*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
674*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
675*10583SEdward.Gillett@Sun.COM#endif
676*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
677*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_11_use_sse2)
678*10583SEdward.Gillett@Sun.COM
679*10583SEdward.Gillett@Sun.COM	.p2align 4
680*10583SEdward.Gillett@Sun.COMLABEL(ashr_11_use_ssse3):
681*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
682*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
683*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
684*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
685*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
686*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
687*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
688*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
689*10583SEdward.Gillett@Sun.COM#endif
690*10583SEdward.Gillett@Sun.COM
691*10583SEdward.Gillett@Sun.COM	#palignr $11, (%rsi, %rcx), %xmm3
692*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
693*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0b
694*10583SEdward.Gillett@Sun.COM
695*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
696*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
697*10583SEdward.Gillett@Sun.COM
698*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
699*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
700*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
701*10583SEdward.Gillett@Sun.COM#endif
702*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
703*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
704*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
705*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
706*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
707*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
708*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
709*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
710*10583SEdward.Gillett@Sun.COM#endif
711*10583SEdward.Gillett@Sun.COM
712*10583SEdward.Gillett@Sun.COM	#palignr $11, (%rsi, %rcx), %xmm3
713*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
714*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0b
715*10583SEdward.Gillett@Sun.COM
716*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
717*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
718*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
719*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
720*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
721*10583SEdward.Gillett@Sun.COM#endif
722*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_11_use_ssse3)
723*10583SEdward.Gillett@Sun.COM
724*10583SEdward.Gillett@Sun.COM	.p2align 4
725*10583SEdward.Gillett@Sun.COMLABEL(ashr_11_use_sse2):
726*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
727*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
728*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
729*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
730*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
731*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
732*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
733*10583SEdward.Gillett@Sun.COM#endif
734*10583SEdward.Gillett@Sun.COM
735*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
736*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
737*10583SEdward.Gillett@Sun.COM
738*10583SEdward.Gillett@Sun.COM	psrldq	$11, %xmm2
739*10583SEdward.Gillett@Sun.COM	pslldq	$5, %xmm3
740*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
741*10583SEdward.Gillett@Sun.COM
742*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
743*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
744*10583SEdward.Gillett@Sun.COM
745*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
746*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
747*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
748*10583SEdward.Gillett@Sun.COM#endif
749*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
750*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
751*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
752*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
753*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
754*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
755*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
756*10583SEdward.Gillett@Sun.COM#endif
757*10583SEdward.Gillett@Sun.COM
758*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
759*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
760*10583SEdward.Gillett@Sun.COM
761*10583SEdward.Gillett@Sun.COM	psrldq	$11, %xmm2
762*10583SEdward.Gillett@Sun.COM	pslldq	$5, %xmm3
763*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
764*10583SEdward.Gillett@Sun.COM
765*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
766*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
767*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
768*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
769*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
770*10583SEdward.Gillett@Sun.COM#endif
771*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_11_use_sse2)
772*10583SEdward.Gillett@Sun.COM
773*10583SEdward.Gillett@Sun.COM
774*10583SEdward.Gillett@Sun.COM/*
775*10583SEdward.Gillett@Sun.COM * ashr_10 handles the following cases:
776*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 10
777*10583SEdward.Gillett@Sun.COM *
778*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
779*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
780*10583SEdward.Gillett@Sun.COM */
781*10583SEdward.Gillett@Sun.COM	.p2align 4
782*10583SEdward.Gillett@Sun.COMLABEL(ashr_10):
783*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
784*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
785*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
786*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
787*10583SEdward.Gillett@Sun.COM#endif
788*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
789*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_10_use_sse2)
790*10583SEdward.Gillett@Sun.COM
791*10583SEdward.Gillett@Sun.COM	.p2align 4
792*10583SEdward.Gillett@Sun.COMLABEL(ashr_10_use_ssse3):
793*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
794*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
795*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
796*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
797*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
798*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
799*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
800*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
801*10583SEdward.Gillett@Sun.COM#endif
802*10583SEdward.Gillett@Sun.COM
803*10583SEdward.Gillett@Sun.COM	#palignr $10, (%rsi, %rcx), %xmm3
804*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
805*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0a
806*10583SEdward.Gillett@Sun.COM
807*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
808*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
809*10583SEdward.Gillett@Sun.COM
810*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
811*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
812*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
813*10583SEdward.Gillett@Sun.COM#endif
814*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
815*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
816*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
817*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
818*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
819*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
820*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
821*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
822*10583SEdward.Gillett@Sun.COM#endif
823*10583SEdward.Gillett@Sun.COM
824*10583SEdward.Gillett@Sun.COM	#palignr $10, (%rsi, %rcx), %xmm3
825*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
826*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x0a
827*10583SEdward.Gillett@Sun.COM
828*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
829*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
830*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
831*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
832*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
833*10583SEdward.Gillett@Sun.COM#endif
834*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_10_use_ssse3)
835*10583SEdward.Gillett@Sun.COM
836*10583SEdward.Gillett@Sun.COM	.p2align 4
837*10583SEdward.Gillett@Sun.COMLABEL(ashr_10_use_sse2):
838*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
839*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
840*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
841*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
842*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
843*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
844*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
845*10583SEdward.Gillett@Sun.COM#endif
846*10583SEdward.Gillett@Sun.COM
847*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
848*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
849*10583SEdward.Gillett@Sun.COM
850*10583SEdward.Gillett@Sun.COM	psrldq	$10, %xmm2
851*10583SEdward.Gillett@Sun.COM	pslldq	$6, %xmm3
852*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
853*10583SEdward.Gillett@Sun.COM
854*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
855*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
856*10583SEdward.Gillett@Sun.COM
857*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
858*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
859*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
860*10583SEdward.Gillett@Sun.COM#endif
861*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
862*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
863*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
864*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
865*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
866*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
867*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
868*10583SEdward.Gillett@Sun.COM#endif
869*10583SEdward.Gillett@Sun.COM
870*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
871*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
872*10583SEdward.Gillett@Sun.COM
873*10583SEdward.Gillett@Sun.COM	psrldq	$10, %xmm2
874*10583SEdward.Gillett@Sun.COM	pslldq	$6, %xmm3
875*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
876*10583SEdward.Gillett@Sun.COM
877*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
878*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
879*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
880*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
881*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
882*10583SEdward.Gillett@Sun.COM#endif
883*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_10_use_sse2)
884*10583SEdward.Gillett@Sun.COM
885*10583SEdward.Gillett@Sun.COM
886*10583SEdward.Gillett@Sun.COM/*
887*10583SEdward.Gillett@Sun.COM * ashr_9 handles the following cases:
888*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 9
889*10583SEdward.Gillett@Sun.COM *
890*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
891*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
892*10583SEdward.Gillett@Sun.COM */
893*10583SEdward.Gillett@Sun.COM	.p2align 4
894*10583SEdward.Gillett@Sun.COMLABEL(ashr_9):
895*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
896*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
897*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
898*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
899*10583SEdward.Gillett@Sun.COM#endif
900*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
901*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_9_use_sse2)
902*10583SEdward.Gillett@Sun.COM
903*10583SEdward.Gillett@Sun.COM	.p2align 4
904*10583SEdward.Gillett@Sun.COMLABEL(ashr_9_use_ssse3):
905*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
906*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
907*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
908*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
909*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
910*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
911*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
912*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
913*10583SEdward.Gillett@Sun.COM#endif
914*10583SEdward.Gillett@Sun.COM
915*10583SEdward.Gillett@Sun.COM	#palignr $9, (%rsi, %rcx), %xmm3
916*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
917*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x09
918*10583SEdward.Gillett@Sun.COM
919*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
920*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
921*10583SEdward.Gillett@Sun.COM
922*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
923*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
924*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
925*10583SEdward.Gillett@Sun.COM#endif
926*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
927*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
928*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
929*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
930*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
931*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
932*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
933*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
934*10583SEdward.Gillett@Sun.COM#endif
935*10583SEdward.Gillett@Sun.COM
936*10583SEdward.Gillett@Sun.COM	#palignr $9, (%rsi, %rcx), %xmm3
937*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
938*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x09
939*10583SEdward.Gillett@Sun.COM
940*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
941*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
942*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
943*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
944*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
945*10583SEdward.Gillett@Sun.COM#endif
946*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_9_use_ssse3)
947*10583SEdward.Gillett@Sun.COM
948*10583SEdward.Gillett@Sun.COM	.p2align 4
949*10583SEdward.Gillett@Sun.COMLABEL(ashr_9_use_sse2):
950*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
951*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
952*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
953*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
954*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
955*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
956*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
957*10583SEdward.Gillett@Sun.COM#endif
958*10583SEdward.Gillett@Sun.COM
959*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
960*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
961*10583SEdward.Gillett@Sun.COM
962*10583SEdward.Gillett@Sun.COM	psrldq	$9, %xmm2
963*10583SEdward.Gillett@Sun.COM	pslldq	$7, %xmm3
964*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
965*10583SEdward.Gillett@Sun.COM
966*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
967*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
968*10583SEdward.Gillett@Sun.COM
969*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
970*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
971*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
972*10583SEdward.Gillett@Sun.COM#endif
973*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
974*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
975*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
976*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
977*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
978*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
979*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
9800Sstevel@tonic-gate#endif
9810Sstevel@tonic-gate
982*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
983*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
984*10583SEdward.Gillett@Sun.COM
985*10583SEdward.Gillett@Sun.COM	psrldq	$9, %xmm2
986*10583SEdward.Gillett@Sun.COM	pslldq	$7, %xmm3
987*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
988*10583SEdward.Gillett@Sun.COM
989*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
990*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
991*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
992*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
993*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
994*10583SEdward.Gillett@Sun.COM#endif
995*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_9_use_sse2)
996*10583SEdward.Gillett@Sun.COM
997*10583SEdward.Gillett@Sun.COM
998*10583SEdward.Gillett@Sun.COM/*
999*10583SEdward.Gillett@Sun.COM * ashr_8 handles the following cases:
1000*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 8
1001*10583SEdward.Gillett@Sun.COM *
1002*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1003*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1004*10583SEdward.Gillett@Sun.COM */
1005*10583SEdward.Gillett@Sun.COM	.p2align 4
1006*10583SEdward.Gillett@Sun.COMLABEL(ashr_8):
1007*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1008*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1009*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1010*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1011*10583SEdward.Gillett@Sun.COM#endif
1012*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1013*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_8_use_sse2)
1014*10583SEdward.Gillett@Sun.COM
1015*10583SEdward.Gillett@Sun.COM	.p2align 4
1016*10583SEdward.Gillett@Sun.COMLABEL(ashr_8_use_ssse3):
1017*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1018*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1019*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1020*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1021*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1022*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1023*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1024*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1025*10583SEdward.Gillett@Sun.COM#endif
1026*10583SEdward.Gillett@Sun.COM
1027*10583SEdward.Gillett@Sun.COM	#palignr $8, (%rsi, %rcx), %xmm3
1028*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1029*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x08
1030*10583SEdward.Gillett@Sun.COM
1031*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1032*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1033*10583SEdward.Gillett@Sun.COM
1034*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1035*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1036*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1037*10583SEdward.Gillett@Sun.COM#endif
1038*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1039*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1040*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1041*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1042*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1043*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1044*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1045*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1046*10583SEdward.Gillett@Sun.COM#endif
1047*10583SEdward.Gillett@Sun.COM
1048*10583SEdward.Gillett@Sun.COM	#palignr $8, (%rsi, %rcx), %xmm3
1049*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1050*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x08
1051*10583SEdward.Gillett@Sun.COM
1052*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1053*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1054*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1055*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1056*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1057*10583SEdward.Gillett@Sun.COM#endif
1058*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_8_use_ssse3)
1059*10583SEdward.Gillett@Sun.COM
1060*10583SEdward.Gillett@Sun.COM	.p2align 4
1061*10583SEdward.Gillett@Sun.COMLABEL(ashr_8_use_sse2):
1062*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1063*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1064*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1065*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1066*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1067*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1068*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1069*10583SEdward.Gillett@Sun.COM#endif
1070*10583SEdward.Gillett@Sun.COM
1071*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1072*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1073*10583SEdward.Gillett@Sun.COM
1074*10583SEdward.Gillett@Sun.COM	psrldq	$8, %xmm2
1075*10583SEdward.Gillett@Sun.COM	pslldq	$8, %xmm3
1076*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1077*10583SEdward.Gillett@Sun.COM
1078*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1079*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1080*10583SEdward.Gillett@Sun.COM
1081*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1082*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1083*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1084*10583SEdward.Gillett@Sun.COM#endif
1085*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1086*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1087*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1088*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1089*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1090*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1091*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1092*10583SEdward.Gillett@Sun.COM#endif
1093*10583SEdward.Gillett@Sun.COM
1094*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1095*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1096*10583SEdward.Gillett@Sun.COM
1097*10583SEdward.Gillett@Sun.COM	psrldq	$8, %xmm2
1098*10583SEdward.Gillett@Sun.COM	pslldq	$8, %xmm3
1099*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1100*10583SEdward.Gillett@Sun.COM
1101*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1102*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1103*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1104*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1105*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1106*10583SEdward.Gillett@Sun.COM#endif
1107*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_8_use_sse2)
1108*10583SEdward.Gillett@Sun.COM
1109*10583SEdward.Gillett@Sun.COM
1110*10583SEdward.Gillett@Sun.COM/*
1111*10583SEdward.Gillett@Sun.COM * ashr_7 handles the following cases:
1112*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 7
1113*10583SEdward.Gillett@Sun.COM *
1114*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1115*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1116*10583SEdward.Gillett@Sun.COM */
1117*10583SEdward.Gillett@Sun.COM	.p2align 4
1118*10583SEdward.Gillett@Sun.COMLABEL(ashr_7):
1119*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1120*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1121*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1122*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1123*10583SEdward.Gillett@Sun.COM#endif
1124*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1125*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_7_use_sse2)
1126*10583SEdward.Gillett@Sun.COM
1127*10583SEdward.Gillett@Sun.COM	.p2align 4
1128*10583SEdward.Gillett@Sun.COMLABEL(ashr_7_use_ssse3):
1129*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1130*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1131*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1132*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1133*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1134*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1135*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1136*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1137*10583SEdward.Gillett@Sun.COM#endif
1138*10583SEdward.Gillett@Sun.COM
1139*10583SEdward.Gillett@Sun.COM	#palignr $7, (%rsi, %rcx), %xmm3
1140*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1141*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x07
1142*10583SEdward.Gillett@Sun.COM
1143*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1144*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1145*10583SEdward.Gillett@Sun.COM
1146*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1147*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1148*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1149*10583SEdward.Gillett@Sun.COM#endif
1150*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1151*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1152*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1153*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1154*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1155*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1156*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1157*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1158*10583SEdward.Gillett@Sun.COM#endif
1159*10583SEdward.Gillett@Sun.COM
1160*10583SEdward.Gillett@Sun.COM	#palignr $7, (%rsi, %rcx), %xmm3
1161*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1162*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x07
1163*10583SEdward.Gillett@Sun.COM
1164*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1165*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1166*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1167*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1168*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1169*10583SEdward.Gillett@Sun.COM#endif
1170*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_7_use_ssse3)
1171*10583SEdward.Gillett@Sun.COM
1172*10583SEdward.Gillett@Sun.COM	.p2align 4
1173*10583SEdward.Gillett@Sun.COMLABEL(ashr_7_use_sse2):
1174*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1175*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1176*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1177*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1178*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1179*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1180*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1181*10583SEdward.Gillett@Sun.COM#endif
1182*10583SEdward.Gillett@Sun.COM
1183*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1184*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1185*10583SEdward.Gillett@Sun.COM
1186*10583SEdward.Gillett@Sun.COM	psrldq	$7, %xmm2
1187*10583SEdward.Gillett@Sun.COM	pslldq	$9, %xmm3
1188*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1189*10583SEdward.Gillett@Sun.COM
1190*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1191*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1192*10583SEdward.Gillett@Sun.COM
1193*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1194*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1195*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1196*10583SEdward.Gillett@Sun.COM#endif
1197*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1198*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1199*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1200*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1201*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1202*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1203*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1204*10583SEdward.Gillett@Sun.COM#endif
1205*10583SEdward.Gillett@Sun.COM
1206*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1207*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1208*10583SEdward.Gillett@Sun.COM
1209*10583SEdward.Gillett@Sun.COM	psrldq	$7, %xmm2
1210*10583SEdward.Gillett@Sun.COM	pslldq	$9, %xmm3
1211*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1212*10583SEdward.Gillett@Sun.COM
1213*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1214*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1215*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1216*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1217*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1218*10583SEdward.Gillett@Sun.COM#endif
1219*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_7_use_sse2)
1220*10583SEdward.Gillett@Sun.COM
12210Sstevel@tonic-gate
1222*10583SEdward.Gillett@Sun.COM/*
1223*10583SEdward.Gillett@Sun.COM * ashr_6 handles the following cases:
1224*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 6
1225*10583SEdward.Gillett@Sun.COM *
1226*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1227*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1228*10583SEdward.Gillett@Sun.COM */
1229*10583SEdward.Gillett@Sun.COM	.p2align 4
1230*10583SEdward.Gillett@Sun.COMLABEL(ashr_6):
1231*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1232*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1233*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1234*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1235*10583SEdward.Gillett@Sun.COM#endif
1236*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1237*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_6_use_sse2)
1238*10583SEdward.Gillett@Sun.COM
1239*10583SEdward.Gillett@Sun.COM	.p2align 4
1240*10583SEdward.Gillett@Sun.COMLABEL(ashr_6_use_ssse3):
1241*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1242*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1243*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1244*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1245*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1246*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1247*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1248*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1249*10583SEdward.Gillett@Sun.COM#endif
1250*10583SEdward.Gillett@Sun.COM
1251*10583SEdward.Gillett@Sun.COM	#palignr $6, (%rsi, %rcx), %xmm3
1252*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1253*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x06
1254*10583SEdward.Gillett@Sun.COM
1255*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1256*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1257*10583SEdward.Gillett@Sun.COM
1258*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1259*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1260*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1261*10583SEdward.Gillett@Sun.COM#endif
1262*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1263*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1264*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1265*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1266*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1267*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1268*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1269*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1270*10583SEdward.Gillett@Sun.COM#endif
1271*10583SEdward.Gillett@Sun.COM
1272*10583SEdward.Gillett@Sun.COM	#palignr $6, (%rsi, %rcx), %xmm3
1273*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1274*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x06
1275*10583SEdward.Gillett@Sun.COM
1276*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1277*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1278*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1279*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1280*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1281*10583SEdward.Gillett@Sun.COM#endif
1282*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_6_use_ssse3)
1283*10583SEdward.Gillett@Sun.COM
1284*10583SEdward.Gillett@Sun.COM	.p2align 4
1285*10583SEdward.Gillett@Sun.COMLABEL(ashr_6_use_sse2):
1286*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1287*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1288*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1289*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1290*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1291*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1292*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1293*10583SEdward.Gillett@Sun.COM#endif
1294*10583SEdward.Gillett@Sun.COM
1295*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1296*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1297*10583SEdward.Gillett@Sun.COM
1298*10583SEdward.Gillett@Sun.COM	psrldq	$6, %xmm2
1299*10583SEdward.Gillett@Sun.COM	pslldq	$10, %xmm3
1300*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1301*10583SEdward.Gillett@Sun.COM
1302*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1303*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1304*10583SEdward.Gillett@Sun.COM
1305*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1306*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1307*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1308*10583SEdward.Gillett@Sun.COM#endif
1309*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1310*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1311*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1312*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1313*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1314*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1315*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1316*10583SEdward.Gillett@Sun.COM#endif
1317*10583SEdward.Gillett@Sun.COM
1318*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1319*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1320*10583SEdward.Gillett@Sun.COM
1321*10583SEdward.Gillett@Sun.COM	psrldq	$6, %xmm2
1322*10583SEdward.Gillett@Sun.COM	pslldq	$10, %xmm3
1323*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1324*10583SEdward.Gillett@Sun.COM
1325*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1326*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1327*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1328*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1329*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1330*10583SEdward.Gillett@Sun.COM#endif
1331*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_6_use_sse2)
1332*10583SEdward.Gillett@Sun.COM
13330Sstevel@tonic-gate
1334*10583SEdward.Gillett@Sun.COM/*
1335*10583SEdward.Gillett@Sun.COM * ashr_5 handles the following cases:
1336*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 5
1337*10583SEdward.Gillett@Sun.COM *
1338*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1339*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1340*10583SEdward.Gillett@Sun.COM */
1341*10583SEdward.Gillett@Sun.COM	.p2align 4
1342*10583SEdward.Gillett@Sun.COMLABEL(ashr_5):
1343*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1344*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1345*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1346*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1347*10583SEdward.Gillett@Sun.COM#endif
1348*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1349*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_5_use_sse2)
1350*10583SEdward.Gillett@Sun.COM
1351*10583SEdward.Gillett@Sun.COM	.p2align 4
1352*10583SEdward.Gillett@Sun.COMLABEL(ashr_5_use_ssse3):
1353*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1354*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1355*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1356*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1357*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1358*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1359*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1360*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1361*10583SEdward.Gillett@Sun.COM#endif
1362*10583SEdward.Gillett@Sun.COM
1363*10583SEdward.Gillett@Sun.COM	#palignr $5, (%rsi, %rcx), %xmm3
1364*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1365*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x05
1366*10583SEdward.Gillett@Sun.COM
1367*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1368*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1369*10583SEdward.Gillett@Sun.COM
1370*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1371*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1372*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1373*10583SEdward.Gillett@Sun.COM#endif
1374*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1375*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1376*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1377*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1378*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1379*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1380*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1381*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1382*10583SEdward.Gillett@Sun.COM#endif
1383*10583SEdward.Gillett@Sun.COM
1384*10583SEdward.Gillett@Sun.COM	#palignr $5, (%rsi, %rcx), %xmm3
1385*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1386*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x05
1387*10583SEdward.Gillett@Sun.COM
1388*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1389*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1390*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1391*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1392*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1393*10583SEdward.Gillett@Sun.COM#endif
1394*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_5_use_ssse3)
1395*10583SEdward.Gillett@Sun.COM
1396*10583SEdward.Gillett@Sun.COM	.p2align 4
1397*10583SEdward.Gillett@Sun.COMLABEL(ashr_5_use_sse2):
1398*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1399*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1400*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1401*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1402*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1403*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1404*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1405*10583SEdward.Gillett@Sun.COM#endif
1406*10583SEdward.Gillett@Sun.COM
1407*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1408*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1409*10583SEdward.Gillett@Sun.COM
1410*10583SEdward.Gillett@Sun.COM	psrldq	$5, %xmm2
1411*10583SEdward.Gillett@Sun.COM	pslldq	$11, %xmm3
1412*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1413*10583SEdward.Gillett@Sun.COM
1414*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1415*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1416*10583SEdward.Gillett@Sun.COM
1417*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1418*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1419*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1420*10583SEdward.Gillett@Sun.COM#endif
1421*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1422*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1423*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1424*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1425*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1426*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1427*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1428*10583SEdward.Gillett@Sun.COM#endif
1429*10583SEdward.Gillett@Sun.COM
1430*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1431*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1432*10583SEdward.Gillett@Sun.COM
1433*10583SEdward.Gillett@Sun.COM	psrldq	$5, %xmm2
1434*10583SEdward.Gillett@Sun.COM	pslldq	$11, %xmm3
1435*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1436*10583SEdward.Gillett@Sun.COM
1437*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1438*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1439*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1440*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1441*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1442*10583SEdward.Gillett@Sun.COM#endif
1443*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_5_use_sse2)
1444*10583SEdward.Gillett@Sun.COM
1445*10583SEdward.Gillett@Sun.COM
1446*10583SEdward.Gillett@Sun.COM/*
1447*10583SEdward.Gillett@Sun.COM * ashr_4 handles the following cases:
1448*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 4
1449*10583SEdward.Gillett@Sun.COM *
1450*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1451*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1452*10583SEdward.Gillett@Sun.COM */
1453*10583SEdward.Gillett@Sun.COM	.p2align 4
1454*10583SEdward.Gillett@Sun.COMLABEL(ashr_4):
1455*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1456*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1457*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1458*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1459*10583SEdward.Gillett@Sun.COM#endif
1460*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1461*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_4_use_sse2)
14620Sstevel@tonic-gate
14630Sstevel@tonic-gate	.p2align 4
1464*10583SEdward.Gillett@Sun.COMLABEL(ashr_4_use_ssse3):
1465*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1466*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1467*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1468*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1469*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1470*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1471*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1472*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1473*10583SEdward.Gillett@Sun.COM#endif
14740Sstevel@tonic-gate
1475*10583SEdward.Gillett@Sun.COM	#palignr $4, (%rsi, %rcx), %xmm3
1476*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1477*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x04
1478*10583SEdward.Gillett@Sun.COM
1479*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1480*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1481*10583SEdward.Gillett@Sun.COM
1482*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1483*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1484*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1485*10583SEdward.Gillett@Sun.COM#endif
1486*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1487*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1488*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1489*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1490*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1491*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1492*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1493*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1494*10583SEdward.Gillett@Sun.COM#endif
1495*10583SEdward.Gillett@Sun.COM
1496*10583SEdward.Gillett@Sun.COM	#palignr $4, (%rsi, %rcx), %xmm3
1497*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1498*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x04
1499*10583SEdward.Gillett@Sun.COM
1500*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1501*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1502*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1503*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1504*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1505*10583SEdward.Gillett@Sun.COM#endif
1506*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_4_use_ssse3)
1507*10583SEdward.Gillett@Sun.COM
1508*10583SEdward.Gillett@Sun.COM	.p2align 4
1509*10583SEdward.Gillett@Sun.COMLABEL(ashr_4_use_sse2):
1510*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1511*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1512*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1513*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1514*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1515*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1516*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1517*10583SEdward.Gillett@Sun.COM#endif
1518*10583SEdward.Gillett@Sun.COM
1519*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1520*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1521*10583SEdward.Gillett@Sun.COM
1522*10583SEdward.Gillett@Sun.COM	psrldq	$4, %xmm2
1523*10583SEdward.Gillett@Sun.COM	pslldq	$12, %xmm3
1524*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1525*10583SEdward.Gillett@Sun.COM
1526*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1527*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1528*10583SEdward.Gillett@Sun.COM
1529*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1530*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1531*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1532*10583SEdward.Gillett@Sun.COM#endif
1533*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1534*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1535*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1536*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1537*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1538*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1539*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1540*10583SEdward.Gillett@Sun.COM#endif
1541*10583SEdward.Gillett@Sun.COM
1542*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1543*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1544*10583SEdward.Gillett@Sun.COM
1545*10583SEdward.Gillett@Sun.COM	psrldq	$4, %xmm2
1546*10583SEdward.Gillett@Sun.COM	pslldq	$12, %xmm3
1547*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1548*10583SEdward.Gillett@Sun.COM
1549*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1550*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1551*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1552*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1553*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1554*10583SEdward.Gillett@Sun.COM#endif
1555*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_4_use_sse2)
1556*10583SEdward.Gillett@Sun.COM
1557*10583SEdward.Gillett@Sun.COM
1558*10583SEdward.Gillett@Sun.COM/*
1559*10583SEdward.Gillett@Sun.COM * ashr_3 handles the following cases:
1560*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 3
1561*10583SEdward.Gillett@Sun.COM *
1562*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1563*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1564*10583SEdward.Gillett@Sun.COM */
1565*10583SEdward.Gillett@Sun.COM	.p2align 4
1566*10583SEdward.Gillett@Sun.COMLABEL(ashr_3):
1567*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
15680Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
1569*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1570*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1571*10583SEdward.Gillett@Sun.COM#endif
1572*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1573*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_3_use_sse2)
1574*10583SEdward.Gillett@Sun.COM
1575*10583SEdward.Gillett@Sun.COM	.p2align 4
1576*10583SEdward.Gillett@Sun.COMLABEL(ashr_3_use_ssse3):
1577*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1578*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1579*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1580*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1581*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1582*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1583*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1584*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1585*10583SEdward.Gillett@Sun.COM#endif
1586*10583SEdward.Gillett@Sun.COM
1587*10583SEdward.Gillett@Sun.COM	#palignr $3, (%rsi, %rcx), %xmm3
1588*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1589*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x03
1590*10583SEdward.Gillett@Sun.COM
1591*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1592*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1593*10583SEdward.Gillett@Sun.COM
1594*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1595*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1596*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1597*10583SEdward.Gillett@Sun.COM#endif
1598*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1599*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1600*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1601*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1602*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1603*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1604*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1605*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1606*10583SEdward.Gillett@Sun.COM#endif
1607*10583SEdward.Gillett@Sun.COM
1608*10583SEdward.Gillett@Sun.COM	#palignr $3, (%rsi, %rcx), %xmm3
1609*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1610*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x03
1611*10583SEdward.Gillett@Sun.COM
1612*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1613*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1614*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1615*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1616*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1617*10583SEdward.Gillett@Sun.COM#endif
1618*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_3_use_ssse3)
1619*10583SEdward.Gillett@Sun.COM
1620*10583SEdward.Gillett@Sun.COM	.p2align 4
1621*10583SEdward.Gillett@Sun.COMLABEL(ashr_3_use_sse2):
1622*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1623*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1624*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1625*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1626*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1627*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1628*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1629*10583SEdward.Gillett@Sun.COM#endif
1630*10583SEdward.Gillett@Sun.COM
1631*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1632*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1633*10583SEdward.Gillett@Sun.COM
1634*10583SEdward.Gillett@Sun.COM	psrldq	$3, %xmm2
1635*10583SEdward.Gillett@Sun.COM	pslldq	$13, %xmm3
1636*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1637*10583SEdward.Gillett@Sun.COM
1638*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1639*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1640*10583SEdward.Gillett@Sun.COM
1641*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1642*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1643*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1644*10583SEdward.Gillett@Sun.COM#endif
1645*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1646*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1647*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1648*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1649*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1650*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1651*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1652*10583SEdward.Gillett@Sun.COM#endif
1653*10583SEdward.Gillett@Sun.COM
1654*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1655*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1656*10583SEdward.Gillett@Sun.COM
1657*10583SEdward.Gillett@Sun.COM	psrldq	$3, %xmm2
1658*10583SEdward.Gillett@Sun.COM	pslldq	$13, %xmm3
1659*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1660*10583SEdward.Gillett@Sun.COM
1661*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1662*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1663*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1664*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1665*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1666*10583SEdward.Gillett@Sun.COM#endif
1667*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_3_use_sse2)
1668*10583SEdward.Gillett@Sun.COM
1669*10583SEdward.Gillett@Sun.COM
1670*10583SEdward.Gillett@Sun.COM/*
1671*10583SEdward.Gillett@Sun.COM * ashr_2 handles the following cases:
1672*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 2
1673*10583SEdward.Gillett@Sun.COM *
1674*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1675*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1676*10583SEdward.Gillett@Sun.COM */
1677*10583SEdward.Gillett@Sun.COM	.p2align 4
1678*10583SEdward.Gillett@Sun.COMLABEL(ashr_2):
1679*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1680*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1681*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1682*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1683*10583SEdward.Gillett@Sun.COM#endif
1684*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1685*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_2_use_sse2)
1686*10583SEdward.Gillett@Sun.COM
1687*10583SEdward.Gillett@Sun.COM	.p2align 4
1688*10583SEdward.Gillett@Sun.COMLABEL(ashr_2_use_ssse3):
1689*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1690*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1691*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1692*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1693*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1694*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1695*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1696*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
16970Sstevel@tonic-gate#endif
16980Sstevel@tonic-gate
1699*10583SEdward.Gillett@Sun.COM	#palignr $2, (%rsi, %rcx), %xmm3
1700*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1701*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x02
1702*10583SEdward.Gillett@Sun.COM
1703*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1704*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1705*10583SEdward.Gillett@Sun.COM
1706*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1707*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1708*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1709*10583SEdward.Gillett@Sun.COM#endif
1710*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1711*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1712*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1713*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1714*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1715*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1716*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1717*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1718*10583SEdward.Gillett@Sun.COM#endif
1719*10583SEdward.Gillett@Sun.COM
1720*10583SEdward.Gillett@Sun.COM	#palignr $2, (%rsi, %rcx), %xmm3
1721*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1722*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x02
1723*10583SEdward.Gillett@Sun.COM
1724*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1725*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1726*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1727*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1728*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1729*10583SEdward.Gillett@Sun.COM#endif
1730*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_2_use_ssse3)
1731*10583SEdward.Gillett@Sun.COM
1732*10583SEdward.Gillett@Sun.COM	.p2align 4
1733*10583SEdward.Gillett@Sun.COMLABEL(ashr_2_use_sse2):
1734*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1735*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1736*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1737*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1738*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1739*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1740*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1741*10583SEdward.Gillett@Sun.COM#endif
1742*10583SEdward.Gillett@Sun.COM
1743*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1744*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1745*10583SEdward.Gillett@Sun.COM
1746*10583SEdward.Gillett@Sun.COM	psrldq	$2, %xmm2
1747*10583SEdward.Gillett@Sun.COM	pslldq	$14, %xmm3
1748*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1749*10583SEdward.Gillett@Sun.COM
1750*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1751*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
17520Sstevel@tonic-gate
1753*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1754*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1755*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1756*10583SEdward.Gillett@Sun.COM#endif
1757*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1758*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1759*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1760*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1761*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1762*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1763*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1764*10583SEdward.Gillett@Sun.COM#endif
1765*10583SEdward.Gillett@Sun.COM
1766*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1767*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1768*10583SEdward.Gillett@Sun.COM
1769*10583SEdward.Gillett@Sun.COM	psrldq	$2, %xmm2
1770*10583SEdward.Gillett@Sun.COM	pslldq	$14, %xmm3
1771*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1772*10583SEdward.Gillett@Sun.COM
1773*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1774*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1775*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1776*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1777*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1778*10583SEdward.Gillett@Sun.COM#endif
1779*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_2_use_sse2)
1780*10583SEdward.Gillett@Sun.COM
1781*10583SEdward.Gillett@Sun.COM
1782*10583SEdward.Gillett@Sun.COM/*
1783*10583SEdward.Gillett@Sun.COM * ashr_1 handles the following cases:
1784*10583SEdward.Gillett@Sun.COM * 	(16 + (src offset - dest offset)) % 16 = 1
1785*10583SEdward.Gillett@Sun.COM *
1786*10583SEdward.Gillett@Sun.COM * Based on above operation, start from (%r9 + rsi) to the left of this cache
1787*10583SEdward.Gillett@Sun.COM * bank, there is no null byte.
1788*10583SEdward.Gillett@Sun.COM */
1789*10583SEdward.Gillett@Sun.COM	.p2align 4
1790*10583SEdward.Gillett@Sun.COMLABEL(ashr_1):
1791*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx				/* clear index */
1792*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1793*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1794*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1795*10583SEdward.Gillett@Sun.COM#endif
1796*10583SEdward.Gillett@Sun.COM	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1797*10583SEdward.Gillett@Sun.COM	jz	LABEL(ashr_1_use_sse2)
1798*10583SEdward.Gillett@Sun.COM
1799*10583SEdward.Gillett@Sun.COM	.p2align 4
1800*10583SEdward.Gillett@Sun.COMLABEL(ashr_1_use_ssse3):
1801*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1802*10583SEdward.Gillett@Sun.COM	pcmpeqb	%xmm3, %xmm0
1803*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1804*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1805*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1806*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1807*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1808*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1809*10583SEdward.Gillett@Sun.COM#endif
1810*10583SEdward.Gillett@Sun.COM
1811*10583SEdward.Gillett@Sun.COM	#palignr $1, (%rsi, %rcx), %xmm3
1812*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1813*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x01
1814*10583SEdward.Gillett@Sun.COM
1815*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1816*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
18170Sstevel@tonic-gate
18187953SNobutomo.Nakano@Sun.COM#ifdef USE_AS_STRNCPY
1819*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1820*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1821*10583SEdward.Gillett@Sun.COM#endif
1822*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1823*10583SEdward.Gillett@Sun.COM	pcmpeqb %xmm3, %xmm0
1824*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1825*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1826*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1827*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1828*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1829*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1830*10583SEdward.Gillett@Sun.COM#endif
1831*10583SEdward.Gillett@Sun.COM	#palignr $1, (%rsi, %rcx), %xmm3
1832*10583SEdward.Gillett@Sun.COM	.byte	0x66, 0x0F, 0x3A ,0x0F
1833*10583SEdward.Gillett@Sun.COM	.byte	0x1c, 0x0e, 0x01
1834*10583SEdward.Gillett@Sun.COM
1835*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1836*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1837*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1838*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1839*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1840*10583SEdward.Gillett@Sun.COM#endif
1841*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_1_use_ssse3)
1842*10583SEdward.Gillett@Sun.COM
1843*10583SEdward.Gillett@Sun.COM	.p2align 4
1844*10583SEdward.Gillett@Sun.COMLABEL(ashr_1_use_sse2):
1845*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1846*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1847*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1848*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1849*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1850*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1851*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1852*10583SEdward.Gillett@Sun.COM#endif
1853*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1854*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1855*10583SEdward.Gillett@Sun.COM
1856*10583SEdward.Gillett@Sun.COM	psrldq	$1, %xmm2
1857*10583SEdward.Gillett@Sun.COM	pslldq	$15, %xmm3
1858*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1859*10583SEdward.Gillett@Sun.COM
1860*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1861*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1862*10583SEdward.Gillett@Sun.COM
1863*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1864*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1865*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1866*10583SEdward.Gillett@Sun.COM#endif
1867*10583SEdward.Gillett@Sun.COM	pcmpeqb 16(%rsi, %rcx), %xmm0
1868*10583SEdward.Gillett@Sun.COM	pmovmskb %xmm0, %edx
1869*10583SEdward.Gillett@Sun.COM	test	%edx, %edx
1870*10583SEdward.Gillett@Sun.COM	jnz	LABEL(unaligned_exit)
1871*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1872*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
1873*10583SEdward.Gillett@Sun.COM 	jbe	LABEL(strncpy_truncation_unaligned)
1874*10583SEdward.Gillett@Sun.COM#endif
1875*10583SEdward.Gillett@Sun.COM
1876*10583SEdward.Gillett@Sun.COM	movdqa	16(%rsi, %rcx), %xmm3
1877*10583SEdward.Gillett@Sun.COM	movdqa	(%rsi, %rcx), %xmm2
1878*10583SEdward.Gillett@Sun.COM
1879*10583SEdward.Gillett@Sun.COM	psrldq	$1, %xmm2
1880*10583SEdward.Gillett@Sun.COM	pslldq	$15, %xmm3
1881*10583SEdward.Gillett@Sun.COM	por	%xmm2, %xmm3
1882*10583SEdward.Gillett@Sun.COM
1883*10583SEdward.Gillett@Sun.COM	movdqa	%xmm3, (%rdi, %rcx)
1884*10583SEdward.Gillett@Sun.COM	add	$16, %rcx
1885*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1886*10583SEdward.Gillett@Sun.COM	cmp	%r10, %r8
1887*10583SEdward.Gillett@Sun.COM	jbe	LABEL(unaligned_exit)
1888*10583SEdward.Gillett@Sun.COM#endif
1889*10583SEdward.Gillett@Sun.COM	jmp	LABEL(ashr_1_use_sse2)
1890*10583SEdward.Gillett@Sun.COM
1891*10583SEdward.Gillett@Sun.COM
1892*10583SEdward.Gillett@Sun.COM	/*
1893*10583SEdward.Gillett@Sun.COM	 * Exit tail code:
1894*10583SEdward.Gillett@Sun.COM	 * Up to 32 bytes are copied in the case of strcpy.
1895*10583SEdward.Gillett@Sun.COM	 */
1896*10583SEdward.Gillett@Sun.COM	.p2align 4
1897*10583SEdward.Gillett@Sun.COMLABEL(less32bytes):
1898*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx
1899*10583SEdward.Gillett@Sun.COMLABEL(unaligned_exit):
1900*10583SEdward.Gillett@Sun.COM	add	%r9, %rsi		/* r9 holds offset of rsi */
1901*10583SEdward.Gillett@Sun.COM	mov	%rcx, %r9
1902*10583SEdward.Gillett@Sun.COM	mov	%r10, %rcx
1903*10583SEdward.Gillett@Sun.COM	shl	%cl, %edx		/* after shl, calculate the exact number to be filled */
1904*10583SEdward.Gillett@Sun.COM	mov	%r9, %rcx
1905*10583SEdward.Gillett@Sun.COM	.p2align 4
1906*10583SEdward.Gillett@Sun.COMLABEL(aligned_exit):
1907*10583SEdward.Gillett@Sun.COM	add	%rcx, %rdi		/* locate exact address for rdi */
1908*10583SEdward.Gillett@Sun.COMLABEL(less16bytes):
1909*10583SEdward.Gillett@Sun.COM	add	%rcx, %rsi		/* locate exact address for rsi */
1910*10583SEdward.Gillett@Sun.COMLABEL(aligned_16bytes):
1911*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1912*10583SEdward.Gillett@Sun.COM	/*
1913*10583SEdward.Gillett@Sun.COM	 * Null found in 16bytes checked. Set bit in bitmask corresponding to
1914*10583SEdward.Gillett@Sun.COM	 * the strncpy count argument. We will copy to the null (inclusive)
1915*10583SEdward.Gillett@Sun.COM	 * or count whichever comes first.
1916*10583SEdward.Gillett@Sun.COM	 */
1917*10583SEdward.Gillett@Sun.COM	mov	$1, %r9d
1918*10583SEdward.Gillett@Sun.COM	lea	-1(%r8), %rcx
1919*10583SEdward.Gillett@Sun.COM	shl	%cl, %r9d
1920*10583SEdward.Gillett@Sun.COM	cmp	$32, %r8
1921*10583SEdward.Gillett@Sun.COM	ja	LABEL(strncpy_tail)
1922*10583SEdward.Gillett@Sun.COM	or	%r9d, %edx
1923*10583SEdward.Gillett@Sun.COMLABEL(strncpy_tail):
1924*10583SEdward.Gillett@Sun.COM#endif
1925*10583SEdward.Gillett@Sun.COM	/*
1926*10583SEdward.Gillett@Sun.COM	 * Check to see if BSF is fast on this processor. If not, use a
1927*10583SEdward.Gillett@Sun.COM	 * different exit tail.
1928*10583SEdward.Gillett@Sun.COM	 */
1929*10583SEdward.Gillett@Sun.COM	testb	$USE_BSF, .memops_method(%rip)
1930*10583SEdward.Gillett@Sun.COM	jz	LABEL(AMD_exit)
1931*10583SEdward.Gillett@Sun.COM	bsf	%rdx, %rcx		/* Find byte with null char */
1932*10583SEdward.Gillett@Sun.COM	lea	LABEL(tail_table)(%rip), %r11
1933*10583SEdward.Gillett@Sun.COM	movslq	(%r11, %rcx, 4), %rcx
1934*10583SEdward.Gillett@Sun.COM	lea	(%r11, %rcx), %rcx
1935*10583SEdward.Gillett@Sun.COM	jmp	*%rcx
1936*10583SEdward.Gillett@Sun.COM
1937*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
1938*10583SEdward.Gillett@Sun.COM	/*
1939*10583SEdward.Gillett@Sun.COM	 * Count reached before null found.
1940*10583SEdward.Gillett@Sun.COM	 */
1941*10583SEdward.Gillett@Sun.COM	.p2align 4
1942*10583SEdward.Gillett@Sun.COMLABEL(less32bytes_strncpy_truncation):
1943*10583SEdward.Gillett@Sun.COM	xor	%ecx, %ecx
1944*10583SEdward.Gillett@Sun.COMLABEL(strncpy_truncation_unaligned):
1945*10583SEdward.Gillett@Sun.COM	add	%r9, %rsi		/* next src char to copy */
1946*10583SEdward.Gillett@Sun.COMLABEL(strncpy_truncation_aligned):
1947*10583SEdward.Gillett@Sun.COM	add	%rcx, %rdi
1948*10583SEdward.Gillett@Sun.COM	add	%rcx, %rsi
1949*10583SEdward.Gillett@Sun.COM	add	$16, %r8		/* compensation */
1950*10583SEdward.Gillett@Sun.COM	lea	-1(%r8), %rcx
1951*10583SEdward.Gillett@Sun.COM	lea	LABEL(tail_table)(%rip), %r11
1952*10583SEdward.Gillett@Sun.COM	movslq	(%r11, %rcx, 4), %rcx
1953*10583SEdward.Gillett@Sun.COM	lea	(%r11, %rcx), %rcx
1954*10583SEdward.Gillett@Sun.COM	jmp	*%rcx
1955*10583SEdward.Gillett@Sun.COM
1956*10583SEdward.Gillett@Sun.COM	.p2align 4
1957*10583SEdward.Gillett@Sun.COMLABEL(strncpy_exitz):
1958*10583SEdward.Gillett@Sun.COM	mov	%rdi, %rax
1959*10583SEdward.Gillett@Sun.COM	ret
19607953SNobutomo.Nakano@Sun.COM#endif
19617953SNobutomo.Nakano@Sun.COM
19620Sstevel@tonic-gate	.p2align 4
1963*10583SEdward.Gillett@Sun.COMLABEL(AMD_exit):
1964*10583SEdward.Gillett@Sun.COM	test	%dl, %dl
1965*10583SEdward.Gillett@Sun.COM	jz	LABEL(AMD_exit_more_8)
1966*10583SEdward.Gillett@Sun.COM	test	$0x01, %dl
1967*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_0)
1968*10583SEdward.Gillett@Sun.COM	test	$0x02, %dl
1969*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_1)
1970*10583SEdward.Gillett@Sun.COM	test	$0x04, %dl
1971*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_2)
1972*10583SEdward.Gillett@Sun.COM	test	$0x08, %dl
1973*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_3)
1974*10583SEdward.Gillett@Sun.COM	test	$0x10, %dl
1975*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_4)
1976*10583SEdward.Gillett@Sun.COM	test	$0x20, %dl
1977*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_5)
1978*10583SEdward.Gillett@Sun.COM	test	$0x40, %dl
1979*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_6)
19800Sstevel@tonic-gate
1981*10583SEdward.Gillett@Sun.COM	.p2align 4
1982*10583SEdward.Gillett@Sun.COMLABEL(tail_7):				/* 8 bytes */
1983*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
1984*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
19850Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
1986*10583SEdward.Gillett@Sun.COM	mov	$8, %cl
1987*10583SEdward.Gillett@Sun.COM	sub	$8, %r8
1988*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
19890Sstevel@tonic-gate#endif
1990*10583SEdward.Gillett@Sun.COM	ret
19910Sstevel@tonic-gate
19920Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
1993*10583SEdward.Gillett@Sun.COM	/*
1994*10583SEdward.Gillett@Sun.COM	 * Null terminated src string shorter than count. Fill the rest of the
1995*10583SEdward.Gillett@Sun.COM	 * destination with null chars.
1996*10583SEdward.Gillett@Sun.COM	 */
1997*10583SEdward.Gillett@Sun.COM	.p2align 4
1998*10583SEdward.Gillett@Sun.COMLABEL(strncpy_fill_tail):
1999*10583SEdward.Gillett@Sun.COM	mov	%rax, %rdx
2000*10583SEdward.Gillett@Sun.COM	movzx	%cl, %rax
2001*10583SEdward.Gillett@Sun.COM	mov	%r8, %rcx
2002*10583SEdward.Gillett@Sun.COM	add	%rax, %rdi
2003*10583SEdward.Gillett@Sun.COM	xor	%eax, %eax
2004*10583SEdward.Gillett@Sun.COM	shr	$3, %ecx
2005*10583SEdward.Gillett@Sun.COM	jz	LABEL(strncpy_fill_less_8)
20060Sstevel@tonic-gate
2007*10583SEdward.Gillett@Sun.COM	rep	stosq
2008*10583SEdward.Gillett@Sun.COMLABEL(strncpy_fill_less_8):
2009*10583SEdward.Gillett@Sun.COM	mov	%r8, %rcx
2010*10583SEdward.Gillett@Sun.COM	and	$7, %rcx
2011*10583SEdward.Gillett@Sun.COM	jz	LABEL(strncpy_fill_return)
2012*10583SEdward.Gillett@Sun.COMLABEL(strncpy_fill_less_7):
2013*10583SEdward.Gillett@Sun.COM	sub	$1, %ecx
2014*10583SEdward.Gillett@Sun.COM	mov	%al, (%rdi, %rcx)
2015*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_less_7)
2016*10583SEdward.Gillett@Sun.COMLABEL(strncpy_fill_return):
2017*10583SEdward.Gillett@Sun.COM	mov	%rdx, %rax
2018*10583SEdward.Gillett@Sun.COM	ret
20190Sstevel@tonic-gate#endif
20200Sstevel@tonic-gate
2021*10583SEdward.Gillett@Sun.COM	.p2align 4
2022*10583SEdward.Gillett@Sun.COMLABEL(tail_0):				/* 1 byte */
2023*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %cl
2024*10583SEdward.Gillett@Sun.COM	mov	%cl, (%rdi)
2025*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2026*10583SEdward.Gillett@Sun.COM	mov	$1, %cl
2027*10583SEdward.Gillett@Sun.COM	sub	$1, %r8
2028*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2029*10583SEdward.Gillett@Sun.COM#endif
2030*10583SEdward.Gillett@Sun.COM	ret
20310Sstevel@tonic-gate
2032*10583SEdward.Gillett@Sun.COM	.p2align 4
2033*10583SEdward.Gillett@Sun.COMLABEL(tail_1):				/* 2 bytes */
2034*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %cx
2035*10583SEdward.Gillett@Sun.COM	mov	%cx, (%rdi)
2036*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2037*10583SEdward.Gillett@Sun.COM	mov	$2, %cl
2038*10583SEdward.Gillett@Sun.COM	sub	$2, %r8
2039*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2040*10583SEdward.Gillett@Sun.COM#endif
2041*10583SEdward.Gillett@Sun.COM	ret
20420Sstevel@tonic-gate
2043*10583SEdward.Gillett@Sun.COM	.p2align 4
2044*10583SEdward.Gillett@Sun.COMLABEL(tail_2):				/* 3 bytes */
2045*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %cx
2046*10583SEdward.Gillett@Sun.COM	mov	%cx, (%rdi)
2047*10583SEdward.Gillett@Sun.COM	mov	1(%rsi), %cx
2048*10583SEdward.Gillett@Sun.COM	mov	%cx, 1(%rdi)
2049*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2050*10583SEdward.Gillett@Sun.COM	mov	$3, %cl
2051*10583SEdward.Gillett@Sun.COM	sub	$3, %r8
2052*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2053*10583SEdward.Gillett@Sun.COM#endif
2054*10583SEdward.Gillett@Sun.COM	ret
20550Sstevel@tonic-gate
2056*10583SEdward.Gillett@Sun.COM	.p2align 4
2057*10583SEdward.Gillett@Sun.COMLABEL(tail_3):				/* 4 bytes */
2058*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %ecx
2059*10583SEdward.Gillett@Sun.COM	mov	%ecx, (%rdi)
20600Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2061*10583SEdward.Gillett@Sun.COM	mov	$4, %cl
2062*10583SEdward.Gillett@Sun.COM	sub	$4, %r8
2063*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
20640Sstevel@tonic-gate#endif
2065*10583SEdward.Gillett@Sun.COM	ret
20660Sstevel@tonic-gate
2067*10583SEdward.Gillett@Sun.COM	.p2align 4
2068*10583SEdward.Gillett@Sun.COMLABEL(tail_4):				/* 5 bytes */
2069*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %ecx
2070*10583SEdward.Gillett@Sun.COM	mov	%ecx, (%rdi)
2071*10583SEdward.Gillett@Sun.COM	mov	1(%rsi), %edx
2072*10583SEdward.Gillett@Sun.COM	mov	%edx, 1(%rdi)
2073*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2074*10583SEdward.Gillett@Sun.COM	mov	$5, %cl
2075*10583SEdward.Gillett@Sun.COM	sub	$5, %r8
2076*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2077*10583SEdward.Gillett@Sun.COM#endif
2078*10583SEdward.Gillett@Sun.COM	ret
20790Sstevel@tonic-gate
2080*10583SEdward.Gillett@Sun.COM	.p2align 4
2081*10583SEdward.Gillett@Sun.COMLABEL(tail_5):				/* 6 bytes */
2082*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %ecx
2083*10583SEdward.Gillett@Sun.COM	mov	%ecx, (%rdi)
2084*10583SEdward.Gillett@Sun.COM	mov	2(%rsi), %edx
2085*10583SEdward.Gillett@Sun.COM	mov	%edx, 2(%rdi)
2086*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2087*10583SEdward.Gillett@Sun.COM	mov	$6, %cl
2088*10583SEdward.Gillett@Sun.COM	sub	$6, %r8
2089*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2090*10583SEdward.Gillett@Sun.COM#endif
2091*10583SEdward.Gillett@Sun.COM	ret
20920Sstevel@tonic-gate
2093*10583SEdward.Gillett@Sun.COM	.p2align 4
2094*10583SEdward.Gillett@Sun.COMLABEL(tail_6):				/* 7 bytes */
2095*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %ecx
2096*10583SEdward.Gillett@Sun.COM	mov	%ecx, (%rdi)
2097*10583SEdward.Gillett@Sun.COM	mov	3(%rsi), %edx
2098*10583SEdward.Gillett@Sun.COM	mov	%edx,3(%rdi)
20990Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2100*10583SEdward.Gillett@Sun.COM	mov	$7, %cl
2101*10583SEdward.Gillett@Sun.COM	sub	$7, %r8
2102*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
21030Sstevel@tonic-gate#endif
2104*10583SEdward.Gillett@Sun.COM	ret
21050Sstevel@tonic-gate
2106*10583SEdward.Gillett@Sun.COM	.p2align 4
2107*10583SEdward.Gillett@Sun.COMLABEL(tail_8):				/* 9 bytes */
2108*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2109*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2110*10583SEdward.Gillett@Sun.COM	mov	5(%rsi), %edx
2111*10583SEdward.Gillett@Sun.COM	mov	%edx, 5(%rdi)
2112*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2113*10583SEdward.Gillett@Sun.COM	mov	$9, %cl
2114*10583SEdward.Gillett@Sun.COM	sub	$9, %r8
2115*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2116*10583SEdward.Gillett@Sun.COM#endif
2117*10583SEdward.Gillett@Sun.COM	ret
21180Sstevel@tonic-gate
2119*10583SEdward.Gillett@Sun.COM	.p2align 4
2120*10583SEdward.Gillett@Sun.COMLABEL(AMD_exit_more_8):
2121*10583SEdward.Gillett@Sun.COM	test	%dh, %dh
2122*10583SEdward.Gillett@Sun.COM	jz	LABEL(AMD_exit_more_16)
2123*10583SEdward.Gillett@Sun.COM	test	$0x01, %dh
2124*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_8)
2125*10583SEdward.Gillett@Sun.COM	test	$0x02, %dh
2126*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_9)
2127*10583SEdward.Gillett@Sun.COM	test	$0x04, %dh
2128*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_10)
2129*10583SEdward.Gillett@Sun.COM	test	$0x08, %dh
2130*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_11)
2131*10583SEdward.Gillett@Sun.COM	test	$0x10, %dh
2132*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_12)
2133*10583SEdward.Gillett@Sun.COM	test	$0x20, %dh
2134*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_13)
2135*10583SEdward.Gillett@Sun.COM	test	$0x40, %dh
2136*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_14)
21370Sstevel@tonic-gate
2138*10583SEdward.Gillett@Sun.COM	.p2align 4
2139*10583SEdward.Gillett@Sun.COMLABEL(tail_15):				/* 16 bytes */
2140*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2141*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2142*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2143*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2144*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2145*10583SEdward.Gillett@Sun.COM	mov	$16, %cl
2146*10583SEdward.Gillett@Sun.COM	sub	$16, %r8
2147*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2148*10583SEdward.Gillett@Sun.COM#endif
2149*10583SEdward.Gillett@Sun.COM	ret
21500Sstevel@tonic-gate
2151*10583SEdward.Gillett@Sun.COM	.p2align 4
2152*10583SEdward.Gillett@Sun.COMLABEL(tail_9):				/* 10 bytes */
2153*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2154*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2155*10583SEdward.Gillett@Sun.COM	mov	6(%rsi), %edx
2156*10583SEdward.Gillett@Sun.COM	mov	%edx, 6(%rdi)
21570Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2158*10583SEdward.Gillett@Sun.COM	mov	$10, %cl
2159*10583SEdward.Gillett@Sun.COM	sub	$10, %r8
2160*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
21610Sstevel@tonic-gate#endif
2162*10583SEdward.Gillett@Sun.COM	ret
21630Sstevel@tonic-gate
2164*10583SEdward.Gillett@Sun.COM	.p2align 4
2165*10583SEdward.Gillett@Sun.COMLABEL(tail_10):				/* 11 bytes */
2166*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2167*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2168*10583SEdward.Gillett@Sun.COM	mov	7(%rsi), %edx
2169*10583SEdward.Gillett@Sun.COM	mov	%edx, 7(%rdi)
2170*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2171*10583SEdward.Gillett@Sun.COM	mov	$11, %cl
2172*10583SEdward.Gillett@Sun.COM	sub	$11, %r8
2173*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2174*10583SEdward.Gillett@Sun.COM#endif
2175*10583SEdward.Gillett@Sun.COM	ret
21760Sstevel@tonic-gate
2177*10583SEdward.Gillett@Sun.COM	.p2align 4
2178*10583SEdward.Gillett@Sun.COMLABEL(tail_11):				/* 12 bytes */
2179*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2180*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2181*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %edx
2182*10583SEdward.Gillett@Sun.COM	mov	%edx, 8(%rdi)
2183*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2184*10583SEdward.Gillett@Sun.COM	mov	$12, %cl
2185*10583SEdward.Gillett@Sun.COM	sub	$12, %r8
2186*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2187*10583SEdward.Gillett@Sun.COM#endif
2188*10583SEdward.Gillett@Sun.COM	ret
21890Sstevel@tonic-gate
2190*10583SEdward.Gillett@Sun.COM	.p2align 4
2191*10583SEdward.Gillett@Sun.COMLABEL(tail_12):				/* 13 bytes */
2192*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2193*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2194*10583SEdward.Gillett@Sun.COM	mov	5(%rsi), %rcx
2195*10583SEdward.Gillett@Sun.COM	mov	%rcx, 5(%rdi)
2196*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2197*10583SEdward.Gillett@Sun.COM	mov	$13, %cl
2198*10583SEdward.Gillett@Sun.COM	sub	$13, %r8
2199*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2200*10583SEdward.Gillett@Sun.COM#endif
2201*10583SEdward.Gillett@Sun.COM	ret
22020Sstevel@tonic-gate
2203*10583SEdward.Gillett@Sun.COM	.p2align 4
2204*10583SEdward.Gillett@Sun.COMLABEL(tail_13):				/* 14 bytes */
2205*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2206*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2207*10583SEdward.Gillett@Sun.COM	mov	6(%rsi), %rcx
2208*10583SEdward.Gillett@Sun.COM	mov	%rcx, 6(%rdi)
2209*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2210*10583SEdward.Gillett@Sun.COM	mov	$14, %cl
2211*10583SEdward.Gillett@Sun.COM	sub	$14, %r8
2212*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2213*10583SEdward.Gillett@Sun.COM#endif
2214*10583SEdward.Gillett@Sun.COM	ret
22150Sstevel@tonic-gate
2216*10583SEdward.Gillett@Sun.COM	.p2align 4
2217*10583SEdward.Gillett@Sun.COMLABEL(tail_14):				/* 15 bytes */
2218*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2219*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2220*10583SEdward.Gillett@Sun.COM	mov	7(%rsi), %rcx
2221*10583SEdward.Gillett@Sun.COM	mov	%rcx, 7(%rdi)
22220Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2223*10583SEdward.Gillett@Sun.COM	mov	$15, %cl
2224*10583SEdward.Gillett@Sun.COM	sub	$15, %r8
2225*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
22260Sstevel@tonic-gate#endif
2227*10583SEdward.Gillett@Sun.COM	ret
22280Sstevel@tonic-gate
2229*10583SEdward.Gillett@Sun.COM	.p2align 4
2230*10583SEdward.Gillett@Sun.COMLABEL(AMD_exit_more_16):
2231*10583SEdward.Gillett@Sun.COM	shr	$16, %edx
2232*10583SEdward.Gillett@Sun.COM	test	%dl, %dl
2233*10583SEdward.Gillett@Sun.COM	jz	LABEL(AMD_exit_more_24)
2234*10583SEdward.Gillett@Sun.COM	test	$0x01, %dl
2235*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_16)
2236*10583SEdward.Gillett@Sun.COM	test	$0x02, %dl
2237*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_17)
2238*10583SEdward.Gillett@Sun.COM	test	$0x04, %dl
2239*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_18)
2240*10583SEdward.Gillett@Sun.COM	test	$0x08, %dl
2241*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_19)
2242*10583SEdward.Gillett@Sun.COM	test	$0x10, %dl
2243*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_20)
2244*10583SEdward.Gillett@Sun.COM	test	$0x20, %dl
2245*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_21)
2246*10583SEdward.Gillett@Sun.COM	test	$0x40, %dl
2247*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_22)
22480Sstevel@tonic-gate
2249*10583SEdward.Gillett@Sun.COM	.p2align 4
2250*10583SEdward.Gillett@Sun.COMLABEL(tail_23):				/* 24 bytes */
2251*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2252*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2253*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2254*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2255*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2256*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
22570Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2258*10583SEdward.Gillett@Sun.COM	mov	$24, %cl
2259*10583SEdward.Gillett@Sun.COM	sub	$24, %r8
2260*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
22610Sstevel@tonic-gate#endif
2262*10583SEdward.Gillett@Sun.COM	ret
22630Sstevel@tonic-gate
2264*10583SEdward.Gillett@Sun.COM	.p2align 4
2265*10583SEdward.Gillett@Sun.COMLABEL(tail_16):				/* 17 bytes */
2266*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2267*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2268*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2269*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2270*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %cl
2271*10583SEdward.Gillett@Sun.COM	mov	%cl, 16(%rdi)
2272*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2273*10583SEdward.Gillett@Sun.COM	mov	$17, %cl
2274*10583SEdward.Gillett@Sun.COM	sub	$17, %r8
2275*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2276*10583SEdward.Gillett@Sun.COM#endif
2277*10583SEdward.Gillett@Sun.COM	ret
2278*10583SEdward.Gillett@Sun.COM
2279*10583SEdward.Gillett@Sun.COM	.p2align 4
2280*10583SEdward.Gillett@Sun.COMLABEL(tail_17):				/* 18 bytes */
2281*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2282*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2283*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2284*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2285*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %cx
2286*10583SEdward.Gillett@Sun.COM	mov	%cx, 16(%rdi)
2287*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2288*10583SEdward.Gillett@Sun.COM	mov	$18, %cl
2289*10583SEdward.Gillett@Sun.COM	sub	$18, %r8
2290*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2291*10583SEdward.Gillett@Sun.COM#endif
2292*10583SEdward.Gillett@Sun.COM	ret
22930Sstevel@tonic-gate
2294*10583SEdward.Gillett@Sun.COM	.p2align 4
2295*10583SEdward.Gillett@Sun.COMLABEL(tail_18):				/* 19 bytes */
2296*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2297*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2298*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2299*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2300*10583SEdward.Gillett@Sun.COM	mov	15(%rsi), %ecx
2301*10583SEdward.Gillett@Sun.COM	mov	%ecx,15(%rdi)
23020Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2303*10583SEdward.Gillett@Sun.COM	mov	$19, %cl
2304*10583SEdward.Gillett@Sun.COM	sub	$19, %r8
2305*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2306*10583SEdward.Gillett@Sun.COM#endif
2307*10583SEdward.Gillett@Sun.COM	ret
23080Sstevel@tonic-gate
2309*10583SEdward.Gillett@Sun.COM	.p2align 4
2310*10583SEdward.Gillett@Sun.COMLABEL(tail_19):				/* 20 bytes */
2311*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2312*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2313*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2314*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2315*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %ecx
2316*10583SEdward.Gillett@Sun.COM	mov	%ecx, 16(%rdi)
2317*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2318*10583SEdward.Gillett@Sun.COM	mov	$20, %cl
2319*10583SEdward.Gillett@Sun.COM	sub	$20, %r8
2320*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
23210Sstevel@tonic-gate#endif
2322*10583SEdward.Gillett@Sun.COM	ret
23230Sstevel@tonic-gate
2324*10583SEdward.Gillett@Sun.COM	.p2align 4
2325*10583SEdward.Gillett@Sun.COMLABEL(tail_20):				/* 21 bytes */
2326*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2327*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2328*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2329*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2330*10583SEdward.Gillett@Sun.COM	mov	13(%rsi), %rcx
2331*10583SEdward.Gillett@Sun.COM	mov	%rcx, 13(%rdi)
2332*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2333*10583SEdward.Gillett@Sun.COM	mov	$21, %cl
2334*10583SEdward.Gillett@Sun.COM	sub	$21, %r8
2335*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2336*10583SEdward.Gillett@Sun.COM#endif
2337*10583SEdward.Gillett@Sun.COM	ret
23380Sstevel@tonic-gate
2339*10583SEdward.Gillett@Sun.COM	.p2align 4
2340*10583SEdward.Gillett@Sun.COMLABEL(tail_21):				/* 22 bytes */
2341*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2342*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2343*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2344*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2345*10583SEdward.Gillett@Sun.COM	mov	14(%rsi), %rcx
2346*10583SEdward.Gillett@Sun.COM	mov	%rcx, 14(%rdi)
23470Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2348*10583SEdward.Gillett@Sun.COM	mov	$22, %cl
2349*10583SEdward.Gillett@Sun.COM	sub	$22, %r8
2350*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
23510Sstevel@tonic-gate#endif
2352*10583SEdward.Gillett@Sun.COM	ret
23530Sstevel@tonic-gate
2354*10583SEdward.Gillett@Sun.COM	.p2align 4
2355*10583SEdward.Gillett@Sun.COMLABEL(tail_22):				/* 23 bytes */
2356*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2357*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2358*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2359*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2360*10583SEdward.Gillett@Sun.COM	mov	15(%rsi), %rcx
2361*10583SEdward.Gillett@Sun.COM	mov	%rcx, 15(%rdi)
2362*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2363*10583SEdward.Gillett@Sun.COM	mov	$23, %cl
2364*10583SEdward.Gillett@Sun.COM	sub	$23, %r8
2365*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2366*10583SEdward.Gillett@Sun.COM#endif
2367*10583SEdward.Gillett@Sun.COM	ret
23680Sstevel@tonic-gate
2369*10583SEdward.Gillett@Sun.COM	.p2align 4
2370*10583SEdward.Gillett@Sun.COMLABEL(AMD_exit_more_24):
2371*10583SEdward.Gillett@Sun.COM	test	$0x01, %dh
2372*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_24)
2373*10583SEdward.Gillett@Sun.COM	test	$0x02, %dh
2374*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_25)
2375*10583SEdward.Gillett@Sun.COM	test	$0x04, %dh
2376*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_26)
2377*10583SEdward.Gillett@Sun.COM	test	$0x08, %dh
2378*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_27)
2379*10583SEdward.Gillett@Sun.COM	test	$0x10, %dh
2380*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_28)
2381*10583SEdward.Gillett@Sun.COM	test	$0x20, %dh
2382*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_29)
2383*10583SEdward.Gillett@Sun.COM	test	$0x40, %dh
2384*10583SEdward.Gillett@Sun.COM	jnz	LABEL(tail_30)
23850Sstevel@tonic-gate
2386*10583SEdward.Gillett@Sun.COM	.p2align 4
2387*10583SEdward.Gillett@Sun.COMLABEL(tail_31):				/* 32 bytes */
2388*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2389*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2390*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2391*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2392*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2393*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2394*10583SEdward.Gillett@Sun.COM	mov	24(%rsi), %rdx
2395*10583SEdward.Gillett@Sun.COM	mov	%rdx, 24(%rdi)
2396*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2397*10583SEdward.Gillett@Sun.COM	mov	$32, %cl
2398*10583SEdward.Gillett@Sun.COM	sub	$32, %r8
2399*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2400*10583SEdward.Gillett@Sun.COM#endif
2401*10583SEdward.Gillett@Sun.COM	ret
24020Sstevel@tonic-gate
2403*10583SEdward.Gillett@Sun.COM	.p2align 4
2404*10583SEdward.Gillett@Sun.COMLABEL(tail_24):				/* 25 bytes */
2405*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2406*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2407*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2408*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2409*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2410*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2411*10583SEdward.Gillett@Sun.COM	mov	21(%rsi), %edx
2412*10583SEdward.Gillett@Sun.COM	mov	%edx, 21(%rdi)
2413*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2414*10583SEdward.Gillett@Sun.COM	mov	$25, %cl
2415*10583SEdward.Gillett@Sun.COM	sub	$25, %r8
2416*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2417*10583SEdward.Gillett@Sun.COM#endif
2418*10583SEdward.Gillett@Sun.COM	ret
24190Sstevel@tonic-gate
2420*10583SEdward.Gillett@Sun.COM	.p2align 4
2421*10583SEdward.Gillett@Sun.COMLABEL(tail_25):				/* 26 bytes */
2422*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2423*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2424*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2425*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2426*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2427*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2428*10583SEdward.Gillett@Sun.COM	mov	22(%rsi), %edx
2429*10583SEdward.Gillett@Sun.COM	mov	%edx, 22(%rdi)
24300Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
2431*10583SEdward.Gillett@Sun.COM	mov	$26, %cl
2432*10583SEdward.Gillett@Sun.COM	sub	$26, %r8
2433*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2434*10583SEdward.Gillett@Sun.COM#endif
2435*10583SEdward.Gillett@Sun.COM	ret
24360Sstevel@tonic-gate
2437*10583SEdward.Gillett@Sun.COM	.p2align 4
2438*10583SEdward.Gillett@Sun.COMLABEL(tail_26):				/* 27 bytes */
2439*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2440*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2441*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2442*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2443*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2444*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2445*10583SEdward.Gillett@Sun.COM	mov	23(%rsi), %edx
2446*10583SEdward.Gillett@Sun.COM	mov	%edx, 23(%rdi)
2447*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2448*10583SEdward.Gillett@Sun.COM	mov	$27, %cl
2449*10583SEdward.Gillett@Sun.COM	sub	$27, %r8
2450*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
24510Sstevel@tonic-gate#endif
2452*10583SEdward.Gillett@Sun.COM	ret
24530Sstevel@tonic-gate
2454*10583SEdward.Gillett@Sun.COM	.p2align 4
2455*10583SEdward.Gillett@Sun.COMLABEL(tail_27):				/* 28 bytes */
2456*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2457*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2458*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2459*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2460*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2461*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2462*10583SEdward.Gillett@Sun.COM	mov	24(%rsi), %edx
2463*10583SEdward.Gillett@Sun.COM	mov	%edx, 24(%rdi)
2464*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2465*10583SEdward.Gillett@Sun.COM	mov	$28, %cl
2466*10583SEdward.Gillett@Sun.COM	sub	$28, %r8
2467*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2468*10583SEdward.Gillett@Sun.COM#endif
2469*10583SEdward.Gillett@Sun.COM	ret
24700Sstevel@tonic-gate
2471*10583SEdward.Gillett@Sun.COM	.p2align 4
2472*10583SEdward.Gillett@Sun.COMLABEL(tail_28):				/* 29 bytes */
2473*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2474*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2475*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2476*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2477*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2478*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2479*10583SEdward.Gillett@Sun.COM	mov	21(%rsi), %rdx
2480*10583SEdward.Gillett@Sun.COM	mov	%rdx, 21(%rdi)
2481*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2482*10583SEdward.Gillett@Sun.COM	mov	$29, %cl
2483*10583SEdward.Gillett@Sun.COM	sub	$29, %r8
2484*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2485*10583SEdward.Gillett@Sun.COM#endif
2486*10583SEdward.Gillett@Sun.COM	ret
24870Sstevel@tonic-gate
2488*10583SEdward.Gillett@Sun.COM	.p2align 4
2489*10583SEdward.Gillett@Sun.COMLABEL(tail_29):				/* 30 bytes */
2490*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2491*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2492*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2493*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2494*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2495*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2496*10583SEdward.Gillett@Sun.COM	mov	22(%rsi), %rdx
2497*10583SEdward.Gillett@Sun.COM	mov	%rdx, 22(%rdi)
2498*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2499*10583SEdward.Gillett@Sun.COM	mov	$30, %cl
2500*10583SEdward.Gillett@Sun.COM	sub	$30, %r8
2501*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2502*10583SEdward.Gillett@Sun.COM#endif
2503*10583SEdward.Gillett@Sun.COM	ret
25040Sstevel@tonic-gate
2505*10583SEdward.Gillett@Sun.COM	.p2align 4
2506*10583SEdward.Gillett@Sun.COMLABEL(tail_30):				/* 31 bytes */
2507*10583SEdward.Gillett@Sun.COM	mov	(%rsi), %rcx
2508*10583SEdward.Gillett@Sun.COM	mov	%rcx, (%rdi)
2509*10583SEdward.Gillett@Sun.COM	mov	8(%rsi), %rdx
2510*10583SEdward.Gillett@Sun.COM	mov	%rdx, 8(%rdi)
2511*10583SEdward.Gillett@Sun.COM	mov	16(%rsi), %rcx
2512*10583SEdward.Gillett@Sun.COM	mov	%rcx, 16(%rdi)
2513*10583SEdward.Gillett@Sun.COM	mov	23(%rsi), %rdx
2514*10583SEdward.Gillett@Sun.COM	mov	%rdx, 23(%rdi)
2515*10583SEdward.Gillett@Sun.COM#ifdef USE_AS_STRNCPY
2516*10583SEdward.Gillett@Sun.COM	mov	$31, %cl
2517*10583SEdward.Gillett@Sun.COM	sub	$31, %r8
2518*10583SEdward.Gillett@Sun.COM	jnz	LABEL(strncpy_fill_tail)
2519*10583SEdward.Gillett@Sun.COM#endif
2520*10583SEdward.Gillett@Sun.COM	ret
25210Sstevel@tonic-gate
2522*10583SEdward.Gillett@Sun.COM	.pushsection .rodata
2523*10583SEdward.Gillett@Sun.COM	.p2align 4
2524*10583SEdward.Gillett@Sun.COMLABEL(tail_table):
2525*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_0) - LABEL(tail_table)	/* 1 byte */
2526*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_1) - LABEL(tail_table)
2527*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_2) - LABEL(tail_table)
2528*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_3) - LABEL(tail_table)
2529*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_4) - LABEL(tail_table)
2530*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_5) - LABEL(tail_table)
2531*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_6) - LABEL(tail_table)
2532*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_7) - LABEL(tail_table)
2533*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_8) - LABEL(tail_table)
2534*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_9) - LABEL(tail_table)
2535*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_10) - LABEL(tail_table)
2536*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_11) - LABEL(tail_table)
2537*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_12) - LABEL(tail_table)
2538*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_13) - LABEL(tail_table)
2539*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_14) - LABEL(tail_table)
2540*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_15) - LABEL(tail_table)
2541*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_16) - LABEL(tail_table)
2542*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_17) - LABEL(tail_table)
2543*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_18) - LABEL(tail_table)
2544*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_19) - LABEL(tail_table)
2545*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_20) - LABEL(tail_table)
2546*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_21) - LABEL(tail_table)
2547*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_22) - LABEL(tail_table)
2548*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_23) - LABEL(tail_table)
2549*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_24) - LABEL(tail_table)
2550*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_25) - LABEL(tail_table)
2551*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_26) - LABEL(tail_table)
2552*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_27) - LABEL(tail_table)
2553*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_28) - LABEL(tail_table)
2554*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_29) - LABEL(tail_table)
2555*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_30) - LABEL(tail_table)
2556*10583SEdward.Gillett@Sun.COM	.int	LABEL(tail_31) - LABEL(tail_table)	/* 32 bytes */
25570Sstevel@tonic-gate
2558*10583SEdward.Gillett@Sun.COM	.p2align 4
2559*10583SEdward.Gillett@Sun.COMLABEL(unaligned_table):
2560*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2561*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2562*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2563*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2564*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2565*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2566*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2567*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2568*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2569*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2570*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2571*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2572*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2573*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2574*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2575*10583SEdward.Gillett@Sun.COM	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2576*10583SEdward.Gillett@Sun.COM	.popsection
25770Sstevel@tonic-gate
25780Sstevel@tonic-gate#ifdef USE_AS_STRNCPY
25790Sstevel@tonic-gate	SET_SIZE(strncpy)
25800Sstevel@tonic-gate#else
2581*10583SEdward.Gillett@Sun.COM	SET_SIZE(strcpy)			/* (char *, const char *) */
25820Sstevel@tonic-gate#endif
2583