xref: /illumos-gate/usr/src/lib/libc/i386_hwcap1/gen/memcpy.S (revision 55fea89dcaa64928bed4327112404dcb3e07b79f)
1*5d9d9091SRichard Lowe/*
2*5d9d9091SRichard Lowe * CDDL HEADER START
3*5d9d9091SRichard Lowe *
4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the
5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License").
6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License.
7*5d9d9091SRichard Lowe *
8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing.
10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions
11*5d9d9091SRichard Lowe * and limitations under the License.
12*5d9d9091SRichard Lowe *
13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each
14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the
16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying
17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner]
18*5d9d9091SRichard Lowe *
19*5d9d9091SRichard Lowe * CDDL HEADER END
20*5d9d9091SRichard Lowe */
21*5d9d9091SRichard Lowe
22*5d9d9091SRichard Lowe/*
23*5d9d9091SRichard Lowe * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24*5d9d9091SRichard Lowe * Use is subject to license terms.
25*5d9d9091SRichard Lowe */
26*5d9d9091SRichard Lowe
27*5d9d9091SRichard Lowe	.file	"memcpy.s"
28*5d9d9091SRichard Lowe
29*5d9d9091SRichard Lowe#include <sys/asm_linkage.h>
30*5d9d9091SRichard Lowe
31*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memmove,function)
32*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memcpy,function)
33*5d9d9091SRichard Lowe
34*5d9d9091SRichard Lowe	ENTRY(memmove)
35*5d9d9091SRichard Lowe	movl	0+12(%esp),%ecx	/ get number of bytes to move
36*5d9d9091SRichard Lowe	pushl	%esi		/ save off %edi, %esi and move destination
37*5d9d9091SRichard Lowe	pushl	%edi
38*5d9d9091SRichard Lowe	movl	8+ 4(%esp),%edi	/ destination buffer address
39*5d9d9091SRichard Lowe	movl	8+ 8(%esp),%esi	/ source buffer address
40*5d9d9091SRichard Lowe	movl	%edi, %eax
41*5d9d9091SRichard Lowe	testl	%ecx,%ecx
42*5d9d9091SRichard Lowe	jz	.Return
43*5d9d9091SRichard Lowe
44*5d9d9091SRichard Lowe	cmpl	%esi,%edi	/ if (source addr > dest addr)
45*5d9d9091SRichard Lowe	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
46*5d9d9091SRichard Lowe	jbe	.memcpy_post	/ jump if dst <= src
47*5d9d9091SRichard Lowe	cmpl	%edx,%edi
48*5d9d9091SRichard Lowe	jbe	.CopyLeft	/ jump if dst <= src + size - 1
49*5d9d9091SRichard Lowe	jmp	.memcpy_post
50*5d9d9091SRichard Lowe
51*5d9d9091SRichard Lowe	ENTRY(memcpy)
52*5d9d9091SRichard Lowe	pushl	%esi
53*5d9d9091SRichard Lowe	pushl	%edi
54*5d9d9091SRichard Lowe
55*5d9d9091SRichard Lowe	movl	8+4(%esp),%edi	/ %edi = dest address
56*5d9d9091SRichard Lowe	movl	%edi, %eax	/ save this
57*5d9d9091SRichard Lowe	movl	8+8(%esp),%esi	/ %esi = source address
58*5d9d9091SRichard Lowe	movl	8+12(%esp),%ecx/ %ecx = length of string
59*5d9d9091SRichard Lowe				/ %edx scratch register
60*5d9d9091SRichard Lowe				/ %eax scratch register
61*5d9d9091SRichard Lowe.memcpy_post:
62*5d9d9091SRichard Lowe	nop			/ this really helps, don't know why
63*5d9d9091SRichard Lowe				/ note:	cld is perf death on P4
64*5d9d9091SRichard Lowe	cmpl	$63,%ecx
65*5d9d9091SRichard Lowe	ja	.move_sse	/ not worth doing sse for less
66*5d9d9091SRichard Lowe
67*5d9d9091SRichard Lowe.movew:
68*5d9d9091SRichard Lowe	movl	%ecx,%edx	/ save byte cnt
69*5d9d9091SRichard Lowe	shrl	$2,%ecx		/ %ecx = number of words to move
70*5d9d9091SRichard Lowe	rep ; smovl		/ move the words
71*5d9d9091SRichard Lowe
72*5d9d9091SRichard Lowe
73*5d9d9091SRichard Lowe	andl	$0x3,%edx	/ %edx = number of bytes left to move
74*5d9d9091SRichard Lowe	jz	.Return		/ %edx <= 3, so just unroll the loop
75*5d9d9091SRichard Lowe
76*5d9d9091SRichard Lowe	movb	(%esi), %cl
77*5d9d9091SRichard Lowe	movb	%cl, (%edi)
78*5d9d9091SRichard Lowe	decl	%edx
79*5d9d9091SRichard Lowe	jz	.Return
80*5d9d9091SRichard Lowe	movb	1(%esi), %cl
81*5d9d9091SRichard Lowe	movb	%cl, 1(%edi)
82*5d9d9091SRichard Lowe	decl	%edx
83*5d9d9091SRichard Lowe	jz	.Return
84*5d9d9091SRichard Lowe	movb	2(%esi), %cl
85*5d9d9091SRichard Lowe	movb	%cl, 2(%edi)
86*5d9d9091SRichard Lowe
87*5d9d9091SRichard Lowe.Return:
88*5d9d9091SRichard Lowe	popl	%edi		/ restore register variables
89*5d9d9091SRichard Lowe	popl	%esi
90*5d9d9091SRichard Lowe	ret
91*5d9d9091SRichard Lowe
92*5d9d9091SRichard Lowe.move_sse:
93*5d9d9091SRichard Lowe	/
94*5d9d9091SRichard Lowe	/ time to 16 byte align destination
95*5d9d9091SRichard Lowe	/
96*5d9d9091SRichard Lowe	andl	$15, %eax
97*5d9d9091SRichard Lowe	jnz	.sse_unaligned	/ jmp if dest is unaligned
98*5d9d9091SRichard Lowe.sse:				/ dest is aligned, check source
99*5d9d9091SRichard Lowe	movl	%ecx, %edx	/ get byte count
100*5d9d9091SRichard Lowe	shrl	$6, %edx	/ number of 64 byte blocks to move
101*5d9d9091SRichard Lowe	testl	$15, %esi
102*5d9d9091SRichard Lowe	jnz	.sse_da		/ go to slow loop if source is unaligned
103*5d9d9091SRichard Lowe	cmpl	$65535, %ecx
104*5d9d9091SRichard Lowe	ja	.sse_sa_nt_loop
105*5d9d9091SRichard Lowe
106*5d9d9091SRichard Lowe	/
107*5d9d9091SRichard Lowe	/ use aligned load since we're lucky
108*5d9d9091SRichard Lowe	/
109*5d9d9091SRichard Lowe.sse_sa_loop:
110*5d9d9091SRichard Lowe	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
111*5d9d9091SRichard Lowe	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
112*5d9d9091SRichard Lowe	movaps	0(%esi), %xmm0
113*5d9d9091SRichard Lowe	movaps	%xmm0, 0(%edi)
114*5d9d9091SRichard Lowe	movaps	16(%esi), %xmm1
115*5d9d9091SRichard Lowe	movaps	%xmm1, 16(%edi)
116*5d9d9091SRichard Lowe	movaps	32(%esi), %xmm2
117*5d9d9091SRichard Lowe	movaps	%xmm2, 32(%edi)
118*5d9d9091SRichard Lowe	movaps	48(%esi), %xmm3
119*5d9d9091SRichard Lowe	movaps	%xmm3, 48(%edi)
120*5d9d9091SRichard Lowe	addl	$64, %esi
121*5d9d9091SRichard Lowe	addl	$64, %edi
122*5d9d9091SRichard Lowe	decl	%edx
123*5d9d9091SRichard Lowe	jnz	.sse_sa_loop
124*5d9d9091SRichard Lowe
125*5d9d9091SRichard Lowe.sse_cleanup:
126*5d9d9091SRichard Lowe	andl	$63, %ecx	/ compute remaining bytes
127*5d9d9091SRichard Lowe	movl	8+4(%esp), %eax	/ setup return value
128*5d9d9091SRichard Lowe	jz	.Return
129*5d9d9091SRichard Lowe	jmp	.movew
130*5d9d9091SRichard Lowe
131*5d9d9091SRichard Lowe	/
132*5d9d9091SRichard Lowe	/ use aligned load since we're lucky
133*5d9d9091SRichard Lowe	/
134*5d9d9091SRichard Lowe	.align 16
135*5d9d9091SRichard Lowe.sse_sa_nt_loop:
136*5d9d9091SRichard Lowe	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
137*5d9d9091SRichard Lowe	movaps	(%esi), %xmm0
138*5d9d9091SRichard Lowe	movntps	%xmm0, 0(%edi)
139*5d9d9091SRichard Lowe	movaps	16(%esi), %xmm1
140*5d9d9091SRichard Lowe	movntps	%xmm1, 16(%edi)
141*5d9d9091SRichard Lowe	movaps	32(%esi), %xmm2
142*5d9d9091SRichard Lowe	movntps	%xmm2, 32(%edi)
143*5d9d9091SRichard Lowe	movaps	48(%esi), %xmm3
144*5d9d9091SRichard Lowe	movntps	%xmm3, 48(%edi)
145*5d9d9091SRichard Lowe	addl	$64, %esi
146*5d9d9091SRichard Lowe	addl	$64, %edi
147*5d9d9091SRichard Lowe	decl	%edx
148*5d9d9091SRichard Lowe	jnz	.sse_sa_nt_loop
149*5d9d9091SRichard Lowe#if defined(_SSE2_INSN)
150*5d9d9091SRichard Lowe	mfence
151*5d9d9091SRichard Lowe#elif defined(_SSE_INSN)
152*5d9d9091SRichard Lowe	sfence
153*5d9d9091SRichard Lowe#else
154*5d9d9091SRichard Lowe#error "Must have either SSE or SSE2"
155*5d9d9091SRichard Lowe#endif
156*5d9d9091SRichard Lowe	jmp	.sse_cleanup
157*5d9d9091SRichard Lowe
158*5d9d9091SRichard Lowe	/
159*5d9d9091SRichard Lowe	/ Make certain that destination buffer becomes aligned
160*5d9d9091SRichard Lowe	/
161*5d9d9091SRichard Lowe.sse_unaligned:
162*5d9d9091SRichard Lowe	neg	%eax		/ subtract from 16 and get destination
163*5d9d9091SRichard Lowe	andl	$15, %eax	/ aligned on a 16 byte boundary
164*5d9d9091SRichard Lowe	movl	%ecx, %edx	/ saved count
165*5d9d9091SRichard Lowe	subl	%eax, %ecx	/ subtract from byte count
166*5d9d9091SRichard Lowe	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
167*5d9d9091SRichard Lowe	cmovb	%edx, %ecx	/ if not, restore original byte count,
168*5d9d9091SRichard Lowe	cmovb	8+4(%esp), %eax	/ and restore return value,
169*5d9d9091SRichard Lowe	jb	.movew		/ and do a non-SSE move.
170*5d9d9091SRichard Lowe	xchg	%ecx, %eax	/ flip for copy
171*5d9d9091SRichard Lowe	rep ; smovb		/ move the bytes
172*5d9d9091SRichard Lowe	xchg	%ecx, %eax	/ flip back
173*5d9d9091SRichard Lowe	jmp	.sse
174*5d9d9091SRichard Lowe
175*5d9d9091SRichard Lowe	.align 16
176*5d9d9091SRichard Lowe.sse_da:
177*5d9d9091SRichard Lowe	cmpl	$65535, %ecx
178*5d9d9091SRichard Lowe	jbe	.sse_da_loop
179*5d9d9091SRichard Lowe
180*5d9d9091SRichard Lowe	/
181*5d9d9091SRichard Lowe	/ use unaligned load since source doesn't line up
182*5d9d9091SRichard Lowe	/
183*5d9d9091SRichard Lowe.sse_da_nt_loop:
184*5d9d9091SRichard Lowe	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
185*5d9d9091SRichard Lowe	movups	0(%esi), %xmm0
186*5d9d9091SRichard Lowe	movntps	%xmm0, 0(%edi)
187*5d9d9091SRichard Lowe	movups	16(%esi), %xmm1
188*5d9d9091SRichard Lowe	movntps	%xmm1, 16(%edi)
189*5d9d9091SRichard Lowe	movups	32(%esi), %xmm2
190*5d9d9091SRichard Lowe	movntps	%xmm2, 32(%edi)
191*5d9d9091SRichard Lowe	movups	48(%esi), %xmm3
192*5d9d9091SRichard Lowe	movntps	%xmm3, 48(%edi)
193*5d9d9091SRichard Lowe	addl	$64, %esi
194*5d9d9091SRichard Lowe	addl	$64, %edi
195*5d9d9091SRichard Lowe	decl	%edx
196*5d9d9091SRichard Lowe	jnz	.sse_da_nt_loop
197*5d9d9091SRichard Lowe#if defined(_SSE2_INSN)
198*5d9d9091SRichard Lowe	mfence
199*5d9d9091SRichard Lowe#elif defined(_SSE_INSN)
200*5d9d9091SRichard Lowe	sfence
201*5d9d9091SRichard Lowe#else
202*5d9d9091SRichard Lowe#error "Must have either SSE or SSE2"
203*5d9d9091SRichard Lowe#endif
204*5d9d9091SRichard Lowe	jmp	.sse_cleanup
205*5d9d9091SRichard Lowe	/
206*5d9d9091SRichard Lowe	/ use unaligned load since source doesn't line up
207*5d9d9091SRichard Lowe	/
208*5d9d9091SRichard Lowe	.align	16
209*5d9d9091SRichard Lowe.sse_da_loop:
210*5d9d9091SRichard Lowe	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
211*5d9d9091SRichard Lowe	prefetcht0 568(%edi)
212*5d9d9091SRichard Lowe	movups	0(%esi), %xmm0
213*5d9d9091SRichard Lowe	movaps	%xmm0, 0(%edi)
214*5d9d9091SRichard Lowe	movups	16(%esi), %xmm1
215*5d9d9091SRichard Lowe	movaps	%xmm1, 16(%edi)
216*5d9d9091SRichard Lowe	movups	32(%esi), %xmm2
217*5d9d9091SRichard Lowe	movaps	%xmm2, 32(%edi)
218*5d9d9091SRichard Lowe	movups	48(%esi), %xmm3
219*5d9d9091SRichard Lowe	movaps	%xmm3, 48(%edi)
220*5d9d9091SRichard Lowe	addl	$64, %esi
221*5d9d9091SRichard Lowe	addl	$64, %edi
222*5d9d9091SRichard Lowe	decl	%edx
223*5d9d9091SRichard Lowe	jnz	.sse_da_loop
224*5d9d9091SRichard Lowe	jmp	.sse_cleanup
225*5d9d9091SRichard Lowe
226*5d9d9091SRichard Lowe	SET_SIZE(memcpy)
227*5d9d9091SRichard Lowe
228*5d9d9091SRichard Lowe
229*5d9d9091SRichard Lowe/ .CopyLeft handles the memmove case where we must perform the copy backwards,
230*5d9d9091SRichard Lowe/ because of overlap between src and dst. This is not particularly optimized.
231*5d9d9091SRichard Lowe
232*5d9d9091SRichard Lowe.CopyLeft:
233*5d9d9091SRichard Lowe	movl	$3,%eax			/ heavily used constant
234*5d9d9091SRichard Lowe	std				/ reverse direction bit (RtoL)
235*5d9d9091SRichard Lowe	cmpl	$12,%ecx		/ if (size < 12)
236*5d9d9091SRichard Lowe	ja	.BigCopyLeft		/ {
237*5d9d9091SRichard Lowe	movl	%edx,%esi		/     src = src + size - 1
238*5d9d9091SRichard Lowe	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
239*5d9d9091SRichard Lowe	rep;	smovb			/    do the byte copy
240*5d9d9091SRichard Lowe	cld				/    reset direction flag to LtoR
241*5d9d9091SRichard Lowe	popl	%edi			/  }
242*5d9d9091SRichard Lowe	popl	%esi			/  restore registers
243*5d9d9091SRichard Lowe	movl	4(%esp),%eax		/  set up return value
244*5d9d9091SRichard Lowe	ret				/  return(dba);
245*5d9d9091SRichard Lowe.BigCopyLeft:				/ } else {
246*5d9d9091SRichard Lowe	xchgl	%edx,%ecx
247*5d9d9091SRichard Lowe	movl	%ecx,%esi		/ align source w/byte copy
248*5d9d9091SRichard Lowe	leal	-1(%edx,%edi),%edi
249*5d9d9091SRichard Lowe	andl	%eax,%ecx
250*5d9d9091SRichard Lowe	jz	.SkipAlignLeft
251*5d9d9091SRichard Lowe	addl	$1, %ecx		/ we need to insure that future
252*5d9d9091SRichard Lowe	subl	%ecx,%edx		/ copy is done on aligned boundary
253*5d9d9091SRichard Lowe	rep;	smovb
254*5d9d9091SRichard Lowe.SkipAlignLeft:
255*5d9d9091SRichard Lowe	movl	%edx,%ecx
256*5d9d9091SRichard Lowe	subl	%eax,%esi
257*5d9d9091SRichard Lowe	shrl	$2,%ecx			/ do 4 byte copy RtoL
258*5d9d9091SRichard Lowe	subl	%eax,%edi
259*5d9d9091SRichard Lowe	rep;	smovl
260*5d9d9091SRichard Lowe	andl	%eax,%edx		/ do 1 byte copy whats left
261*5d9d9091SRichard Lowe	jz	.CleanupReturnLeft
262*5d9d9091SRichard Lowe	movl	%edx,%ecx
263*5d9d9091SRichard Lowe	addl	%eax,%esi		/ rep; smovl instruction will decrement
264*5d9d9091SRichard Lowe	addl	%eax,%edi		/ %edi, %esi by four after each copy
265*5d9d9091SRichard Lowe					/ adding 3 will restore pointers to byte
266*5d9d9091SRichard Lowe					/ before last double word copied
267*5d9d9091SRichard Lowe					/ which is where they are expected to
268*5d9d9091SRichard Lowe					/ be for the single byte copy code
269*5d9d9091SRichard Lowe	rep;	smovb
270*5d9d9091SRichard Lowe.CleanupReturnLeft:
271*5d9d9091SRichard Lowe	cld				/ reset direction flag to LtoR
272*5d9d9091SRichard Lowe	popl	%edi
273*5d9d9091SRichard Lowe	popl	%esi			/ restore registers
274*5d9d9091SRichard Lowe	movl	4(%esp),%eax		/ set up return value
275*5d9d9091SRichard Lowe	ret				/ return(dba);
276*5d9d9091SRichard Lowe	SET_SIZE(memmove)
277