xref: /minix3/common/lib/libc/arch/sparc64/string/memcpy.S (revision 84d9c625bfea59e274550651111ae9edfdc40fbd)
1*84d9c625SLionel Sambuc/*	$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $	*/
2*84d9c625SLionel Sambuc
3*84d9c625SLionel Sambuc/*
4*84d9c625SLionel Sambuc * Copyright (c) 1996-2002 Eduardo Horvath
5*84d9c625SLionel Sambuc * All rights reserved.
6*84d9c625SLionel Sambuc *
7*84d9c625SLionel Sambuc * Redistribution and use in source and binary forms, with or without
8*84d9c625SLionel Sambuc * modification, are permitted provided that the following conditions
9*84d9c625SLionel Sambuc * are met:
10*84d9c625SLionel Sambuc * 1. Redistributions of source code must retain the above copyright
11*84d9c625SLionel Sambuc *    notice, this list of conditions and the following disclaimer.
12*84d9c625SLionel Sambuc *
13*84d9c625SLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE AUTHOR  ``AS IS'' AND
14*84d9c625SLionel Sambuc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15*84d9c625SLionel Sambuc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16*84d9c625SLionel Sambuc * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR  BE LIABLE
17*84d9c625SLionel Sambuc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18*84d9c625SLionel Sambuc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19*84d9c625SLionel Sambuc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20*84d9c625SLionel Sambuc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21*84d9c625SLionel Sambuc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22*84d9c625SLionel Sambuc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23*84d9c625SLionel Sambuc * SUCH DAMAGE.
24*84d9c625SLionel Sambuc *
25*84d9c625SLionel Sambuc */
26*84d9c625SLionel Sambuc#include "strmacros.h"
27*84d9c625SLionel Sambuc#if defined(LIBC_SCCS) && !defined(lint)
28*84d9c625SLionel SambucRCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $")
29*84d9c625SLionel Sambuc#endif  /* LIBC_SCCS and not lint */
30*84d9c625SLionel Sambuc
31*84d9c625SLionel Sambuc/*
32*84d9c625SLionel Sambuc * memcpy
33*84d9c625SLionel Sambuc * Assumes regions do not overlap;
34*84d9c625SLionel Sambuc *
35*84d9c625SLionel Sambuc * Must not use %g7 (see copyin/copyout above).
36*84d9c625SLionel Sambuc */
37*84d9c625SLionel SambucENTRY(memcpy) /* dest, src, size */
38*84d9c625SLionel Sambuc	/*
39*84d9c625SLionel Sambuc	 * Swap args for bcopy.  Gcc generates calls to memcpy for
40*84d9c625SLionel Sambuc	 * structure assignments.
41*84d9c625SLionel Sambuc	 */
42*84d9c625SLionel Sambuc	mov	%o0, %o3
43*84d9c625SLionel Sambuc	mov	%o1, %o0
44*84d9c625SLionel Sambuc	mov	%o3, %o1
45*84d9c625SLionel Sambuc#if !defined(_KERNEL) || defined(_RUMPKERNEL)
46*84d9c625SLionel SambucENTRY(bcopy) /* src, dest, size */
47*84d9c625SLionel Sambuc#endif
48*84d9c625SLionel Sambuc#ifdef DEBUG
49*84d9c625SLionel Sambuc#if defined(_KERNEL) && !defined(_RUMPKERNEL)
50*84d9c625SLionel Sambuc	set	pmapdebug, %o4
51*84d9c625SLionel Sambuc	ld	[%o4], %o4
52*84d9c625SLionel Sambuc	btst	0x80, %o4	! PDB_COPY
53*84d9c625SLionel Sambuc	bz,pt	%icc, 3f
54*84d9c625SLionel Sambuc	 nop
55*84d9c625SLionel Sambuc#endif
56*84d9c625SLionel Sambuc	save	%sp, -CC64FSZ, %sp
57*84d9c625SLionel Sambuc	mov	%i0, %o1
58*84d9c625SLionel Sambuc	set	2f, %o0
59*84d9c625SLionel Sambuc	mov	%i1, %o2
60*84d9c625SLionel Sambuc	call	printf
61*84d9c625SLionel Sambuc	 mov	%i2, %o3
62*84d9c625SLionel Sambuc!	ta	1; nop
63*84d9c625SLionel Sambuc	restore
64*84d9c625SLionel Sambuc	.data
65*84d9c625SLionel Sambuc2:	.asciz	"memcpy(%p<-%p,%x)\n"
66*84d9c625SLionel Sambuc	_ALIGN
67*84d9c625SLionel Sambuc	.text
68*84d9c625SLionel Sambuc3:
69*84d9c625SLionel Sambuc#endif
70*84d9c625SLionel Sambuc
71*84d9c625SLionel Sambuc	cmp	%o2, BCOPY_SMALL
72*84d9c625SLionel Sambuc
73*84d9c625SLionel SambucLmemcpy_start:
74*84d9c625SLionel Sambuc	bge,pt	CCCR, 2f	! if >= this many, go be fancy.
75*84d9c625SLionel Sambuc	 cmp	%o2, 256
76*84d9c625SLionel Sambuc
77*84d9c625SLionel Sambuc	mov	%o1, %o5	! Save memcpy return value
78*84d9c625SLionel Sambuc	/*
79*84d9c625SLionel Sambuc	 * Not much to copy, just do it a byte at a time.
80*84d9c625SLionel Sambuc	 */
81*84d9c625SLionel Sambuc	deccc	%o2		! while (--len >= 0)
82*84d9c625SLionel Sambuc	bl	1f
83*84d9c625SLionel Sambuc	 .empty
84*84d9c625SLionel Sambuc0:
85*84d9c625SLionel Sambuc	inc	%o0
86*84d9c625SLionel Sambuc	ldsb	[%o0 - 1], %o4	!	(++dst)[-1] = *src++;
87*84d9c625SLionel Sambuc	stb	%o4, [%o1]
88*84d9c625SLionel Sambuc	deccc	%o2
89*84d9c625SLionel Sambuc	bge	0b
90*84d9c625SLionel Sambuc	 inc	%o1
91*84d9c625SLionel Sambuc1:
92*84d9c625SLionel Sambuc	retl
93*84d9c625SLionel Sambuc	 mov	%o5, %o0
94*84d9c625SLionel Sambuc	NOTREACHED
95*84d9c625SLionel Sambuc
96*84d9c625SLionel Sambuc	/*
97*84d9c625SLionel Sambuc	 * Plenty of data to copy, so try to do it optimally.
98*84d9c625SLionel Sambuc	 */
99*84d9c625SLionel Sambuc2:
100*84d9c625SLionel Sambuc#ifdef USE_BLOCK_STORE_LOAD
101*84d9c625SLionel Sambuc	! If it is big enough, use VIS instructions
102*84d9c625SLionel Sambuc	bge	Lmemcpy_block
103*84d9c625SLionel Sambuc	 nop
104*84d9c625SLionel Sambuc#endif /* USE_BLOCK_STORE_LOAD */
105*84d9c625SLionel SambucLmemcpy_fancy:
106*84d9c625SLionel Sambuc
107*84d9c625SLionel Sambuc	!!
108*84d9c625SLionel Sambuc	!! First align the output to a 8-byte entity
109*84d9c625SLionel Sambuc	!!
110*84d9c625SLionel Sambuc
111*84d9c625SLionel Sambuc	save	%sp, -CC64FSZ, %sp
112*84d9c625SLionel Sambuc
113*84d9c625SLionel Sambuc	mov	%i0, %l0
114*84d9c625SLionel Sambuc	mov	%i1, %l1
115*84d9c625SLionel Sambuc
116*84d9c625SLionel Sambuc	mov	%i2, %l2
117*84d9c625SLionel Sambuc	btst	1, %l1
118*84d9c625SLionel Sambuc
119*84d9c625SLionel Sambuc	bz,pt	%icc, 4f
120*84d9c625SLionel Sambuc	 btst	2, %l1
121*84d9c625SLionel Sambuc	ldub	[%l0], %l4				! Load 1st byte
122*84d9c625SLionel Sambuc
123*84d9c625SLionel Sambuc	deccc	1, %l2
124*84d9c625SLionel Sambuc	ble,pn	CCCR, Lmemcpy_finish			! XXXX
125*84d9c625SLionel Sambuc	 inc	1, %l0
126*84d9c625SLionel Sambuc
127*84d9c625SLionel Sambuc	stb	%l4, [%l1]				! Store 1st byte
128*84d9c625SLionel Sambuc	inc	1, %l1					! Update address
129*84d9c625SLionel Sambuc	btst	2, %l1
130*84d9c625SLionel Sambuc4:
131*84d9c625SLionel Sambuc	bz,pt	%icc, 4f
132*84d9c625SLionel Sambuc
133*84d9c625SLionel Sambuc	 btst	1, %l0
134*84d9c625SLionel Sambuc	bz,a	1f
135*84d9c625SLionel Sambuc	 lduh	[%l0], %l4				! Load short
136*84d9c625SLionel Sambuc
137*84d9c625SLionel Sambuc	ldub	[%l0], %l4				! Load bytes
138*84d9c625SLionel Sambuc
139*84d9c625SLionel Sambuc	ldub	[%l0+1], %l3
140*84d9c625SLionel Sambuc	sllx	%l4, 8, %l4
141*84d9c625SLionel Sambuc	or	%l3, %l4, %l4
142*84d9c625SLionel Sambuc
143*84d9c625SLionel Sambuc1:
144*84d9c625SLionel Sambuc	deccc	2, %l2
145*84d9c625SLionel Sambuc	ble,pn	CCCR, Lmemcpy_finish			! XXXX
146*84d9c625SLionel Sambuc	 inc	2, %l0
147*84d9c625SLionel Sambuc	sth	%l4, [%l1]				! Store 1st short
148*84d9c625SLionel Sambuc
149*84d9c625SLionel Sambuc	inc	2, %l1
150*84d9c625SLionel Sambuc4:
151*84d9c625SLionel Sambuc	btst	4, %l1
152*84d9c625SLionel Sambuc	bz,pt	CCCR, 4f
153*84d9c625SLionel Sambuc
154*84d9c625SLionel Sambuc	 btst	3, %l0
155*84d9c625SLionel Sambuc	bz,a,pt	CCCR, 1f
156*84d9c625SLionel Sambuc	 lduw	[%l0], %l4				! Load word -1
157*84d9c625SLionel Sambuc
158*84d9c625SLionel Sambuc	btst	1, %l0
159*84d9c625SLionel Sambuc	bz,a,pt	%icc, 2f
160*84d9c625SLionel Sambuc	 lduh	[%l0], %l4
161*84d9c625SLionel Sambuc
162*84d9c625SLionel Sambuc	ldub	[%l0], %l4
163*84d9c625SLionel Sambuc
164*84d9c625SLionel Sambuc	lduh	[%l0+1], %l3
165*84d9c625SLionel Sambuc	sllx	%l4, 16, %l4
166*84d9c625SLionel Sambuc	or	%l4, %l3, %l4
167*84d9c625SLionel Sambuc
168*84d9c625SLionel Sambuc	ldub	[%l0+3], %l3
169*84d9c625SLionel Sambuc	sllx	%l4, 8, %l4
170*84d9c625SLionel Sambuc	ba,pt	%icc, 1f
171*84d9c625SLionel Sambuc	 or	%l4, %l3, %l4
172*84d9c625SLionel Sambuc
173*84d9c625SLionel Sambuc2:
174*84d9c625SLionel Sambuc	lduh	[%l0+2], %l3
175*84d9c625SLionel Sambuc	sllx	%l4, 16, %l4
176*84d9c625SLionel Sambuc	or	%l4, %l3, %l4
177*84d9c625SLionel Sambuc
178*84d9c625SLionel Sambuc1:
179*84d9c625SLionel Sambuc	deccc	4, %l2
180*84d9c625SLionel Sambuc	ble,pn	CCCR, Lmemcpy_finish		! XXXX
181*84d9c625SLionel Sambuc	 inc	4, %l0
182*84d9c625SLionel Sambuc
183*84d9c625SLionel Sambuc	st	%l4, [%l1]				! Store word
184*84d9c625SLionel Sambuc	inc	4, %l1
185*84d9c625SLionel Sambuc4:
186*84d9c625SLionel Sambuc	!!
187*84d9c625SLionel Sambuc	!! We are now 32-bit aligned in the dest.
188*84d9c625SLionel Sambuc	!!
189*84d9c625SLionel SambucLmemcpy_common:
190*84d9c625SLionel Sambuc
191*84d9c625SLionel Sambuc	and	%l0, 7, %l4				! Shift amount
192*84d9c625SLionel Sambuc	andn	%l0, 7, %l0				! Source addr
193*84d9c625SLionel Sambuc
194*84d9c625SLionel Sambuc	brz,pt	%l4, Lmemcpy_noshift8			! No shift version...
195*84d9c625SLionel Sambuc
196*84d9c625SLionel Sambuc	 sllx	%l4, 3, %l4				! In bits
197*84d9c625SLionel Sambuc	mov	8<<3, %l3
198*84d9c625SLionel Sambuc
199*84d9c625SLionel Sambuc	ldx	[%l0], %o0				! Load word -1
200*84d9c625SLionel Sambuc	sub	%l3, %l4, %l3				! Reverse shift
201*84d9c625SLionel Sambuc	deccc	12*8, %l2				! Have enough room?
202*84d9c625SLionel Sambuc
203*84d9c625SLionel Sambuc	sllx	%o0, %l4, %o0
204*84d9c625SLionel Sambuc	bl,pn	CCCR, 2f
205*84d9c625SLionel Sambuc	 and	%l3, 0x38, %l3
206*84d9c625SLionel SambucLmemcpy_unrolled8:
207*84d9c625SLionel Sambuc
208*84d9c625SLionel Sambuc	/*
209*84d9c625SLionel Sambuc	 * This is about as close to optimal as you can get, since
210*84d9c625SLionel Sambuc	 * the shifts require EU0 and cannot be paired, and you have
211*84d9c625SLionel Sambuc	 * 3 dependent operations on the data.
212*84d9c625SLionel Sambuc	 */
213*84d9c625SLionel Sambuc
214*84d9c625SLionel Sambuc!	ldx	[%l0+0*8], %o0				! Already done
215*84d9c625SLionel Sambuc!	sllx	%o0, %l4, %o0				! Already done
216*84d9c625SLionel Sambuc	ldx	[%l0+1*8], %o1
217*84d9c625SLionel Sambuc	ldx	[%l0+2*8], %o2
218*84d9c625SLionel Sambuc	ldx	[%l0+3*8], %o3
219*84d9c625SLionel Sambuc	ldx	[%l0+4*8], %o4
220*84d9c625SLionel Sambuc	ba,pt	%icc, 1f
221*84d9c625SLionel Sambuc	 ldx	[%l0+5*8], %o5
222*84d9c625SLionel Sambuc	.align	8
223*84d9c625SLionel Sambuc1:
224*84d9c625SLionel Sambuc	srlx	%o1, %l3, %g1
225*84d9c625SLionel Sambuc	inc	6*8, %l0
226*84d9c625SLionel Sambuc
227*84d9c625SLionel Sambuc	sllx	%o1, %l4, %o1
228*84d9c625SLionel Sambuc	or	%g1, %o0, %g6
229*84d9c625SLionel Sambuc	ldx	[%l0+0*8], %o0
230*84d9c625SLionel Sambuc
231*84d9c625SLionel Sambuc	stx	%g6, [%l1+0*8]
232*84d9c625SLionel Sambuc	srlx	%o2, %l3, %g1
233*84d9c625SLionel Sambuc
234*84d9c625SLionel Sambuc	sllx	%o2, %l4, %o2
235*84d9c625SLionel Sambuc	or	%g1, %o1, %g6
236*84d9c625SLionel Sambuc	ldx	[%l0+1*8], %o1
237*84d9c625SLionel Sambuc
238*84d9c625SLionel Sambuc	stx	%g6, [%l1+1*8]
239*84d9c625SLionel Sambuc	srlx	%o3, %l3, %g1
240*84d9c625SLionel Sambuc
241*84d9c625SLionel Sambuc	sllx	%o3, %l4, %o3
242*84d9c625SLionel Sambuc	or	%g1, %o2, %g6
243*84d9c625SLionel Sambuc	ldx	[%l0+2*8], %o2
244*84d9c625SLionel Sambuc
245*84d9c625SLionel Sambuc	stx	%g6, [%l1+2*8]
246*84d9c625SLionel Sambuc	srlx	%o4, %l3, %g1
247*84d9c625SLionel Sambuc
248*84d9c625SLionel Sambuc	sllx	%o4, %l4, %o4
249*84d9c625SLionel Sambuc	or	%g1, %o3, %g6
250*84d9c625SLionel Sambuc	ldx	[%l0+3*8], %o3
251*84d9c625SLionel Sambuc
252*84d9c625SLionel Sambuc	stx	%g6, [%l1+3*8]
253*84d9c625SLionel Sambuc	srlx	%o5, %l3, %g1
254*84d9c625SLionel Sambuc
255*84d9c625SLionel Sambuc	sllx	%o5, %l4, %o5
256*84d9c625SLionel Sambuc	or	%g1, %o4, %g6
257*84d9c625SLionel Sambuc	ldx	[%l0+4*8], %o4
258*84d9c625SLionel Sambuc
259*84d9c625SLionel Sambuc	stx	%g6, [%l1+4*8]
260*84d9c625SLionel Sambuc	srlx	%o0, %l3, %g1
261*84d9c625SLionel Sambuc	deccc	6*8, %l2				! Have enough room?
262*84d9c625SLionel Sambuc
263*84d9c625SLionel Sambuc	sllx	%o0, %l4, %o0				! Next loop
264*84d9c625SLionel Sambuc	or	%g1, %o5, %g6
265*84d9c625SLionel Sambuc	ldx	[%l0+5*8], %o5
266*84d9c625SLionel Sambuc
267*84d9c625SLionel Sambuc	stx	%g6, [%l1+5*8]
268*84d9c625SLionel Sambuc	bge,pt	CCCR, 1b
269*84d9c625SLionel Sambuc	 inc	6*8, %l1
270*84d9c625SLionel Sambuc
271*84d9c625SLionel SambucLmemcpy_unrolled8_cleanup:
272*84d9c625SLionel Sambuc	!!
273*84d9c625SLionel Sambuc	!! Finished 8 byte block, unload the regs.
274*84d9c625SLionel Sambuc	!!
275*84d9c625SLionel Sambuc	srlx	%o1, %l3, %g1
276*84d9c625SLionel Sambuc	inc	5*8, %l0
277*84d9c625SLionel Sambuc
278*84d9c625SLionel Sambuc	sllx	%o1, %l4, %o1
279*84d9c625SLionel Sambuc	or	%g1, %o0, %g6
280*84d9c625SLionel Sambuc
281*84d9c625SLionel Sambuc	stx	%g6, [%l1+0*8]
282*84d9c625SLionel Sambuc	srlx	%o2, %l3, %g1
283*84d9c625SLionel Sambuc
284*84d9c625SLionel Sambuc	sllx	%o2, %l4, %o2
285*84d9c625SLionel Sambuc	or	%g1, %o1, %g6
286*84d9c625SLionel Sambuc
287*84d9c625SLionel Sambuc	stx	%g6, [%l1+1*8]
288*84d9c625SLionel Sambuc	srlx	%o3, %l3, %g1
289*84d9c625SLionel Sambuc
290*84d9c625SLionel Sambuc	sllx	%o3, %l4, %o3
291*84d9c625SLionel Sambuc	or	%g1, %o2, %g6
292*84d9c625SLionel Sambuc
293*84d9c625SLionel Sambuc	stx	%g6, [%l1+2*8]
294*84d9c625SLionel Sambuc	srlx	%o4, %l3, %g1
295*84d9c625SLionel Sambuc
296*84d9c625SLionel Sambuc	sllx	%o4, %l4, %o4
297*84d9c625SLionel Sambuc	or	%g1, %o3, %g6
298*84d9c625SLionel Sambuc
299*84d9c625SLionel Sambuc	stx	%g6, [%l1+3*8]
300*84d9c625SLionel Sambuc	srlx	%o5, %l3, %g1
301*84d9c625SLionel Sambuc
302*84d9c625SLionel Sambuc	sllx	%o5, %l4, %o5
303*84d9c625SLionel Sambuc	or	%g1, %o4, %g6
304*84d9c625SLionel Sambuc
305*84d9c625SLionel Sambuc	stx	%g6, [%l1+4*8]
306*84d9c625SLionel Sambuc	inc	5*8, %l1
307*84d9c625SLionel Sambuc
308*84d9c625SLionel Sambuc	mov	%o5, %o0				! Save our unused data
309*84d9c625SLionel Sambuc	dec	5*8, %l2
310*84d9c625SLionel Sambuc2:
311*84d9c625SLionel Sambuc	inccc	12*8, %l2
312*84d9c625SLionel Sambuc	bz,pn	%icc, Lmemcpy_complete
313*84d9c625SLionel Sambuc
314*84d9c625SLionel Sambuc	!! Unrolled 8 times
315*84d9c625SLionel SambucLmemcpy_aligned8:
316*84d9c625SLionel Sambuc!	ldx	[%l0], %o0				! Already done
317*84d9c625SLionel Sambuc!	sllx	%o0, %l4, %o0				! Shift high word
318*84d9c625SLionel Sambuc
319*84d9c625SLionel Sambuc	 deccc	8, %l2					! Pre-decrement
320*84d9c625SLionel Sambuc	bl,pn	CCCR, Lmemcpy_finish
321*84d9c625SLionel Sambuc1:
322*84d9c625SLionel Sambuc	ldx	[%l0+8], %o1				! Load word 0
323*84d9c625SLionel Sambuc	inc	8, %l0
324*84d9c625SLionel Sambuc
325*84d9c625SLionel Sambuc	srlx	%o1, %l3, %g6
326*84d9c625SLionel Sambuc	or	%g6, %o0, %g6				! Combine
327*84d9c625SLionel Sambuc
328*84d9c625SLionel Sambuc	stx	%g6, [%l1]				! Store result
329*84d9c625SLionel Sambuc	 inc	8, %l1
330*84d9c625SLionel Sambuc
331*84d9c625SLionel Sambuc	deccc	8, %l2
332*84d9c625SLionel Sambuc	bge,pn	CCCR, 1b
333*84d9c625SLionel Sambuc	 sllx	%o1, %l4, %o0
334*84d9c625SLionel Sambuc
335*84d9c625SLionel Sambuc	btst	7, %l2					! Done?
336*84d9c625SLionel Sambuc	bz,pt	CCCR, Lmemcpy_complete
337*84d9c625SLionel Sambuc
338*84d9c625SLionel Sambuc	!!
339*84d9c625SLionel Sambuc	!! Loadup the last dregs into %o0 and shift it into place
340*84d9c625SLionel Sambuc	!!
341*84d9c625SLionel Sambuc	 srlx	%l3, 3, %g6				! # bytes in %o0
342*84d9c625SLionel Sambuc	dec	8, %g6					!  - 8
343*84d9c625SLionel Sambuc	!! n-8 - (by - 8) -> n - by
344*84d9c625SLionel Sambuc	subcc	%l2, %g6, %g0				! # bytes we need
345*84d9c625SLionel Sambuc	ble,pt	%icc, Lmemcpy_finish
346*84d9c625SLionel Sambuc	 nop
347*84d9c625SLionel Sambuc	ldx	[%l0+8], %o1				! Need another word
348*84d9c625SLionel Sambuc	srlx	%o1, %l3, %o1
349*84d9c625SLionel Sambuc	ba,pt	%icc, Lmemcpy_finish
350*84d9c625SLionel Sambuc	 or	%o0, %o1, %o0				! All loaded up.
351*84d9c625SLionel Sambuc
352*84d9c625SLionel SambucLmemcpy_noshift8:
353*84d9c625SLionel Sambuc	deccc	6*8, %l2				! Have enough room?
354*84d9c625SLionel Sambuc	bl,pn	CCCR, 2f
355*84d9c625SLionel Sambuc	 nop
356*84d9c625SLionel Sambuc	ba,pt	%icc, 1f
357*84d9c625SLionel Sambuc	 nop
358*84d9c625SLionel Sambuc	.align	32
359*84d9c625SLionel Sambuc1:
360*84d9c625SLionel Sambuc	ldx	[%l0+0*8], %o0
361*84d9c625SLionel Sambuc	ldx	[%l0+1*8], %o1
362*84d9c625SLionel Sambuc	ldx	[%l0+2*8], %o2
363*84d9c625SLionel Sambuc	stx	%o0, [%l1+0*8]
364*84d9c625SLionel Sambuc	stx	%o1, [%l1+1*8]
365*84d9c625SLionel Sambuc	stx	%o2, [%l1+2*8]
366*84d9c625SLionel Sambuc
367*84d9c625SLionel Sambuc
368*84d9c625SLionel Sambuc	ldx	[%l0+3*8], %o3
369*84d9c625SLionel Sambuc	ldx	[%l0+4*8], %o4
370*84d9c625SLionel Sambuc	ldx	[%l0+5*8], %o5
371*84d9c625SLionel Sambuc	inc	6*8, %l0
372*84d9c625SLionel Sambuc	stx	%o3, [%l1+3*8]
373*84d9c625SLionel Sambuc	deccc	6*8, %l2
374*84d9c625SLionel Sambuc	stx	%o4, [%l1+4*8]
375*84d9c625SLionel Sambuc	stx	%o5, [%l1+5*8]
376*84d9c625SLionel Sambuc	bge,pt	CCCR, 1b
377*84d9c625SLionel Sambuc	 inc	6*8, %l1
378*84d9c625SLionel Sambuc2:
379*84d9c625SLionel Sambuc	inc	6*8, %l2
380*84d9c625SLionel Sambuc1:
381*84d9c625SLionel Sambuc	deccc	8, %l2
382*84d9c625SLionel Sambuc	bl,pn	%icc, 1f				! < 0 --> sub word
383*84d9c625SLionel Sambuc	 nop
384*84d9c625SLionel Sambuc	ldx	[%l0], %g6
385*84d9c625SLionel Sambuc	inc	8, %l0
386*84d9c625SLionel Sambuc	stx	%g6, [%l1]
387*84d9c625SLionel Sambuc	bg,pt	%icc, 1b				! Exactly 0 --> done
388*84d9c625SLionel Sambuc	 inc	8, %l1
389*84d9c625SLionel Sambuc1:
390*84d9c625SLionel Sambuc	btst	7, %l2					! Done?
391*84d9c625SLionel Sambuc	bz,pt	CCCR, Lmemcpy_complete
392*84d9c625SLionel Sambuc	 clr	%l4
393*84d9c625SLionel Sambuc	ldx	[%l0], %o0
394*84d9c625SLionel SambucLmemcpy_finish:
395*84d9c625SLionel Sambuc
396*84d9c625SLionel Sambuc	brz,pn	%l2, 2f					! 100% complete?
397*84d9c625SLionel Sambuc	 cmp	%l2, 8					! Exactly 8 bytes?
398*84d9c625SLionel Sambuc	bz,a,pn	CCCR, 2f
399*84d9c625SLionel Sambuc	 stx	%o0, [%l1]
400*84d9c625SLionel Sambuc
401*84d9c625SLionel Sambuc	btst	4, %l2					! Word store?
402*84d9c625SLionel Sambuc	bz	CCCR, 1f
403*84d9c625SLionel Sambuc	 srlx	%o0, 32, %g6				! Shift high word down
404*84d9c625SLionel Sambuc	stw	%g6, [%l1]
405*84d9c625SLionel Sambuc	inc	4, %l1
406*84d9c625SLionel Sambuc	mov	%o0, %g6				! Operate on the low bits
407*84d9c625SLionel Sambuc1:
408*84d9c625SLionel Sambuc	btst	2, %l2
409*84d9c625SLionel Sambuc	mov	%g6, %o0
410*84d9c625SLionel Sambuc	bz	1f
411*84d9c625SLionel Sambuc	 srlx	%o0, 16, %g6
412*84d9c625SLionel Sambuc
413*84d9c625SLionel Sambuc	sth	%g6, [%l1]				! Store short
414*84d9c625SLionel Sambuc	inc	2, %l1
415*84d9c625SLionel Sambuc	mov	%o0, %g6				! Operate on low bytes
416*84d9c625SLionel Sambuc1:
417*84d9c625SLionel Sambuc	mov	%g6, %o0
418*84d9c625SLionel Sambuc	btst	1, %l2					! Byte aligned?
419*84d9c625SLionel Sambuc	bz	2f
420*84d9c625SLionel Sambuc	 srlx	%o0, 8, %g6
421*84d9c625SLionel Sambuc
422*84d9c625SLionel Sambuc	stb	%g6, [%l1]				! Store last byte
423*84d9c625SLionel Sambuc	inc	1, %l1					! Update address
424*84d9c625SLionel Sambuc2:
425*84d9c625SLionel SambucLmemcpy_complete:
426*84d9c625SLionel Sambuc#if 0
427*84d9c625SLionel Sambuc	!!
428*84d9c625SLionel Sambuc	!! verify copy success.
429*84d9c625SLionel Sambuc	!!
430*84d9c625SLionel Sambuc
431*84d9c625SLionel Sambuc	mov	%i0, %o2
432*84d9c625SLionel Sambuc	mov	%i1, %o4
433*84d9c625SLionel Sambuc	mov	%i2, %l4
434*84d9c625SLionel Sambuc0:
435*84d9c625SLionel Sambuc	ldub	[%o2], %o1
436*84d9c625SLionel Sambuc	inc	%o2
437*84d9c625SLionel Sambuc	ldub	[%o4], %o3
438*84d9c625SLionel Sambuc	inc	%o4
439*84d9c625SLionel Sambuc	cmp	%o3, %o1
440*84d9c625SLionel Sambuc	bnz	1f
441*84d9c625SLionel Sambuc	 dec	%l4
442*84d9c625SLionel Sambuc	brnz	%l4, 0b
443*84d9c625SLionel Sambuc	 nop
444*84d9c625SLionel Sambuc	ba	2f
445*84d9c625SLionel Sambuc	 nop
446*84d9c625SLionel Sambuc
447*84d9c625SLionel Sambuc1:
448*84d9c625SLionel Sambuc	set	0f, %o0
449*84d9c625SLionel Sambuc	call	printf
450*84d9c625SLionel Sambuc	 sub	%i2, %l4, %o5
451*84d9c625SLionel Sambuc	set	1f, %o0
452*84d9c625SLionel Sambuc	mov	%i0, %o2
453*84d9c625SLionel Sambuc	mov	%i1, %o1
454*84d9c625SLionel Sambuc	call	printf
455*84d9c625SLionel Sambuc	 mov	%i2, %o3
456*84d9c625SLionel Sambuc	ta	1
457*84d9c625SLionel Sambuc	.data
458*84d9c625SLionel Sambuc0:	.asciz	"memcpy failed: %x@%p != %x@%p byte %d\n"
459*84d9c625SLionel Sambuc1:	.asciz	"memcpy(%p, %p, %lx)\n"
460*84d9c625SLionel Sambuc	.align 8
461*84d9c625SLionel Sambuc	.text
462*84d9c625SLionel Sambuc2:
463*84d9c625SLionel Sambuc#endif
464*84d9c625SLionel Sambuc	ret
465*84d9c625SLionel Sambuc	 restore %i1, %g0, %o0
466*84d9c625SLionel Sambuc
467*84d9c625SLionel Sambuc#ifdef USE_BLOCK_STORE_LOAD
468*84d9c625SLionel Sambuc
469*84d9c625SLionel Sambuc/*
470*84d9c625SLionel Sambuc * Block copy.  Useful for >256 byte copies.
471*84d9c625SLionel Sambuc *
472*84d9c625SLionel Sambuc * Benchmarking has shown this always seems to be slower than
473*84d9c625SLionel Sambuc * the integer version, so this is disabled.  Maybe someone will
474*84d9c625SLionel Sambuc * figure out why sometime.
475*84d9c625SLionel Sambuc */
476*84d9c625SLionel Sambuc
477*84d9c625SLionel SambucLmemcpy_block:
478*84d9c625SLionel Sambuc	sethi	%hi(block_disable), %o3
479*84d9c625SLionel Sambuc	ldx	[ %o3 + %lo(block_disable) ], %o3
480*84d9c625SLionel Sambuc	brnz,pn	%o3, Lmemcpy_fancy
481*84d9c625SLionel Sambuc	!! Make sure our trap table is installed
482*84d9c625SLionel Sambuc	set	_C_LABEL(trapbase), %o5
483*84d9c625SLionel Sambuc	rdpr	%tba, %o3
484*84d9c625SLionel Sambuc	sub	%o3, %o5, %o3
485*84d9c625SLionel Sambuc	brnz,pn	%o3, Lmemcpy_fancy	! No, then don't use block load/store
486*84d9c625SLionel Sambuc	 nop
487*84d9c625SLionel Sambuc#if defined(_KERNEL) && !defined(_RUMPKERNEL)
488*84d9c625SLionel Sambuc/*
489*84d9c625SLionel Sambuc * Kernel:
490*84d9c625SLionel Sambuc *
491*84d9c625SLionel Sambuc * Here we use VIS instructions to do a block clear of a page.
492*84d9c625SLionel Sambuc * But before we can do that we need to save and enable the FPU.
493*84d9c625SLionel Sambuc * The last owner of the FPU registers is fplwp, and
494*84d9c625SLionel Sambuc * fplwp->l_md.md_fpstate is the current fpstate.  If that's not
495*84d9c625SLionel Sambuc * null, call savefpstate() with it to store our current fp state.
496*84d9c625SLionel Sambuc *
497*84d9c625SLionel Sambuc * Next, allocate an aligned fpstate on the stack.  We will properly
498*84d9c625SLionel Sambuc * nest calls on a particular stack so this should not be a problem.
499*84d9c625SLionel Sambuc *
500*84d9c625SLionel Sambuc * Now we grab either curlwp (or if we're on the interrupt stack
501*84d9c625SLionel Sambuc * lwp0).  We stash its existing fpstate in a local register and
502*84d9c625SLionel Sambuc * put our new fpstate in curlwp->p_md.md_fpstate.  We point
503*84d9c625SLionel Sambuc * fplwp at curlwp (or lwp0) and enable the FPU.
504*84d9c625SLionel Sambuc *
505*84d9c625SLionel Sambuc * If we are ever preempted, our FPU state will be saved in our
506*84d9c625SLionel Sambuc * fpstate.  Then, when we're resumed and we take an FPDISABLED
507*84d9c625SLionel Sambuc * trap, the trap handler will be able to fish our FPU state out
508*84d9c625SLionel Sambuc * of curlwp (or lwp0).
509*84d9c625SLionel Sambuc *
510*84d9c625SLionel Sambuc * On exiting this routine we undo the damage: restore the original
511*84d9c625SLionel Sambuc * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
512*84d9c625SLionel Sambuc * the MMU.
513*84d9c625SLionel Sambuc *
514*84d9c625SLionel Sambuc *
515*84d9c625SLionel Sambuc * Register usage, Kernel only (after save):
516*84d9c625SLionel Sambuc *
517*84d9c625SLionel Sambuc * %i0		src
518*84d9c625SLionel Sambuc * %i1		dest
519*84d9c625SLionel Sambuc * %i2		size
520*84d9c625SLionel Sambuc *
521*84d9c625SLionel Sambuc * %l0		XXXX DEBUG old fpstate
522*84d9c625SLionel Sambuc * %l1		fplwp (hi bits only)
523*84d9c625SLionel Sambuc * %l2		orig fplwp
524*84d9c625SLionel Sambuc * %l3		orig fpstate
525*84d9c625SLionel Sambuc * %l5		curlwp
526*84d9c625SLionel Sambuc * %l6		old fpstate
527*84d9c625SLionel Sambuc *
528*84d9c625SLionel Sambuc * Register ussage, Kernel and user:
529*84d9c625SLionel Sambuc *
530*84d9c625SLionel Sambuc * %g1		src (retval for memcpy)
531*84d9c625SLionel Sambuc *
532*84d9c625SLionel Sambuc * %o0		src
533*84d9c625SLionel Sambuc * %o1		dest
534*84d9c625SLionel Sambuc * %o2		end dest
535*84d9c625SLionel Sambuc * %o5		last safe fetchable address
536*84d9c625SLionel Sambuc */
537*84d9c625SLionel Sambuc
538*84d9c625SLionel Sambuc	ENABLE_FPU(0)
539*84d9c625SLionel Sambuc
540*84d9c625SLionel Sambuc	mov	%i0, %o0				! Src addr.
541*84d9c625SLionel Sambuc	mov	%i1, %o1				! Store our dest ptr here.
542*84d9c625SLionel Sambuc	mov	%i2, %o2				! Len counter
543*84d9c625SLionel Sambuc#endif	/* _KERNEL */
544*84d9c625SLionel Sambuc
545*84d9c625SLionel Sambuc	!!
546*84d9c625SLionel Sambuc	!! First align the output to a 64-bit entity
547*84d9c625SLionel Sambuc	!!
548*84d9c625SLionel Sambuc
549*84d9c625SLionel Sambuc	mov	%o1, %g1				! memcpy retval
550*84d9c625SLionel Sambuc	add	%o0, %o2, %o5				! End of source block
551*84d9c625SLionel Sambuc
552*84d9c625SLionel Sambuc	andn	%o0, 7, %o3				! Start of block
553*84d9c625SLionel Sambuc	dec	%o5
554*84d9c625SLionel Sambuc	fzero	%f0
555*84d9c625SLionel Sambuc
556*84d9c625SLionel Sambuc	andn	%o5, BLOCK_ALIGN, %o5			! Last safe addr.
557*84d9c625SLionel Sambuc	ldd	[%o3], %f2				! Load 1st word
558*84d9c625SLionel Sambuc
559*84d9c625SLionel Sambuc	dec	8, %o3					! Move %o3 1 word back
560*84d9c625SLionel Sambuc	btst	1, %o1
561*84d9c625SLionel Sambuc	bz	4f
562*84d9c625SLionel Sambuc
563*84d9c625SLionel Sambuc	 mov	-7, %o4					! Lowest src addr possible
564*84d9c625SLionel Sambuc	alignaddr %o0, %o4, %o4				! Base addr for load.
565*84d9c625SLionel Sambuc
566*84d9c625SLionel Sambuc	cmp	%o3, %o4
567*84d9c625SLionel Sambuc	be,pt	CCCR, 1f				! Already loaded?
568*84d9c625SLionel Sambuc	 mov	%o4, %o3
569*84d9c625SLionel Sambuc	fmovd	%f2, %f0				! No. Shift
570*84d9c625SLionel Sambuc	ldd	[%o3+8], %f2				! And load
571*84d9c625SLionel Sambuc1:
572*84d9c625SLionel Sambuc
573*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f4			! Isolate 1st byte
574*84d9c625SLionel Sambuc
575*84d9c625SLionel Sambuc	stda	%f4, [%o1] ASI_FL8_P			! Store 1st byte
576*84d9c625SLionel Sambuc	inc	1, %o1					! Update address
577*84d9c625SLionel Sambuc	inc	1, %o0
578*84d9c625SLionel Sambuc	dec	1, %o2
579*84d9c625SLionel Sambuc4:
580*84d9c625SLionel Sambuc	btst	2, %o1
581*84d9c625SLionel Sambuc	bz	4f
582*84d9c625SLionel Sambuc
583*84d9c625SLionel Sambuc	 mov	-6, %o4					! Calculate src - 6
584*84d9c625SLionel Sambuc	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
585*84d9c625SLionel Sambuc
586*84d9c625SLionel Sambuc	cmp	%o3, %o4				! Addresses same?
587*84d9c625SLionel Sambuc	be,pt	CCCR, 1f
588*84d9c625SLionel Sambuc	 mov	%o4, %o3
589*84d9c625SLionel Sambuc	fmovd	%f2, %f0				! Shuffle data
590*84d9c625SLionel Sambuc	ldd	[%o3+8], %f2				! Load word 0
591*84d9c625SLionel Sambuc1:
592*84d9c625SLionel Sambuc	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
593*84d9c625SLionel Sambuc
594*84d9c625SLionel Sambuc	stda	%f4, [%o1] ASI_FL16_P			! Store 1st short
595*84d9c625SLionel Sambuc	dec	2, %o2
596*84d9c625SLionel Sambuc	inc	2, %o1
597*84d9c625SLionel Sambuc	inc	2, %o0
598*84d9c625SLionel Sambuc4:
599*84d9c625SLionel Sambuc	brz,pn	%o2, Lmemcpy_blockfinish			! XXXX
600*84d9c625SLionel Sambuc
601*84d9c625SLionel Sambuc	 btst	4, %o1
602*84d9c625SLionel Sambuc	bz	4f
603*84d9c625SLionel Sambuc
604*84d9c625SLionel Sambuc	mov	-4, %o4
605*84d9c625SLionel Sambuc	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
606*84d9c625SLionel Sambuc
607*84d9c625SLionel Sambuc	cmp	%o3, %o4				! Addresses same?
608*84d9c625SLionel Sambuc	beq,pt	CCCR, 1f
609*84d9c625SLionel Sambuc	 mov	%o4, %o3
610*84d9c625SLionel Sambuc	fmovd	%f2, %f0				! Shuffle data
611*84d9c625SLionel Sambuc	ldd	[%o3+8], %f2				! Load word 0
612*84d9c625SLionel Sambuc1:
613*84d9c625SLionel Sambuc	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
614*84d9c625SLionel Sambuc
615*84d9c625SLionel Sambuc	st	%f5, [%o1]				! Store word
616*84d9c625SLionel Sambuc	dec	4, %o2
617*84d9c625SLionel Sambuc	inc	4, %o1
618*84d9c625SLionel Sambuc	inc	4, %o0
619*84d9c625SLionel Sambuc4:
620*84d9c625SLionel Sambuc	brz,pn	%o2, Lmemcpy_blockfinish			! XXXX
621*84d9c625SLionel Sambuc	!!
622*84d9c625SLionel Sambuc	!! We are now 32-bit aligned in the dest.
623*84d9c625SLionel Sambuc	!!
624*84d9c625SLionel SambucLmemcpy_block_common:
625*84d9c625SLionel Sambuc
626*84d9c625SLionel Sambuc	 mov	-0, %o4
627*84d9c625SLionel Sambuc	alignaddr %o0, %o4, %o4				! base - shift
628*84d9c625SLionel Sambuc
629*84d9c625SLionel Sambuc	cmp	%o3, %o4				! Addresses same?
630*84d9c625SLionel Sambuc	beq,pt	CCCR, 1f
631*84d9c625SLionel Sambuc	 mov	%o4, %o3
632*84d9c625SLionel Sambuc	fmovd	%f2, %f0				! Shuffle data
633*84d9c625SLionel Sambuc	ldd	[%o3+8], %f2				! Load word 0
634*84d9c625SLionel Sambuc1:
635*84d9c625SLionel Sambuc	add	%o3, 8, %o0				! now use %o0 for src
636*84d9c625SLionel Sambuc
637*84d9c625SLionel Sambuc	!!
638*84d9c625SLionel Sambuc	!! Continue until our dest is block aligned
639*84d9c625SLionel Sambuc	!!
640*84d9c625SLionel SambucLmemcpy_block_aligned8:
641*84d9c625SLionel Sambuc1:
642*84d9c625SLionel Sambuc	brz	%o2, Lmemcpy_blockfinish
643*84d9c625SLionel Sambuc	 btst	BLOCK_ALIGN, %o1			! Block aligned?
644*84d9c625SLionel Sambuc	bz	1f
645*84d9c625SLionel Sambuc
646*84d9c625SLionel Sambuc	 faligndata %f0, %f2, %f4			! Generate result
647*84d9c625SLionel Sambuc	deccc	8, %o2
648*84d9c625SLionel Sambuc	ble,pn	%icc, Lmemcpy_blockfinish		! Should never happen
649*84d9c625SLionel Sambuc	 fmovd	%f4, %f48
650*84d9c625SLionel Sambuc
651*84d9c625SLionel Sambuc	std	%f4, [%o1]				! Store result
652*84d9c625SLionel Sambuc	inc	8, %o1
653*84d9c625SLionel Sambuc
654*84d9c625SLionel Sambuc	fmovd	%f2, %f0
655*84d9c625SLionel Sambuc	inc	8, %o0
656*84d9c625SLionel Sambuc	ba,pt	%xcc, 1b				! Not yet.
657*84d9c625SLionel Sambuc	 ldd	[%o0], %f2				! Load next part
658*84d9c625SLionel SambucLmemcpy_block_aligned64:
659*84d9c625SLionel Sambuc1:
660*84d9c625SLionel Sambuc
661*84d9c625SLionel Sambuc/*
662*84d9c625SLionel Sambuc * 64-byte aligned -- ready for block operations.
663*84d9c625SLionel Sambuc *
664*84d9c625SLionel Sambuc * Here we have the destination block aligned, but the
665*84d9c625SLionel Sambuc * source pointer may not be.  Sub-word alignment will
666*84d9c625SLionel Sambuc * be handled by faligndata instructions.  But the source
667*84d9c625SLionel Sambuc * can still be potentially aligned to 8 different words
668*84d9c625SLionel Sambuc * in our 64-bit block, so we have 8 different copy routines.
669*84d9c625SLionel Sambuc *
670*84d9c625SLionel Sambuc * Once we figure out our source alignment, we branch
671*84d9c625SLionel Sambuc * to the appropriate copy routine, which sets up the
672*84d9c625SLionel Sambuc * alignment for faligndata and loads (sets) the values
673*84d9c625SLionel Sambuc * into the source registers and does the copy loop.
674*84d9c625SLionel Sambuc *
675*84d9c625SLionel Sambuc * When were down to less than 1 block to store, we
676*84d9c625SLionel Sambuc * exit the copy loop and execute cleanup code.
677*84d9c625SLionel Sambuc *
678*84d9c625SLionel Sambuc * Block loads and stores are not properly interlocked.
679*84d9c625SLionel Sambuc * Stores save one reg/cycle, so you can start overwriting
680*84d9c625SLionel Sambuc * registers the cycle after the store is issued.
681*84d9c625SLionel Sambuc *
682*84d9c625SLionel Sambuc * Block loads require a block load to a different register
683*84d9c625SLionel Sambuc * block or a membar #Sync before accessing the loaded
684*84d9c625SLionel Sambuc * data.
685*84d9c625SLionel Sambuc *
686*84d9c625SLionel Sambuc * Since the faligndata instructions may be offset as far
687*84d9c625SLionel Sambuc * as 7 registers into a block (if you are shifting source
688*84d9c625SLionel Sambuc * 7 -> dest 0), you need 3 source register blocks for full
689*84d9c625SLionel Sambuc * performance: one you are copying, one you are loading,
690*84d9c625SLionel Sambuc * and one for interlocking.  Otherwise, we would need to
691*84d9c625SLionel Sambuc * sprinkle the code with membar #Sync and lose the advantage
692*84d9c625SLionel Sambuc * of running faligndata in parallel with block stores.  This
693*84d9c625SLionel Sambuc * means we are fetching a full 128 bytes ahead of the stores.
694*84d9c625SLionel Sambuc * We need to make sure the prefetch does not inadvertently
695*84d9c625SLionel Sambuc * cross a page boundary and fault on data that we will never
696*84d9c625SLionel Sambuc * store.
697*84d9c625SLionel Sambuc *
698*84d9c625SLionel Sambuc */
699*84d9c625SLionel Sambuc#if 1
700*84d9c625SLionel Sambuc	and	%o0, BLOCK_ALIGN, %o3
701*84d9c625SLionel Sambuc	srax	%o3, 3, %o3				! Isolate the offset
702*84d9c625SLionel Sambuc
703*84d9c625SLionel Sambuc	brz	%o3, L100				! 0->0
704*84d9c625SLionel Sambuc	 btst	4, %o3
705*84d9c625SLionel Sambuc	bnz	%xcc, 4f
706*84d9c625SLionel Sambuc	 btst	2, %o3
707*84d9c625SLionel Sambuc	bnz	%xcc, 2f
708*84d9c625SLionel Sambuc	 btst	1, %o3
709*84d9c625SLionel Sambuc	ba,pt	%xcc, L101				! 0->1
710*84d9c625SLionel Sambuc	 nop	/* XXX spitfire bug */
711*84d9c625SLionel Sambuc2:
712*84d9c625SLionel Sambuc	bz	%xcc, L102				! 0->2
713*84d9c625SLionel Sambuc	 nop
714*84d9c625SLionel Sambuc	ba,pt	%xcc, L103				! 0->3
715*84d9c625SLionel Sambuc	 nop	/* XXX spitfire bug */
716*84d9c625SLionel Sambuc4:
717*84d9c625SLionel Sambuc	bnz	%xcc, 2f
718*84d9c625SLionel Sambuc	 btst	1, %o3
719*84d9c625SLionel Sambuc	bz	%xcc, L104				! 0->4
720*84d9c625SLionel Sambuc	 nop
721*84d9c625SLionel Sambuc	ba,pt	%xcc, L105				! 0->5
722*84d9c625SLionel Sambuc	 nop	/* XXX spitfire bug */
723*84d9c625SLionel Sambuc2:
724*84d9c625SLionel Sambuc	bz	%xcc, L106				! 0->6
725*84d9c625SLionel Sambuc	 nop
726*84d9c625SLionel Sambuc	ba,pt	%xcc, L107				! 0->7
727*84d9c625SLionel Sambuc	 nop	/* XXX spitfire bug */
728*84d9c625SLionel Sambuc#else
729*84d9c625SLionel Sambuc
730*84d9c625SLionel Sambuc	!!
731*84d9c625SLionel Sambuc	!! Isolate the word offset, which just happens to be
732*84d9c625SLionel Sambuc	!! the slot in our jump table.
733*84d9c625SLionel Sambuc	!!
734*84d9c625SLionel Sambuc	!! This is 6 insns, most of which cannot be paired,
735*84d9c625SLionel Sambuc	!! which is about the same as the above version.
736*84d9c625SLionel Sambuc	!!
737*84d9c625SLionel Sambuc	rd	%pc, %o4
738*84d9c625SLionel Sambuc1:
739*84d9c625SLionel Sambuc	and	%o0, 0x31, %o3
740*84d9c625SLionel Sambuc	add	%o3, (Lmemcpy_block_jmp - 1b), %o3
741*84d9c625SLionel Sambuc	jmpl	%o4 + %o3, %g0
742*84d9c625SLionel Sambuc	 nop
743*84d9c625SLionel Sambuc
744*84d9c625SLionel Sambuc	!!
745*84d9c625SLionel Sambuc	!! Jump table
746*84d9c625SLionel Sambuc	!!
747*84d9c625SLionel Sambuc
748*84d9c625SLionel SambucLmemcpy_block_jmp:
749*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L100
750*84d9c625SLionel Sambuc	 nop
751*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L101
752*84d9c625SLionel Sambuc	 nop
753*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L102
754*84d9c625SLionel Sambuc	 nop
755*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L103
756*84d9c625SLionel Sambuc	 nop
757*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L104
758*84d9c625SLionel Sambuc	 nop
759*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L105
760*84d9c625SLionel Sambuc	 nop
761*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L106
762*84d9c625SLionel Sambuc	 nop
763*84d9c625SLionel Sambuc	ba,a,pt	%xcc, L107
764*84d9c625SLionel Sambuc	 nop
765*84d9c625SLionel Sambuc#endif
766*84d9c625SLionel Sambuc
767*84d9c625SLionel Sambuc	!!
768*84d9c625SLionel Sambuc	!! Source is block aligned.
769*84d9c625SLionel Sambuc	!!
770*84d9c625SLionel Sambuc	!! Just load a block and go.
771*84d9c625SLionel Sambuc	!!
772*84d9c625SLionel SambucL100:
773*84d9c625SLionel Sambuc#ifdef RETURN_NAME
774*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
775*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
776*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
777*84d9c625SLionel Sambuc1:
778*84d9c625SLionel Sambuc	.asciz	"L100"
779*84d9c625SLionel Sambuc	.align	8
780*84d9c625SLionel Sambuc2:
781*84d9c625SLionel Sambuc#endif
782*84d9c625SLionel Sambuc	fmovd	%f0 , %f62
783*84d9c625SLionel Sambuc	ldda	[%o0] ASI_BLK_P, %f0
784*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
785*84d9c625SLionel Sambuc	cmp	%o0, %o5
786*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 3f
787*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
788*84d9c625SLionel Sambuc	ba,pt	%icc, 3f
789*84d9c625SLionel Sambuc	 membar #Sync
790*84d9c625SLionel Sambuc
791*84d9c625SLionel Sambuc	.align	32					! ICache align.
792*84d9c625SLionel Sambuc3:
793*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f32
794*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
795*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f34
796*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
797*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f36
798*84d9c625SLionel Sambuc	cmp	%o0, %o5
799*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f38
800*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f40
801*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f42
802*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f44
803*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
804*84d9c625SLionel Sambuc	 faligndata	%f12, %f14, %f46
805*84d9c625SLionel Sambuc
806*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
807*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
808*84d9c625SLionel Sambuc	membar	#Sync
809*84d9c625SLionel Sambuc2:
810*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
811*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f32
812*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
813*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f34
814*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
815*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f36
816*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
817*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f38
818*84d9c625SLionel Sambuc	cmp	%o0, %o5
819*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f40
820*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f42
821*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f44
822*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
823*84d9c625SLionel Sambuc	 faligndata	%f28, %f30, %f46
824*84d9c625SLionel Sambuc
825*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
826*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
827*84d9c625SLionel Sambuc	membar	#Sync
828*84d9c625SLionel Sambuc2:
829*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
830*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f32
831*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
832*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f34
833*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
834*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f36
835*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
836*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f38
837*84d9c625SLionel Sambuc	cmp	%o0, %o5
838*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f40
839*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f42
840*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f44
841*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
842*84d9c625SLionel Sambuc	 faligndata	%f60, %f62, %f46
843*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
844*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16			! Increment is at top
845*84d9c625SLionel Sambuc	membar	#Sync
846*84d9c625SLionel Sambuc2:
847*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
848*84d9c625SLionel Sambuc	ba	3b
849*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
850*84d9c625SLionel Sambuc
851*84d9c625SLionel Sambuc	!!
852*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+8
853*84d9c625SLionel Sambuc	!!
854*84d9c625SLionel Sambuc	!! We need to load almost 1 complete block by hand.
855*84d9c625SLionel Sambuc	!!
856*84d9c625SLionel SambucL101:
857*84d9c625SLionel Sambuc#ifdef RETURN_NAME
858*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
859*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
860*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
861*84d9c625SLionel Sambuc1:
862*84d9c625SLionel Sambuc	.asciz	"L101"
863*84d9c625SLionel Sambuc	.align	8
864*84d9c625SLionel Sambuc2:
865*84d9c625SLionel Sambuc#endif
866*84d9c625SLionel Sambuc!	fmovd	%f0, %f0				! Hoist fmovd
867*84d9c625SLionel Sambuc	ldd	[%o0], %f2
868*84d9c625SLionel Sambuc	inc	8, %o0
869*84d9c625SLionel Sambuc	ldd	[%o0], %f4
870*84d9c625SLionel Sambuc	inc	8, %o0
871*84d9c625SLionel Sambuc	ldd	[%o0], %f6
872*84d9c625SLionel Sambuc	inc	8, %o0
873*84d9c625SLionel Sambuc	ldd	[%o0], %f8
874*84d9c625SLionel Sambuc	inc	8, %o0
875*84d9c625SLionel Sambuc	ldd	[%o0], %f10
876*84d9c625SLionel Sambuc	inc	8, %o0
877*84d9c625SLionel Sambuc	ldd	[%o0], %f12
878*84d9c625SLionel Sambuc	inc	8, %o0
879*84d9c625SLionel Sambuc	ldd	[%o0], %f14
880*84d9c625SLionel Sambuc	inc	8, %o0
881*84d9c625SLionel Sambuc
882*84d9c625SLionel Sambuc	cmp	%o0, %o5
883*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 3f
884*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
885*84d9c625SLionel Sambuc	membar #Sync
886*84d9c625SLionel Sambuc3:
887*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f32
888*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
889*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f34
890*84d9c625SLionel Sambuc	cmp	%o0, %o5
891*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f36
892*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
893*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f38
894*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f40
895*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f42
896*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f44
897*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
898*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
899*84d9c625SLionel Sambuc	membar	#Sync
900*84d9c625SLionel Sambuc2:
901*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
902*84d9c625SLionel Sambuc	 faligndata	%f14, %f16, %f46
903*84d9c625SLionel Sambuc
904*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
905*84d9c625SLionel Sambuc
906*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f32
907*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
908*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f34
909*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
910*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f36
911*84d9c625SLionel Sambuc	cmp	%o0, %o5
912*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f38
913*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
914*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f40
915*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f42
916*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f44
917*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
918*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
919*84d9c625SLionel Sambuc	membar	#Sync
920*84d9c625SLionel Sambuc2:
921*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
922*84d9c625SLionel Sambuc	 faligndata	%f30, %f48, %f46
923*84d9c625SLionel Sambuc
924*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
925*84d9c625SLionel Sambuc
926*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f32
927*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
928*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f34
929*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
930*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f36
931*84d9c625SLionel Sambuc	cmp	%o0, %o5
932*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f38
933*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
934*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f40
935*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f42
936*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f44
937*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
938*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
939*84d9c625SLionel Sambuc	membar	#Sync
940*84d9c625SLionel Sambuc2:
941*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
942*84d9c625SLionel Sambuc	 faligndata	%f62, %f0, %f46
943*84d9c625SLionel Sambuc
944*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
945*84d9c625SLionel Sambuc	ba	3b
946*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
947*84d9c625SLionel Sambuc
948*84d9c625SLionel Sambuc	!!
949*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+16
950*84d9c625SLionel Sambuc	!!
951*84d9c625SLionel Sambuc	!! We need to load 6 doubles by hand.
952*84d9c625SLionel Sambuc	!!
953*84d9c625SLionel SambucL102:
954*84d9c625SLionel Sambuc#ifdef RETURN_NAME
955*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
956*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
957*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
958*84d9c625SLionel Sambuc1:
959*84d9c625SLionel Sambuc	.asciz	"L102"
960*84d9c625SLionel Sambuc	.align	8
961*84d9c625SLionel Sambuc2:
962*84d9c625SLionel Sambuc#endif
963*84d9c625SLionel Sambuc	ldd	[%o0], %f4
964*84d9c625SLionel Sambuc	inc	8, %o0
965*84d9c625SLionel Sambuc	fmovd	%f0, %f2				! Hoist fmovd
966*84d9c625SLionel Sambuc	ldd	[%o0], %f6
967*84d9c625SLionel Sambuc	inc	8, %o0
968*84d9c625SLionel Sambuc
969*84d9c625SLionel Sambuc	ldd	[%o0], %f8
970*84d9c625SLionel Sambuc	inc	8, %o0
971*84d9c625SLionel Sambuc	ldd	[%o0], %f10
972*84d9c625SLionel Sambuc	inc	8, %o0
973*84d9c625SLionel Sambuc	ldd	[%o0], %f12
974*84d9c625SLionel Sambuc	inc	8, %o0
975*84d9c625SLionel Sambuc	ldd	[%o0], %f14
976*84d9c625SLionel Sambuc	inc	8, %o0
977*84d9c625SLionel Sambuc
978*84d9c625SLionel Sambuc	cmp	%o0, %o5
979*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 3f
980*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
981*84d9c625SLionel Sambuc	membar #Sync
982*84d9c625SLionel Sambuc3:
983*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f32
984*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
985*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f34
986*84d9c625SLionel Sambuc	cmp	%o0, %o5
987*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f36
988*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
989*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f38
990*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f40
991*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f42
992*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
993*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
994*84d9c625SLionel Sambuc	membar	#Sync
995*84d9c625SLionel Sambuc2:
996*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f44
997*84d9c625SLionel Sambuc
998*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
999*84d9c625SLionel Sambuc	 faligndata	%f16, %f18, %f46
1000*84d9c625SLionel Sambuc
1001*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1002*84d9c625SLionel Sambuc
1003*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f32
1004*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1005*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f34
1006*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1007*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f36
1008*84d9c625SLionel Sambuc	cmp	%o0, %o5
1009*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f38
1010*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1011*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f40
1012*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f42
1013*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1014*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
1015*84d9c625SLionel Sambuc	membar	#Sync
1016*84d9c625SLionel Sambuc2:
1017*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f44
1018*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1019*84d9c625SLionel Sambuc	 faligndata	%f48, %f50, %f46
1020*84d9c625SLionel Sambuc
1021*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1022*84d9c625SLionel Sambuc
1023*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f32
1024*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1025*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f34
1026*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1027*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f36
1028*84d9c625SLionel Sambuc	cmp	%o0, %o5
1029*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f38
1030*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1031*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f40
1032*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f42
1033*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1034*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1035*84d9c625SLionel Sambuc	membar	#Sync
1036*84d9c625SLionel Sambuc2:
1037*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f44
1038*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1039*84d9c625SLionel Sambuc	 faligndata	%f0, %f2, %f46
1040*84d9c625SLionel Sambuc
1041*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1042*84d9c625SLionel Sambuc	ba	3b
1043*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
1044*84d9c625SLionel Sambuc
1045*84d9c625SLionel Sambuc	!!
1046*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+24
1047*84d9c625SLionel Sambuc	!!
1048*84d9c625SLionel Sambuc	!! We need to load 5 doubles by hand.
1049*84d9c625SLionel Sambuc	!!
1050*84d9c625SLionel SambucL103:
1051*84d9c625SLionel Sambuc#ifdef RETURN_NAME
1052*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
1053*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
1054*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
1055*84d9c625SLionel Sambuc1:
1056*84d9c625SLionel Sambuc	.asciz	"L103"
1057*84d9c625SLionel Sambuc	.align	8
1058*84d9c625SLionel Sambuc2:
1059*84d9c625SLionel Sambuc#endif
1060*84d9c625SLionel Sambuc	fmovd	%f0, %f4
1061*84d9c625SLionel Sambuc	ldd	[%o0], %f6
1062*84d9c625SLionel Sambuc	inc	8, %o0
1063*84d9c625SLionel Sambuc	ldd	[%o0], %f8
1064*84d9c625SLionel Sambuc	inc	8, %o0
1065*84d9c625SLionel Sambuc	ldd	[%o0], %f10
1066*84d9c625SLionel Sambuc	inc	8, %o0
1067*84d9c625SLionel Sambuc	ldd	[%o0], %f12
1068*84d9c625SLionel Sambuc	inc	8, %o0
1069*84d9c625SLionel Sambuc	ldd	[%o0], %f14
1070*84d9c625SLionel Sambuc	inc	8, %o0
1071*84d9c625SLionel Sambuc
1072*84d9c625SLionel Sambuc	cmp	%o0, %o5
1073*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1074*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1075*84d9c625SLionel Sambuc	membar #Sync
1076*84d9c625SLionel Sambuc2:
1077*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1078*84d9c625SLionel Sambuc3:
1079*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f32
1080*84d9c625SLionel Sambuc	cmp	%o0, %o5
1081*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f34
1082*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1083*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f36
1084*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f38
1085*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f40
1086*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1087*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
1088*84d9c625SLionel Sambuc	membar	#Sync
1089*84d9c625SLionel Sambuc2:
1090*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f42
1091*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1092*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f44
1093*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1094*84d9c625SLionel Sambuc	 faligndata	%f18, %f20, %f46
1095*84d9c625SLionel Sambuc
1096*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1097*84d9c625SLionel Sambuc
1098*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f32
1099*84d9c625SLionel Sambuc	cmp	%o0, %o5
1100*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f34
1101*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1102*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f36
1103*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1104*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f38
1105*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f40
1106*84d9c625SLionel Sambuc	ble,a,pn	%icc, 2f
1107*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
1108*84d9c625SLionel Sambuc	membar	#Sync
1109*84d9c625SLionel Sambuc2:
1110*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f42
1111*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1112*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f44
1113*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1114*84d9c625SLionel Sambuc	 faligndata	%f50, %f52, %f46
1115*84d9c625SLionel Sambuc
1116*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1117*84d9c625SLionel Sambuc
1118*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f32
1119*84d9c625SLionel Sambuc	cmp	%o0, %o5
1120*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f34
1121*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1122*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f36
1123*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f38
1124*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1125*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f40
1126*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1127*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1128*84d9c625SLionel Sambuc	membar	#Sync
1129*84d9c625SLionel Sambuc2:
1130*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f42
1131*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1132*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f44
1133*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1134*84d9c625SLionel Sambuc	 faligndata	%f2, %f4, %f46
1135*84d9c625SLionel Sambuc
1136*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1137*84d9c625SLionel Sambuc	ba	3b
1138*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
1139*84d9c625SLionel Sambuc
1140*84d9c625SLionel Sambuc	!!
1141*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+32
1142*84d9c625SLionel Sambuc	!!
1143*84d9c625SLionel Sambuc	!! We need to load 4 doubles by hand.
1144*84d9c625SLionel Sambuc	!!
1145*84d9c625SLionel SambucL104:
1146*84d9c625SLionel Sambuc#ifdef RETURN_NAME
1147*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
1148*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
1149*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
1150*84d9c625SLionel Sambuc1:
1151*84d9c625SLionel Sambuc	.asciz	"L104"
1152*84d9c625SLionel Sambuc	.align	8
1153*84d9c625SLionel Sambuc2:
1154*84d9c625SLionel Sambuc#endif
1155*84d9c625SLionel Sambuc	fmovd	%f0, %f6
1156*84d9c625SLionel Sambuc	ldd	[%o0], %f8
1157*84d9c625SLionel Sambuc	inc	8, %o0
1158*84d9c625SLionel Sambuc	ldd	[%o0], %f10
1159*84d9c625SLionel Sambuc	inc	8, %o0
1160*84d9c625SLionel Sambuc	ldd	[%o0], %f12
1161*84d9c625SLionel Sambuc	inc	8, %o0
1162*84d9c625SLionel Sambuc	ldd	[%o0], %f14
1163*84d9c625SLionel Sambuc	inc	8, %o0
1164*84d9c625SLionel Sambuc
1165*84d9c625SLionel Sambuc	cmp	%o0, %o5
1166*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1167*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1168*84d9c625SLionel Sambuc	membar #Sync
1169*84d9c625SLionel Sambuc2:
1170*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1171*84d9c625SLionel Sambuc3:
1172*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f32
1173*84d9c625SLionel Sambuc	cmp	%o0, %o5
1174*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f34
1175*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1176*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f36
1177*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f38
1178*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1179*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
1180*84d9c625SLionel Sambuc	membar	#Sync
1181*84d9c625SLionel Sambuc2:
1182*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f40
1183*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f42
1184*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1185*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f44
1186*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1187*84d9c625SLionel Sambuc	 faligndata	%f20, %f22, %f46
1188*84d9c625SLionel Sambuc
1189*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1190*84d9c625SLionel Sambuc
1191*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f32
1192*84d9c625SLionel Sambuc	cmp	%o0, %o5
1193*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f34
1194*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f36
1195*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1196*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f38
1197*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1198*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
1199*84d9c625SLionel Sambuc	membar	#Sync
1200*84d9c625SLionel Sambuc2:
1201*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f40
1202*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1203*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f42
1204*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1205*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f44
1206*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1207*84d9c625SLionel Sambuc	 faligndata	%f52, %f54, %f46
1208*84d9c625SLionel Sambuc
1209*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1210*84d9c625SLionel Sambuc
1211*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f32
1212*84d9c625SLionel Sambuc	cmp	%o0, %o5
1213*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f34
1214*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f36
1215*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1216*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f38
1217*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1218*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1219*84d9c625SLionel Sambuc	membar	#Sync
1220*84d9c625SLionel Sambuc2:
1221*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f40
1222*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1223*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f42
1224*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1225*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f44
1226*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1227*84d9c625SLionel Sambuc	 faligndata	%f4, %f6, %f46
1228*84d9c625SLionel Sambuc
1229*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1230*84d9c625SLionel Sambuc	ba	3b
1231*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
1232*84d9c625SLionel Sambuc
1233*84d9c625SLionel Sambuc	!!
1234*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+40
1235*84d9c625SLionel Sambuc	!!
1236*84d9c625SLionel Sambuc	!! We need to load 3 doubles by hand.
1237*84d9c625SLionel Sambuc	!!
1238*84d9c625SLionel SambucL105:
1239*84d9c625SLionel Sambuc#ifdef RETURN_NAME
1240*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
1241*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
1242*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
1243*84d9c625SLionel Sambuc1:
1244*84d9c625SLionel Sambuc	.asciz	"L105"
1245*84d9c625SLionel Sambuc	.align	8
1246*84d9c625SLionel Sambuc2:
1247*84d9c625SLionel Sambuc#endif
1248*84d9c625SLionel Sambuc	fmovd	%f0, %f8
1249*84d9c625SLionel Sambuc	ldd	[%o0], %f10
1250*84d9c625SLionel Sambuc	inc	8, %o0
1251*84d9c625SLionel Sambuc	ldd	[%o0], %f12
1252*84d9c625SLionel Sambuc	inc	8, %o0
1253*84d9c625SLionel Sambuc	ldd	[%o0], %f14
1254*84d9c625SLionel Sambuc	inc	8, %o0
1255*84d9c625SLionel Sambuc
1256*84d9c625SLionel Sambuc	cmp	%o0, %o5
1257*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1258*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1259*84d9c625SLionel Sambuc	membar #Sync
1260*84d9c625SLionel Sambuc2:
1261*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1262*84d9c625SLionel Sambuc3:
1263*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f32
1264*84d9c625SLionel Sambuc	cmp	%o0, %o5
1265*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f34
1266*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f36
1267*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1268*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
1269*84d9c625SLionel Sambuc	membar	#Sync
1270*84d9c625SLionel Sambuc2:
1271*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f38
1272*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1273*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f40
1274*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1275*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f42
1276*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f44
1277*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1278*84d9c625SLionel Sambuc	 faligndata	%f22, %f24, %f46
1279*84d9c625SLionel Sambuc
1280*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1281*84d9c625SLionel Sambuc
1282*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f32
1283*84d9c625SLionel Sambuc	cmp	%o0, %o5
1284*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f34
1285*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1286*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f36
1287*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1288*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
1289*84d9c625SLionel Sambuc	membar	#Sync
1290*84d9c625SLionel Sambuc2:
1291*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f38
1292*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1293*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f40
1294*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1295*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f42
1296*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f44
1297*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1298*84d9c625SLionel Sambuc	 faligndata	%f54, %f56, %f46
1299*84d9c625SLionel Sambuc
1300*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1301*84d9c625SLionel Sambuc
1302*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f32
1303*84d9c625SLionel Sambuc	cmp	%o0, %o5
1304*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f34
1305*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1306*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f36
1307*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1308*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1309*84d9c625SLionel Sambuc	membar	#Sync
1310*84d9c625SLionel Sambuc2:
1311*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f38
1312*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1313*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f40
1314*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1315*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f42
1316*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f44
1317*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1318*84d9c625SLionel Sambuc	 faligndata	%f6, %f8, %f46
1319*84d9c625SLionel Sambuc
1320*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1321*84d9c625SLionel Sambuc	ba	3b
1322*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
1323*84d9c625SLionel Sambuc
1324*84d9c625SLionel Sambuc
1325*84d9c625SLionel Sambuc	!!
1326*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+48
1327*84d9c625SLionel Sambuc	!!
1328*84d9c625SLionel Sambuc	!! We need to load 2 doubles by hand.
1329*84d9c625SLionel Sambuc	!!
1330*84d9c625SLionel SambucL106:
1331*84d9c625SLionel Sambuc#ifdef RETURN_NAME
1332*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
1333*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
1334*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
1335*84d9c625SLionel Sambuc1:
1336*84d9c625SLionel Sambuc	.asciz	"L106"
1337*84d9c625SLionel Sambuc	.align	8
1338*84d9c625SLionel Sambuc2:
1339*84d9c625SLionel Sambuc#endif
1340*84d9c625SLionel Sambuc	fmovd	%f0, %f10
1341*84d9c625SLionel Sambuc	ldd	[%o0], %f12
1342*84d9c625SLionel Sambuc	inc	8, %o0
1343*84d9c625SLionel Sambuc	ldd	[%o0], %f14
1344*84d9c625SLionel Sambuc	inc	8, %o0
1345*84d9c625SLionel Sambuc
1346*84d9c625SLionel Sambuc	cmp	%o0, %o5
1347*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1348*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1349*84d9c625SLionel Sambuc	membar #Sync
1350*84d9c625SLionel Sambuc2:
1351*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1352*84d9c625SLionel Sambuc3:
1353*84d9c625SLionel Sambuc	faligndata	%f10, %f12, %f32
1354*84d9c625SLionel Sambuc	cmp	%o0, %o5
1355*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f34
1356*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1357*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
1358*84d9c625SLionel Sambuc	membar	#Sync
1359*84d9c625SLionel Sambuc2:
1360*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f36
1361*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1362*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f38
1363*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1364*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f40
1365*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f42
1366*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f44
1367*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1368*84d9c625SLionel Sambuc	 faligndata	%f24, %f26, %f46
1369*84d9c625SLionel Sambuc
1370*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1371*84d9c625SLionel Sambuc
1372*84d9c625SLionel Sambuc	faligndata	%f26, %f28, %f32
1373*84d9c625SLionel Sambuc	cmp	%o0, %o5
1374*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f34
1375*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1376*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
1377*84d9c625SLionel Sambuc	membar	#Sync
1378*84d9c625SLionel Sambuc2:
1379*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f36
1380*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1381*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f38
1382*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1383*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f40
1384*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f42
1385*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1386*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f44
1387*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1388*84d9c625SLionel Sambuc	 faligndata	%f56, %f58, %f46
1389*84d9c625SLionel Sambuc
1390*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1391*84d9c625SLionel Sambuc
1392*84d9c625SLionel Sambuc	faligndata	%f58, %f60, %f32
1393*84d9c625SLionel Sambuc	cmp	%o0, %o5
1394*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f34
1395*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1396*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1397*84d9c625SLionel Sambuc	membar	#Sync
1398*84d9c625SLionel Sambuc2:
1399*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f36
1400*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1401*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f38
1402*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1403*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f40
1404*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f42
1405*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1406*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f44
1407*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1408*84d9c625SLionel Sambuc	 faligndata	%f8, %f10, %f46
1409*84d9c625SLionel Sambuc
1410*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1411*84d9c625SLionel Sambuc	ba	3b
1412*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
1413*84d9c625SLionel Sambuc
1414*84d9c625SLionel Sambuc
1415*84d9c625SLionel Sambuc	!!
1416*84d9c625SLionel Sambuc	!! Source at BLOCK_ALIGN+56
1417*84d9c625SLionel Sambuc	!!
1418*84d9c625SLionel Sambuc	!! We need to load 1 double by hand.
1419*84d9c625SLionel Sambuc	!!
1420*84d9c625SLionel SambucL107:
1421*84d9c625SLionel Sambuc#ifdef RETURN_NAME
1422*84d9c625SLionel Sambuc	sethi	%hi(1f), %g1
1423*84d9c625SLionel Sambuc	ba,pt	%icc, 2f
1424*84d9c625SLionel Sambuc	 or	%g1, %lo(1f), %g1
1425*84d9c625SLionel Sambuc1:
1426*84d9c625SLionel Sambuc	.asciz	"L107"
1427*84d9c625SLionel Sambuc	.align	8
1428*84d9c625SLionel Sambuc2:
1429*84d9c625SLionel Sambuc#endif
1430*84d9c625SLionel Sambuc	fmovd	%f0, %f12
1431*84d9c625SLionel Sambuc	ldd	[%o0], %f14
1432*84d9c625SLionel Sambuc	inc	8, %o0
1433*84d9c625SLionel Sambuc
1434*84d9c625SLionel Sambuc	cmp	%o0, %o5
1435*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1436*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1437*84d9c625SLionel Sambuc	membar #Sync
1438*84d9c625SLionel Sambuc2:
1439*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1440*84d9c625SLionel Sambuc3:
1441*84d9c625SLionel Sambuc	faligndata	%f12, %f14, %f32
1442*84d9c625SLionel Sambuc	cmp	%o0, %o5
1443*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1444*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f48
1445*84d9c625SLionel Sambuc	membar	#Sync
1446*84d9c625SLionel Sambuc2:
1447*84d9c625SLionel Sambuc	faligndata	%f14, %f16, %f34
1448*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1449*84d9c625SLionel Sambuc	faligndata	%f16, %f18, %f36
1450*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1451*84d9c625SLionel Sambuc	faligndata	%f18, %f20, %f38
1452*84d9c625SLionel Sambuc	faligndata	%f20, %f22, %f40
1453*84d9c625SLionel Sambuc	faligndata	%f22, %f24, %f42
1454*84d9c625SLionel Sambuc	faligndata	%f24, %f26, %f44
1455*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1456*84d9c625SLionel Sambuc	 faligndata	%f26, %f28, %f46
1457*84d9c625SLionel Sambuc
1458*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1459*84d9c625SLionel Sambuc
1460*84d9c625SLionel Sambuc	faligndata	%f28, %f30, %f32
1461*84d9c625SLionel Sambuc	cmp	%o0, %o5
1462*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1463*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f0
1464*84d9c625SLionel Sambuc	membar	#Sync
1465*84d9c625SLionel Sambuc2:
1466*84d9c625SLionel Sambuc	faligndata	%f30, %f48, %f34
1467*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1468*84d9c625SLionel Sambuc	faligndata	%f48, %f50, %f36
1469*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1470*84d9c625SLionel Sambuc	faligndata	%f50, %f52, %f38
1471*84d9c625SLionel Sambuc	faligndata	%f52, %f54, %f40
1472*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1473*84d9c625SLionel Sambuc	faligndata	%f54, %f56, %f42
1474*84d9c625SLionel Sambuc	faligndata	%f56, %f58, %f44
1475*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1476*84d9c625SLionel Sambuc	 faligndata	%f58, %f60, %f46
1477*84d9c625SLionel Sambuc
1478*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1479*84d9c625SLionel Sambuc
1480*84d9c625SLionel Sambuc	faligndata	%f60, %f62, %f32
1481*84d9c625SLionel Sambuc	cmp	%o0, %o5
1482*84d9c625SLionel Sambuc	bleu,a,pn	%icc, 2f
1483*84d9c625SLionel Sambuc	 ldda	[%o0] ASI_BLK_P, %f16
1484*84d9c625SLionel Sambuc	membar	#Sync
1485*84d9c625SLionel Sambuc2:
1486*84d9c625SLionel Sambuc	faligndata	%f62, %f0, %f34
1487*84d9c625SLionel Sambuc	dec	BLOCK_SIZE, %o2
1488*84d9c625SLionel Sambuc	faligndata	%f0, %f2, %f36
1489*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o1
1490*84d9c625SLionel Sambuc	faligndata	%f2, %f4, %f38
1491*84d9c625SLionel Sambuc	faligndata	%f4, %f6, %f40
1492*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o0
1493*84d9c625SLionel Sambuc	faligndata	%f6, %f8, %f42
1494*84d9c625SLionel Sambuc	faligndata	%f8, %f10, %f44
1495*84d9c625SLionel Sambuc
1496*84d9c625SLionel Sambuc	brlez,pn	%o2, Lmemcpy_blockdone
1497*84d9c625SLionel Sambuc	 faligndata	%f10, %f12, %f46
1498*84d9c625SLionel Sambuc
1499*84d9c625SLionel Sambuc	stda	%f32, [%o1] ASI_STORE
1500*84d9c625SLionel Sambuc	ba	3b
1501*84d9c625SLionel Sambuc	 inc	BLOCK_SIZE, %o1
1502*84d9c625SLionel Sambuc
1503*84d9c625SLionel SambucLmemcpy_blockdone:
1504*84d9c625SLionel Sambuc	inc	BLOCK_SIZE, %o2				! Fixup our overcommit
1505*84d9c625SLionel Sambuc	membar	#Sync					! Finish any pending loads
1506*84d9c625SLionel Sambuc#define	FINISH_REG(f)				\
1507*84d9c625SLionel Sambuc	deccc	8, %o2;				\
1508*84d9c625SLionel Sambuc	bl,a	Lmemcpy_blockfinish;		\
1509*84d9c625SLionel Sambuc	 fmovd	f, %f48;			\
1510*84d9c625SLionel Sambuc	std	f, [%o1];			\
1511*84d9c625SLionel Sambuc	inc	8, %o1
1512*84d9c625SLionel Sambuc
1513*84d9c625SLionel Sambuc	FINISH_REG(%f32)
1514*84d9c625SLionel Sambuc	FINISH_REG(%f34)
1515*84d9c625SLionel Sambuc	FINISH_REG(%f36)
1516*84d9c625SLionel Sambuc	FINISH_REG(%f38)
1517*84d9c625SLionel Sambuc	FINISH_REG(%f40)
1518*84d9c625SLionel Sambuc	FINISH_REG(%f42)
1519*84d9c625SLionel Sambuc	FINISH_REG(%f44)
1520*84d9c625SLionel Sambuc	FINISH_REG(%f46)
1521*84d9c625SLionel Sambuc	FINISH_REG(%f48)
1522*84d9c625SLionel Sambuc#undef FINISH_REG
1523*84d9c625SLionel Sambuc	!!
1524*84d9c625SLionel Sambuc	!! The low 3 bits have the sub-word bits needed to be
1525*84d9c625SLionel Sambuc	!! stored [because (x-8)&0x7 == x].
1526*84d9c625SLionel Sambuc	!!
1527*84d9c625SLionel SambucLmemcpy_blockfinish:
1528*84d9c625SLionel Sambuc	brz,pn	%o2, 2f					! 100% complete?
1529*84d9c625SLionel Sambuc	 fmovd	%f48, %f4
1530*84d9c625SLionel Sambuc	cmp	%o2, 8					! Exactly 8 bytes?
1531*84d9c625SLionel Sambuc	bz,a,pn	CCCR, 2f
1532*84d9c625SLionel Sambuc	 std	%f4, [%o1]
1533*84d9c625SLionel Sambuc
1534*84d9c625SLionel Sambuc	btst	4, %o2					! Word store?
1535*84d9c625SLionel Sambuc	bz	CCCR, 1f
1536*84d9c625SLionel Sambuc	 nop
1537*84d9c625SLionel Sambuc	st	%f4, [%o1]
1538*84d9c625SLionel Sambuc	inc	4, %o1
1539*84d9c625SLionel Sambuc1:
1540*84d9c625SLionel Sambuc	btst	2, %o2
1541*84d9c625SLionel Sambuc	fzero	%f0
1542*84d9c625SLionel Sambuc	bz	1f
1543*84d9c625SLionel Sambuc
1544*84d9c625SLionel Sambuc	 mov	-6, %o4
1545*84d9c625SLionel Sambuc	alignaddr %o1, %o4, %g0
1546*84d9c625SLionel Sambuc
1547*84d9c625SLionel Sambuc	faligndata %f0, %f4, %f8
1548*84d9c625SLionel Sambuc
1549*84d9c625SLionel Sambuc	stda	%f8, [%o1] ASI_FL16_P			! Store short
1550*84d9c625SLionel Sambuc	inc	2, %o1
1551*84d9c625SLionel Sambuc1:
1552*84d9c625SLionel Sambuc	btst	1, %o2					! Byte aligned?
1553*84d9c625SLionel Sambuc	bz	2f
1554*84d9c625SLionel Sambuc
1555*84d9c625SLionel Sambuc	 mov	-7, %o0					! Calculate dest - 7
1556*84d9c625SLionel Sambuc	alignaddr %o1, %o0, %g0				! Calculate shift mask and dest.
1557*84d9c625SLionel Sambuc
1558*84d9c625SLionel Sambuc	faligndata %f0, %f4, %f8			! Move 1st byte to low part of f8
1559*84d9c625SLionel Sambuc
1560*84d9c625SLionel Sambuc	stda	%f8, [%o1] ASI_FL8_P			! Store 1st byte
1561*84d9c625SLionel Sambuc	inc	1, %o1					! Update address
1562*84d9c625SLionel Sambuc2:
1563*84d9c625SLionel Sambuc	membar	#Sync
1564*84d9c625SLionel Sambuc#if 0
1565*84d9c625SLionel Sambuc	!!
1566*84d9c625SLionel Sambuc	!! verify copy success.
1567*84d9c625SLionel Sambuc	!!
1568*84d9c625SLionel Sambuc
1569*84d9c625SLionel Sambuc	mov	%i0, %o2
1570*84d9c625SLionel Sambuc	mov	%i1, %o4
1571*84d9c625SLionel Sambuc	mov	%i2, %l4
1572*84d9c625SLionel Sambuc0:
1573*84d9c625SLionel Sambuc	ldub	[%o2], %o1
1574*84d9c625SLionel Sambuc	inc	%o2
1575*84d9c625SLionel Sambuc	ldub	[%o4], %o3
1576*84d9c625SLionel Sambuc	inc	%o4
1577*84d9c625SLionel Sambuc	cmp	%o3, %o1
1578*84d9c625SLionel Sambuc	bnz	1f
1579*84d9c625SLionel Sambuc	 dec	%l4
1580*84d9c625SLionel Sambuc	brnz	%l4, 0b
1581*84d9c625SLionel Sambuc	 nop
1582*84d9c625SLionel Sambuc	ba	2f
1583*84d9c625SLionel Sambuc	 nop
1584*84d9c625SLionel Sambuc
1585*84d9c625SLionel Sambuc1:
1586*84d9c625SLionel Sambuc	set	block_disable, %o0
1587*84d9c625SLionel Sambuc	stx	%o0, [%o0]
1588*84d9c625SLionel Sambuc
1589*84d9c625SLionel Sambuc	set	0f, %o0
1590*84d9c625SLionel Sambuc	call	prom_printf
1591*84d9c625SLionel Sambuc	 sub	%i2, %l4, %o5
1592*84d9c625SLionel Sambuc	set	1f, %o0
1593*84d9c625SLionel Sambuc	mov	%i0, %o2
1594*84d9c625SLionel Sambuc	mov	%i1, %o1
1595*84d9c625SLionel Sambuc	call	prom_printf
1596*84d9c625SLionel Sambuc	 mov	%i2, %o3
1597*84d9c625SLionel Sambuc	ta	1
1598*84d9c625SLionel Sambuc	.data
1599*84d9c625SLionel Sambuc	_ALIGN
1600*84d9c625SLionel Sambuc0:	.asciz	"block memcpy failed: %x@%p != %x@%p byte %d\r\n"
1601*84d9c625SLionel Sambuc1:	.asciz	"memcpy(%p, %p, %lx)\r\n"
1602*84d9c625SLionel Sambuc	_ALIGN
1603*84d9c625SLionel Sambuc	.text
1604*84d9c625SLionel Sambuc2:
1605*84d9c625SLionel Sambuc#endif
1606*84d9c625SLionel Sambuc#if defined(_KERNEL) && !defined(_RUMPKERNEL)
1607*84d9c625SLionel Sambuc
1608*84d9c625SLionel Sambuc/*
1609*84d9c625SLionel Sambuc * Weve saved our possible fpstate, now disable the fpu
1610*84d9c625SLionel Sambuc * and continue with life.
1611*84d9c625SLionel Sambuc */
1612*84d9c625SLionel Sambuc	RESTORE_FPU
1613*84d9c625SLionel Sambuc	ret
1614*84d9c625SLionel Sambuc	 restore	%g1, 0, %o0			! Return DEST for memcpy
1615*84d9c625SLionel Sambuc#endif
1616*84d9c625SLionel Sambuc 	retl
1617*84d9c625SLionel Sambuc	 mov	%g1, %o0
1618*84d9c625SLionel Sambuc/*
1619*84d9c625SLionel Sambuc * Use block_disable to turn off block insns for
1620*84d9c625SLionel Sambuc * memcpy/memset
1621*84d9c625SLionel Sambuc */
1622*84d9c625SLionel Sambuc	.data
1623*84d9c625SLionel Sambuc	.align	8
1624*84d9c625SLionel Sambuc	.globl	block_disable
1625*84d9c625SLionel Sambucblock_disable:	.xword	1
1626*84d9c625SLionel Sambuc	.text
1627*84d9c625SLionel Sambuc#endif	/* USE_BLOCK_STORE_LOAD */
1628