xref: /onnv-gate/usr/src/lib/libc/sparcv9/crt/__align_cpy_4.s (revision 7298:b69e27387f74)
10Sstevel@tonic-gate/*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
56812Sraf * Common Development and Distribution License (the "License").
66812Sraf * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
216812Sraf
220Sstevel@tonic-gate/*
236812Sraf * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
246812Sraf * Use is subject to license terms.
250Sstevel@tonic-gate */
260Sstevel@tonic-gate
27*7298SMark.J.Nelson@Sun.COM	.file	"__align_cpy_4.s"
280Sstevel@tonic-gate
290Sstevel@tonic-gate/* __align_cpy_4(s1, s2, n)
300Sstevel@tonic-gate *
310Sstevel@tonic-gate * Copy 4-byte aligned source to 4-byte aligned target in multiples of 4 bytes.
320Sstevel@tonic-gate *
330Sstevel@tonic-gate * Input:
340Sstevel@tonic-gate *	o0	address of target
350Sstevel@tonic-gate *	o1	address of source
360Sstevel@tonic-gate *	o2	number of bytes to copy (must be a multiple of 4)
370Sstevel@tonic-gate * Output:
380Sstevel@tonic-gate *	o0	address of target
390Sstevel@tonic-gate * Caller's registers that have been changed by this function:
400Sstevel@tonic-gate *	o1-o5, g1, g5
410Sstevel@tonic-gate *
420Sstevel@tonic-gate * Note:
430Sstevel@tonic-gate *	This helper routine will not be used by any 32-bit compilations.
440Sstevel@tonic-gate *	To do so would break binary compatibility with previous versions of
450Sstevel@tonic-gate *	Solaris.
460Sstevel@tonic-gate *
470Sstevel@tonic-gate * Assumptions:
480Sstevel@tonic-gate *	Source and target addresses are 4-byte aligned.
490Sstevel@tonic-gate *	Bytes to be copied are non-overlapping or _exactly_ overlapping.
500Sstevel@tonic-gate *	The number of bytes to be copied is a multiple of 4.
510Sstevel@tonic-gate *	Call will usually be made with a byte count of more than 4*4 and
520Sstevel@tonic-gate *	less than a few hundred bytes.  Legal values are 0 to MAX_SIZE_T.
530Sstevel@tonic-gate *
540Sstevel@tonic-gate * Optimization attempt:
550Sstevel@tonic-gate *	Reasonable speed for a generic v9.
560Sstevel@tonic-gate */
570Sstevel@tonic-gate
580Sstevel@tonic-gate#include <sys/asm_linkage.h>
590Sstevel@tonic-gate
600Sstevel@tonic-gate	ENTRY(__align_cpy_4)
610Sstevel@tonic-gate	brz,pn %o2, .done		! Skip out if no bytes to copy.
620Sstevel@tonic-gate	cmp	%o0, %o1
630Sstevel@tonic-gate	be,pn	%xcc, .done		! Addresses are identical--done.
640Sstevel@tonic-gate	and	%o0, 7, %o3		! Is target 8-byte aligned?
650Sstevel@tonic-gate	and	%o1, 7, %o4		! Is source 8-byte aligned?
660Sstevel@tonic-gate	cmp	%o3, %o4
670Sstevel@tonic-gate	bne,pt	%icc, .noton8		! Exactly one of source and target is
680Sstevel@tonic-gate	mov	%o0, %g1		!     8-byte aligned.
690Sstevel@tonic-gate	brz,pt	%o3, .both8		! Both are 8-byte aligned.
700Sstevel@tonic-gate	nop
710Sstevel@tonic-gate
720Sstevel@tonic-gate	ld	[%o1], %o3		! Neither is aligned, so do 4 bytes;
730Sstevel@tonic-gate	subcc	%o2, 4, %o2		! then both will be aligned.
740Sstevel@tonic-gate	st	%o3, [%g1]
750Sstevel@tonic-gate	bz,pn	%xcc, .done
760Sstevel@tonic-gate	add	%g1, 4, %g1
770Sstevel@tonic-gate	b	.both8
780Sstevel@tonic-gate	add	%o1, 4, %o1
790Sstevel@tonic-gate
800Sstevel@tonic-gate! Section of code dealing with case where source and target are both 8-byte
810Sstevel@tonic-gate! aligned.  Get and store 16 bytes at a time using ldx and stx.
820Sstevel@tonic-gate
830Sstevel@tonic-gate	.align	32
840Sstevel@tonic-gate.both8:					! Both source and target are aligned.
850Sstevel@tonic-gate	cmp	%o2, 16
860Sstevel@tonic-gate	bl,a,pn %xcc, .chkwd
870Sstevel@tonic-gate	cmp	%o2, 8
880Sstevel@tonic-gate
890Sstevel@tonic-gate	sub	%o2, 12, %o2
900Sstevel@tonic-gate.loop16a:				! Load and store 16 bytes at a time.
910Sstevel@tonic-gate	ldx	[%o1], %o3
920Sstevel@tonic-gate	ldx	[%o1+8], %o4
930Sstevel@tonic-gate	subcc	%o2, 16, %o2
940Sstevel@tonic-gate	stx	%o3, [%g1]
950Sstevel@tonic-gate	stx	%o4, [%g1+8]
960Sstevel@tonic-gate	add	%o1, 16, %o1
970Sstevel@tonic-gate	bg,pt	%xcc, .loop16a		! Have at least 16 bytes left.
980Sstevel@tonic-gate	add	%g1, 16, %g1
990Sstevel@tonic-gate
1000Sstevel@tonic-gate	addcc	%o2, 12, %o2
1010Sstevel@tonic-gate	bg,a,pt	%xcc, .chkwd		! Have some remaining bytes.
1020Sstevel@tonic-gate	cmp	%o2, 8
1030Sstevel@tonic-gate	retl
1040Sstevel@tonic-gate	nop
1050Sstevel@tonic-gate
1060Sstevel@tonic-gate.chkwd:
1070Sstevel@tonic-gate	bl,a,pn	%xcc, .wrword		! Only 4 bytes left.
1080Sstevel@tonic-gate	ld	[%o1], %o3
1090Sstevel@tonic-gate
1100Sstevel@tonic-gate	ldx	[%o1], %o3		! Have 8 or 12, so do 8.
1110Sstevel@tonic-gate	stx	%o3, [%g1]
1120Sstevel@tonic-gate	add	%o1, 8, %o1
1130Sstevel@tonic-gate	add	%g1, 8, %g1
1140Sstevel@tonic-gate	subcc	%o2, 8, %o2
1150Sstevel@tonic-gate	bg,a,pn %xcc, .wrword		! Still have four to do.
1160Sstevel@tonic-gate	ld	[%o1], %o3
1170Sstevel@tonic-gate
1180Sstevel@tonic-gate	retl
1190Sstevel@tonic-gate	nop
1200Sstevel@tonic-gate
1210Sstevel@tonic-gate.wrword:				! Copy final word.
1220Sstevel@tonic-gate	st	%o3, [%g1]
1230Sstevel@tonic-gate
1240Sstevel@tonic-gate.done:
1250Sstevel@tonic-gate	retl
1260Sstevel@tonic-gate	nop
1270Sstevel@tonic-gate
1280Sstevel@tonic-gate! Section of code where either source or target, but not both, are 8-byte
1290Sstevel@tonic-gate! aligned.  So, use ld and st instructions rather than trying to copy stuff
1300Sstevel@tonic-gate! around in registers.
1310Sstevel@tonic-gate
1320Sstevel@tonic-gate	.align	32			! Ultra cache line boundary.
1330Sstevel@tonic-gate.noton8:
1340Sstevel@tonic-gate	add	%o1, %o2, %g5	! Ending address of source.
1350Sstevel@tonic-gate	andcc	%o2, 15, %o3	! Mod 16 of number of bytes to copy.
1360Sstevel@tonic-gate	bz,pn	%xcc, .loop16	! Copy odd amounts first, then multiples of 16.
1370Sstevel@tonic-gate	cmp	%o3, 4
1380Sstevel@tonic-gate	bz,pn	%xcc, .mod4
1390Sstevel@tonic-gate	cmp	%o3, 8
1400Sstevel@tonic-gate	bz,pn	%xcc, .mod8
1410Sstevel@tonic-gate	cmp	%o3, 12
1420Sstevel@tonic-gate	bz,pt	%xcc, .mod12
1430Sstevel@tonic-gate	nop
1440Sstevel@tonic-gate	illtrap	0		! Size not valid.
1450Sstevel@tonic-gate
1460Sstevel@tonic-gate.mod4:				! Do first 4 bytes, then do multiples of 16.
1470Sstevel@tonic-gate	lduw	[%o1], %o2
1480Sstevel@tonic-gate	add	%o1, 4, %o1
1490Sstevel@tonic-gate	st	%o2, [%g1]
1500Sstevel@tonic-gate	cmp	%o1, %g5
1510Sstevel@tonic-gate	bl,a,pt %xcc, .loop16
1520Sstevel@tonic-gate	add	%g1, 4, %g1
1530Sstevel@tonic-gate	retl
1540Sstevel@tonic-gate	nop
1550Sstevel@tonic-gate.mod8:				! Do first 8 bytes, then do multiples of 16.
1560Sstevel@tonic-gate	lduw	[%o1], %o2
1570Sstevel@tonic-gate	lduw	[%o1+4], %o3
1580Sstevel@tonic-gate	add	%o1, 8, %o1
1590Sstevel@tonic-gate	st	%o2, [%g1]
1600Sstevel@tonic-gate	st	%o3, [%g1+4]
1610Sstevel@tonic-gate	cmp	%o1, %g5
1620Sstevel@tonic-gate	bl,a,pt	%xcc, .loop16
1630Sstevel@tonic-gate	add	%g1, 8, %g1
1640Sstevel@tonic-gate	retl
1650Sstevel@tonic-gate	nop
1660Sstevel@tonic-gate.mod12:				! Do first 12 bytes, then do multiples of 16.
1670Sstevel@tonic-gate	lduw	[%o1], %o2
1680Sstevel@tonic-gate	lduw	[%o1+4], %o3
1690Sstevel@tonic-gate	lduw	[%o1+8], %o4
1700Sstevel@tonic-gate	add	%o1, 12, %o1
1710Sstevel@tonic-gate	st	%o2, [%g1]
1720Sstevel@tonic-gate	st	%o3, [%g1+4]
1730Sstevel@tonic-gate	st	%o4, [%g1+8]
1740Sstevel@tonic-gate	cmp	%o1, %g5
1750Sstevel@tonic-gate	bl,a,pt	%xcc, .loop16
1760Sstevel@tonic-gate	add	%g1, 12, %g1
1770Sstevel@tonic-gate	retl
1780Sstevel@tonic-gate	nop
1790Sstevel@tonic-gate	.align	32			! Ultra cache line boundary.
1800Sstevel@tonic-gate.loop16:				! Do multiples of 16 bytes.
1810Sstevel@tonic-gate	lduw	[%o1], %o2
1820Sstevel@tonic-gate	lduw	[%o1+4], %o3
1830Sstevel@tonic-gate	lduw	[%o1+8], %o4
1840Sstevel@tonic-gate	lduw	[%o1+12], %o5
1850Sstevel@tonic-gate	add	%o1, 16, %o1
1860Sstevel@tonic-gate	st	%o2, [%g1]
1870Sstevel@tonic-gate	st	%o3, [%g1+4]
1880Sstevel@tonic-gate	cmp	%o1, %g5
1890Sstevel@tonic-gate	st	%o4, [%g1+8]
1900Sstevel@tonic-gate	st	%o5, [%g1+12]
1910Sstevel@tonic-gate	bl,a,pt	%xcc, .loop16
1920Sstevel@tonic-gate	add	%g1, 16,%g1
1930Sstevel@tonic-gate	retl			! Target address is already in o0.
1940Sstevel@tonic-gate	nop
1950Sstevel@tonic-gate
1960Sstevel@tonic-gate	SET_SIZE(__align_cpy_4)
197