i386_hwcap1/gen/memcpy.S

*5d9d9091SRichard Lowe/*
*5d9d9091SRichard Lowe * CDDL HEADER START
*5d9d9091SRichard Lowe *
*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the
*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License").
*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License.
*5d9d9091SRichard Lowe *
*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing.
*5d9d9091SRichard Lowe * See the License for the specific language governing permissions
*5d9d9091SRichard Lowe * and limitations under the License.
*5d9d9091SRichard Lowe *
*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each
*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the
*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying
*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner]
*5d9d9091SRichard Lowe *
*5d9d9091SRichard Lowe * CDDL HEADER END
*5d9d9091SRichard Lowe */
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe/*
*5d9d9091SRichard Lowe * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
*5d9d9091SRichard Lowe * Use is subject to license terms.
*5d9d9091SRichard Lowe */
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	.file	"memcpy.s"
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe#include <sys/asm_linkage.h>
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memmove,function)
*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memcpy,function)
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	ENTRY(memmove)
*5d9d9091SRichard Lowe	movl	0+12(%esp),%ecx	/ get number of bytes to move
*5d9d9091SRichard Lowe	pushl	%esi		/ save off %edi, %esi and move destination
*5d9d9091SRichard Lowe	pushl	%edi
*5d9d9091SRichard Lowe	movl	8+ 4(%esp),%edi	/ destination buffer address
*5d9d9091SRichard Lowe	movl	8+ 8(%esp),%esi	/ source buffer address
*5d9d9091SRichard Lowe	movl	%edi, %eax
*5d9d9091SRichard Lowe	testl	%ecx,%ecx
*5d9d9091SRichard Lowe	jz	.Return
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	cmpl	%esi,%edi	/ if (source addr > dest addr)
*5d9d9091SRichard Lowe	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
*5d9d9091SRichard Lowe	jbe	.memcpy_post	/ jump if dst <= src
*5d9d9091SRichard Lowe	cmpl	%edx,%edi
*5d9d9091SRichard Lowe	jbe	.CopyLeft	/ jump if dst <= src + size - 1
*5d9d9091SRichard Lowe	jmp	.memcpy_post
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	ENTRY(memcpy)
*5d9d9091SRichard Lowe	pushl	%esi
*5d9d9091SRichard Lowe	pushl	%edi
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	movl	8+4(%esp),%edi	/ %edi = dest address
*5d9d9091SRichard Lowe	movl	%edi, %eax	/ save this
*5d9d9091SRichard Lowe	movl	8+8(%esp),%esi	/ %esi = source address
*5d9d9091SRichard Lowe	movl	8+12(%esp),%ecx/ %ecx = length of string
*5d9d9091SRichard Lowe				/ %edx scratch register
*5d9d9091SRichard Lowe				/ %eax scratch register
*5d9d9091SRichard Lowe.memcpy_post:
*5d9d9091SRichard Lowe	nop			/ this really helps, don't know why
*5d9d9091SRichard Lowe				/ note:	cld is perf death on P4
*5d9d9091SRichard Lowe	cmpl	$63,%ecx
*5d9d9091SRichard Lowe	ja	.move_sse	/ not worth doing sse for less
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe.movew:
*5d9d9091SRichard Lowe	movl	%ecx,%edx	/ save byte cnt
*5d9d9091SRichard Lowe	shrl	$2,%ecx		/ %ecx = number of words to move
*5d9d9091SRichard Lowe	rep ; smovl		/ move the words
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	andl	$0x3,%edx	/ %edx = number of bytes left to move
*5d9d9091SRichard Lowe	jz	.Return		/ %edx <= 3, so just unroll the loop
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	movb	(%esi), %cl
*5d9d9091SRichard Lowe	movb	%cl, (%edi)
*5d9d9091SRichard Lowe	decl	%edx
*5d9d9091SRichard Lowe	jz	.Return
*5d9d9091SRichard Lowe	movb	1(%esi), %cl
*5d9d9091SRichard Lowe	movb	%cl, 1(%edi)
*5d9d9091SRichard Lowe	decl	%edx
*5d9d9091SRichard Lowe	jz	.Return
*5d9d9091SRichard Lowe	movb	2(%esi), %cl
*5d9d9091SRichard Lowe	movb	%cl, 2(%edi)
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe.Return:
*5d9d9091SRichard Lowe	popl	%edi		/ restore register variables
*5d9d9091SRichard Lowe	popl	%esi
*5d9d9091SRichard Lowe	ret
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe.move_sse:
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	/ time to 16 byte align destination
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	andl	$15, %eax
*5d9d9091SRichard Lowe	jnz	.sse_unaligned	/ jmp if dest is unaligned
*5d9d9091SRichard Lowe.sse:				/ dest is aligned, check source
*5d9d9091SRichard Lowe	movl	%ecx, %edx	/ get byte count
*5d9d9091SRichard Lowe	shrl	$6, %edx	/ number of 64 byte blocks to move
*5d9d9091SRichard Lowe	testl	$15, %esi
*5d9d9091SRichard Lowe	jnz	.sse_da		/ go to slow loop if source is unaligned
*5d9d9091SRichard Lowe	cmpl	$65535, %ecx
*5d9d9091SRichard Lowe	ja	.sse_sa_nt_loop
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	/ use aligned load since we're lucky
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe.sse_sa_loop:
*5d9d9091SRichard Lowe	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
*5d9d9091SRichard Lowe	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
*5d9d9091SRichard Lowe	movaps	0(%esi), %xmm0
*5d9d9091SRichard Lowe	movaps	%xmm0, 0(%edi)
*5d9d9091SRichard Lowe	movaps	16(%esi), %xmm1
*5d9d9091SRichard Lowe	movaps	%xmm1, 16(%edi)
*5d9d9091SRichard Lowe	movaps	32(%esi), %xmm2
*5d9d9091SRichard Lowe	movaps	%xmm2, 32(%edi)
*5d9d9091SRichard Lowe	movaps	48(%esi), %xmm3
*5d9d9091SRichard Lowe	movaps	%xmm3, 48(%edi)
*5d9d9091SRichard Lowe	addl	$64, %esi
*5d9d9091SRichard Lowe	addl	$64, %edi
*5d9d9091SRichard Lowe	decl	%edx
*5d9d9091SRichard Lowe	jnz	.sse_sa_loop
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe.sse_cleanup:
*5d9d9091SRichard Lowe	andl	$63, %ecx	/ compute remaining bytes
*5d9d9091SRichard Lowe	movl	8+4(%esp), %eax	/ setup return value
*5d9d9091SRichard Lowe	jz	.Return
*5d9d9091SRichard Lowe	jmp	.movew
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	/ use aligned load since we're lucky
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	.align 16
*5d9d9091SRichard Lowe.sse_sa_nt_loop:
*5d9d9091SRichard Lowe	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
*5d9d9091SRichard Lowe	movaps	(%esi), %xmm0
*5d9d9091SRichard Lowe	movntps	%xmm0, 0(%edi)
*5d9d9091SRichard Lowe	movaps	16(%esi), %xmm1
*5d9d9091SRichard Lowe	movntps	%xmm1, 16(%edi)
*5d9d9091SRichard Lowe	movaps	32(%esi), %xmm2
*5d9d9091SRichard Lowe	movntps	%xmm2, 32(%edi)
*5d9d9091SRichard Lowe	movaps	48(%esi), %xmm3
*5d9d9091SRichard Lowe	movntps	%xmm3, 48(%edi)
*5d9d9091SRichard Lowe	addl	$64, %esi
*5d9d9091SRichard Lowe	addl	$64, %edi
*5d9d9091SRichard Lowe	decl	%edx
*5d9d9091SRichard Lowe	jnz	.sse_sa_nt_loop
*5d9d9091SRichard Lowe#if defined(_SSE2_INSN)
*5d9d9091SRichard Lowe	mfence
*5d9d9091SRichard Lowe#elif defined(_SSE_INSN)
*5d9d9091SRichard Lowe	sfence
*5d9d9091SRichard Lowe#else
*5d9d9091SRichard Lowe#error "Must have either SSE or SSE2"
*5d9d9091SRichard Lowe#endif
*5d9d9091SRichard Lowe	jmp	.sse_cleanup
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	/ Make certain that destination buffer becomes aligned
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe.sse_unaligned:
*5d9d9091SRichard Lowe	neg	%eax		/ subtract from 16 and get destination
*5d9d9091SRichard Lowe	andl	$15, %eax	/ aligned on a 16 byte boundary
*5d9d9091SRichard Lowe	movl	%ecx, %edx	/ saved count
*5d9d9091SRichard Lowe	subl	%eax, %ecx	/ subtract from byte count
*5d9d9091SRichard Lowe	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
*5d9d9091SRichard Lowe	cmovb	%edx, %ecx	/ if not, restore original byte count,
*5d9d9091SRichard Lowe	cmovb	8+4(%esp), %eax	/ and restore return value,
*5d9d9091SRichard Lowe	jb	.movew		/ and do a non-SSE move.
*5d9d9091SRichard Lowe	xchg	%ecx, %eax	/ flip for copy
*5d9d9091SRichard Lowe	rep ; smovb		/ move the bytes
*5d9d9091SRichard Lowe	xchg	%ecx, %eax	/ flip back
*5d9d9091SRichard Lowe	jmp	.sse
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	.align 16
*5d9d9091SRichard Lowe.sse_da:
*5d9d9091SRichard Lowe	cmpl	$65535, %ecx
*5d9d9091SRichard Lowe	jbe	.sse_da_loop
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	/ use unaligned load since source doesn't line up
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe.sse_da_nt_loop:
*5d9d9091SRichard Lowe	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
*5d9d9091SRichard Lowe	movups	0(%esi), %xmm0
*5d9d9091SRichard Lowe	movntps	%xmm0, 0(%edi)
*5d9d9091SRichard Lowe	movups	16(%esi), %xmm1
*5d9d9091SRichard Lowe	movntps	%xmm1, 16(%edi)
*5d9d9091SRichard Lowe	movups	32(%esi), %xmm2
*5d9d9091SRichard Lowe	movntps	%xmm2, 32(%edi)
*5d9d9091SRichard Lowe	movups	48(%esi), %xmm3
*5d9d9091SRichard Lowe	movntps	%xmm3, 48(%edi)
*5d9d9091SRichard Lowe	addl	$64, %esi
*5d9d9091SRichard Lowe	addl	$64, %edi
*5d9d9091SRichard Lowe	decl	%edx
*5d9d9091SRichard Lowe	jnz	.sse_da_nt_loop
*5d9d9091SRichard Lowe#if defined(_SSE2_INSN)
*5d9d9091SRichard Lowe	mfence
*5d9d9091SRichard Lowe#elif defined(_SSE_INSN)
*5d9d9091SRichard Lowe	sfence
*5d9d9091SRichard Lowe#else
*5d9d9091SRichard Lowe#error "Must have either SSE or SSE2"
*5d9d9091SRichard Lowe#endif
*5d9d9091SRichard Lowe	jmp	.sse_cleanup
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	/ use unaligned load since source doesn't line up
*5d9d9091SRichard Lowe	/
*5d9d9091SRichard Lowe	.align	16
*5d9d9091SRichard Lowe.sse_da_loop:
*5d9d9091SRichard Lowe	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
*5d9d9091SRichard Lowe	prefetcht0 568(%edi)
*5d9d9091SRichard Lowe	movups	0(%esi), %xmm0
*5d9d9091SRichard Lowe	movaps	%xmm0, 0(%edi)
*5d9d9091SRichard Lowe	movups	16(%esi), %xmm1
*5d9d9091SRichard Lowe	movaps	%xmm1, 16(%edi)
*5d9d9091SRichard Lowe	movups	32(%esi), %xmm2
*5d9d9091SRichard Lowe	movaps	%xmm2, 32(%edi)
*5d9d9091SRichard Lowe	movups	48(%esi), %xmm3
*5d9d9091SRichard Lowe	movaps	%xmm3, 48(%edi)
*5d9d9091SRichard Lowe	addl	$64, %esi
*5d9d9091SRichard Lowe	addl	$64, %edi
*5d9d9091SRichard Lowe	decl	%edx
*5d9d9091SRichard Lowe	jnz	.sse_da_loop
*5d9d9091SRichard Lowe	jmp	.sse_cleanup
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe	SET_SIZE(memcpy)
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe/ .CopyLeft handles the memmove case where we must perform the copy backwards,
*5d9d9091SRichard Lowe/ because of overlap between src and dst. This is not particularly optimized.
*5d9d9091SRichard Lowe
*5d9d9091SRichard Lowe.CopyLeft:
*5d9d9091SRichard Lowe	movl	$3,%eax			/ heavily used constant
*5d9d9091SRichard Lowe	std				/ reverse direction bit (RtoL)
*5d9d9091SRichard Lowe	cmpl	$12,%ecx		/ if (size < 12)
*5d9d9091SRichard Lowe	ja	.BigCopyLeft		/ {
*5d9d9091SRichard Lowe	movl	%edx,%esi		/     src = src + size - 1
*5d9d9091SRichard Lowe	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
*5d9d9091SRichard Lowe	rep;	smovb			/    do the byte copy
*5d9d9091SRichard Lowe	cld				/    reset direction flag to LtoR
*5d9d9091SRichard Lowe	popl	%edi			/  }
*5d9d9091SRichard Lowe	popl	%esi			/  restore registers
*5d9d9091SRichard Lowe	movl	4(%esp),%eax		/  set up return value
*5d9d9091SRichard Lowe	ret				/  return(dba);
*5d9d9091SRichard Lowe.BigCopyLeft:				/ } else {
*5d9d9091SRichard Lowe	xchgl	%edx,%ecx
*5d9d9091SRichard Lowe	movl	%ecx,%esi		/ align source w/byte copy
*5d9d9091SRichard Lowe	leal	-1(%edx,%edi),%edi
*5d9d9091SRichard Lowe	andl	%eax,%ecx
*5d9d9091SRichard Lowe	jz	.SkipAlignLeft
*5d9d9091SRichard Lowe	addl	$1, %ecx		/ we need to insure that future
*5d9d9091SRichard Lowe	subl	%ecx,%edx		/ copy is done on aligned boundary
*5d9d9091SRichard Lowe	rep;	smovb
*5d9d9091SRichard Lowe.SkipAlignLeft:
*5d9d9091SRichard Lowe	movl	%edx,%ecx
*5d9d9091SRichard Lowe	subl	%eax,%esi
*5d9d9091SRichard Lowe	shrl	$2,%ecx			/ do 4 byte copy RtoL
*5d9d9091SRichard Lowe	subl	%eax,%edi
*5d9d9091SRichard Lowe	rep;	smovl
*5d9d9091SRichard Lowe	andl	%eax,%edx		/ do 1 byte copy whats left
*5d9d9091SRichard Lowe	jz	.CleanupReturnLeft
*5d9d9091SRichard Lowe	movl	%edx,%ecx
*5d9d9091SRichard Lowe	addl	%eax,%esi		/ rep; smovl instruction will decrement
*5d9d9091SRichard Lowe	addl	%eax,%edi		/ %edi, %esi by four after each copy
*5d9d9091SRichard Lowe					/ adding 3 will restore pointers to byte
*5d9d9091SRichard Lowe					/ before last double word copied
*5d9d9091SRichard Lowe					/ which is where they are expected to
*5d9d9091SRichard Lowe					/ be for the single byte copy code
*5d9d9091SRichard Lowe	rep;	smovb
*5d9d9091SRichard Lowe.CleanupReturnLeft:
*5d9d9091SRichard Lowe	cld				/ reset direction flag to LtoR
*5d9d9091SRichard Lowe	popl	%edi
*5d9d9091SRichard Lowe	popl	%esi			/ restore registers
*5d9d9091SRichard Lowe	movl	4(%esp),%eax		/ set up return value
*5d9d9091SRichard Lowe	ret				/ return(dba);
*5d9d9091SRichard Lowe	SET_SIZE(memmove)