xref: /onnv-gate/usr/src/uts/intel/ia32/ml/sseblk.s (revision 3446:5903aece022d)
10Sstevel@tonic-gate/*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*3446Smrj * Common Development and Distribution License (the "License").
6*3446Smrj * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate/*
22*3446Smrj * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate#pragma	ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate
280Sstevel@tonic-gate#include <sys/asm_linkage.h>
290Sstevel@tonic-gate#include <sys/regset.h>
300Sstevel@tonic-gate#include <sys/privregs.h>
310Sstevel@tonic-gate
320Sstevel@tonic-gate#if defined(__lint)
330Sstevel@tonic-gate#include <sys/types.h>
340Sstevel@tonic-gate#include <sys/archsystm.h>
350Sstevel@tonic-gate#else
360Sstevel@tonic-gate#include "assym.h"
370Sstevel@tonic-gate#endif
380Sstevel@tonic-gate
390Sstevel@tonic-gate/*
400Sstevel@tonic-gate * Do block operations using Streaming SIMD extensions
410Sstevel@tonic-gate */
420Sstevel@tonic-gate
430Sstevel@tonic-gate#if defined(DEBUG)
440Sstevel@tonic-gate#if defined(__amd64)
450Sstevel@tonic-gate#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
460Sstevel@tonic-gate	movq	%gs:CPU_THREAD, t;		\
470Sstevel@tonic-gate	movsbl	T_PREEMPT(t), r32;		\
480Sstevel@tonic-gate	testl	r32, r32;			\
490Sstevel@tonic-gate	jne	5f;				\
500Sstevel@tonic-gate	pushq	%rbp;				\
510Sstevel@tonic-gate	movq	%rsp, %rbp;			\
520Sstevel@tonic-gate	leaq	msg(%rip), %rdi;		\
530Sstevel@tonic-gate	xorl	%eax, %eax;			\
540Sstevel@tonic-gate	call	panic;				\
550Sstevel@tonic-gate5:
560Sstevel@tonic-gate#elif defined(__i386)
570Sstevel@tonic-gate#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
580Sstevel@tonic-gate	movl	%gs:CPU_THREAD, t;		\
590Sstevel@tonic-gate	movsbl	T_PREEMPT(t), r32;		\
600Sstevel@tonic-gate	testl	r32, r32;			\
610Sstevel@tonic-gate	jne	5f;				\
620Sstevel@tonic-gate	pushl	%ebp;				\
630Sstevel@tonic-gate	movl	%esp, %ebp;			\
640Sstevel@tonic-gate	pushl	$msg;				\
650Sstevel@tonic-gate	call	panic;				\
660Sstevel@tonic-gate5:
670Sstevel@tonic-gate#endif	/* __i386 */
680Sstevel@tonic-gate#else	/* DEBUG */
690Sstevel@tonic-gate#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)
700Sstevel@tonic-gate#endif	/* DEBUG */
710Sstevel@tonic-gate
720Sstevel@tonic-gate#define	BLOCKSHIFT	6
730Sstevel@tonic-gate#define	BLOCKSIZE	64	/* (1 << BLOCKSHIFT) */
740Sstevel@tonic-gate#define	BLOCKMASK	63	/* (BLOCKSIZE - 1) */
750Sstevel@tonic-gate
760Sstevel@tonic-gate#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
770Sstevel@tonic-gate#error	"mucked up constants"
780Sstevel@tonic-gate#endif
790Sstevel@tonic-gate
800Sstevel@tonic-gate#if defined(__lint)
810Sstevel@tonic-gate
820Sstevel@tonic-gate/*ARGSUSED*/
830Sstevel@tonic-gatevoid
840Sstevel@tonic-gatehwblkclr(void *addr, size_t size)
850Sstevel@tonic-gate{}
860Sstevel@tonic-gate
870Sstevel@tonic-gate#else	/* __lint */
880Sstevel@tonic-gate
890Sstevel@tonic-gate#if defined(__amd64)
900Sstevel@tonic-gate#define	ADD	addq
910Sstevel@tonic-gate#define	SUB	subq
920Sstevel@tonic-gate#else
930Sstevel@tonic-gate#define	ADD	addl
940Sstevel@tonic-gate#define	SUB	subl
950Sstevel@tonic-gate#endif
960Sstevel@tonic-gate
970Sstevel@tonic-gate#define	SAVE_XMM0(r)				\
980Sstevel@tonic-gate	SAVE_XMM_PROLOG(r, 1);			\
990Sstevel@tonic-gate	movdqa	%xmm0, (r)
1000Sstevel@tonic-gate
1010Sstevel@tonic-gate#define	ZERO_LOOP_INIT_XMM(dst)			\
1020Sstevel@tonic-gate	pxor	%xmm0, %xmm0
1030Sstevel@tonic-gate
1040Sstevel@tonic-gate#define	ZERO_LOOP_BODY_XMM(dst, cnt)		\
1050Sstevel@tonic-gate	movntdq	%xmm0, (dst);			\
1060Sstevel@tonic-gate	movntdq	%xmm0, 0x10(dst);		\
1070Sstevel@tonic-gate	movntdq	%xmm0, 0x20(dst);		\
1080Sstevel@tonic-gate	movntdq	%xmm0, 0x30(dst);		\
1090Sstevel@tonic-gate	ADD	$BLOCKSIZE, dst;		\
1100Sstevel@tonic-gate	SUB	$1, cnt
1110Sstevel@tonic-gate
1120Sstevel@tonic-gate#define	ZERO_LOOP_FINI_XMM(dst)			\
1130Sstevel@tonic-gate	mfence
1140Sstevel@tonic-gate
1150Sstevel@tonic-gate#define	RSTOR_XMM0(r)				\
1160Sstevel@tonic-gate	movdqa	0x0(r), %xmm0;			\
1170Sstevel@tonic-gate	RSTOR_XMM_EPILOG(r, 1)
1180Sstevel@tonic-gate
1190Sstevel@tonic-gate#if defined(__amd64)
1200Sstevel@tonic-gate
1210Sstevel@tonic-gate	/*
1220Sstevel@tonic-gate	 * %rdi		dst
1230Sstevel@tonic-gate	 * %rsi		size
1240Sstevel@tonic-gate	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_preempt)
1250Sstevel@tonic-gate	 * %r8		pointer to %xmm register save area
1260Sstevel@tonic-gate	 */
1270Sstevel@tonic-gate	ENTRY(hwblkclr)
1280Sstevel@tonic-gate	pushq	%rbp
1290Sstevel@tonic-gate	movq	%rsp, %rbp
1300Sstevel@tonic-gate	testl	$BLOCKMASK, %edi	/* address must be BLOCKSIZE aligned */
1310Sstevel@tonic-gate	jne	.dobzero
1320Sstevel@tonic-gate	cmpq	$BLOCKSIZE, %rsi	/* size must be at least BLOCKSIZE */
1330Sstevel@tonic-gate	jl	.dobzero
1340Sstevel@tonic-gate	testq	$BLOCKMASK, %rsi	/* .. and be a multiple of BLOCKSIZE */
1350Sstevel@tonic-gate	jne	.dobzero
1360Sstevel@tonic-gate	shrq	$BLOCKSHIFT, %rsi
1370Sstevel@tonic-gate
1380Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
1390Sstevel@tonic-gate	movq	%cr0, %rax
1400Sstevel@tonic-gate	clts
1410Sstevel@tonic-gate	testl	$CR0_TS, %eax
1420Sstevel@tonic-gate	jnz	1f
1430Sstevel@tonic-gate
1440Sstevel@tonic-gate	SAVE_XMM0(%r8)
1450Sstevel@tonic-gate1:	ZERO_LOOP_INIT_XMM(%rdi)
1460Sstevel@tonic-gate9:	ZERO_LOOP_BODY_XMM(%rdi, %rsi)
1470Sstevel@tonic-gate	jnz	9b
1480Sstevel@tonic-gate	ZERO_LOOP_FINI_XMM(%rdi)
1490Sstevel@tonic-gate
1500Sstevel@tonic-gate	testl	$CR0_TS, %eax
1510Sstevel@tonic-gate	jnz	2f
1520Sstevel@tonic-gate	RSTOR_XMM0(%r8)
1530Sstevel@tonic-gate2:	movq	%rax, %cr0
1540Sstevel@tonic-gate	leave
1550Sstevel@tonic-gate	ret
1560Sstevel@tonic-gate.dobzero:
1570Sstevel@tonic-gate	leave
1580Sstevel@tonic-gate	jmp	bzero
1590Sstevel@tonic-gate	SET_SIZE(hwblkclr)
1600Sstevel@tonic-gate
1610Sstevel@tonic-gate#elif defined(__i386)
1620Sstevel@tonic-gate
1630Sstevel@tonic-gate	/*
1640Sstevel@tonic-gate	 * %eax		dst
1650Sstevel@tonic-gate	 * %ecx		size in bytes, loop count
1660Sstevel@tonic-gate	 * %ebx		saved %cr0 (#if DEBUG then t->t_preempt)
1670Sstevel@tonic-gate	 * %edi		pointer to %xmm register save area
1680Sstevel@tonic-gate	 */
1690Sstevel@tonic-gate	ENTRY(hwblkclr)
1700Sstevel@tonic-gate	movl	4(%esp), %eax
1710Sstevel@tonic-gate	movl	8(%esp), %ecx
1720Sstevel@tonic-gate	testl	$BLOCKMASK, %eax	/* address must be BLOCKSIZE aligned */
1730Sstevel@tonic-gate	jne	.dobzero
1740Sstevel@tonic-gate	cmpl	$BLOCKSIZE, %ecx	/* size must be at least BLOCKSIZE */
1750Sstevel@tonic-gate	jl	.dobzero
1760Sstevel@tonic-gate	testl	$BLOCKMASK, %ecx 	/* .. and be a multiple of BLOCKSIZE */
1770Sstevel@tonic-gate	jne	.dobzero
1780Sstevel@tonic-gate	shrl	$BLOCKSHIFT, %ecx
1790Sstevel@tonic-gate	movl	0xc(%esp), %edx
1800Sstevel@tonic-gate	pushl	%ebx
1810Sstevel@tonic-gate
1820Sstevel@tonic-gate	pushl	%esi
1830Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
1840Sstevel@tonic-gate	popl	%esi
1850Sstevel@tonic-gate	movl	%cr0, %ebx
1860Sstevel@tonic-gate	clts
1870Sstevel@tonic-gate	testl	$CR0_TS, %ebx
1880Sstevel@tonic-gate	jnz	1f
1890Sstevel@tonic-gate
1900Sstevel@tonic-gate	pushl	%edi
1910Sstevel@tonic-gate	SAVE_XMM0(%edi)
1920Sstevel@tonic-gate1:	ZERO_LOOP_INIT_XMM(%eax)
1930Sstevel@tonic-gate9:	ZERO_LOOP_BODY_XMM(%eax, %ecx)
1940Sstevel@tonic-gate	jnz	9b
1950Sstevel@tonic-gate	ZERO_LOOP_FINI_XMM(%eax)
1960Sstevel@tonic-gate
1970Sstevel@tonic-gate	testl	$CR0_TS, %ebx
1980Sstevel@tonic-gate	jnz	2f
1990Sstevel@tonic-gate	RSTOR_XMM0(%edi)
2000Sstevel@tonic-gate	popl	%edi
2010Sstevel@tonic-gate2:	movl	%ebx, %cr0
2020Sstevel@tonic-gate	popl	%ebx
2030Sstevel@tonic-gate	ret
2040Sstevel@tonic-gate.dobzero:
2050Sstevel@tonic-gate	jmp	bzero
2060Sstevel@tonic-gate	SET_SIZE(hwblkclr)
2070Sstevel@tonic-gate
2080Sstevel@tonic-gate#endif	/* __i386 */
2090Sstevel@tonic-gate#endif	/* __lint */
2100Sstevel@tonic-gate
2110Sstevel@tonic-gate
2120Sstevel@tonic-gate#if defined(__lint)
2130Sstevel@tonic-gate
2140Sstevel@tonic-gate/*ARGSUSED*/
2150Sstevel@tonic-gatevoid
2160Sstevel@tonic-gatehwblkpagecopy(const void *src, void *dst)
2170Sstevel@tonic-gate{}
2180Sstevel@tonic-gate
2190Sstevel@tonic-gate#else	/* __lint */
2200Sstevel@tonic-gate
2210Sstevel@tonic-gate#define	PREFETCH_START(src)			\
2220Sstevel@tonic-gate	prefetchnta	0x0(src);		\
2230Sstevel@tonic-gate	prefetchnta	0x40(src)
2240Sstevel@tonic-gate
2250Sstevel@tonic-gate#define	SAVE_XMMS(r)				\
2260Sstevel@tonic-gate	SAVE_XMM_PROLOG(r, 8);			\
2270Sstevel@tonic-gate	movdqa	%xmm0, (r);			\
2280Sstevel@tonic-gate	movdqa	%xmm1, 0x10(r);			\
2290Sstevel@tonic-gate	movdqa	%xmm2, 0x20(r);			\
2300Sstevel@tonic-gate	movdqa	%xmm3, 0x30(r);			\
2310Sstevel@tonic-gate	movdqa	%xmm4, 0x40(r);			\
2320Sstevel@tonic-gate	movdqa	%xmm5, 0x50(r);			\
2330Sstevel@tonic-gate	movdqa	%xmm6, 0x60(r);			\
2340Sstevel@tonic-gate	movdqa	%xmm7, 0x70(r)
2350Sstevel@tonic-gate
2360Sstevel@tonic-gate#define	COPY_LOOP_INIT_XMM(src)			\
2370Sstevel@tonic-gate	prefetchnta	0x80(src);		\
2380Sstevel@tonic-gate	prefetchnta	0xc0(src);		\
2390Sstevel@tonic-gate	movdqa	0x0(src), %xmm0;		\
2400Sstevel@tonic-gate	movdqa	0x10(src), %xmm1;		\
2410Sstevel@tonic-gate	movdqa	0x20(src), %xmm2;		\
2420Sstevel@tonic-gate	movdqa	0x30(src), %xmm3;		\
2430Sstevel@tonic-gate	movdqa	0x40(src), %xmm4;		\
2440Sstevel@tonic-gate	movdqa	0x50(src), %xmm5;		\
2450Sstevel@tonic-gate	movdqa	0x60(src), %xmm6;		\
2460Sstevel@tonic-gate	movdqa	0x70(src), %xmm7;		\
2470Sstevel@tonic-gate	ADD	$0x80, src
2480Sstevel@tonic-gate
2490Sstevel@tonic-gate#define	COPY_LOOP_BODY_XMM(src, dst, cnt)	\
2500Sstevel@tonic-gate	prefetchnta	0x80(src);		\
2510Sstevel@tonic-gate	prefetchnta	0xc0(src);		\
2520Sstevel@tonic-gate	prefetchnta	0x100(src);		\
2530Sstevel@tonic-gate	prefetchnta	0x140(src);		\
2540Sstevel@tonic-gate	movntdq	%xmm0, (dst);			\
2550Sstevel@tonic-gate	movntdq	%xmm1, 0x10(dst);		\
2560Sstevel@tonic-gate	movntdq	%xmm2, 0x20(dst);		\
2570Sstevel@tonic-gate	movntdq	%xmm3, 0x30(dst);		\
2580Sstevel@tonic-gate	movdqa	0x0(src), %xmm0;		\
2590Sstevel@tonic-gate	movdqa	0x10(src), %xmm1;		\
2600Sstevel@tonic-gate	movntdq	%xmm4, 0x40(dst);		\
2610Sstevel@tonic-gate	movntdq	%xmm5, 0x50(dst);		\
2620Sstevel@tonic-gate	movdqa	0x20(src), %xmm2;		\
2630Sstevel@tonic-gate	movdqa	0x30(src), %xmm3;		\
2640Sstevel@tonic-gate	movntdq	%xmm6, 0x60(dst);		\
2650Sstevel@tonic-gate	movntdq	%xmm7, 0x70(dst);		\
2660Sstevel@tonic-gate	movdqa	0x40(src), %xmm4;		\
2670Sstevel@tonic-gate	movdqa	0x50(src), %xmm5;		\
2680Sstevel@tonic-gate	ADD	$0x80, dst;			\
2690Sstevel@tonic-gate	movdqa	0x60(src), %xmm6;		\
2700Sstevel@tonic-gate	movdqa	0x70(src), %xmm7;		\
2710Sstevel@tonic-gate	ADD	$0x80, src;			\
2720Sstevel@tonic-gate	subl	$1, cnt
2730Sstevel@tonic-gate
2740Sstevel@tonic-gate#define	COPY_LOOP_FINI_XMM(dst)			\
2750Sstevel@tonic-gate	movntdq	%xmm0, 0x0(dst);		\
2760Sstevel@tonic-gate	movntdq	%xmm1, 0x10(dst);		\
2770Sstevel@tonic-gate	movntdq	%xmm2, 0x20(dst);		\
2780Sstevel@tonic-gate	movntdq	%xmm3, 0x30(dst);		\
2790Sstevel@tonic-gate	movntdq	%xmm4, 0x40(dst);		\
2800Sstevel@tonic-gate	movntdq	%xmm5, 0x50(dst);		\
2810Sstevel@tonic-gate	movntdq %xmm6, 0x60(dst);		\
2820Sstevel@tonic-gate	movntdq	%xmm7, 0x70(dst)
2830Sstevel@tonic-gate
2840Sstevel@tonic-gate#define	RSTOR_XMMS(r)				\
2850Sstevel@tonic-gate	movdqa	0x0(r), %xmm0;			\
2860Sstevel@tonic-gate	movdqa	0x10(r), %xmm1;			\
2870Sstevel@tonic-gate	movdqa	0x20(r), %xmm2;			\
2880Sstevel@tonic-gate	movdqa	0x30(r), %xmm3;			\
2890Sstevel@tonic-gate	movdqa	0x40(r), %xmm4;			\
2900Sstevel@tonic-gate	movdqa	0x50(r), %xmm5;			\
2910Sstevel@tonic-gate	movdqa	0x60(r), %xmm6;			\
2920Sstevel@tonic-gate	movdqa	0x70(r), %xmm7;			\
2930Sstevel@tonic-gate	RSTOR_XMM_EPILOG(r, 8)
2940Sstevel@tonic-gate
2950Sstevel@tonic-gate#if defined(__amd64)
2960Sstevel@tonic-gate
2970Sstevel@tonic-gate	/*
2980Sstevel@tonic-gate	 * %rdi		src
2990Sstevel@tonic-gate	 * %rsi		dst
3000Sstevel@tonic-gate	 * %rdx		#if DEBUG then curthread
3010Sstevel@tonic-gate	 * %ecx		loop count
3020Sstevel@tonic-gate	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_prempt)
3030Sstevel@tonic-gate	 * %r8		pointer to %xmm register save area
3040Sstevel@tonic-gate	 */
3050Sstevel@tonic-gate	ENTRY(hwblkpagecopy)
3060Sstevel@tonic-gate	pushq	%rbp
3070Sstevel@tonic-gate	movq	%rsp, %rbp
3080Sstevel@tonic-gate	PREFETCH_START(%rdi)
3090Sstevel@tonic-gate	/*
3100Sstevel@tonic-gate	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
3110Sstevel@tonic-gate	 * load and final store save us on loop count
3120Sstevel@tonic-gate	 */
3130Sstevel@tonic-gate	movl	$_CONST(32 - 1), %ecx
3140Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
3150Sstevel@tonic-gate	movq	%cr0, %rax
3160Sstevel@tonic-gate	clts
3170Sstevel@tonic-gate	testl	$CR0_TS, %eax
3180Sstevel@tonic-gate	jnz	3f
3190Sstevel@tonic-gate	SAVE_XMMS(%r8)
3200Sstevel@tonic-gate3:	COPY_LOOP_INIT_XMM(%rdi)
3210Sstevel@tonic-gate4:	COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
3220Sstevel@tonic-gate	jnz	4b
3230Sstevel@tonic-gate	COPY_LOOP_FINI_XMM(%rsi)
3240Sstevel@tonic-gate	testl	$CR0_TS, %eax
3250Sstevel@tonic-gate	jnz	5f
3260Sstevel@tonic-gate	RSTOR_XMMS(%r8)
3270Sstevel@tonic-gate5:	movq	%rax, %cr0
3280Sstevel@tonic-gate	mfence
3290Sstevel@tonic-gate	leave
3300Sstevel@tonic-gate	ret
3310Sstevel@tonic-gate	SET_SIZE(hwblkpagecopy)
3320Sstevel@tonic-gate
3330Sstevel@tonic-gate#elif defined(__i386)
3340Sstevel@tonic-gate
3350Sstevel@tonic-gate	/*
3360Sstevel@tonic-gate	 * %eax		src
3370Sstevel@tonic-gate	 * %edx		dst
3380Sstevel@tonic-gate	 * %ecx		loop count
3390Sstevel@tonic-gate	 * %ebx		saved %cr0 (#if DEBUG then t->t_prempt)
3400Sstevel@tonic-gate	 * %edi		pointer to %xmm register save area
3410Sstevel@tonic-gate	 * %esi		#if DEBUG temporary thread pointer
3420Sstevel@tonic-gate	 */
3430Sstevel@tonic-gate	ENTRY(hwblkpagecopy)
3440Sstevel@tonic-gate	movl	4(%esp), %eax
3450Sstevel@tonic-gate	movl	8(%esp), %edx
3460Sstevel@tonic-gate	PREFETCH_START(%eax)
3470Sstevel@tonic-gate	pushl	%ebx
3480Sstevel@tonic-gate	/*
3490Sstevel@tonic-gate	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
3500Sstevel@tonic-gate	 * load and final store save us one loop count
3510Sstevel@tonic-gate	 */
3520Sstevel@tonic-gate	movl	$_CONST(32 - 1), %ecx
3530Sstevel@tonic-gate	pushl	%esi
3540Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
3550Sstevel@tonic-gate	popl	%esi
3560Sstevel@tonic-gate	movl	%cr0, %ebx
3570Sstevel@tonic-gate	clts
3580Sstevel@tonic-gate	testl	$CR0_TS, %ebx
3590Sstevel@tonic-gate	jnz	3f
3600Sstevel@tonic-gate	pushl	%edi
3610Sstevel@tonic-gate	SAVE_XMMS(%edi)
3620Sstevel@tonic-gate3:	COPY_LOOP_INIT_XMM(%eax)
3630Sstevel@tonic-gate4:	COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
3640Sstevel@tonic-gate	jnz	4b
3650Sstevel@tonic-gate	COPY_LOOP_FINI_XMM(%edx)
3660Sstevel@tonic-gate	testl	$CR0_TS, %ebx
3670Sstevel@tonic-gate	jnz	5f
3680Sstevel@tonic-gate	RSTOR_XMMS(%edi)
3690Sstevel@tonic-gate	popl	%edi
3700Sstevel@tonic-gate5:	movl	%ebx, %cr0
3710Sstevel@tonic-gate	popl	%ebx
3720Sstevel@tonic-gate	mfence
3730Sstevel@tonic-gate	ret
3740Sstevel@tonic-gate	SET_SIZE(hwblkpagecopy)
3750Sstevel@tonic-gate
3760Sstevel@tonic-gate#endif	/* __i386 */
3770Sstevel@tonic-gate#endif	/* __lint */
3780Sstevel@tonic-gate
3790Sstevel@tonic-gate#if defined(__lint)
3800Sstevel@tonic-gate
381*3446Smrj/*
382*3446Smrj * Version of hwblkclr which doesn't use XMM registers.
383*3446Smrj * Note that it requires aligned dst and len.
384*3446Smrj *
385*3446Smrj * XXPV This needs to be performance tuned at some point.
386*3446Smrj *	Is 4 the best number of iterations to unroll?
387*3446Smrj */
3880Sstevel@tonic-gate/*ARGSUSED*/
3890Sstevel@tonic-gatevoid
390*3446Smrjblock_zero_no_xmm(void *dst, int len)
3910Sstevel@tonic-gate{}
3920Sstevel@tonic-gate
393*3446Smrj#else	/* __lint */
3940Sstevel@tonic-gate
3950Sstevel@tonic-gate#if defined(__amd64)
3960Sstevel@tonic-gate
397*3446Smrj	ENTRY(block_zero_no_xmm)
398*3446Smrj	pushq	%rbp
399*3446Smrj	movq	%rsp, %rbp
4000Sstevel@tonic-gate	xorl	%eax, %eax
401*3446Smrj	addq	%rsi, %rdi
402*3446Smrj	negq	%rsi
4030Sstevel@tonic-gate1:
404*3446Smrj	movnti	%rax, (%rdi, %rsi)
405*3446Smrj	movnti	%rax, 8(%rdi, %rsi)
406*3446Smrj	movnti	%rax, 16(%rdi, %rsi)
407*3446Smrj	movnti	%rax, 24(%rdi, %rsi)
408*3446Smrj	addq	$32, %rsi
4090Sstevel@tonic-gate	jnz	1b
4100Sstevel@tonic-gate	mfence
411*3446Smrj	leave
4120Sstevel@tonic-gate	ret
413*3446Smrj	SET_SIZE(block_zero_no_xmm)
4140Sstevel@tonic-gate
4150Sstevel@tonic-gate#elif defined(__i386)
4160Sstevel@tonic-gate
417*3446Smrj	ENTRY(block_zero_no_xmm)
418*3446Smrj	pushl	%ebp
419*3446Smrj	movl	%esp, %ebp
4200Sstevel@tonic-gate	xorl	%eax, %eax
421*3446Smrj	movl	8(%ebp), %edx
422*3446Smrj	movl	12(%ebp), %ecx
423*3446Smrj	addl	%ecx, %edx
424*3446Smrj	negl	%ecx
4250Sstevel@tonic-gate1:
426*3446Smrj	movnti	%eax, (%edx, %ecx)
427*3446Smrj	movnti	%eax, 4(%edx, %ecx)
428*3446Smrj	movnti	%eax, 8(%edx, %ecx)
429*3446Smrj	movnti	%eax, 12(%edx, %ecx)
430*3446Smrj	addl	$16, %ecx
431*3446Smrj	jnz	1b
432*3446Smrj	mfence
433*3446Smrj	leave
434*3446Smrj	ret
435*3446Smrj	SET_SIZE(block_zero_no_xmm)
436*3446Smrj
437*3446Smrj#endif	/* __i386 */
438*3446Smrj#endif	/* __lint */
439*3446Smrj
440*3446Smrj
441*3446Smrj#if defined(__lint)
442*3446Smrj
443*3446Smrj/*
444*3446Smrj * Version of page copy which doesn't use XMM registers.
445*3446Smrj *
446*3446Smrj * XXPV	This needs to be performance tuned at some point.
447*3446Smrj *	Is 4 the right number of iterations to unroll?
448*3446Smrj *	Is the load/store order optimal? Should it use prefetch?
449*3446Smrj */
450*3446Smrj/*ARGSUSED*/
451*3446Smrjvoid
452*3446Smrjpage_copy_no_xmm(void *dst, void *src)
453*3446Smrj{}
454*3446Smrj
455*3446Smrj#else	/* __lint */
456*3446Smrj
457*3446Smrj#if defined(__amd64)
458*3446Smrj
459*3446Smrj	ENTRY(page_copy_no_xmm)
460*3446Smrj	movq	$MMU_STD_PAGESIZE, %rcx
461*3446Smrj	addq	%rcx, %rdi
462*3446Smrj	addq	%rcx, %rsi
463*3446Smrj	negq	%rcx
464*3446Smrj1:
465*3446Smrj	movq	(%rsi, %rcx), %rax
466*3446Smrj	movnti	%rax, (%rdi, %rcx)
467*3446Smrj	movq	8(%rsi, %rcx), %rax
468*3446Smrj	movnti	%rax, 8(%rdi, %rcx)
469*3446Smrj	movq	16(%rsi, %rcx), %rax
470*3446Smrj	movnti	%rax, 16(%rdi, %rcx)
471*3446Smrj	movq	24(%rsi, %rcx), %rax
472*3446Smrj	movnti	%rax, 24(%rdi, %rcx)
473*3446Smrj	addq	$32, %rcx
4740Sstevel@tonic-gate	jnz	1b
4750Sstevel@tonic-gate	mfence
4760Sstevel@tonic-gate	ret
477*3446Smrj	SET_SIZE(page_copy_no_xmm)
478*3446Smrj
479*3446Smrj#elif defined(__i386)
480*3446Smrj
481*3446Smrj	ENTRY(page_copy_no_xmm)
482*3446Smrj	pushl	%esi
483*3446Smrj	movl	$MMU_STD_PAGESIZE, %ecx
484*3446Smrj	movl	8(%esp), %edx
485*3446Smrj	movl	12(%esp), %esi
486*3446Smrj	addl	%ecx, %edx
487*3446Smrj	addl	%ecx, %esi
488*3446Smrj	negl	%ecx
489*3446Smrj1:
490*3446Smrj	movl	(%esi, %ecx), %eax
491*3446Smrj	movnti	%eax, (%edx, %ecx)
492*3446Smrj	movl	4(%esi, %ecx), %eax
493*3446Smrj	movnti	%eax, 4(%edx, %ecx)
494*3446Smrj	movl	8(%esi, %ecx), %eax
495*3446Smrj	movnti	%eax, 8(%edx, %ecx)
496*3446Smrj	movl	12(%esi, %ecx), %eax
497*3446Smrj	movnti	%eax, 12(%edx, %ecx)
498*3446Smrj	addl	$16, %ecx
499*3446Smrj	jnz	1b
500*3446Smrj	mfence
501*3446Smrj	popl	%esi
502*3446Smrj	ret
503*3446Smrj	SET_SIZE(page_copy_no_xmm)
5040Sstevel@tonic-gate
5050Sstevel@tonic-gate#endif	/* __i386 */
5060Sstevel@tonic-gate#endif	/* __lint */
5070Sstevel@tonic-gate
5080Sstevel@tonic-gate#if defined(DEBUG) && !defined(__lint)
5090Sstevel@tonic-gate	.text
5100Sstevel@tonic-gate.not_disabled:
5110Sstevel@tonic-gate	.string	"sseblk: preemption not disabled!"
5120Sstevel@tonic-gate#endif
513