xref: /onnv-gate/usr/src/common/bignum/i386/bignum_i386_asm.s (revision 12826:fca99d9e3f2f)
10Sstevel@tonic-gate/*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*12826Skuriakose.kuruvilla@oracle.com * Common Development and Distribution License (the "License").
6*12826Skuriakose.kuruvilla@oracle.com * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate/*
22*12826Skuriakose.kuruvilla@oracle.com * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
230Sstevel@tonic-gate */
240Sstevel@tonic-gate
250Sstevel@tonic-gate#include <sys/asm_linkage.h>
260Sstevel@tonic-gate#include <sys/x86_archext.h>
270Sstevel@tonic-gate#include <sys/controlregs.h>
280Sstevel@tonic-gate
290Sstevel@tonic-gate#if defined(__lint)
300Sstevel@tonic-gate
310Sstevel@tonic-gate#include <sys/types.h>
320Sstevel@tonic-gate
330Sstevel@tonic-gateuint32_t
340Sstevel@tonic-gatebignum_use_sse2()
350Sstevel@tonic-gate{ return (0); }
360Sstevel@tonic-gate
370Sstevel@tonic-gate/* Not to be called by C code */
380Sstevel@tonic-gate/* ARGSUSED */
390Sstevel@tonic-gateuint32_t
400Sstevel@tonic-gatebig_mul_set_vec_sse2_r()
410Sstevel@tonic-gate{ return (0); }
420Sstevel@tonic-gate
430Sstevel@tonic-gate/* Not to be called by C code */
440Sstevel@tonic-gate/* ARGSUSED */
450Sstevel@tonic-gateuint32_t
460Sstevel@tonic-gatebig_mul_add_vec_sse2_r()
470Sstevel@tonic-gate{ return (0); }
480Sstevel@tonic-gate
490Sstevel@tonic-gate/* ARGSUSED */
500Sstevel@tonic-gateuint32_t
510Sstevel@tonic-gatebig_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
520Sstevel@tonic-gate{ return (0); }
530Sstevel@tonic-gate
540Sstevel@tonic-gate/* ARGSUSED */
550Sstevel@tonic-gateuint32_t
560Sstevel@tonic-gatebig_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
570Sstevel@tonic-gate{ return (0); }
580Sstevel@tonic-gate
590Sstevel@tonic-gate/* ARGSUSED */
600Sstevel@tonic-gatevoid
610Sstevel@tonic-gatebig_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
620Sstevel@tonic-gate{}
630Sstevel@tonic-gate
640Sstevel@tonic-gate/* ARGSUSED */
650Sstevel@tonic-gatevoid
660Sstevel@tonic-gatebig_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
670Sstevel@tonic-gate{}
680Sstevel@tonic-gate
690Sstevel@tonic-gate#if defined(MMX_MANAGE)
700Sstevel@tonic-gate
710Sstevel@tonic-gate/* ARGSUSED */
720Sstevel@tonic-gateuint32_t
730Sstevel@tonic-gatebig_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
740Sstevel@tonic-gate{ return (0); }
750Sstevel@tonic-gate
760Sstevel@tonic-gate/* ARGSUSED */
770Sstevel@tonic-gateuint32_t
780Sstevel@tonic-gatebig_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
790Sstevel@tonic-gate{ return (0); }
800Sstevel@tonic-gate
810Sstevel@tonic-gate/* Not to be called by C code */
820Sstevel@tonic-gate/* ARGSUSED */
830Sstevel@tonic-gatevoid
840Sstevel@tonic-gatebig_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
850Sstevel@tonic-gate{}
860Sstevel@tonic-gate
870Sstevel@tonic-gate#endif	/* MMX_MANAGE */
880Sstevel@tonic-gate
890Sstevel@tonic-gate/*
900Sstevel@tonic-gate * UMUL
910Sstevel@tonic-gate *
920Sstevel@tonic-gate */
930Sstevel@tonic-gate
940Sstevel@tonic-gate/* ARGSUSED */
950Sstevel@tonic-gateuint32_t
960Sstevel@tonic-gatebig_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
970Sstevel@tonic-gate{ return (0); }
980Sstevel@tonic-gate
990Sstevel@tonic-gate/* ARGSUSED */
1000Sstevel@tonic-gateuint32_t
1010Sstevel@tonic-gatebig_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1020Sstevel@tonic-gate{ return (0); }
1030Sstevel@tonic-gate
1040Sstevel@tonic-gate#else	/* __lint */
1050Sstevel@tonic-gate
1060Sstevel@tonic-gate#if defined(MMX_MANAGE)
1070Sstevel@tonic-gate
1080Sstevel@tonic-gate#if defined(_KERNEL)
1090Sstevel@tonic-gate
1100Sstevel@tonic-gate#define	KPREEMPT_DISABLE call kpr_disable
1110Sstevel@tonic-gate#define	KPREEMPT_ENABLE call kpr_enable
1120Sstevel@tonic-gate#define	TEST_TS(reg)					\
1130Sstevel@tonic-gate	movl	%cr0, reg;				\
1140Sstevel@tonic-gate	clts;						\
1150Sstevel@tonic-gate	testl	$CR0_TS, reg
1160Sstevel@tonic-gate
1170Sstevel@tonic-gate#else	/* _KERNEL */
1180Sstevel@tonic-gate
1190Sstevel@tonic-gate#define	KPREEMPT_DISABLE
1200Sstevel@tonic-gate#define	KPREEMPT_ENABLE
1210Sstevel@tonic-gate
1220Sstevel@tonic-gate#define	TEST_TS(reg)					\
1230Sstevel@tonic-gate	movl	$0, reg;				\
1240Sstevel@tonic-gate	testl	$CR0_TS, reg
1250Sstevel@tonic-gate
1260Sstevel@tonic-gate#endif	/* _KERNEL */
1270Sstevel@tonic-gate
1280Sstevel@tonic-gate#define	MMX_SIZE 8
1290Sstevel@tonic-gate#define	MMX_ALIGN 8
1300Sstevel@tonic-gate
1310Sstevel@tonic-gate#define	SAVE_MMX_PROLOG(sreg, nreg)			\
132508Sdarrenm	subl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;	\
1330Sstevel@tonic-gate	movl	%esp, sreg;				\
1340Sstevel@tonic-gate	addl	$MMX_ALIGN, sreg;			\
1350Sstevel@tonic-gate	andl	$-1![MMX_ALIGN-1], sreg;
1360Sstevel@tonic-gate
1370Sstevel@tonic-gate#define	RSTOR_MMX_EPILOG(nreg)				\
138508Sdarrenm	addl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
1390Sstevel@tonic-gate
1400Sstevel@tonic-gate#define	SAVE_MMX_0TO4(sreg)			\
1410Sstevel@tonic-gate	SAVE_MMX_PROLOG(sreg, 5);		\
1420Sstevel@tonic-gate	movq	%mm0, 0(sreg);			\
1430Sstevel@tonic-gate	movq	%mm1, 8(sreg);			\
1440Sstevel@tonic-gate	movq	%mm2, 16(sreg);			\
1450Sstevel@tonic-gate	movq	%mm3, 24(sreg);			\
1460Sstevel@tonic-gate	movq	%mm4, 32(sreg)
1470Sstevel@tonic-gate
1480Sstevel@tonic-gate#define	RSTOR_MMX_0TO4(sreg)			\
1490Sstevel@tonic-gate	movq	0(sreg), %mm0;			\
1500Sstevel@tonic-gate	movq	8(sreg), %mm1;			\
1510Sstevel@tonic-gate	movq	16(sreg), %mm2;			\
1520Sstevel@tonic-gate	movq	24(sreg), %mm3;			\
1530Sstevel@tonic-gate	movq	32(sreg), %mm4;			\
1540Sstevel@tonic-gate	RSTOR_MMX_EPILOG(5)
1550Sstevel@tonic-gate
1560Sstevel@tonic-gate#endif	/* MMX_MANAGE */
1570Sstevel@tonic-gate
1580Sstevel@tonic-gate/ Note: this file contains implementations for
1590Sstevel@tonic-gate/	big_mul_set_vec()
1600Sstevel@tonic-gate/	big_mul_add_vec()
1610Sstevel@tonic-gate/	big_mul_vec()
1620Sstevel@tonic-gate/	big_sqr_vec()
1630Sstevel@tonic-gate/ One set of implementations is for SSE2-capable models.
1640Sstevel@tonic-gate/ The other uses no MMX, SSE, or SSE2 instructions, only
1650Sstevel@tonic-gate/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
1660Sstevel@tonic-gate/
1670Sstevel@tonic-gate/ The code for the implementations is grouped by SSE2 vs UMUL,
1680Sstevel@tonic-gate/ rather than grouping pairs of implementations for each function.
1690Sstevel@tonic-gate/ This is because the bignum implementation gets "imprinted"
1700Sstevel@tonic-gate/ on the correct implementation, at the time of first use,
1710Sstevel@tonic-gate/ so none of the code for the other implementations is ever
1720Sstevel@tonic-gate/ executed.  So, it is a no-brainer to layout the code to minimize
1730Sstevel@tonic-gate/ the "footprint" of executed code.
1740Sstevel@tonic-gate
1750Sstevel@tonic-gate/ Can we use SSE2 instructions?  Return value is non-zero
1760Sstevel@tonic-gate/ if we can.
1770Sstevel@tonic-gate/
1780Sstevel@tonic-gate/ Note:
1790Sstevel@tonic-gate/   Using the cpuid instruction directly would work equally
1800Sstevel@tonic-gate/   well in userland and in the kernel, but we do not use the
181*12826Skuriakose.kuruvilla@oracle.com/   cpuid instruction in the kernel, we use x86_featureset,
182*12826Skuriakose.kuruvilla@oracle.com/   instead.  This means we honor any decisions the kernel
183*12826Skuriakose.kuruvilla@oracle.com/   startup code may have made in setting this variable,
184*12826Skuriakose.kuruvilla@oracle.com/   including disabling SSE2.  It might even be a good idea
185*12826Skuriakose.kuruvilla@oracle.com/   to honor this kind of setting in userland, as well, but
186*12826Skuriakose.kuruvilla@oracle.com/   the variable, x86_featureset is not readily available to
187*12826Skuriakose.kuruvilla@oracle.com/   userland processes.
1880Sstevel@tonic-gate/
1890Sstevel@tonic-gate/ uint32_t
1900Sstevel@tonic-gate/ bignum_use_sse2()
1910Sstevel@tonic-gate
1920Sstevel@tonic-gate	ENTRY(bignum_use_sse2)
1930Sstevel@tonic-gate#if defined(_KERNEL)
194*12826Skuriakose.kuruvilla@oracle.com	xor	%eax, %eax
195*12826Skuriakose.kuruvilla@oracle.com	bt	$X86FSET_SSE2, x86_featureset
196*12826Skuriakose.kuruvilla@oracle.com	adc     %eax, %eax
1970Sstevel@tonic-gate#else	/* _KERNEL */
1980Sstevel@tonic-gate	pushl	%ebx
1990Sstevel@tonic-gate	movl	$1, %eax		/ Get feature information
2000Sstevel@tonic-gate	cpuid
2010Sstevel@tonic-gate	movl	%edx, %eax		/ set return value
2020Sstevel@tonic-gate	popl	%ebx
2030Sstevel@tonic-gate	andl	$CPUID_INTC_EDX_SSE2, %eax
2040Sstevel@tonic-gate#endif	/* _KERNEL */
2050Sstevel@tonic-gate	ret
2060Sstevel@tonic-gate	SET_SIZE(bignum_use_sse2)
2070Sstevel@tonic-gate
2080Sstevel@tonic-gate
2090Sstevel@tonic-gate/ ------------------------------------------------------------------------
2100Sstevel@tonic-gate/		SSE2 Implementations
2110Sstevel@tonic-gate/ ------------------------------------------------------------------------
2120Sstevel@tonic-gate
2130Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
2140Sstevel@tonic-gate/ returns the carry digit
2150Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
2160Sstevel@tonic-gate/
2170Sstevel@tonic-gate/ uint32_t
2180Sstevel@tonic-gate/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
2190Sstevel@tonic-gate/
2200Sstevel@tonic-gate/ r	%edx
2210Sstevel@tonic-gate/ a	%ebx
2220Sstevel@tonic-gate/ len	%ecx
2230Sstevel@tonic-gate/ digit	%mm3
2240Sstevel@tonic-gate/
2250Sstevel@tonic-gate/ Does not touch the following registers: %esi, %edi, %mm4
2260Sstevel@tonic-gate/
2270Sstevel@tonic-gate/ N.B.:
2280Sstevel@tonic-gate/   This is strictly for internal use.
2290Sstevel@tonic-gate/   The interface is very light-weight.
2300Sstevel@tonic-gate/   All parameters are passed in registers.
2310Sstevel@tonic-gate/   It does not conform to the SYSV x86 ABI.
2320Sstevel@tonic-gate/   So, don't even think about calling this function directly from C code.
2330Sstevel@tonic-gate/
2340Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times.
2350Sstevel@tonic-gate/ Each comment is preceded by an instance number.
2360Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural"
2370Sstevel@tonic-gate/ instance number.  It should be easier this way to follow
2380Sstevel@tonic-gate/ the step-wise refinement process that went into constructing
2390Sstevel@tonic-gate/ the final code.
2400Sstevel@tonic-gate
2410Sstevel@tonic-gate#define	UNROLL		8
2420Sstevel@tonic-gate#define	UNROLL32	32
2430Sstevel@tonic-gate
2440Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2_r)
2450Sstevel@tonic-gate	xorl	%eax, %eax	/ if (len == 0) return (0);
2460Sstevel@tonic-gate	testl	%ecx, %ecx
2470Sstevel@tonic-gate	jz	.L17
2480Sstevel@tonic-gate
2490Sstevel@tonic-gate	pxor	%mm0, %mm0	/ cy = 0
2500Sstevel@tonic-gate
2510Sstevel@tonic-gate.L15:
2520Sstevel@tonic-gate	cmpl	$UNROLL, %ecx
2530Sstevel@tonic-gate	jl	.L16
2540Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
2550Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
2560Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
2570Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
2580Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
2590Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
2600Sstevel@tonic-gate
2610Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
2620Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
2630Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
2640Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
2650Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
2660Sstevel@tonic-gate
2670Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
2680Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
2690Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
2700Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
2710Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
2720Sstevel@tonic-gate
2730Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
2740Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
2750Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
2760Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
2770Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
2780Sstevel@tonic-gate
2790Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
2800Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
2810Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
2820Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
2830Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
2860Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
2870Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
2880Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
2890Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
2900Sstevel@tonic-gate
2910Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
2920Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
2930Sstevel@tonic-gate	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
2940Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
2950Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
2960Sstevel@tonic-gate
2970Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
2980Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 8: mm0 = digit * a[i] + cy;
2990Sstevel@tonic-gate	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
3000Sstevel@tonic-gate	psrlq	$32, %mm0	/ 8: cy = product[63..32]
3010Sstevel@tonic-gate
3020Sstevel@tonic-gate	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
3030Sstevel@tonic-gate	leal	UNROLL32(%edx), %edx	/ r += UNROLL
3040Sstevel@tonic-gate	subl	$UNROLL, %ecx		/ len -= UNROLL
3050Sstevel@tonic-gate	jz	.L17
3060Sstevel@tonic-gate	jmp	.L15
3070Sstevel@tonic-gate
3080Sstevel@tonic-gate.L16:
3090Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
3100Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
3110Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
3120Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
3130Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
3140Sstevel@tonic-gate	subl	$1, %ecx
3150Sstevel@tonic-gate	jz	.L17
3160Sstevel@tonic-gate
3170Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
3180Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
3190Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
3200Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
3210Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
3220Sstevel@tonic-gate	subl	$1, %ecx
3230Sstevel@tonic-gate	jz	.L17
3240Sstevel@tonic-gate
3250Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
3260Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
3270Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
3280Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
3290Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
3300Sstevel@tonic-gate	subl	$1, %ecx
3310Sstevel@tonic-gate	jz	.L17
3320Sstevel@tonic-gate
3330Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
3340Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
3350Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
3360Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
3370Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
3380Sstevel@tonic-gate	subl	$1, %ecx
3390Sstevel@tonic-gate	jz	.L17
3400Sstevel@tonic-gate
3410Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
3420Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
3430Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
3440Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
3450Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
3460Sstevel@tonic-gate	subl	$1, %ecx
3470Sstevel@tonic-gate	jz	.L17
3480Sstevel@tonic-gate
3490Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
3500Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
3510Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
3520Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
3530Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
3540Sstevel@tonic-gate	subl	$1, %ecx
3550Sstevel@tonic-gate	jz	.L17
3560Sstevel@tonic-gate
3570Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
3580Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
3590Sstevel@tonic-gate	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
3600Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
3610Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
3620Sstevel@tonic-gate
3630Sstevel@tonic-gate.L17:
3640Sstevel@tonic-gate	movd	%mm0, %eax	/ return (cy)
3650Sstevel@tonic-gate	/ no emms.  caller is responsible for emms
3660Sstevel@tonic-gate	ret
3670Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2_r)
3680Sstevel@tonic-gate
3690Sstevel@tonic-gate
3700Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
3710Sstevel@tonic-gate/ returns the carry digit
3720Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
3730Sstevel@tonic-gate/
3740Sstevel@tonic-gate/ r		 8(%ebp)	%edx
3750Sstevel@tonic-gate/ a		12(%ebp)	%ebx
3760Sstevel@tonic-gate/ len		16(%ebp)	%ecx
3770Sstevel@tonic-gate/ digit		20(%ebp)	%mm3
3780Sstevel@tonic-gate/
3790Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_set_vec_sse2().
3800Sstevel@tonic-gate/ But in the kernel, there are two variations:
3810Sstevel@tonic-gate/    1. big_mul_set_vec_sse2() which does what is necessary to save and
3820Sstevel@tonic-gate/       restore state, if necessary, and to ensure that preemtion is
3830Sstevel@tonic-gate/       disabled.
3840Sstevel@tonic-gate/    2. big_mul_set_vec_sse2_nsv() which just does the work;
3850Sstevel@tonic-gate/       it is the caller's responsibility to ensure that MMX state
3860Sstevel@tonic-gate/       does not need to be saved and restored and that preemption
3870Sstevel@tonic-gate/       is already disabled.
3880Sstevel@tonic-gate
3890Sstevel@tonic-gate#if defined(MMX_MANAGE)
3900Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2)
3910Sstevel@tonic-gate	pushl	%ebp
3920Sstevel@tonic-gate	movl	%esp, %ebp
3930Sstevel@tonic-gate	pushl	%ebx
3940Sstevel@tonic-gate	pushl	%esi
3950Sstevel@tonic-gate	KPREEMPT_DISABLE
3960Sstevel@tonic-gate	TEST_TS(%ebx)
3970Sstevel@tonic-gate	pushl	%ebx
3980Sstevel@tonic-gate	jnz	.setvec_no_save
3990Sstevel@tonic-gate	pushl	%edi
4000Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
4010Sstevel@tonic-gate	movl	8(%ebp), %edx
4020Sstevel@tonic-gate	movl	12(%ebp), %ebx
4030Sstevel@tonic-gate	movl	16(%ebp), %ecx
4040Sstevel@tonic-gate	movd	20(%ebp), %mm3
4050Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4060Sstevel@tonic-gate	movl	%eax, %esi
4070Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
4080Sstevel@tonic-gate	popl	%edi
4090Sstevel@tonic-gate	jmp	.setvec_rtn
4100Sstevel@tonic-gate
4110Sstevel@tonic-gate.setvec_no_save:
4120Sstevel@tonic-gate	movl	8(%ebp), %edx
4130Sstevel@tonic-gate	movl	12(%ebp), %ebx
4140Sstevel@tonic-gate	movl	16(%ebp), %ecx
4150Sstevel@tonic-gate	movd	20(%ebp), %mm3
4160Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4170Sstevel@tonic-gate	movl	%eax, %esi
4180Sstevel@tonic-gate
4190Sstevel@tonic-gate.setvec_rtn:
4200Sstevel@tonic-gate	emms
4210Sstevel@tonic-gate	popl	%ebx
4220Sstevel@tonic-gate	movl	%ebx, %cr0
4230Sstevel@tonic-gate	KPREEMPT_ENABLE
4240Sstevel@tonic-gate	movl	%esi, %eax
4250Sstevel@tonic-gate	popl	%esi
4260Sstevel@tonic-gate	popl	%ebx
4270Sstevel@tonic-gate	leave
4280Sstevel@tonic-gate	ret
4290Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2)
4300Sstevel@tonic-gate
4310Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2_nsv)
4320Sstevel@tonic-gate	pushl	%ebp
4330Sstevel@tonic-gate	movl	%esp, %ebp
4340Sstevel@tonic-gate	pushl	%ebx
4350Sstevel@tonic-gate	movl	8(%ebp), %edx
4360Sstevel@tonic-gate	movl	12(%ebp), %ebx
4370Sstevel@tonic-gate	movl	16(%ebp), %ecx
4380Sstevel@tonic-gate	movd	20(%ebp), %mm3
4390Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4400Sstevel@tonic-gate	popl	%ebx
4410Sstevel@tonic-gate	leave
4420Sstevel@tonic-gate	ret
4430Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2_nsv)
4440Sstevel@tonic-gate
4450Sstevel@tonic-gate#else	/* !defined(MMX_MANAGE) */
4460Sstevel@tonic-gate
4470Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
4480Sstevel@tonic-gate/ returns the carry digit
4490Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
4500Sstevel@tonic-gate/
4510Sstevel@tonic-gate/ r		 8(%ebp)	%edx
4520Sstevel@tonic-gate/ a		12(%ebp)	%ebx
4530Sstevel@tonic-gate/ len		16(%ebp)	%ecx
4540Sstevel@tonic-gate/ digit		20(%ebp)	%mm3
4550Sstevel@tonic-gate
4560Sstevel@tonic-gate	ENTRY(big_mul_set_vec_sse2)
4570Sstevel@tonic-gate	pushl	%ebp
4580Sstevel@tonic-gate	movl	%esp, %ebp
4590Sstevel@tonic-gate	pushl	%ebx
4600Sstevel@tonic-gate	movl	8(%ebp), %edx
4610Sstevel@tonic-gate	movl	12(%ebp), %ebx
4620Sstevel@tonic-gate	movl	16(%ebp), %ecx
4630Sstevel@tonic-gate	movd	20(%ebp), %mm3
4640Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r
4650Sstevel@tonic-gate	popl	%ebx
4660Sstevel@tonic-gate	emms
4670Sstevel@tonic-gate	leave
4680Sstevel@tonic-gate	ret
4690Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_sse2)
4700Sstevel@tonic-gate
4710Sstevel@tonic-gate#endif	/* MMX_MANAGE */
4720Sstevel@tonic-gate
4730Sstevel@tonic-gate
4740Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len
4750Sstevel@tonic-gate/ returns the carry digit
4760Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
4770Sstevel@tonic-gate/
4780Sstevel@tonic-gate/ uint32_t
4790Sstevel@tonic-gate/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
4800Sstevel@tonic-gate/
4810Sstevel@tonic-gate/ r	%edx
4820Sstevel@tonic-gate/ a	%ebx
4830Sstevel@tonic-gate/ len	%ecx
4840Sstevel@tonic-gate/ digit	%mm3
4850Sstevel@tonic-gate/
4860Sstevel@tonic-gate/ N.B.:
4870Sstevel@tonic-gate/   This is strictly for internal use.
4880Sstevel@tonic-gate/   The interface is very light-weight.
4890Sstevel@tonic-gate/   All parameters are passed in registers.
4900Sstevel@tonic-gate/   It does not conform to the SYSV x86 ABI.
4910Sstevel@tonic-gate/   So, don't even think about calling this function directly from C code.
4920Sstevel@tonic-gate/
4930Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times.
4940Sstevel@tonic-gate/ Each comment is preceded by an instance number.
4950Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural"
4960Sstevel@tonic-gate/ instance number.  It should be easier this way to follow
4970Sstevel@tonic-gate/ the step-wise refinement process that went into constructing
4980Sstevel@tonic-gate/ the final code.
4990Sstevel@tonic-gate
5000Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2_r)
5010Sstevel@tonic-gate	xorl	%eax, %eax
5020Sstevel@tonic-gate	testl	%ecx, %ecx
5030Sstevel@tonic-gate	jz	.L27
5040Sstevel@tonic-gate
5050Sstevel@tonic-gate	pxor	%mm0, %mm0	/ cy = 0
5060Sstevel@tonic-gate
5070Sstevel@tonic-gate.L25:
5080Sstevel@tonic-gate	cmpl	$UNROLL, %ecx
5090Sstevel@tonic-gate	jl	.L26
5100Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
5110Sstevel@tonic-gate	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
5120Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
5130Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
5140Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
5150Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
5160Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
5170Sstevel@tonic-gate	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
5180Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
5190Sstevel@tonic-gate
5200Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
5210Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
5220Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
5230Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
5240Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
5250Sstevel@tonic-gate	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
5260Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
5270Sstevel@tonic-gate
5280Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
5290Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
5300Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
5310Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
5320Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
5330Sstevel@tonic-gate	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
5340Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
5350Sstevel@tonic-gate
5360Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
5370Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
5380Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
5390Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
5400Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
5410Sstevel@tonic-gate	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
5420Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
5430Sstevel@tonic-gate
5440Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
5450Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
5460Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
5470Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
5480Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
5490Sstevel@tonic-gate	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
5500Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
5510Sstevel@tonic-gate
5520Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
5530Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
5540Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
5550Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
5560Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
5570Sstevel@tonic-gate	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
5580Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
5590Sstevel@tonic-gate
5600Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
5610Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
5620Sstevel@tonic-gate	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
5630Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
5640Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
5650Sstevel@tonic-gate	movd	28(%edx), %mm2	/ 8: mm2 = r[i]
5660Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
5670Sstevel@tonic-gate
5680Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
5690Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 8: mm2 = digit * a[i] + r[i]
5700Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 8: mm0 = digit * a[i] + r[i] + cy;
5710Sstevel@tonic-gate	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
5720Sstevel@tonic-gate	psrlq	$32, %mm0	/ 8: cy = product[63..32]
5730Sstevel@tonic-gate
5740Sstevel@tonic-gate	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
5750Sstevel@tonic-gate	leal	UNROLL32(%edx), %edx	/ r += UNROLL
5760Sstevel@tonic-gate	subl	$UNROLL, %ecx		/ len -= UNROLL
5770Sstevel@tonic-gate	jz	.L27
5780Sstevel@tonic-gate	jmp	.L25
5790Sstevel@tonic-gate
5800Sstevel@tonic-gate.L26:
5810Sstevel@tonic-gate	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
5820Sstevel@tonic-gate	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
5830Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
5840Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
5850Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
5860Sstevel@tonic-gate	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
5870Sstevel@tonic-gate	psrlq	$32, %mm0	/ 1: cy = product[63..32]
5880Sstevel@tonic-gate	subl	$1, %ecx
5890Sstevel@tonic-gate	jz	.L27
5900Sstevel@tonic-gate
5910Sstevel@tonic-gate	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
5920Sstevel@tonic-gate	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
5930Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
5940Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
5950Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
5960Sstevel@tonic-gate	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
5970Sstevel@tonic-gate	psrlq	$32, %mm0	/ 2: cy = product[63..32]
5980Sstevel@tonic-gate	subl	$1, %ecx
5990Sstevel@tonic-gate	jz	.L27
6000Sstevel@tonic-gate
6010Sstevel@tonic-gate	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
6020Sstevel@tonic-gate	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
6030Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
6040Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
6050Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
6060Sstevel@tonic-gate	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
6070Sstevel@tonic-gate	psrlq	$32, %mm0	/ 3: cy = product[63..32]
6080Sstevel@tonic-gate	subl	$1, %ecx
6090Sstevel@tonic-gate	jz	.L27
6100Sstevel@tonic-gate
6110Sstevel@tonic-gate	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
6120Sstevel@tonic-gate	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
6130Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
6140Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
6150Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
6160Sstevel@tonic-gate	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
6170Sstevel@tonic-gate	psrlq	$32, %mm0	/ 4: cy = product[63..32]
6180Sstevel@tonic-gate	subl	$1, %ecx
6190Sstevel@tonic-gate	jz	.L27
6200Sstevel@tonic-gate
6210Sstevel@tonic-gate	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
6220Sstevel@tonic-gate	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
6230Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
6240Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
6250Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
6260Sstevel@tonic-gate	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
6270Sstevel@tonic-gate	psrlq	$32, %mm0	/ 5: cy = product[63..32]
6280Sstevel@tonic-gate	subl	$1, %ecx
6290Sstevel@tonic-gate	jz	.L27
6300Sstevel@tonic-gate
6310Sstevel@tonic-gate	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
6320Sstevel@tonic-gate	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
6330Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
6340Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
6350Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
6360Sstevel@tonic-gate	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
6370Sstevel@tonic-gate	psrlq	$32, %mm0	/ 6: cy = product[63..32]
6380Sstevel@tonic-gate	subl	$1, %ecx
6390Sstevel@tonic-gate	jz	.L27
6400Sstevel@tonic-gate
6410Sstevel@tonic-gate	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
6420Sstevel@tonic-gate	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
6430Sstevel@tonic-gate	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
6440Sstevel@tonic-gate	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
6450Sstevel@tonic-gate	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
6460Sstevel@tonic-gate	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
6470Sstevel@tonic-gate	psrlq	$32, %mm0	/ 7: cy = product[63..32]
6480Sstevel@tonic-gate
6490Sstevel@tonic-gate.L27:
6500Sstevel@tonic-gate	movd	%mm0, %eax
6510Sstevel@tonic-gate	/ no emms.  caller is responsible for emms
6520Sstevel@tonic-gate	ret
6530Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2_r)
6540Sstevel@tonic-gate
6550Sstevel@tonic-gate
6560Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len
6570Sstevel@tonic-gate/ returns the carry digit
6580Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
6590Sstevel@tonic-gate/
6600Sstevel@tonic-gate/ r		 8(%ebp)	%edx
6610Sstevel@tonic-gate/ a		12(%ebp)	%ebx
6620Sstevel@tonic-gate/ len		16(%ebp)	%ecx
6630Sstevel@tonic-gate/ digit		20(%ebp)	%mm3
6640Sstevel@tonic-gate/
6650Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_add_vec_sse2().
6660Sstevel@tonic-gate/ But in the kernel, there are two variations:
6670Sstevel@tonic-gate/    1. big_mul_add_vec_sse2() which does what is necessary to save and
6680Sstevel@tonic-gate/       restore state, if necessary, and to ensure that preemtion is
6690Sstevel@tonic-gate/       disabled.
6700Sstevel@tonic-gate/    2. big_mul_add_vec_sse2_nsv() which just does the work;
6710Sstevel@tonic-gate/       it is the caller's responsibility to ensure that MMX state
6720Sstevel@tonic-gate/       does not need to be saved and restored and that preemption
6730Sstevel@tonic-gate/       is already disabled.
6740Sstevel@tonic-gate
6750Sstevel@tonic-gate
6760Sstevel@tonic-gate#if defined(MMX_MANAGE)
6770Sstevel@tonic-gate
6780Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2)
6790Sstevel@tonic-gate	pushl	%ebp
6800Sstevel@tonic-gate	movl	%esp, %ebp
6810Sstevel@tonic-gate	pushl	%ebx
6820Sstevel@tonic-gate	pushl	%esi
6830Sstevel@tonic-gate	KPREEMPT_DISABLE
6840Sstevel@tonic-gate	TEST_TS(%ebx)
6850Sstevel@tonic-gate	pushl	%ebx
6860Sstevel@tonic-gate	jnz	.addvec_no_save
6870Sstevel@tonic-gate	pushl	%edi
6880Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
6890Sstevel@tonic-gate	movl	8(%ebp), %edx
6900Sstevel@tonic-gate	movl	12(%ebp), %ebx
6910Sstevel@tonic-gate	movl	16(%ebp), %ecx
6920Sstevel@tonic-gate	movd	20(%ebp), %mm3
6930Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
6940Sstevel@tonic-gate	movl	%eax, %esi
6950Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
6960Sstevel@tonic-gate	popl	%edi
6970Sstevel@tonic-gate	jmp	.addvec_rtn
6980Sstevel@tonic-gate
6990Sstevel@tonic-gate.addvec_no_save:
7000Sstevel@tonic-gate	movl	8(%ebp), %edx
7010Sstevel@tonic-gate	movl	12(%ebp), %ebx
7020Sstevel@tonic-gate	movl	16(%ebp), %ecx
7030Sstevel@tonic-gate	movd	20(%ebp), %mm3
7040Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
7050Sstevel@tonic-gate	movl	%eax, %esi
7060Sstevel@tonic-gate
7070Sstevel@tonic-gate.addvec_rtn:
7080Sstevel@tonic-gate	emms
7090Sstevel@tonic-gate	popl	%ebx
7100Sstevel@tonic-gate	movl	%ebx, %cr0
7110Sstevel@tonic-gate	KPREEMPT_ENABLE
7120Sstevel@tonic-gate	movl	%esi, %eax
7130Sstevel@tonic-gate	popl	%esi
7140Sstevel@tonic-gate	popl	%ebx
7150Sstevel@tonic-gate	leave
7160Sstevel@tonic-gate	ret
7170Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2)
7180Sstevel@tonic-gate
7190Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2_nsv)
7200Sstevel@tonic-gate	pushl	%ebp
7210Sstevel@tonic-gate	movl	%esp, %ebp
7220Sstevel@tonic-gate	pushl	%ebx
7230Sstevel@tonic-gate	movl	8(%ebp), %edx
7240Sstevel@tonic-gate	movl	12(%ebp), %ebx
7250Sstevel@tonic-gate	movl	16(%ebp), %ecx
7260Sstevel@tonic-gate	movd	20(%ebp), %mm3
7270Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
7280Sstevel@tonic-gate	popl	%ebx
7290Sstevel@tonic-gate	leave
7300Sstevel@tonic-gate	ret
7310Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2_nsv)
7320Sstevel@tonic-gate
7330Sstevel@tonic-gate
7340Sstevel@tonic-gate#else	/* !defined(MMX_MANAGE) */
7350Sstevel@tonic-gate
7360Sstevel@tonic-gate	ENTRY(big_mul_add_vec_sse2)
7370Sstevel@tonic-gate	pushl	%ebp
7380Sstevel@tonic-gate	movl	%esp, %ebp
7390Sstevel@tonic-gate	pushl	%ebx
7400Sstevel@tonic-gate	movl	8(%ebp), %edx
7410Sstevel@tonic-gate	movl	12(%ebp), %ebx
7420Sstevel@tonic-gate	movl	16(%ebp), %ecx
7430Sstevel@tonic-gate	movd	20(%ebp), %mm3
7440Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
7450Sstevel@tonic-gate	popl	%ebx
7460Sstevel@tonic-gate	emms
7470Sstevel@tonic-gate	leave
7480Sstevel@tonic-gate	ret
7490Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_sse2)
7500Sstevel@tonic-gate
7510Sstevel@tonic-gate#endif	/* MMX_MANAGE */
7520Sstevel@tonic-gate
7530Sstevel@tonic-gate
7540Sstevel@tonic-gate/ void
7550Sstevel@tonic-gate/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
7560Sstevel@tonic-gate/ {
7570Sstevel@tonic-gate/ 	int i;
7580Sstevel@tonic-gate/
7590Sstevel@tonic-gate/ 	r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
7600Sstevel@tonic-gate/ 	for (i = 1; i < blen; ++i)
7610Sstevel@tonic-gate/ 		r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
7620Sstevel@tonic-gate/ }
7630Sstevel@tonic-gate
7640Sstevel@tonic-gate
7650Sstevel@tonic-gate#if defined(MMX_MANAGE)
7660Sstevel@tonic-gate	ENTRY(big_mul_vec_sse2_fc)
7670Sstevel@tonic-gate#else
7680Sstevel@tonic-gate	ENTRY(big_mul_vec_sse2)
7690Sstevel@tonic-gate#endif
7700Sstevel@tonic-gate	subl	$0x8, %esp
7710Sstevel@tonic-gate	pushl	%ebx
7720Sstevel@tonic-gate	pushl	%ebp
7730Sstevel@tonic-gate	pushl	%esi
7740Sstevel@tonic-gate	pushl	%edi
7750Sstevel@tonic-gate	movl	40(%esp), %eax
7760Sstevel@tonic-gate	movl	%eax, 20(%esp)
7770Sstevel@tonic-gate	pushl	(%eax)
7780Sstevel@tonic-gate	movl	40(%esp), %edi
7790Sstevel@tonic-gate	pushl	%edi
7800Sstevel@tonic-gate	movl	40(%esp), %esi
7810Sstevel@tonic-gate	pushl	%esi
7820Sstevel@tonic-gate	movl	40(%esp), %ebx
7830Sstevel@tonic-gate	pushl	%ebx
7840Sstevel@tonic-gate#if defined(MMX_MANAGE)
7850Sstevel@tonic-gate	call	big_mul_set_vec_sse2_nsv
7860Sstevel@tonic-gate#else
7870Sstevel@tonic-gate	call	big_mul_set_vec_sse2
7880Sstevel@tonic-gate#endif
7890Sstevel@tonic-gate	addl	$0x10, %esp
7900Sstevel@tonic-gate	movl	%eax, (%ebx,%edi,4)
7910Sstevel@tonic-gate	movl	44(%esp), %eax
7920Sstevel@tonic-gate	movl	%eax, 16(%esp)
7930Sstevel@tonic-gate	cmpl	$0x1, %eax
7940Sstevel@tonic-gate	jle	.mulvec_rtn
7950Sstevel@tonic-gate	movl	$0x1, %ebp
7960Sstevel@tonic-gate
797508Sdarrenm	.align 16
7980Sstevel@tonic-gate.mulvec_add:
7990Sstevel@tonic-gate	movl	20(%esp), %eax
8000Sstevel@tonic-gate	pushl	(%eax,%ebp,4)
8010Sstevel@tonic-gate	pushl	%edi
8020Sstevel@tonic-gate	pushl	%esi
8030Sstevel@tonic-gate	leal	(%ebx,%ebp,4), %eax
8040Sstevel@tonic-gate	pushl	%eax
8050Sstevel@tonic-gate#if defined(MMX_MANAGE)
8060Sstevel@tonic-gate	call	big_mul_add_vec_sse2_nsv
8070Sstevel@tonic-gate#else
8080Sstevel@tonic-gate	call	big_mul_add_vec_sse2
8090Sstevel@tonic-gate#endif
8100Sstevel@tonic-gate	addl	$0x10, %esp
8110Sstevel@tonic-gate	leal	(%ebp,%edi), %ecx
8120Sstevel@tonic-gate	movl	%eax, (%ebx,%ecx,4)
8130Sstevel@tonic-gate	incl	%ebp
8140Sstevel@tonic-gate	cmpl	16(%esp), %ebp
8150Sstevel@tonic-gate	jl	.mulvec_add
8160Sstevel@tonic-gate.mulvec_rtn:
8170Sstevel@tonic-gate#if defined(MMX_MANAGE)
8180Sstevel@tonic-gate	emms
8190Sstevel@tonic-gate#endif
8200Sstevel@tonic-gate	popl	%edi
8210Sstevel@tonic-gate	popl	%esi
8220Sstevel@tonic-gate	popl	%ebp
8230Sstevel@tonic-gate	popl	%ebx
8240Sstevel@tonic-gate	addl	$0x8, %esp
8250Sstevel@tonic-gate	ret
8260Sstevel@tonic-gate#if defined(MMX_MANAGE)
8270Sstevel@tonic-gate	SET_SIZE(big_mul_vec_sse2_fc)
8280Sstevel@tonic-gate#else
8290Sstevel@tonic-gate	SET_SIZE(big_mul_vec_sse2)
8300Sstevel@tonic-gate#endif
8310Sstevel@tonic-gate
8320Sstevel@tonic-gate#if defined(MMX_MANAGE)
8330Sstevel@tonic-gate
8340Sstevel@tonic-gate	ENTRY(big_mul_vec_sse2)
8350Sstevel@tonic-gate	pushl	%ebp
8360Sstevel@tonic-gate	movl	%esp, %ebp
8370Sstevel@tonic-gate	subl	$8, %esp
8380Sstevel@tonic-gate	pushl	%edi
8390Sstevel@tonic-gate	KPREEMPT_DISABLE
8400Sstevel@tonic-gate	TEST_TS(%eax)
8410Sstevel@tonic-gate	movl	%eax, -8(%ebp)
8420Sstevel@tonic-gate	jnz	.mulvec_no_save
8430Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
8440Sstevel@tonic-gate	movl	%edi, -4(%ebp)
8450Sstevel@tonic-gate.mulvec_no_save:
8460Sstevel@tonic-gate	movl	24(%ebp), %eax		/ blen
8470Sstevel@tonic-gate	pushl	%eax
8480Sstevel@tonic-gate	movl	20(%ebp), %eax		/ b
8490Sstevel@tonic-gate	pushl	%eax
8500Sstevel@tonic-gate	movl	16(%ebp), %eax		/ alen
8510Sstevel@tonic-gate	pushl	%eax
8520Sstevel@tonic-gate	movl	12(%ebp), %eax		/ a
8530Sstevel@tonic-gate	pushl	%eax
8540Sstevel@tonic-gate	movl	8(%ebp), %eax		/ r
8550Sstevel@tonic-gate	pushl	%eax
8560Sstevel@tonic-gate	call	big_mul_vec_sse2_fc
8570Sstevel@tonic-gate	addl	$20, %esp
8580Sstevel@tonic-gate	movl	-8(%ebp), %eax
8590Sstevel@tonic-gate	testl	$CR0_TS, %eax
8600Sstevel@tonic-gate	jnz	.mulvec_no_rstr
8610Sstevel@tonic-gate	movl	-4(%ebp), %edi
8620Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
8630Sstevel@tonic-gate.mulvec_no_rstr:
8640Sstevel@tonic-gate	movl	%eax, %cr0
8650Sstevel@tonic-gate	KPREEMPT_ENABLE
8660Sstevel@tonic-gate	popl	%edi
8670Sstevel@tonic-gate	leave
8680Sstevel@tonic-gate	ret
8690Sstevel@tonic-gate	SET_SIZE(big_mul_vec_sse2)
8700Sstevel@tonic-gate
8710Sstevel@tonic-gate#endif	/* MMX_MANAGE */
8720Sstevel@tonic-gate
8730Sstevel@tonic-gate
8740Sstevel@tonic-gate
8750Sstevel@tonic-gate#undef UNROLL
8760Sstevel@tonic-gate#undef UNROLL32
8770Sstevel@tonic-gate
8780Sstevel@tonic-gate
8790Sstevel@tonic-gate/ r = a * a, r and a are vectors of length len
8800Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions
8810Sstevel@tonic-gate/
8820Sstevel@tonic-gate/ This function is not suitable for a truly general-purpose multiprecision
8830Sstevel@tonic-gate/ arithmetic library, because it does not work for "small" numbers, that is
8840Sstevel@tonic-gate/ numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
8850Sstevel@tonic-gate/ for any small numbers.
8860Sstevel@tonic-gate
8870Sstevel@tonic-gate#if defined(MMX_MANAGE)
8880Sstevel@tonic-gate	ENTRY(big_sqr_vec_sse2_fc)
8890Sstevel@tonic-gate#else
8900Sstevel@tonic-gate	ENTRY(big_sqr_vec_sse2)
8910Sstevel@tonic-gate	pushl	%ebp
8920Sstevel@tonic-gate	movl	%esp, %ebp
8930Sstevel@tonic-gate#endif
8940Sstevel@tonic-gate
8950Sstevel@tonic-gate	pushl	%ebx
8960Sstevel@tonic-gate	pushl	%edi
8970Sstevel@tonic-gate	pushl	%esi
8980Sstevel@tonic-gate
8990Sstevel@tonic-gate	/ r[1..alen] = a[0] * a[1..alen-1]
9000Sstevel@tonic-gate
9010Sstevel@tonic-gate	movl	8(%ebp), %edi		/ r = arg(r)
9020Sstevel@tonic-gate	movl	12(%ebp), %esi		/ a = arg(a)
9030Sstevel@tonic-gate	movl	16(%ebp), %ecx		/ cnt = arg(alen)
9040Sstevel@tonic-gate	movd	%ecx, %mm4		/ save_cnt = arg(alen)
9050Sstevel@tonic-gate	leal	4(%edi), %edx		/ dst = &r[1]
9060Sstevel@tonic-gate	movl	%esi, %ebx		/ src = a
9070Sstevel@tonic-gate	movd	0(%ebx), %mm3		/ mm3 = a[0]
9080Sstevel@tonic-gate	leal	4(%ebx), %ebx		/ src = &a[1]
9090Sstevel@tonic-gate	subl	$1, %ecx		/ --cnt
9100Sstevel@tonic-gate	call	big_mul_set_vec_sse2_r	/ r[1..alen-1] = a[0] * a[1..alen-1]
9110Sstevel@tonic-gate	movl	%edi, %edx		/ dst = r
9120Sstevel@tonic-gate	movl	%esi, %ebx		/ src = a
9130Sstevel@tonic-gate	movd	%mm4, %ecx		/ cnt = save_cnt
9140Sstevel@tonic-gate	movl	%eax, (%edx, %ecx, 4)	/ r[cnt] = cy
9150Sstevel@tonic-gate
9160Sstevel@tonic-gate/	/* High-level vector C pseudocode */
9170Sstevel@tonic-gate/	for (i = 1; i < alen-1; ++i)
9180Sstevel@tonic-gate/		r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
9190Sstevel@tonic-gate/
9200Sstevel@tonic-gate/	/* Same thing, but slightly lower level C-like pseudocode */
9210Sstevel@tonic-gate/	i = 1;
9220Sstevel@tonic-gate/	r = &arg_r[2*i + 1];
9230Sstevel@tonic-gate/	a = &arg_a[i + 1];
9240Sstevel@tonic-gate/	digit = arg_a[i];
9250Sstevel@tonic-gate/	cnt = alen - 3;
9260Sstevel@tonic-gate/	while (cnt != 0) {
9270Sstevel@tonic-gate/		r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
9280Sstevel@tonic-gate/		r += 2;
9290Sstevel@tonic-gate/		++a;
9300Sstevel@tonic-gate/		--cnt;
9310Sstevel@tonic-gate/	}
9320Sstevel@tonic-gate/
9330Sstevel@tonic-gate/	/* Same thing, but even lower level
9340Sstevel@tonic-gate/	 * For example, pointers are raw pointers,
9350Sstevel@tonic-gate/	 * with no scaling by object size.
9360Sstevel@tonic-gate/	 */
9370Sstevel@tonic-gate/	r = arg_r + 12;	/* i == 1; 2i + 1 == 3;  4*3 == 12; */
9380Sstevel@tonic-gate/	a = arg_a + 8;
9390Sstevel@tonic-gate/	digit = *(arg_a + 4);
9400Sstevel@tonic-gate/	cnt = alen - 3;
9410Sstevel@tonic-gate/	while (cnt != 0) {
9420Sstevel@tonic-gate/		cy = big_mul_add_vec_sse2_r();
9430Sstevel@tonic-gate/		*(r + 4 * cnt) = cy;
9440Sstevel@tonic-gate/		r += 8;
9450Sstevel@tonic-gate/		a += 4;
9460Sstevel@tonic-gate/		--cnt;
9470Sstevel@tonic-gate/	}
9480Sstevel@tonic-gate
9490Sstevel@tonic-gate	leal	4(%edi), %edi		/ r += 4; r = &r[1]
9500Sstevel@tonic-gate	leal	4(%esi), %esi		/ a += 4; a = &a[1]
9510Sstevel@tonic-gate	movd	%mm4, %ecx		/ cnt = save
9520Sstevel@tonic-gate	subl	$2, %ecx		/ cnt = alen - 2; i in 1..alen-2
9530Sstevel@tonic-gate	movd	%ecx, %mm4		/ save_cnt
9540Sstevel@tonic-gate	jecxz	.L32			/ while (cnt != 0) {
9550Sstevel@tonic-gate.L31:
9560Sstevel@tonic-gate	movd	0(%esi), %mm3		/ digit = a[i]
9570Sstevel@tonic-gate	leal	4(%esi), %esi		/ a += 4; a = &a[1]; a = &a[i + 1]
9580Sstevel@tonic-gate	leal	8(%edi), %edi		/ r += 8; r = &r[2]; r = &r[2 * i + 1]
9590Sstevel@tonic-gate	movl	%edi, %edx		/ edx = r
9600Sstevel@tonic-gate	movl	%esi, %ebx		/ ebx = a
9610Sstevel@tonic-gate	cmp	$1, %ecx		/ The last triangle term is special
9620Sstevel@tonic-gate	jz	.L32
9630Sstevel@tonic-gate	call	big_mul_add_vec_sse2_r
9640Sstevel@tonic-gate	movd	%mm4, %ecx		/ cnt = save_cnt
9650Sstevel@tonic-gate	movl	%eax, (%edi, %ecx, 4)	/ r[cnt] = cy
9660Sstevel@tonic-gate	subl	$1, %ecx		/ --cnt
9670Sstevel@tonic-gate	movd	%ecx, %mm4		/ save_cnt = cnt
9680Sstevel@tonic-gate	jmp	.L31			/ }
9690Sstevel@tonic-gate
9700Sstevel@tonic-gate.L32:
9710Sstevel@tonic-gate	movd	0(%ebx), %mm1		/ mm1 = a[i + 1]
9720Sstevel@tonic-gate	movd	0(%edx), %mm2		/ mm2 = r[2 * i + 1]
9730Sstevel@tonic-gate	pmuludq	%mm3, %mm1		/ mm1 = p = digit * a[i + 1]
9740Sstevel@tonic-gate	paddq	%mm1, %mm2		/ mm2 = r[2 * i + 1] + p
9750Sstevel@tonic-gate	movd	%mm2, 0(%edx)		/ r[2 * i + 1] += lo32(p)
9760Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy
9770Sstevel@tonic-gate	movd	%mm2, 4(%edx)		/ r[2 * i + 2] = cy
9780Sstevel@tonic-gate	pxor	%mm2, %mm2
9790Sstevel@tonic-gate	movd	%mm2, 8(%edx)		/ r[2 * i + 3] = 0
9800Sstevel@tonic-gate
9810Sstevel@tonic-gate	movl	8(%ebp), %edx		/ r = arg(r)
9820Sstevel@tonic-gate	movl	12(%ebp), %ebx		/ a = arg(a)
9830Sstevel@tonic-gate	movl	16(%ebp), %ecx		/ cnt = arg(alen)
9840Sstevel@tonic-gate
9850Sstevel@tonic-gate	/ compute low-order corner
9860Sstevel@tonic-gate	/ p = a[0]**2
9870Sstevel@tonic-gate	/ r[0] = lo32(p)
9880Sstevel@tonic-gate	/ cy   = hi32(p)
9890Sstevel@tonic-gate	movd	0(%ebx), %mm2		/ mm2 = a[0]
9900Sstevel@tonic-gate	pmuludq	%mm2, %mm2		/ mm2 = p = a[0]**2
9910Sstevel@tonic-gate	movd	%mm2, 0(%edx)		/ r[0] = lo32(p)
9920Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(p)
9930Sstevel@tonic-gate
9940Sstevel@tonic-gate	/ p = 2 * r[1]
9950Sstevel@tonic-gate	/ t = p + cy
9960Sstevel@tonic-gate	/ r[1] = lo32(t)
9970Sstevel@tonic-gate	/ cy   = hi32(t)
9980Sstevel@tonic-gate	movd	4(%edx), %mm1		/ mm1 = r[1]
9990Sstevel@tonic-gate	psllq	$1, %mm1		/ mm1 = p = 2 * r[1]
10000Sstevel@tonic-gate	paddq	%mm1, %mm2		/ mm2 = t = p + cy
10010Sstevel@tonic-gate	movd	%mm2, 4(%edx)		/ r[1] = low32(t)
10020Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(t)
10030Sstevel@tonic-gate
10040Sstevel@tonic-gate	/ r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
10050Sstevel@tonic-gate	subl	$2, %ecx		/ cnt = alen - 2
10060Sstevel@tonic-gate.L34:
10070Sstevel@tonic-gate	movd	4(%ebx), %mm0		/ mm0 = diag = a[i+1]
10080Sstevel@tonic-gate	pmuludq	%mm0, %mm0		/ mm0 = p = diag**2
10090Sstevel@tonic-gate	paddq	%mm0, %mm2		/ mm2 = t = p + cy
10100Sstevel@tonic-gate	movd	%mm2, %eax
10110Sstevel@tonic-gate	movd	%eax, %mm1		/ mm1 = lo32(t)
10120Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = hi32(t)
10130Sstevel@tonic-gate
10140Sstevel@tonic-gate	movd	8(%edx), %mm3		/ mm3 = r[2*i]
10150Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
10160Sstevel@tonic-gate	paddq	%mm3, %mm1		/ mm1 = 2*r[2*i] + lo32(t)
10170Sstevel@tonic-gate	movd	%mm1, 8(%edx)		/ r[2*i] = 2*r[2*i] + lo32(t)
10180Sstevel@tonic-gate	psrlq	$32, %mm1
10190Sstevel@tonic-gate	paddq	%mm1, %mm2
10200Sstevel@tonic-gate
10210Sstevel@tonic-gate	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
10220Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
10230Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + hi32(t)
10240Sstevel@tonic-gate	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
10250Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy
10260Sstevel@tonic-gate	leal	8(%edx), %edx		/ r += 2
10270Sstevel@tonic-gate	leal	4(%ebx), %ebx		/ ++a
10280Sstevel@tonic-gate	subl	$1, %ecx		/ --cnt
10290Sstevel@tonic-gate	jnz	.L34
10300Sstevel@tonic-gate
10310Sstevel@tonic-gate	/ Carry from last triangle term must participate in doubling,
10320Sstevel@tonic-gate	/ but this step isn't paired up with a squaring the elements
10330Sstevel@tonic-gate	/ of the inner diagonal.
10340Sstevel@tonic-gate	/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
10350Sstevel@tonic-gate	movd	8(%edx), %mm3		/ mm3 = r[2*i]
10360Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
10370Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i] + cy
10380Sstevel@tonic-gate	movd	%mm2, 8(%edx)		/ r[2*i] = lo32(2*r[2*i] + cy)
10390Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(2*r[2*i] + cy)
10400Sstevel@tonic-gate
10410Sstevel@tonic-gate	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
10420Sstevel@tonic-gate	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
10430Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + cy
10440Sstevel@tonic-gate	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
10450Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy
10460Sstevel@tonic-gate
10470Sstevel@tonic-gate	/ compute high-order corner and add it in
10480Sstevel@tonic-gate	/ p = a[alen - 1]**2
10490Sstevel@tonic-gate	/ t = p + cy
10500Sstevel@tonic-gate	/ r[alen + alen - 2] += lo32(t)
10510Sstevel@tonic-gate	/ cy = hi32(t)
10520Sstevel@tonic-gate	/ r[alen + alen - 1] = cy
10530Sstevel@tonic-gate	movd	4(%ebx), %mm0		/ mm0 = a[$-1]
10540Sstevel@tonic-gate	movd	8(%edx), %mm3		/ mm3 = r[$-2]
10550Sstevel@tonic-gate	pmuludq	%mm0, %mm0		/ mm0 = p = a[$-1]**2
10560Sstevel@tonic-gate	paddq	%mm0, %mm2		/ mm2 = t = p + cy
10570Sstevel@tonic-gate	paddq	%mm3, %mm2		/ mm2 = r[$-2] + t
10580Sstevel@tonic-gate	movd	%mm2, 8(%edx)		/ r[$-2] = lo32(r[$-2] + t)
10590Sstevel@tonic-gate	psrlq	$32, %mm2		/ mm2 = cy = hi32(r[$-2] + t)
10600Sstevel@tonic-gate	movd	12(%edx), %mm3
10610Sstevel@tonic-gate	paddq	%mm3, %mm2
10620Sstevel@tonic-gate	movd	%mm2, 12(%edx)		/ r[$-1] += cy
10630Sstevel@tonic-gate
10640Sstevel@tonic-gate.L35:
10650Sstevel@tonic-gate	emms
10660Sstevel@tonic-gate	popl	%esi
10670Sstevel@tonic-gate	popl	%edi
10680Sstevel@tonic-gate	popl	%ebx
10690Sstevel@tonic-gate
10700Sstevel@tonic-gate#if defined(MMX_MANAGE)
10710Sstevel@tonic-gate	ret
10720Sstevel@tonic-gate	SET_SIZE(big_sqr_vec_sse2_fc)
10730Sstevel@tonic-gate#else
10740Sstevel@tonic-gate	leave
10750Sstevel@tonic-gate	ret
10760Sstevel@tonic-gate	SET_SIZE(big_sqr_vec_sse2)
10770Sstevel@tonic-gate#endif
10780Sstevel@tonic-gate
10790Sstevel@tonic-gate
10800Sstevel@tonic-gate#if defined(MMX_MANAGE)
10810Sstevel@tonic-gate	ENTRY(big_sqr_vec_sse2)
10820Sstevel@tonic-gate	pushl	%ebp
10830Sstevel@tonic-gate	movl	%esp, %ebp
10840Sstevel@tonic-gate	KPREEMPT_DISABLE
10850Sstevel@tonic-gate	TEST_TS(%ebx)
10860Sstevel@tonic-gate	pushl	%ebx
10870Sstevel@tonic-gate	jnz	.sqr_no_save
10880Sstevel@tonic-gate	pushl	%edi
10890Sstevel@tonic-gate	SAVE_MMX_0TO4(%edi)
10900Sstevel@tonic-gate	call	big_sqr_vec_sse2_fc
10910Sstevel@tonic-gate	RSTOR_MMX_0TO4(%edi)
10920Sstevel@tonic-gate	popl	%edi
10930Sstevel@tonic-gate	jmp	.sqr_rtn
10940Sstevel@tonic-gate
10950Sstevel@tonic-gate.sqr_no_save:
10960Sstevel@tonic-gate	call	big_sqr_vec_sse2_fc
10970Sstevel@tonic-gate
10980Sstevel@tonic-gate.sqr_rtn:
10990Sstevel@tonic-gate	popl	%ebx
11000Sstevel@tonic-gate	movl	%ebx, %cr0
11010Sstevel@tonic-gate	KPREEMPT_ENABLE
11020Sstevel@tonic-gate	leave
11030Sstevel@tonic-gate	ret
11040Sstevel@tonic-gate	SET_SIZE(big_sqr_vec_sse2)
11050Sstevel@tonic-gate
11060Sstevel@tonic-gate#endif	/* MMX_MANAGE */
11070Sstevel@tonic-gate
11080Sstevel@tonic-gate/ ------------------------------------------------------------------------
11090Sstevel@tonic-gate/		UMUL Implementations
11100Sstevel@tonic-gate/ ------------------------------------------------------------------------
11110Sstevel@tonic-gate
11120Sstevel@tonic-gate
11130Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
11140Sstevel@tonic-gate/ returns the carry digit
11150Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions.
11160Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
11170Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support
11180Sstevel@tonic-gate/ the PMULUDQ instruction.
11190Sstevel@tonic-gate/
11200Sstevel@tonic-gate/ uint32_t
11210Sstevel@tonic-gate/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
11220Sstevel@tonic-gate/
11230Sstevel@tonic-gate/ r		 8(%ebp)	%edx	%edi
11240Sstevel@tonic-gate/ a		12(%ebp)	%ebx	%esi
11250Sstevel@tonic-gate/ len		16(%ebp)	%ecx
11260Sstevel@tonic-gate/ digit		20(%ebp)	%esi
11270Sstevel@tonic-gate
11280Sstevel@tonic-gate	ENTRY(big_mul_set_vec_umul)
11290Sstevel@tonic-gate	pushl	%ebp
11300Sstevel@tonic-gate	movl	%esp, %ebp
11310Sstevel@tonic-gate	pushl	%esi
11320Sstevel@tonic-gate	pushl	%edi
11330Sstevel@tonic-gate	pushl	%ebx
11340Sstevel@tonic-gate	movl	16(%ebp), %ecx
11350Sstevel@tonic-gate	xorl	%ebx, %ebx	/ cy = 0
11360Sstevel@tonic-gate	testl	%ecx, %ecx
11370Sstevel@tonic-gate	movl	8(%ebp), %edi
11380Sstevel@tonic-gate	movl	12(%ebp), %esi
11390Sstevel@tonic-gate	je	.L57
11400Sstevel@tonic-gate
11410Sstevel@tonic-gate.L55:
11420Sstevel@tonic-gate	movl	(%esi), %eax	/ eax = a[i]
11430Sstevel@tonic-gate	leal	4(%esi), %esi	/ ++a
11440Sstevel@tonic-gate	mull	20(%ebp)	/ edx:eax = a[i] * digit
11450Sstevel@tonic-gate	addl	%ebx, %eax
11460Sstevel@tonic-gate	adcl	$0, %edx	/ edx:eax = a[i] * digit + cy
11470Sstevel@tonic-gate	movl	%eax, (%edi)	/ r[i] = product[31..0]
11480Sstevel@tonic-gate	movl	%edx, %ebx	/ cy = product[63..32]
11490Sstevel@tonic-gate	leal	4(%edi), %edi	/ ++r
11500Sstevel@tonic-gate	decl	%ecx		/ --len
11510Sstevel@tonic-gate	jnz	.L55		/ while (len != 0)
11520Sstevel@tonic-gate.L57:
11530Sstevel@tonic-gate	movl	%ebx, %eax
11540Sstevel@tonic-gate	popl	%ebx
11550Sstevel@tonic-gate	popl	%edi
11560Sstevel@tonic-gate	popl	%esi
11570Sstevel@tonic-gate	leave
11580Sstevel@tonic-gate	ret
11590Sstevel@tonic-gate	SET_SIZE(big_mul_set_vec_umul)
11600Sstevel@tonic-gate
11610Sstevel@tonic-gate
11620Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len
11630Sstevel@tonic-gate/ returns the carry digit
11640Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions.
11650Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
11660Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support
11670Sstevel@tonic-gate/ the PMULUDQ instruction.
11680Sstevel@tonic-gate/
11690Sstevel@tonic-gate/ uint32_t
11700Sstevel@tonic-gate/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
11710Sstevel@tonic-gate/
11720Sstevel@tonic-gate/ r		 8(%ebp)	%edx	%edi
11730Sstevel@tonic-gate/ a		12(%ebp)	%ebx	%esi
11740Sstevel@tonic-gate/ len		16(%ebp)	%ecx
11750Sstevel@tonic-gate/ digit		20(%ebp)	%esi
11760Sstevel@tonic-gate
11770Sstevel@tonic-gate	ENTRY(big_mul_add_vec_umul)
11780Sstevel@tonic-gate	pushl	%ebp
11790Sstevel@tonic-gate	movl	%esp, %ebp
11800Sstevel@tonic-gate	pushl	%esi
11810Sstevel@tonic-gate	pushl	%edi
11820Sstevel@tonic-gate	pushl	%ebx
11830Sstevel@tonic-gate	movl	16(%ebp), %ecx
11840Sstevel@tonic-gate	xorl	%ebx, %ebx	/ cy = 0
11850Sstevel@tonic-gate	testl	%ecx, %ecx
11860Sstevel@tonic-gate	movl	8(%ebp), %edi
11870Sstevel@tonic-gate	movl	12(%ebp), %esi
11880Sstevel@tonic-gate	je	.L67
11890Sstevel@tonic-gate	.align 4
11900Sstevel@tonic-gate.L65:
11910Sstevel@tonic-gate	movl	(%esi), %eax	/ eax = a[i]
11920Sstevel@tonic-gate	leal	4(%esi), %esi	/ ++a
11930Sstevel@tonic-gate	mull	20(%ebp)	/ edx:eax = a[i] * digit
11940Sstevel@tonic-gate	addl	(%edi), %eax
11950Sstevel@tonic-gate	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i]
11960Sstevel@tonic-gate	addl	%ebx, %eax
11970Sstevel@tonic-gate	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i] + cy
11980Sstevel@tonic-gate	movl	%eax, (%edi)	/ r[i] = product[31..0]
11990Sstevel@tonic-gate	movl	%edx, %ebx	/ cy = product[63..32]
12000Sstevel@tonic-gate	leal	4(%edi), %edi	/ ++r
12010Sstevel@tonic-gate	decl	%ecx		/ --len
12020Sstevel@tonic-gate	jnz	.L65		/ while (len != 0)
12030Sstevel@tonic-gate.L67:
12040Sstevel@tonic-gate	movl	%ebx, %eax
12050Sstevel@tonic-gate	popl	%ebx
12060Sstevel@tonic-gate	popl	%edi
12070Sstevel@tonic-gate	popl	%esi
12080Sstevel@tonic-gate	leave
12090Sstevel@tonic-gate	ret
12100Sstevel@tonic-gate	SET_SIZE(big_mul_add_vec_umul)
12110Sstevel@tonic-gate
12120Sstevel@tonic-gate#endif	/* __lint */
1213