10Sstevel@tonic-gate/* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*12826Skuriakose.kuruvilla@oracle.com * Common Development and Distribution License (the "License"). 6*12826Skuriakose.kuruvilla@oracle.com * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate/* 22*12826Skuriakose.kuruvilla@oracle.com * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 230Sstevel@tonic-gate */ 240Sstevel@tonic-gate 250Sstevel@tonic-gate#include <sys/asm_linkage.h> 260Sstevel@tonic-gate#include <sys/x86_archext.h> 270Sstevel@tonic-gate#include <sys/controlregs.h> 280Sstevel@tonic-gate 290Sstevel@tonic-gate#if defined(__lint) 300Sstevel@tonic-gate 310Sstevel@tonic-gate#include <sys/types.h> 320Sstevel@tonic-gate 330Sstevel@tonic-gateuint32_t 340Sstevel@tonic-gatebignum_use_sse2() 350Sstevel@tonic-gate{ return (0); } 360Sstevel@tonic-gate 370Sstevel@tonic-gate/* Not to be called by C code */ 380Sstevel@tonic-gate/* ARGSUSED */ 390Sstevel@tonic-gateuint32_t 400Sstevel@tonic-gatebig_mul_set_vec_sse2_r() 410Sstevel@tonic-gate{ return (0); } 420Sstevel@tonic-gate 430Sstevel@tonic-gate/* Not to be called by C code */ 440Sstevel@tonic-gate/* ARGSUSED */ 450Sstevel@tonic-gateuint32_t 460Sstevel@tonic-gatebig_mul_add_vec_sse2_r() 470Sstevel@tonic-gate{ return (0); } 480Sstevel@tonic-gate 490Sstevel@tonic-gate/* ARGSUSED */ 500Sstevel@tonic-gateuint32_t 510Sstevel@tonic-gatebig_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 520Sstevel@tonic-gate{ return (0); } 530Sstevel@tonic-gate 540Sstevel@tonic-gate/* ARGSUSED */ 550Sstevel@tonic-gateuint32_t 560Sstevel@tonic-gatebig_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit) 570Sstevel@tonic-gate{ return (0); } 580Sstevel@tonic-gate 590Sstevel@tonic-gate/* ARGSUSED */ 600Sstevel@tonic-gatevoid 610Sstevel@tonic-gatebig_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 620Sstevel@tonic-gate{} 630Sstevel@tonic-gate 640Sstevel@tonic-gate/* ARGSUSED */ 650Sstevel@tonic-gatevoid 660Sstevel@tonic-gatebig_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len) 670Sstevel@tonic-gate{} 680Sstevel@tonic-gate 690Sstevel@tonic-gate#if defined(MMX_MANAGE) 700Sstevel@tonic-gate 710Sstevel@tonic-gate/* ARGSUSED */ 720Sstevel@tonic-gateuint32_t 730Sstevel@tonic-gatebig_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 740Sstevel@tonic-gate{ return (0); } 750Sstevel@tonic-gate 760Sstevel@tonic-gate/* ARGSUSED */ 770Sstevel@tonic-gateuint32_t 780Sstevel@tonic-gatebig_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit) 790Sstevel@tonic-gate{ return (0); } 800Sstevel@tonic-gate 810Sstevel@tonic-gate/* Not to be called by C code */ 820Sstevel@tonic-gate/* ARGSUSED */ 830Sstevel@tonic-gatevoid 840Sstevel@tonic-gatebig_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len) 850Sstevel@tonic-gate{} 860Sstevel@tonic-gate 870Sstevel@tonic-gate#endif /* MMX_MANAGE */ 880Sstevel@tonic-gate 890Sstevel@tonic-gate/* 900Sstevel@tonic-gate * UMUL 910Sstevel@tonic-gate * 920Sstevel@tonic-gate */ 930Sstevel@tonic-gate 940Sstevel@tonic-gate/* ARGSUSED */ 950Sstevel@tonic-gateuint32_t 960Sstevel@tonic-gatebig_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 970Sstevel@tonic-gate{ return (0); } 980Sstevel@tonic-gate 990Sstevel@tonic-gate/* ARGSUSED */ 1000Sstevel@tonic-gateuint32_t 1010Sstevel@tonic-gatebig_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 1020Sstevel@tonic-gate{ return (0); } 1030Sstevel@tonic-gate 1040Sstevel@tonic-gate#else /* __lint */ 1050Sstevel@tonic-gate 1060Sstevel@tonic-gate#if defined(MMX_MANAGE) 1070Sstevel@tonic-gate 1080Sstevel@tonic-gate#if defined(_KERNEL) 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate#define KPREEMPT_DISABLE call kpr_disable 1110Sstevel@tonic-gate#define KPREEMPT_ENABLE call kpr_enable 1120Sstevel@tonic-gate#define TEST_TS(reg) \ 1130Sstevel@tonic-gate movl %cr0, reg; \ 1140Sstevel@tonic-gate clts; \ 1150Sstevel@tonic-gate testl $CR0_TS, reg 1160Sstevel@tonic-gate 1170Sstevel@tonic-gate#else /* _KERNEL */ 1180Sstevel@tonic-gate 1190Sstevel@tonic-gate#define KPREEMPT_DISABLE 1200Sstevel@tonic-gate#define KPREEMPT_ENABLE 1210Sstevel@tonic-gate 1220Sstevel@tonic-gate#define TEST_TS(reg) \ 1230Sstevel@tonic-gate movl $0, reg; \ 1240Sstevel@tonic-gate testl $CR0_TS, reg 1250Sstevel@tonic-gate 1260Sstevel@tonic-gate#endif /* _KERNEL */ 1270Sstevel@tonic-gate 1280Sstevel@tonic-gate#define MMX_SIZE 8 1290Sstevel@tonic-gate#define MMX_ALIGN 8 1300Sstevel@tonic-gate 1310Sstevel@tonic-gate#define SAVE_MMX_PROLOG(sreg, nreg) \ 132508Sdarrenm subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \ 1330Sstevel@tonic-gate movl %esp, sreg; \ 1340Sstevel@tonic-gate addl $MMX_ALIGN, sreg; \ 1350Sstevel@tonic-gate andl $-1![MMX_ALIGN-1], sreg; 1360Sstevel@tonic-gate 1370Sstevel@tonic-gate#define RSTOR_MMX_EPILOG(nreg) \ 138508Sdarrenm addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; 1390Sstevel@tonic-gate 1400Sstevel@tonic-gate#define SAVE_MMX_0TO4(sreg) \ 1410Sstevel@tonic-gate SAVE_MMX_PROLOG(sreg, 5); \ 1420Sstevel@tonic-gate movq %mm0, 0(sreg); \ 1430Sstevel@tonic-gate movq %mm1, 8(sreg); \ 1440Sstevel@tonic-gate movq %mm2, 16(sreg); \ 1450Sstevel@tonic-gate movq %mm3, 24(sreg); \ 1460Sstevel@tonic-gate movq %mm4, 32(sreg) 1470Sstevel@tonic-gate 1480Sstevel@tonic-gate#define RSTOR_MMX_0TO4(sreg) \ 1490Sstevel@tonic-gate movq 0(sreg), %mm0; \ 1500Sstevel@tonic-gate movq 8(sreg), %mm1; \ 1510Sstevel@tonic-gate movq 16(sreg), %mm2; \ 1520Sstevel@tonic-gate movq 24(sreg), %mm3; \ 1530Sstevel@tonic-gate movq 32(sreg), %mm4; \ 1540Sstevel@tonic-gate RSTOR_MMX_EPILOG(5) 1550Sstevel@tonic-gate 1560Sstevel@tonic-gate#endif /* MMX_MANAGE */ 1570Sstevel@tonic-gate 1580Sstevel@tonic-gate/ Note: this file contains implementations for 1590Sstevel@tonic-gate/ big_mul_set_vec() 1600Sstevel@tonic-gate/ big_mul_add_vec() 1610Sstevel@tonic-gate/ big_mul_vec() 1620Sstevel@tonic-gate/ big_sqr_vec() 1630Sstevel@tonic-gate/ One set of implementations is for SSE2-capable models. 1640Sstevel@tonic-gate/ The other uses no MMX, SSE, or SSE2 instructions, only 1650Sstevel@tonic-gate/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL. 1660Sstevel@tonic-gate/ 1670Sstevel@tonic-gate/ The code for the implementations is grouped by SSE2 vs UMUL, 1680Sstevel@tonic-gate/ rather than grouping pairs of implementations for each function. 1690Sstevel@tonic-gate/ This is because the bignum implementation gets "imprinted" 1700Sstevel@tonic-gate/ on the correct implementation, at the time of first use, 1710Sstevel@tonic-gate/ so none of the code for the other implementations is ever 1720Sstevel@tonic-gate/ executed. So, it is a no-brainer to layout the code to minimize 1730Sstevel@tonic-gate/ the "footprint" of executed code. 1740Sstevel@tonic-gate 1750Sstevel@tonic-gate/ Can we use SSE2 instructions? Return value is non-zero 1760Sstevel@tonic-gate/ if we can. 1770Sstevel@tonic-gate/ 1780Sstevel@tonic-gate/ Note: 1790Sstevel@tonic-gate/ Using the cpuid instruction directly would work equally 1800Sstevel@tonic-gate/ well in userland and in the kernel, but we do not use the 181*12826Skuriakose.kuruvilla@oracle.com/ cpuid instruction in the kernel, we use x86_featureset, 182*12826Skuriakose.kuruvilla@oracle.com/ instead. This means we honor any decisions the kernel 183*12826Skuriakose.kuruvilla@oracle.com/ startup code may have made in setting this variable, 184*12826Skuriakose.kuruvilla@oracle.com/ including disabling SSE2. It might even be a good idea 185*12826Skuriakose.kuruvilla@oracle.com/ to honor this kind of setting in userland, as well, but 186*12826Skuriakose.kuruvilla@oracle.com/ the variable, x86_featureset is not readily available to 187*12826Skuriakose.kuruvilla@oracle.com/ userland processes. 1880Sstevel@tonic-gate/ 1890Sstevel@tonic-gate/ uint32_t 1900Sstevel@tonic-gate/ bignum_use_sse2() 1910Sstevel@tonic-gate 1920Sstevel@tonic-gate ENTRY(bignum_use_sse2) 1930Sstevel@tonic-gate#if defined(_KERNEL) 194*12826Skuriakose.kuruvilla@oracle.com xor %eax, %eax 195*12826Skuriakose.kuruvilla@oracle.com bt $X86FSET_SSE2, x86_featureset 196*12826Skuriakose.kuruvilla@oracle.com adc %eax, %eax 1970Sstevel@tonic-gate#else /* _KERNEL */ 1980Sstevel@tonic-gate pushl %ebx 1990Sstevel@tonic-gate movl $1, %eax / Get feature information 2000Sstevel@tonic-gate cpuid 2010Sstevel@tonic-gate movl %edx, %eax / set return value 2020Sstevel@tonic-gate popl %ebx 2030Sstevel@tonic-gate andl $CPUID_INTC_EDX_SSE2, %eax 2040Sstevel@tonic-gate#endif /* _KERNEL */ 2050Sstevel@tonic-gate ret 2060Sstevel@tonic-gate SET_SIZE(bignum_use_sse2) 2070Sstevel@tonic-gate 2080Sstevel@tonic-gate 2090Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2100Sstevel@tonic-gate/ SSE2 Implementations 2110Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2120Sstevel@tonic-gate 2130Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 2140Sstevel@tonic-gate/ returns the carry digit 2150Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 2160Sstevel@tonic-gate/ 2170Sstevel@tonic-gate/ uint32_t 2180Sstevel@tonic-gate/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 2190Sstevel@tonic-gate/ 2200Sstevel@tonic-gate/ r %edx 2210Sstevel@tonic-gate/ a %ebx 2220Sstevel@tonic-gate/ len %ecx 2230Sstevel@tonic-gate/ digit %mm3 2240Sstevel@tonic-gate/ 2250Sstevel@tonic-gate/ Does not touch the following registers: %esi, %edi, %mm4 2260Sstevel@tonic-gate/ 2270Sstevel@tonic-gate/ N.B.: 2280Sstevel@tonic-gate/ This is strictly for internal use. 2290Sstevel@tonic-gate/ The interface is very light-weight. 2300Sstevel@tonic-gate/ All parameters are passed in registers. 2310Sstevel@tonic-gate/ It does not conform to the SYSV x86 ABI. 2320Sstevel@tonic-gate/ So, don't even think about calling this function directly from C code. 2330Sstevel@tonic-gate/ 2340Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times. 2350Sstevel@tonic-gate/ Each comment is preceded by an instance number. 2360Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural" 2370Sstevel@tonic-gate/ instance number. It should be easier this way to follow 2380Sstevel@tonic-gate/ the step-wise refinement process that went into constructing 2390Sstevel@tonic-gate/ the final code. 2400Sstevel@tonic-gate 2410Sstevel@tonic-gate#define UNROLL 8 2420Sstevel@tonic-gate#define UNROLL32 32 2430Sstevel@tonic-gate 2440Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2_r) 2450Sstevel@tonic-gate xorl %eax, %eax / if (len == 0) return (0); 2460Sstevel@tonic-gate testl %ecx, %ecx 2470Sstevel@tonic-gate jz .L17 2480Sstevel@tonic-gate 2490Sstevel@tonic-gate pxor %mm0, %mm0 / cy = 0 2500Sstevel@tonic-gate 2510Sstevel@tonic-gate.L15: 2520Sstevel@tonic-gate cmpl $UNROLL, %ecx 2530Sstevel@tonic-gate jl .L16 2540Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 2550Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 2560Sstevel@tonic-gate paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 2570Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 2580Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 2590Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 2600Sstevel@tonic-gate 2610Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 2620Sstevel@tonic-gate paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 2630Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 2640Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 2650Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 2660Sstevel@tonic-gate 2670Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 2680Sstevel@tonic-gate paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 2690Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 2700Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 2710Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 2720Sstevel@tonic-gate 2730Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 2740Sstevel@tonic-gate paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 2750Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 2760Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 2770Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 2780Sstevel@tonic-gate 2790Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 2800Sstevel@tonic-gate paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 2810Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 2820Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 2830Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 2840Sstevel@tonic-gate 2850Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 2860Sstevel@tonic-gate paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 2870Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 2880Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 2890Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 2900Sstevel@tonic-gate 2910Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 2920Sstevel@tonic-gate paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 2930Sstevel@tonic-gate movd 28(%ebx), %mm1 / 8: mm1 = a[i] 2940Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 2950Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 2960Sstevel@tonic-gate 2970Sstevel@tonic-gate pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 2980Sstevel@tonic-gate paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy; 2990Sstevel@tonic-gate movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 3000Sstevel@tonic-gate psrlq $32, %mm0 / 8: cy = product[63..32] 3010Sstevel@tonic-gate 3020Sstevel@tonic-gate leal UNROLL32(%ebx), %ebx / a += UNROLL 3030Sstevel@tonic-gate leal UNROLL32(%edx), %edx / r += UNROLL 3040Sstevel@tonic-gate subl $UNROLL, %ecx / len -= UNROLL 3050Sstevel@tonic-gate jz .L17 3060Sstevel@tonic-gate jmp .L15 3070Sstevel@tonic-gate 3080Sstevel@tonic-gate.L16: 3090Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 3100Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 3110Sstevel@tonic-gate paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy; 3120Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 3130Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 3140Sstevel@tonic-gate subl $1, %ecx 3150Sstevel@tonic-gate jz .L17 3160Sstevel@tonic-gate 3170Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 3180Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 3190Sstevel@tonic-gate paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy; 3200Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 3210Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 3220Sstevel@tonic-gate subl $1, %ecx 3230Sstevel@tonic-gate jz .L17 3240Sstevel@tonic-gate 3250Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 3260Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 3270Sstevel@tonic-gate paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy; 3280Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 3290Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 3300Sstevel@tonic-gate subl $1, %ecx 3310Sstevel@tonic-gate jz .L17 3320Sstevel@tonic-gate 3330Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 3340Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 3350Sstevel@tonic-gate paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy; 3360Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 3370Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 3380Sstevel@tonic-gate subl $1, %ecx 3390Sstevel@tonic-gate jz .L17 3400Sstevel@tonic-gate 3410Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 3420Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 3430Sstevel@tonic-gate paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy; 3440Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 3450Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 3460Sstevel@tonic-gate subl $1, %ecx 3470Sstevel@tonic-gate jz .L17 3480Sstevel@tonic-gate 3490Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 3500Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 3510Sstevel@tonic-gate paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy; 3520Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 3530Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 3540Sstevel@tonic-gate subl $1, %ecx 3550Sstevel@tonic-gate jz .L17 3560Sstevel@tonic-gate 3570Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 3580Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 3590Sstevel@tonic-gate paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy; 3600Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 3610Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 3620Sstevel@tonic-gate 3630Sstevel@tonic-gate.L17: 3640Sstevel@tonic-gate movd %mm0, %eax / return (cy) 3650Sstevel@tonic-gate / no emms. caller is responsible for emms 3660Sstevel@tonic-gate ret 3670Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2_r) 3680Sstevel@tonic-gate 3690Sstevel@tonic-gate 3700Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 3710Sstevel@tonic-gate/ returns the carry digit 3720Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 3730Sstevel@tonic-gate/ 3740Sstevel@tonic-gate/ r 8(%ebp) %edx 3750Sstevel@tonic-gate/ a 12(%ebp) %ebx 3760Sstevel@tonic-gate/ len 16(%ebp) %ecx 3770Sstevel@tonic-gate/ digit 20(%ebp) %mm3 3780Sstevel@tonic-gate/ 3790Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_set_vec_sse2(). 3800Sstevel@tonic-gate/ But in the kernel, there are two variations: 3810Sstevel@tonic-gate/ 1. big_mul_set_vec_sse2() which does what is necessary to save and 3820Sstevel@tonic-gate/ restore state, if necessary, and to ensure that preemtion is 3830Sstevel@tonic-gate/ disabled. 3840Sstevel@tonic-gate/ 2. big_mul_set_vec_sse2_nsv() which just does the work; 3850Sstevel@tonic-gate/ it is the caller's responsibility to ensure that MMX state 3860Sstevel@tonic-gate/ does not need to be saved and restored and that preemption 3870Sstevel@tonic-gate/ is already disabled. 3880Sstevel@tonic-gate 3890Sstevel@tonic-gate#if defined(MMX_MANAGE) 3900Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2) 3910Sstevel@tonic-gate pushl %ebp 3920Sstevel@tonic-gate movl %esp, %ebp 3930Sstevel@tonic-gate pushl %ebx 3940Sstevel@tonic-gate pushl %esi 3950Sstevel@tonic-gate KPREEMPT_DISABLE 3960Sstevel@tonic-gate TEST_TS(%ebx) 3970Sstevel@tonic-gate pushl %ebx 3980Sstevel@tonic-gate jnz .setvec_no_save 3990Sstevel@tonic-gate pushl %edi 4000Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 4010Sstevel@tonic-gate movl 8(%ebp), %edx 4020Sstevel@tonic-gate movl 12(%ebp), %ebx 4030Sstevel@tonic-gate movl 16(%ebp), %ecx 4040Sstevel@tonic-gate movd 20(%ebp), %mm3 4050Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4060Sstevel@tonic-gate movl %eax, %esi 4070Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 4080Sstevel@tonic-gate popl %edi 4090Sstevel@tonic-gate jmp .setvec_rtn 4100Sstevel@tonic-gate 4110Sstevel@tonic-gate.setvec_no_save: 4120Sstevel@tonic-gate movl 8(%ebp), %edx 4130Sstevel@tonic-gate movl 12(%ebp), %ebx 4140Sstevel@tonic-gate movl 16(%ebp), %ecx 4150Sstevel@tonic-gate movd 20(%ebp), %mm3 4160Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4170Sstevel@tonic-gate movl %eax, %esi 4180Sstevel@tonic-gate 4190Sstevel@tonic-gate.setvec_rtn: 4200Sstevel@tonic-gate emms 4210Sstevel@tonic-gate popl %ebx 4220Sstevel@tonic-gate movl %ebx, %cr0 4230Sstevel@tonic-gate KPREEMPT_ENABLE 4240Sstevel@tonic-gate movl %esi, %eax 4250Sstevel@tonic-gate popl %esi 4260Sstevel@tonic-gate popl %ebx 4270Sstevel@tonic-gate leave 4280Sstevel@tonic-gate ret 4290Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2) 4300Sstevel@tonic-gate 4310Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2_nsv) 4320Sstevel@tonic-gate pushl %ebp 4330Sstevel@tonic-gate movl %esp, %ebp 4340Sstevel@tonic-gate pushl %ebx 4350Sstevel@tonic-gate movl 8(%ebp), %edx 4360Sstevel@tonic-gate movl 12(%ebp), %ebx 4370Sstevel@tonic-gate movl 16(%ebp), %ecx 4380Sstevel@tonic-gate movd 20(%ebp), %mm3 4390Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4400Sstevel@tonic-gate popl %ebx 4410Sstevel@tonic-gate leave 4420Sstevel@tonic-gate ret 4430Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2_nsv) 4440Sstevel@tonic-gate 4450Sstevel@tonic-gate#else /* !defined(MMX_MANAGE) */ 4460Sstevel@tonic-gate 4470Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 4480Sstevel@tonic-gate/ returns the carry digit 4490Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 4500Sstevel@tonic-gate/ 4510Sstevel@tonic-gate/ r 8(%ebp) %edx 4520Sstevel@tonic-gate/ a 12(%ebp) %ebx 4530Sstevel@tonic-gate/ len 16(%ebp) %ecx 4540Sstevel@tonic-gate/ digit 20(%ebp) %mm3 4550Sstevel@tonic-gate 4560Sstevel@tonic-gate ENTRY(big_mul_set_vec_sse2) 4570Sstevel@tonic-gate pushl %ebp 4580Sstevel@tonic-gate movl %esp, %ebp 4590Sstevel@tonic-gate pushl %ebx 4600Sstevel@tonic-gate movl 8(%ebp), %edx 4610Sstevel@tonic-gate movl 12(%ebp), %ebx 4620Sstevel@tonic-gate movl 16(%ebp), %ecx 4630Sstevel@tonic-gate movd 20(%ebp), %mm3 4640Sstevel@tonic-gate call big_mul_set_vec_sse2_r 4650Sstevel@tonic-gate popl %ebx 4660Sstevel@tonic-gate emms 4670Sstevel@tonic-gate leave 4680Sstevel@tonic-gate ret 4690Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_sse2) 4700Sstevel@tonic-gate 4710Sstevel@tonic-gate#endif /* MMX_MANAGE */ 4720Sstevel@tonic-gate 4730Sstevel@tonic-gate 4740Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len 4750Sstevel@tonic-gate/ returns the carry digit 4760Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 4770Sstevel@tonic-gate/ 4780Sstevel@tonic-gate/ uint32_t 4790Sstevel@tonic-gate/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit) 4800Sstevel@tonic-gate/ 4810Sstevel@tonic-gate/ r %edx 4820Sstevel@tonic-gate/ a %ebx 4830Sstevel@tonic-gate/ len %ecx 4840Sstevel@tonic-gate/ digit %mm3 4850Sstevel@tonic-gate/ 4860Sstevel@tonic-gate/ N.B.: 4870Sstevel@tonic-gate/ This is strictly for internal use. 4880Sstevel@tonic-gate/ The interface is very light-weight. 4890Sstevel@tonic-gate/ All parameters are passed in registers. 4900Sstevel@tonic-gate/ It does not conform to the SYSV x86 ABI. 4910Sstevel@tonic-gate/ So, don't even think about calling this function directly from C code. 4920Sstevel@tonic-gate/ 4930Sstevel@tonic-gate/ The basic multiply digit loop is unrolled 8 times. 4940Sstevel@tonic-gate/ Each comment is preceded by an instance number. 4950Sstevel@tonic-gate/ Instructions that have been moved retain their original, "natural" 4960Sstevel@tonic-gate/ instance number. It should be easier this way to follow 4970Sstevel@tonic-gate/ the step-wise refinement process that went into constructing 4980Sstevel@tonic-gate/ the final code. 4990Sstevel@tonic-gate 5000Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2_r) 5010Sstevel@tonic-gate xorl %eax, %eax 5020Sstevel@tonic-gate testl %ecx, %ecx 5030Sstevel@tonic-gate jz .L27 5040Sstevel@tonic-gate 5050Sstevel@tonic-gate pxor %mm0, %mm0 / cy = 0 5060Sstevel@tonic-gate 5070Sstevel@tonic-gate.L25: 5080Sstevel@tonic-gate cmpl $UNROLL, %ecx 5090Sstevel@tonic-gate jl .L26 5100Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 5110Sstevel@tonic-gate movd 0(%edx), %mm2 / 1: mm2 = r[i] 5120Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 5130Sstevel@tonic-gate paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 5140Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 5150Sstevel@tonic-gate paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 5160Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 5170Sstevel@tonic-gate movd 4(%edx), %mm2 / 2: mm2 = r[i] 5180Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 5190Sstevel@tonic-gate 5200Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 5210Sstevel@tonic-gate paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 5220Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 5230Sstevel@tonic-gate paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 5240Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 5250Sstevel@tonic-gate movd 8(%edx), %mm2 / 3: mm2 = r[i] 5260Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 5270Sstevel@tonic-gate 5280Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 5290Sstevel@tonic-gate paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 5300Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 5310Sstevel@tonic-gate paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 5320Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 5330Sstevel@tonic-gate movd 12(%edx), %mm2 / 4: mm2 = r[i] 5340Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 5350Sstevel@tonic-gate 5360Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 5370Sstevel@tonic-gate paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 5380Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 5390Sstevel@tonic-gate paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 5400Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 5410Sstevel@tonic-gate movd 16(%edx), %mm2 / 5: mm2 = r[i] 5420Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 5430Sstevel@tonic-gate 5440Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 5450Sstevel@tonic-gate paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 5460Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 5470Sstevel@tonic-gate paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 5480Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 5490Sstevel@tonic-gate movd 20(%edx), %mm2 / 6: mm2 = r[i] 5500Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 5510Sstevel@tonic-gate 5520Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 5530Sstevel@tonic-gate paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 5540Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 5550Sstevel@tonic-gate paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 5560Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 5570Sstevel@tonic-gate movd 24(%edx), %mm2 / 7: mm2 = r[i] 5580Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 5590Sstevel@tonic-gate 5600Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 5610Sstevel@tonic-gate paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 5620Sstevel@tonic-gate movd 28(%ebx), %mm1 / 8: mm1 = a[i] 5630Sstevel@tonic-gate paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 5640Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 5650Sstevel@tonic-gate movd 28(%edx), %mm2 / 8: mm2 = r[i] 5660Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 5670Sstevel@tonic-gate 5680Sstevel@tonic-gate pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i] 5690Sstevel@tonic-gate paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i] 5700Sstevel@tonic-gate paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy; 5710Sstevel@tonic-gate movd %mm0, 28(%edx) / 8: r[i] = product[31..0] 5720Sstevel@tonic-gate psrlq $32, %mm0 / 8: cy = product[63..32] 5730Sstevel@tonic-gate 5740Sstevel@tonic-gate leal UNROLL32(%ebx), %ebx / a += UNROLL 5750Sstevel@tonic-gate leal UNROLL32(%edx), %edx / r += UNROLL 5760Sstevel@tonic-gate subl $UNROLL, %ecx / len -= UNROLL 5770Sstevel@tonic-gate jz .L27 5780Sstevel@tonic-gate jmp .L25 5790Sstevel@tonic-gate 5800Sstevel@tonic-gate.L26: 5810Sstevel@tonic-gate movd 0(%ebx), %mm1 / 1: mm1 = a[i] 5820Sstevel@tonic-gate movd 0(%edx), %mm2 / 1: mm2 = r[i] 5830Sstevel@tonic-gate pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i] 5840Sstevel@tonic-gate paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i] 5850Sstevel@tonic-gate paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy; 5860Sstevel@tonic-gate movd %mm0, 0(%edx) / 1: r[i] = product[31..0] 5870Sstevel@tonic-gate psrlq $32, %mm0 / 1: cy = product[63..32] 5880Sstevel@tonic-gate subl $1, %ecx 5890Sstevel@tonic-gate jz .L27 5900Sstevel@tonic-gate 5910Sstevel@tonic-gate movd 4(%ebx), %mm1 / 2: mm1 = a[i] 5920Sstevel@tonic-gate movd 4(%edx), %mm2 / 2: mm2 = r[i] 5930Sstevel@tonic-gate pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i] 5940Sstevel@tonic-gate paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i] 5950Sstevel@tonic-gate paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy; 5960Sstevel@tonic-gate movd %mm0, 4(%edx) / 2: r[i] = product[31..0] 5970Sstevel@tonic-gate psrlq $32, %mm0 / 2: cy = product[63..32] 5980Sstevel@tonic-gate subl $1, %ecx 5990Sstevel@tonic-gate jz .L27 6000Sstevel@tonic-gate 6010Sstevel@tonic-gate movd 8(%ebx), %mm1 / 3: mm1 = a[i] 6020Sstevel@tonic-gate movd 8(%edx), %mm2 / 3: mm2 = r[i] 6030Sstevel@tonic-gate pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i] 6040Sstevel@tonic-gate paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i] 6050Sstevel@tonic-gate paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy; 6060Sstevel@tonic-gate movd %mm0, 8(%edx) / 3: r[i] = product[31..0] 6070Sstevel@tonic-gate psrlq $32, %mm0 / 3: cy = product[63..32] 6080Sstevel@tonic-gate subl $1, %ecx 6090Sstevel@tonic-gate jz .L27 6100Sstevel@tonic-gate 6110Sstevel@tonic-gate movd 12(%ebx), %mm1 / 4: mm1 = a[i] 6120Sstevel@tonic-gate movd 12(%edx), %mm2 / 4: mm2 = r[i] 6130Sstevel@tonic-gate pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i] 6140Sstevel@tonic-gate paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i] 6150Sstevel@tonic-gate paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy; 6160Sstevel@tonic-gate movd %mm0, 12(%edx) / 4: r[i] = product[31..0] 6170Sstevel@tonic-gate psrlq $32, %mm0 / 4: cy = product[63..32] 6180Sstevel@tonic-gate subl $1, %ecx 6190Sstevel@tonic-gate jz .L27 6200Sstevel@tonic-gate 6210Sstevel@tonic-gate movd 16(%ebx), %mm1 / 5: mm1 = a[i] 6220Sstevel@tonic-gate movd 16(%edx), %mm2 / 5: mm2 = r[i] 6230Sstevel@tonic-gate pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i] 6240Sstevel@tonic-gate paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i] 6250Sstevel@tonic-gate paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy; 6260Sstevel@tonic-gate movd %mm0, 16(%edx) / 5: r[i] = product[31..0] 6270Sstevel@tonic-gate psrlq $32, %mm0 / 5: cy = product[63..32] 6280Sstevel@tonic-gate subl $1, %ecx 6290Sstevel@tonic-gate jz .L27 6300Sstevel@tonic-gate 6310Sstevel@tonic-gate movd 20(%ebx), %mm1 / 6: mm1 = a[i] 6320Sstevel@tonic-gate movd 20(%edx), %mm2 / 6: mm2 = r[i] 6330Sstevel@tonic-gate pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i] 6340Sstevel@tonic-gate paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i] 6350Sstevel@tonic-gate paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy; 6360Sstevel@tonic-gate movd %mm0, 20(%edx) / 6: r[i] = product[31..0] 6370Sstevel@tonic-gate psrlq $32, %mm0 / 6: cy = product[63..32] 6380Sstevel@tonic-gate subl $1, %ecx 6390Sstevel@tonic-gate jz .L27 6400Sstevel@tonic-gate 6410Sstevel@tonic-gate movd 24(%ebx), %mm1 / 7: mm1 = a[i] 6420Sstevel@tonic-gate movd 24(%edx), %mm2 / 7: mm2 = r[i] 6430Sstevel@tonic-gate pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i] 6440Sstevel@tonic-gate paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i] 6450Sstevel@tonic-gate paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy; 6460Sstevel@tonic-gate movd %mm0, 24(%edx) / 7: r[i] = product[31..0] 6470Sstevel@tonic-gate psrlq $32, %mm0 / 7: cy = product[63..32] 6480Sstevel@tonic-gate 6490Sstevel@tonic-gate.L27: 6500Sstevel@tonic-gate movd %mm0, %eax 6510Sstevel@tonic-gate / no emms. caller is responsible for emms 6520Sstevel@tonic-gate ret 6530Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2_r) 6540Sstevel@tonic-gate 6550Sstevel@tonic-gate 6560Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len 6570Sstevel@tonic-gate/ returns the carry digit 6580Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 6590Sstevel@tonic-gate/ 6600Sstevel@tonic-gate/ r 8(%ebp) %edx 6610Sstevel@tonic-gate/ a 12(%ebp) %ebx 6620Sstevel@tonic-gate/ len 16(%ebp) %ecx 6630Sstevel@tonic-gate/ digit 20(%ebp) %mm3 6640Sstevel@tonic-gate/ 6650Sstevel@tonic-gate/ In userland, there is just the one function, big_mul_add_vec_sse2(). 6660Sstevel@tonic-gate/ But in the kernel, there are two variations: 6670Sstevel@tonic-gate/ 1. big_mul_add_vec_sse2() which does what is necessary to save and 6680Sstevel@tonic-gate/ restore state, if necessary, and to ensure that preemtion is 6690Sstevel@tonic-gate/ disabled. 6700Sstevel@tonic-gate/ 2. big_mul_add_vec_sse2_nsv() which just does the work; 6710Sstevel@tonic-gate/ it is the caller's responsibility to ensure that MMX state 6720Sstevel@tonic-gate/ does not need to be saved and restored and that preemption 6730Sstevel@tonic-gate/ is already disabled. 6740Sstevel@tonic-gate 6750Sstevel@tonic-gate 6760Sstevel@tonic-gate#if defined(MMX_MANAGE) 6770Sstevel@tonic-gate 6780Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2) 6790Sstevel@tonic-gate pushl %ebp 6800Sstevel@tonic-gate movl %esp, %ebp 6810Sstevel@tonic-gate pushl %ebx 6820Sstevel@tonic-gate pushl %esi 6830Sstevel@tonic-gate KPREEMPT_DISABLE 6840Sstevel@tonic-gate TEST_TS(%ebx) 6850Sstevel@tonic-gate pushl %ebx 6860Sstevel@tonic-gate jnz .addvec_no_save 6870Sstevel@tonic-gate pushl %edi 6880Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 6890Sstevel@tonic-gate movl 8(%ebp), %edx 6900Sstevel@tonic-gate movl 12(%ebp), %ebx 6910Sstevel@tonic-gate movl 16(%ebp), %ecx 6920Sstevel@tonic-gate movd 20(%ebp), %mm3 6930Sstevel@tonic-gate call big_mul_add_vec_sse2_r 6940Sstevel@tonic-gate movl %eax, %esi 6950Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 6960Sstevel@tonic-gate popl %edi 6970Sstevel@tonic-gate jmp .addvec_rtn 6980Sstevel@tonic-gate 6990Sstevel@tonic-gate.addvec_no_save: 7000Sstevel@tonic-gate movl 8(%ebp), %edx 7010Sstevel@tonic-gate movl 12(%ebp), %ebx 7020Sstevel@tonic-gate movl 16(%ebp), %ecx 7030Sstevel@tonic-gate movd 20(%ebp), %mm3 7040Sstevel@tonic-gate call big_mul_add_vec_sse2_r 7050Sstevel@tonic-gate movl %eax, %esi 7060Sstevel@tonic-gate 7070Sstevel@tonic-gate.addvec_rtn: 7080Sstevel@tonic-gate emms 7090Sstevel@tonic-gate popl %ebx 7100Sstevel@tonic-gate movl %ebx, %cr0 7110Sstevel@tonic-gate KPREEMPT_ENABLE 7120Sstevel@tonic-gate movl %esi, %eax 7130Sstevel@tonic-gate popl %esi 7140Sstevel@tonic-gate popl %ebx 7150Sstevel@tonic-gate leave 7160Sstevel@tonic-gate ret 7170Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2) 7180Sstevel@tonic-gate 7190Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2_nsv) 7200Sstevel@tonic-gate pushl %ebp 7210Sstevel@tonic-gate movl %esp, %ebp 7220Sstevel@tonic-gate pushl %ebx 7230Sstevel@tonic-gate movl 8(%ebp), %edx 7240Sstevel@tonic-gate movl 12(%ebp), %ebx 7250Sstevel@tonic-gate movl 16(%ebp), %ecx 7260Sstevel@tonic-gate movd 20(%ebp), %mm3 7270Sstevel@tonic-gate call big_mul_add_vec_sse2_r 7280Sstevel@tonic-gate popl %ebx 7290Sstevel@tonic-gate leave 7300Sstevel@tonic-gate ret 7310Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2_nsv) 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate 7340Sstevel@tonic-gate#else /* !defined(MMX_MANAGE) */ 7350Sstevel@tonic-gate 7360Sstevel@tonic-gate ENTRY(big_mul_add_vec_sse2) 7370Sstevel@tonic-gate pushl %ebp 7380Sstevel@tonic-gate movl %esp, %ebp 7390Sstevel@tonic-gate pushl %ebx 7400Sstevel@tonic-gate movl 8(%ebp), %edx 7410Sstevel@tonic-gate movl 12(%ebp), %ebx 7420Sstevel@tonic-gate movl 16(%ebp), %ecx 7430Sstevel@tonic-gate movd 20(%ebp), %mm3 7440Sstevel@tonic-gate call big_mul_add_vec_sse2_r 7450Sstevel@tonic-gate popl %ebx 7460Sstevel@tonic-gate emms 7470Sstevel@tonic-gate leave 7480Sstevel@tonic-gate ret 7490Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_sse2) 7500Sstevel@tonic-gate 7510Sstevel@tonic-gate#endif /* MMX_MANAGE */ 7520Sstevel@tonic-gate 7530Sstevel@tonic-gate 7540Sstevel@tonic-gate/ void 7550Sstevel@tonic-gate/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen) 7560Sstevel@tonic-gate/ { 7570Sstevel@tonic-gate/ int i; 7580Sstevel@tonic-gate/ 7590Sstevel@tonic-gate/ r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]); 7600Sstevel@tonic-gate/ for (i = 1; i < blen; ++i) 7610Sstevel@tonic-gate/ r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]); 7620Sstevel@tonic-gate/ } 7630Sstevel@tonic-gate 7640Sstevel@tonic-gate 7650Sstevel@tonic-gate#if defined(MMX_MANAGE) 7660Sstevel@tonic-gate ENTRY(big_mul_vec_sse2_fc) 7670Sstevel@tonic-gate#else 7680Sstevel@tonic-gate ENTRY(big_mul_vec_sse2) 7690Sstevel@tonic-gate#endif 7700Sstevel@tonic-gate subl $0x8, %esp 7710Sstevel@tonic-gate pushl %ebx 7720Sstevel@tonic-gate pushl %ebp 7730Sstevel@tonic-gate pushl %esi 7740Sstevel@tonic-gate pushl %edi 7750Sstevel@tonic-gate movl 40(%esp), %eax 7760Sstevel@tonic-gate movl %eax, 20(%esp) 7770Sstevel@tonic-gate pushl (%eax) 7780Sstevel@tonic-gate movl 40(%esp), %edi 7790Sstevel@tonic-gate pushl %edi 7800Sstevel@tonic-gate movl 40(%esp), %esi 7810Sstevel@tonic-gate pushl %esi 7820Sstevel@tonic-gate movl 40(%esp), %ebx 7830Sstevel@tonic-gate pushl %ebx 7840Sstevel@tonic-gate#if defined(MMX_MANAGE) 7850Sstevel@tonic-gate call big_mul_set_vec_sse2_nsv 7860Sstevel@tonic-gate#else 7870Sstevel@tonic-gate call big_mul_set_vec_sse2 7880Sstevel@tonic-gate#endif 7890Sstevel@tonic-gate addl $0x10, %esp 7900Sstevel@tonic-gate movl %eax, (%ebx,%edi,4) 7910Sstevel@tonic-gate movl 44(%esp), %eax 7920Sstevel@tonic-gate movl %eax, 16(%esp) 7930Sstevel@tonic-gate cmpl $0x1, %eax 7940Sstevel@tonic-gate jle .mulvec_rtn 7950Sstevel@tonic-gate movl $0x1, %ebp 7960Sstevel@tonic-gate 797508Sdarrenm .align 16 7980Sstevel@tonic-gate.mulvec_add: 7990Sstevel@tonic-gate movl 20(%esp), %eax 8000Sstevel@tonic-gate pushl (%eax,%ebp,4) 8010Sstevel@tonic-gate pushl %edi 8020Sstevel@tonic-gate pushl %esi 8030Sstevel@tonic-gate leal (%ebx,%ebp,4), %eax 8040Sstevel@tonic-gate pushl %eax 8050Sstevel@tonic-gate#if defined(MMX_MANAGE) 8060Sstevel@tonic-gate call big_mul_add_vec_sse2_nsv 8070Sstevel@tonic-gate#else 8080Sstevel@tonic-gate call big_mul_add_vec_sse2 8090Sstevel@tonic-gate#endif 8100Sstevel@tonic-gate addl $0x10, %esp 8110Sstevel@tonic-gate leal (%ebp,%edi), %ecx 8120Sstevel@tonic-gate movl %eax, (%ebx,%ecx,4) 8130Sstevel@tonic-gate incl %ebp 8140Sstevel@tonic-gate cmpl 16(%esp), %ebp 8150Sstevel@tonic-gate jl .mulvec_add 8160Sstevel@tonic-gate.mulvec_rtn: 8170Sstevel@tonic-gate#if defined(MMX_MANAGE) 8180Sstevel@tonic-gate emms 8190Sstevel@tonic-gate#endif 8200Sstevel@tonic-gate popl %edi 8210Sstevel@tonic-gate popl %esi 8220Sstevel@tonic-gate popl %ebp 8230Sstevel@tonic-gate popl %ebx 8240Sstevel@tonic-gate addl $0x8, %esp 8250Sstevel@tonic-gate ret 8260Sstevel@tonic-gate#if defined(MMX_MANAGE) 8270Sstevel@tonic-gate SET_SIZE(big_mul_vec_sse2_fc) 8280Sstevel@tonic-gate#else 8290Sstevel@tonic-gate SET_SIZE(big_mul_vec_sse2) 8300Sstevel@tonic-gate#endif 8310Sstevel@tonic-gate 8320Sstevel@tonic-gate#if defined(MMX_MANAGE) 8330Sstevel@tonic-gate 8340Sstevel@tonic-gate ENTRY(big_mul_vec_sse2) 8350Sstevel@tonic-gate pushl %ebp 8360Sstevel@tonic-gate movl %esp, %ebp 8370Sstevel@tonic-gate subl $8, %esp 8380Sstevel@tonic-gate pushl %edi 8390Sstevel@tonic-gate KPREEMPT_DISABLE 8400Sstevel@tonic-gate TEST_TS(%eax) 8410Sstevel@tonic-gate movl %eax, -8(%ebp) 8420Sstevel@tonic-gate jnz .mulvec_no_save 8430Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 8440Sstevel@tonic-gate movl %edi, -4(%ebp) 8450Sstevel@tonic-gate.mulvec_no_save: 8460Sstevel@tonic-gate movl 24(%ebp), %eax / blen 8470Sstevel@tonic-gate pushl %eax 8480Sstevel@tonic-gate movl 20(%ebp), %eax / b 8490Sstevel@tonic-gate pushl %eax 8500Sstevel@tonic-gate movl 16(%ebp), %eax / alen 8510Sstevel@tonic-gate pushl %eax 8520Sstevel@tonic-gate movl 12(%ebp), %eax / a 8530Sstevel@tonic-gate pushl %eax 8540Sstevel@tonic-gate movl 8(%ebp), %eax / r 8550Sstevel@tonic-gate pushl %eax 8560Sstevel@tonic-gate call big_mul_vec_sse2_fc 8570Sstevel@tonic-gate addl $20, %esp 8580Sstevel@tonic-gate movl -8(%ebp), %eax 8590Sstevel@tonic-gate testl $CR0_TS, %eax 8600Sstevel@tonic-gate jnz .mulvec_no_rstr 8610Sstevel@tonic-gate movl -4(%ebp), %edi 8620Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 8630Sstevel@tonic-gate.mulvec_no_rstr: 8640Sstevel@tonic-gate movl %eax, %cr0 8650Sstevel@tonic-gate KPREEMPT_ENABLE 8660Sstevel@tonic-gate popl %edi 8670Sstevel@tonic-gate leave 8680Sstevel@tonic-gate ret 8690Sstevel@tonic-gate SET_SIZE(big_mul_vec_sse2) 8700Sstevel@tonic-gate 8710Sstevel@tonic-gate#endif /* MMX_MANAGE */ 8720Sstevel@tonic-gate 8730Sstevel@tonic-gate 8740Sstevel@tonic-gate 8750Sstevel@tonic-gate#undef UNROLL 8760Sstevel@tonic-gate#undef UNROLL32 8770Sstevel@tonic-gate 8780Sstevel@tonic-gate 8790Sstevel@tonic-gate/ r = a * a, r and a are vectors of length len 8800Sstevel@tonic-gate/ Suitable only for x86 models that support SSE2 instruction set extensions 8810Sstevel@tonic-gate/ 8820Sstevel@tonic-gate/ This function is not suitable for a truly general-purpose multiprecision 8830Sstevel@tonic-gate/ arithmetic library, because it does not work for "small" numbers, that is 8840Sstevel@tonic-gate/ numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec() 8850Sstevel@tonic-gate/ for any small numbers. 8860Sstevel@tonic-gate 8870Sstevel@tonic-gate#if defined(MMX_MANAGE) 8880Sstevel@tonic-gate ENTRY(big_sqr_vec_sse2_fc) 8890Sstevel@tonic-gate#else 8900Sstevel@tonic-gate ENTRY(big_sqr_vec_sse2) 8910Sstevel@tonic-gate pushl %ebp 8920Sstevel@tonic-gate movl %esp, %ebp 8930Sstevel@tonic-gate#endif 8940Sstevel@tonic-gate 8950Sstevel@tonic-gate pushl %ebx 8960Sstevel@tonic-gate pushl %edi 8970Sstevel@tonic-gate pushl %esi 8980Sstevel@tonic-gate 8990Sstevel@tonic-gate / r[1..alen] = a[0] * a[1..alen-1] 9000Sstevel@tonic-gate 9010Sstevel@tonic-gate movl 8(%ebp), %edi / r = arg(r) 9020Sstevel@tonic-gate movl 12(%ebp), %esi / a = arg(a) 9030Sstevel@tonic-gate movl 16(%ebp), %ecx / cnt = arg(alen) 9040Sstevel@tonic-gate movd %ecx, %mm4 / save_cnt = arg(alen) 9050Sstevel@tonic-gate leal 4(%edi), %edx / dst = &r[1] 9060Sstevel@tonic-gate movl %esi, %ebx / src = a 9070Sstevel@tonic-gate movd 0(%ebx), %mm3 / mm3 = a[0] 9080Sstevel@tonic-gate leal 4(%ebx), %ebx / src = &a[1] 9090Sstevel@tonic-gate subl $1, %ecx / --cnt 9100Sstevel@tonic-gate call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1] 9110Sstevel@tonic-gate movl %edi, %edx / dst = r 9120Sstevel@tonic-gate movl %esi, %ebx / src = a 9130Sstevel@tonic-gate movd %mm4, %ecx / cnt = save_cnt 9140Sstevel@tonic-gate movl %eax, (%edx, %ecx, 4) / r[cnt] = cy 9150Sstevel@tonic-gate 9160Sstevel@tonic-gate/ /* High-level vector C pseudocode */ 9170Sstevel@tonic-gate/ for (i = 1; i < alen-1; ++i) 9180Sstevel@tonic-gate/ r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1] 9190Sstevel@tonic-gate/ 9200Sstevel@tonic-gate/ /* Same thing, but slightly lower level C-like pseudocode */ 9210Sstevel@tonic-gate/ i = 1; 9220Sstevel@tonic-gate/ r = &arg_r[2*i + 1]; 9230Sstevel@tonic-gate/ a = &arg_a[i + 1]; 9240Sstevel@tonic-gate/ digit = arg_a[i]; 9250Sstevel@tonic-gate/ cnt = alen - 3; 9260Sstevel@tonic-gate/ while (cnt != 0) { 9270Sstevel@tonic-gate/ r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit); 9280Sstevel@tonic-gate/ r += 2; 9290Sstevel@tonic-gate/ ++a; 9300Sstevel@tonic-gate/ --cnt; 9310Sstevel@tonic-gate/ } 9320Sstevel@tonic-gate/ 9330Sstevel@tonic-gate/ /* Same thing, but even lower level 9340Sstevel@tonic-gate/ * For example, pointers are raw pointers, 9350Sstevel@tonic-gate/ * with no scaling by object size. 9360Sstevel@tonic-gate/ */ 9370Sstevel@tonic-gate/ r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */ 9380Sstevel@tonic-gate/ a = arg_a + 8; 9390Sstevel@tonic-gate/ digit = *(arg_a + 4); 9400Sstevel@tonic-gate/ cnt = alen - 3; 9410Sstevel@tonic-gate/ while (cnt != 0) { 9420Sstevel@tonic-gate/ cy = big_mul_add_vec_sse2_r(); 9430Sstevel@tonic-gate/ *(r + 4 * cnt) = cy; 9440Sstevel@tonic-gate/ r += 8; 9450Sstevel@tonic-gate/ a += 4; 9460Sstevel@tonic-gate/ --cnt; 9470Sstevel@tonic-gate/ } 9480Sstevel@tonic-gate 9490Sstevel@tonic-gate leal 4(%edi), %edi / r += 4; r = &r[1] 9500Sstevel@tonic-gate leal 4(%esi), %esi / a += 4; a = &a[1] 9510Sstevel@tonic-gate movd %mm4, %ecx / cnt = save 9520Sstevel@tonic-gate subl $2, %ecx / cnt = alen - 2; i in 1..alen-2 9530Sstevel@tonic-gate movd %ecx, %mm4 / save_cnt 9540Sstevel@tonic-gate jecxz .L32 / while (cnt != 0) { 9550Sstevel@tonic-gate.L31: 9560Sstevel@tonic-gate movd 0(%esi), %mm3 / digit = a[i] 9570Sstevel@tonic-gate leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1] 9580Sstevel@tonic-gate leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1] 9590Sstevel@tonic-gate movl %edi, %edx / edx = r 9600Sstevel@tonic-gate movl %esi, %ebx / ebx = a 9610Sstevel@tonic-gate cmp $1, %ecx / The last triangle term is special 9620Sstevel@tonic-gate jz .L32 9630Sstevel@tonic-gate call big_mul_add_vec_sse2_r 9640Sstevel@tonic-gate movd %mm4, %ecx / cnt = save_cnt 9650Sstevel@tonic-gate movl %eax, (%edi, %ecx, 4) / r[cnt] = cy 9660Sstevel@tonic-gate subl $1, %ecx / --cnt 9670Sstevel@tonic-gate movd %ecx, %mm4 / save_cnt = cnt 9680Sstevel@tonic-gate jmp .L31 / } 9690Sstevel@tonic-gate 9700Sstevel@tonic-gate.L32: 9710Sstevel@tonic-gate movd 0(%ebx), %mm1 / mm1 = a[i + 1] 9720Sstevel@tonic-gate movd 0(%edx), %mm2 / mm2 = r[2 * i + 1] 9730Sstevel@tonic-gate pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1] 9740Sstevel@tonic-gate paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p 9750Sstevel@tonic-gate movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p) 9760Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy 9770Sstevel@tonic-gate movd %mm2, 4(%edx) / r[2 * i + 2] = cy 9780Sstevel@tonic-gate pxor %mm2, %mm2 9790Sstevel@tonic-gate movd %mm2, 8(%edx) / r[2 * i + 3] = 0 9800Sstevel@tonic-gate 9810Sstevel@tonic-gate movl 8(%ebp), %edx / r = arg(r) 9820Sstevel@tonic-gate movl 12(%ebp), %ebx / a = arg(a) 9830Sstevel@tonic-gate movl 16(%ebp), %ecx / cnt = arg(alen) 9840Sstevel@tonic-gate 9850Sstevel@tonic-gate / compute low-order corner 9860Sstevel@tonic-gate / p = a[0]**2 9870Sstevel@tonic-gate / r[0] = lo32(p) 9880Sstevel@tonic-gate / cy = hi32(p) 9890Sstevel@tonic-gate movd 0(%ebx), %mm2 / mm2 = a[0] 9900Sstevel@tonic-gate pmuludq %mm2, %mm2 / mm2 = p = a[0]**2 9910Sstevel@tonic-gate movd %mm2, 0(%edx) / r[0] = lo32(p) 9920Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(p) 9930Sstevel@tonic-gate 9940Sstevel@tonic-gate / p = 2 * r[1] 9950Sstevel@tonic-gate / t = p + cy 9960Sstevel@tonic-gate / r[1] = lo32(t) 9970Sstevel@tonic-gate / cy = hi32(t) 9980Sstevel@tonic-gate movd 4(%edx), %mm1 / mm1 = r[1] 9990Sstevel@tonic-gate psllq $1, %mm1 / mm1 = p = 2 * r[1] 10000Sstevel@tonic-gate paddq %mm1, %mm2 / mm2 = t = p + cy 10010Sstevel@tonic-gate movd %mm2, 4(%edx) / r[1] = low32(t) 10020Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(t) 10030Sstevel@tonic-gate 10040Sstevel@tonic-gate / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3] 10050Sstevel@tonic-gate subl $2, %ecx / cnt = alen - 2 10060Sstevel@tonic-gate.L34: 10070Sstevel@tonic-gate movd 4(%ebx), %mm0 / mm0 = diag = a[i+1] 10080Sstevel@tonic-gate pmuludq %mm0, %mm0 / mm0 = p = diag**2 10090Sstevel@tonic-gate paddq %mm0, %mm2 / mm2 = t = p + cy 10100Sstevel@tonic-gate movd %mm2, %eax 10110Sstevel@tonic-gate movd %eax, %mm1 / mm1 = lo32(t) 10120Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = hi32(t) 10130Sstevel@tonic-gate 10140Sstevel@tonic-gate movd 8(%edx), %mm3 / mm3 = r[2*i] 10150Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i] 10160Sstevel@tonic-gate paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t) 10170Sstevel@tonic-gate movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t) 10180Sstevel@tonic-gate psrlq $32, %mm1 10190Sstevel@tonic-gate paddq %mm1, %mm2 10200Sstevel@tonic-gate 10210Sstevel@tonic-gate movd 12(%edx), %mm3 / mm3 = r[2*i+1] 10220Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i+1] 10230Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t) 10240Sstevel@tonic-gate movd %mm2, 12(%edx) / r[2*i+1] = mm2 10250Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy 10260Sstevel@tonic-gate leal 8(%edx), %edx / r += 2 10270Sstevel@tonic-gate leal 4(%ebx), %ebx / ++a 10280Sstevel@tonic-gate subl $1, %ecx / --cnt 10290Sstevel@tonic-gate jnz .L34 10300Sstevel@tonic-gate 10310Sstevel@tonic-gate / Carry from last triangle term must participate in doubling, 10320Sstevel@tonic-gate / but this step isn't paired up with a squaring the elements 10330Sstevel@tonic-gate / of the inner diagonal. 10340Sstevel@tonic-gate / r[$-3..$-2] += 2 * r[$-3..$-2] + cy 10350Sstevel@tonic-gate movd 8(%edx), %mm3 / mm3 = r[2*i] 10360Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i] 10370Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy 10380Sstevel@tonic-gate movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy) 10390Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy) 10400Sstevel@tonic-gate 10410Sstevel@tonic-gate movd 12(%edx), %mm3 / mm3 = r[2*i+1] 10420Sstevel@tonic-gate psllq $1, %mm3 / mm3 = 2*r[2*i+1] 10430Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy 10440Sstevel@tonic-gate movd %mm2, 12(%edx) / r[2*i+1] = mm2 10450Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy 10460Sstevel@tonic-gate 10470Sstevel@tonic-gate / compute high-order corner and add it in 10480Sstevel@tonic-gate / p = a[alen - 1]**2 10490Sstevel@tonic-gate / t = p + cy 10500Sstevel@tonic-gate / r[alen + alen - 2] += lo32(t) 10510Sstevel@tonic-gate / cy = hi32(t) 10520Sstevel@tonic-gate / r[alen + alen - 1] = cy 10530Sstevel@tonic-gate movd 4(%ebx), %mm0 / mm0 = a[$-1] 10540Sstevel@tonic-gate movd 8(%edx), %mm3 / mm3 = r[$-2] 10550Sstevel@tonic-gate pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2 10560Sstevel@tonic-gate paddq %mm0, %mm2 / mm2 = t = p + cy 10570Sstevel@tonic-gate paddq %mm3, %mm2 / mm2 = r[$-2] + t 10580Sstevel@tonic-gate movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t) 10590Sstevel@tonic-gate psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t) 10600Sstevel@tonic-gate movd 12(%edx), %mm3 10610Sstevel@tonic-gate paddq %mm3, %mm2 10620Sstevel@tonic-gate movd %mm2, 12(%edx) / r[$-1] += cy 10630Sstevel@tonic-gate 10640Sstevel@tonic-gate.L35: 10650Sstevel@tonic-gate emms 10660Sstevel@tonic-gate popl %esi 10670Sstevel@tonic-gate popl %edi 10680Sstevel@tonic-gate popl %ebx 10690Sstevel@tonic-gate 10700Sstevel@tonic-gate#if defined(MMX_MANAGE) 10710Sstevel@tonic-gate ret 10720Sstevel@tonic-gate SET_SIZE(big_sqr_vec_sse2_fc) 10730Sstevel@tonic-gate#else 10740Sstevel@tonic-gate leave 10750Sstevel@tonic-gate ret 10760Sstevel@tonic-gate SET_SIZE(big_sqr_vec_sse2) 10770Sstevel@tonic-gate#endif 10780Sstevel@tonic-gate 10790Sstevel@tonic-gate 10800Sstevel@tonic-gate#if defined(MMX_MANAGE) 10810Sstevel@tonic-gate ENTRY(big_sqr_vec_sse2) 10820Sstevel@tonic-gate pushl %ebp 10830Sstevel@tonic-gate movl %esp, %ebp 10840Sstevel@tonic-gate KPREEMPT_DISABLE 10850Sstevel@tonic-gate TEST_TS(%ebx) 10860Sstevel@tonic-gate pushl %ebx 10870Sstevel@tonic-gate jnz .sqr_no_save 10880Sstevel@tonic-gate pushl %edi 10890Sstevel@tonic-gate SAVE_MMX_0TO4(%edi) 10900Sstevel@tonic-gate call big_sqr_vec_sse2_fc 10910Sstevel@tonic-gate RSTOR_MMX_0TO4(%edi) 10920Sstevel@tonic-gate popl %edi 10930Sstevel@tonic-gate jmp .sqr_rtn 10940Sstevel@tonic-gate 10950Sstevel@tonic-gate.sqr_no_save: 10960Sstevel@tonic-gate call big_sqr_vec_sse2_fc 10970Sstevel@tonic-gate 10980Sstevel@tonic-gate.sqr_rtn: 10990Sstevel@tonic-gate popl %ebx 11000Sstevel@tonic-gate movl %ebx, %cr0 11010Sstevel@tonic-gate KPREEMPT_ENABLE 11020Sstevel@tonic-gate leave 11030Sstevel@tonic-gate ret 11040Sstevel@tonic-gate SET_SIZE(big_sqr_vec_sse2) 11050Sstevel@tonic-gate 11060Sstevel@tonic-gate#endif /* MMX_MANAGE */ 11070Sstevel@tonic-gate 11080Sstevel@tonic-gate/ ------------------------------------------------------------------------ 11090Sstevel@tonic-gate/ UMUL Implementations 11100Sstevel@tonic-gate/ ------------------------------------------------------------------------ 11110Sstevel@tonic-gate 11120Sstevel@tonic-gate 11130Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 11140Sstevel@tonic-gate/ returns the carry digit 11150Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions. 11160Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 11170Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support 11180Sstevel@tonic-gate/ the PMULUDQ instruction. 11190Sstevel@tonic-gate/ 11200Sstevel@tonic-gate/ uint32_t 11210Sstevel@tonic-gate/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 11220Sstevel@tonic-gate/ 11230Sstevel@tonic-gate/ r 8(%ebp) %edx %edi 11240Sstevel@tonic-gate/ a 12(%ebp) %ebx %esi 11250Sstevel@tonic-gate/ len 16(%ebp) %ecx 11260Sstevel@tonic-gate/ digit 20(%ebp) %esi 11270Sstevel@tonic-gate 11280Sstevel@tonic-gate ENTRY(big_mul_set_vec_umul) 11290Sstevel@tonic-gate pushl %ebp 11300Sstevel@tonic-gate movl %esp, %ebp 11310Sstevel@tonic-gate pushl %esi 11320Sstevel@tonic-gate pushl %edi 11330Sstevel@tonic-gate pushl %ebx 11340Sstevel@tonic-gate movl 16(%ebp), %ecx 11350Sstevel@tonic-gate xorl %ebx, %ebx / cy = 0 11360Sstevel@tonic-gate testl %ecx, %ecx 11370Sstevel@tonic-gate movl 8(%ebp), %edi 11380Sstevel@tonic-gate movl 12(%ebp), %esi 11390Sstevel@tonic-gate je .L57 11400Sstevel@tonic-gate 11410Sstevel@tonic-gate.L55: 11420Sstevel@tonic-gate movl (%esi), %eax / eax = a[i] 11430Sstevel@tonic-gate leal 4(%esi), %esi / ++a 11440Sstevel@tonic-gate mull 20(%ebp) / edx:eax = a[i] * digit 11450Sstevel@tonic-gate addl %ebx, %eax 11460Sstevel@tonic-gate adcl $0, %edx / edx:eax = a[i] * digit + cy 11470Sstevel@tonic-gate movl %eax, (%edi) / r[i] = product[31..0] 11480Sstevel@tonic-gate movl %edx, %ebx / cy = product[63..32] 11490Sstevel@tonic-gate leal 4(%edi), %edi / ++r 11500Sstevel@tonic-gate decl %ecx / --len 11510Sstevel@tonic-gate jnz .L55 / while (len != 0) 11520Sstevel@tonic-gate.L57: 11530Sstevel@tonic-gate movl %ebx, %eax 11540Sstevel@tonic-gate popl %ebx 11550Sstevel@tonic-gate popl %edi 11560Sstevel@tonic-gate popl %esi 11570Sstevel@tonic-gate leave 11580Sstevel@tonic-gate ret 11590Sstevel@tonic-gate SET_SIZE(big_mul_set_vec_umul) 11600Sstevel@tonic-gate 11610Sstevel@tonic-gate 11620Sstevel@tonic-gate/ r = r + a * digit, r and a are vectors of length len 11630Sstevel@tonic-gate/ returns the carry digit 11640Sstevel@tonic-gate/ Does not use any MMX, SSE, or SSE2 instructions. 11650Sstevel@tonic-gate/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL. 11660Sstevel@tonic-gate/ This is a fall-back implementation for x86 models that do not support 11670Sstevel@tonic-gate/ the PMULUDQ instruction. 11680Sstevel@tonic-gate/ 11690Sstevel@tonic-gate/ uint32_t 11700Sstevel@tonic-gate/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit) 11710Sstevel@tonic-gate/ 11720Sstevel@tonic-gate/ r 8(%ebp) %edx %edi 11730Sstevel@tonic-gate/ a 12(%ebp) %ebx %esi 11740Sstevel@tonic-gate/ len 16(%ebp) %ecx 11750Sstevel@tonic-gate/ digit 20(%ebp) %esi 11760Sstevel@tonic-gate 11770Sstevel@tonic-gate ENTRY(big_mul_add_vec_umul) 11780Sstevel@tonic-gate pushl %ebp 11790Sstevel@tonic-gate movl %esp, %ebp 11800Sstevel@tonic-gate pushl %esi 11810Sstevel@tonic-gate pushl %edi 11820Sstevel@tonic-gate pushl %ebx 11830Sstevel@tonic-gate movl 16(%ebp), %ecx 11840Sstevel@tonic-gate xorl %ebx, %ebx / cy = 0 11850Sstevel@tonic-gate testl %ecx, %ecx 11860Sstevel@tonic-gate movl 8(%ebp), %edi 11870Sstevel@tonic-gate movl 12(%ebp), %esi 11880Sstevel@tonic-gate je .L67 11890Sstevel@tonic-gate .align 4 11900Sstevel@tonic-gate.L65: 11910Sstevel@tonic-gate movl (%esi), %eax / eax = a[i] 11920Sstevel@tonic-gate leal 4(%esi), %esi / ++a 11930Sstevel@tonic-gate mull 20(%ebp) / edx:eax = a[i] * digit 11940Sstevel@tonic-gate addl (%edi), %eax 11950Sstevel@tonic-gate adcl $0, %edx / edx:eax = a[i] * digit + r[i] 11960Sstevel@tonic-gate addl %ebx, %eax 11970Sstevel@tonic-gate adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy 11980Sstevel@tonic-gate movl %eax, (%edi) / r[i] = product[31..0] 11990Sstevel@tonic-gate movl %edx, %ebx / cy = product[63..32] 12000Sstevel@tonic-gate leal 4(%edi), %edi / ++r 12010Sstevel@tonic-gate decl %ecx / --len 12020Sstevel@tonic-gate jnz .L65 / while (len != 0) 12030Sstevel@tonic-gate.L67: 12040Sstevel@tonic-gate movl %ebx, %eax 12050Sstevel@tonic-gate popl %ebx 12060Sstevel@tonic-gate popl %edi 12070Sstevel@tonic-gate popl %esi 12080Sstevel@tonic-gate leave 12090Sstevel@tonic-gate ret 12100Sstevel@tonic-gate SET_SIZE(big_mul_add_vec_umul) 12110Sstevel@tonic-gate 12120Sstevel@tonic-gate#endif /* __lint */ 1213