xref: /onnv-gate/usr/src/common/bignum/amd64/bignum_amd64_asm.s (revision 8933:16480dbef03d)
10Sstevel@tonic-gate/*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*8933Sopensolaris@drydog.com * Common Development and Distribution License (the "License").
6*8933Sopensolaris@drydog.com * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate/*
22*8933Sopensolaris@drydog.com * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate#include <sys/asm_linkage.h>
270Sstevel@tonic-gate
280Sstevel@tonic-gate#if defined(lint) || defined(__lint)
290Sstevel@tonic-gate
300Sstevel@tonic-gate#include <sys/types.h>
310Sstevel@tonic-gate
320Sstevel@tonic-gate/* ARGSUSED */
330Sstevel@tonic-gateuint64_t
34*8933Sopensolaris@drydog.combig_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
350Sstevel@tonic-gate{ return (0); }
360Sstevel@tonic-gate
370Sstevel@tonic-gate/* ARGSUSED */
380Sstevel@tonic-gateuint64_t
39*8933Sopensolaris@drydog.combig_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
400Sstevel@tonic-gate{ return (0); }
410Sstevel@tonic-gate
420Sstevel@tonic-gate/* ARGSUSED */
430Sstevel@tonic-gatevoid
44*8933Sopensolaris@drydog.combig_sqr_vec(uint64_t *r, uint64_t *a, int len)
450Sstevel@tonic-gate{}
460Sstevel@tonic-gate
470Sstevel@tonic-gate#else	/* lint */
480Sstevel@tonic-gate
490Sstevel@tonic-gate/ ------------------------------------------------------------------------
500Sstevel@tonic-gate/
510Sstevel@tonic-gate/  Implementation of big_mul_set_vec which exploits
520Sstevel@tonic-gate/  the 64X64->128 bit  unsigned multiply instruction.
530Sstevel@tonic-gate/
540Sstevel@tonic-gate/  As defined in Sun's bignum library for pkcs11, bignums are
55*8933Sopensolaris@drydog.com/  composed of an array of 64-bit "digits" or "chunks" along with
56*8933Sopensolaris@drydog.com/  descriptive information.
570Sstevel@tonic-gate/
580Sstevel@tonic-gate/ ------------------------------------------------------------------------
590Sstevel@tonic-gate
600Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
610Sstevel@tonic-gate/ returns the carry digit
620Sstevel@tonic-gate/ r and a are 64 bit aligned.
630Sstevel@tonic-gate/
640Sstevel@tonic-gate/ uint64_t
65*8933Sopensolaris@drydog.com/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
660Sstevel@tonic-gate/
67*8933Sopensolaris@drydog.com	ENTRY(big_mul_set_vec)
680Sstevel@tonic-gate	xorq	%rax, %rax		/ if (len == 0) return (0)
690Sstevel@tonic-gate	testq	%rdx, %rdx
700Sstevel@tonic-gate	jz	.L17
710Sstevel@tonic-gate
720Sstevel@tonic-gate	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
730Sstevel@tonic-gate	xorq	%r9, %r9		/ cy = 0
740Sstevel@tonic-gate
750Sstevel@tonic-gate.L15:
760Sstevel@tonic-gate	cmpq	$8, %r8			/ 8 - len
770Sstevel@tonic-gate	jb	.L16
780Sstevel@tonic-gate	movq	0(%rsi), %rax		/ rax = a[0]
790Sstevel@tonic-gate	movq	8(%rsi), %r11		/ prefetch a[1]
800Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
810Sstevel@tonic-gate	addq	%r9, %rax
820Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
830Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
840Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
850Sstevel@tonic-gate
860Sstevel@tonic-gate	movq	%r11, %rax
870Sstevel@tonic-gate	movq	16(%rsi), %r11		/ prefetch a[2]
880Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
890Sstevel@tonic-gate	addq	%r9, %rax
900Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
910Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
920Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
930Sstevel@tonic-gate
940Sstevel@tonic-gate	movq	%r11, %rax
950Sstevel@tonic-gate	movq	24(%rsi), %r11		/ prefetch a[3]
960Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
970Sstevel@tonic-gate	addq	%r9, %rax
980Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
990Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
1000Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1010Sstevel@tonic-gate
1020Sstevel@tonic-gate	movq	%r11, %rax
1030Sstevel@tonic-gate	movq	32(%rsi), %r11		/ prefetch a[4]
1040Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
1050Sstevel@tonic-gate	addq	%r9, %rax
1060Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1070Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
1080Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1090Sstevel@tonic-gate
1100Sstevel@tonic-gate	movq	%r11, %rax
1110Sstevel@tonic-gate	movq	40(%rsi), %r11		/ prefetch a[5]
1120Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
1130Sstevel@tonic-gate	addq	%r9, %rax
1140Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1150Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
1160Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1170Sstevel@tonic-gate
1180Sstevel@tonic-gate	movq	%r11, %rax
1190Sstevel@tonic-gate	movq	48(%rsi), %r11		/ prefetch a[6]
1200Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
1210Sstevel@tonic-gate	addq	%r9, %rax
1220Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1230Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
1240Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1250Sstevel@tonic-gate
1260Sstevel@tonic-gate	movq	%r11, %rax
1270Sstevel@tonic-gate	movq	56(%rsi), %r11		/ prefetch a[7]
1280Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
1290Sstevel@tonic-gate	addq	%r9, %rax
1300Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1310Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
1320Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1330Sstevel@tonic-gate
1340Sstevel@tonic-gate	movq	%r11, %rax
1350Sstevel@tonic-gate	mulq	%rcx			/ p = a[7] * digit
1360Sstevel@tonic-gate	addq	%r9, %rax
1370Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1380Sstevel@tonic-gate	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
1390Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1400Sstevel@tonic-gate
1410Sstevel@tonic-gate	addq	$64, %rsi
1420Sstevel@tonic-gate	addq	$64, %rdi
1430Sstevel@tonic-gate	subq	$8, %r8
1440Sstevel@tonic-gate
1450Sstevel@tonic-gate	jz	.L17
1460Sstevel@tonic-gate	jmp	.L15
1470Sstevel@tonic-gate
1480Sstevel@tonic-gate.L16:
1490Sstevel@tonic-gate	movq	0(%rsi), %rax
1500Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
1510Sstevel@tonic-gate	addq	%r9, %rax
1520Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1530Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
1540Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1550Sstevel@tonic-gate	decq	%r8
1560Sstevel@tonic-gate	jz	.L17
1570Sstevel@tonic-gate
1580Sstevel@tonic-gate	movq	8(%rsi), %rax
1590Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
1600Sstevel@tonic-gate	addq	%r9, %rax
1610Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1620Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
1630Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1640Sstevel@tonic-gate	decq	%r8
1650Sstevel@tonic-gate	jz	.L17
1660Sstevel@tonic-gate
1670Sstevel@tonic-gate	movq	16(%rsi), %rax
1680Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
1690Sstevel@tonic-gate	addq	%r9, %rax
1700Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1710Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
1720Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1730Sstevel@tonic-gate	decq	%r8
1740Sstevel@tonic-gate	jz	.L17
1750Sstevel@tonic-gate
1760Sstevel@tonic-gate	movq	24(%rsi), %rax
1770Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
1780Sstevel@tonic-gate	addq	%r9, %rax
1790Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1800Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
1810Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1820Sstevel@tonic-gate	decq	%r8
1830Sstevel@tonic-gate	jz	.L17
1840Sstevel@tonic-gate
1850Sstevel@tonic-gate	movq	32(%rsi), %rax
1860Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
1870Sstevel@tonic-gate	addq	%r9, %rax
1880Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1890Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
1900Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1910Sstevel@tonic-gate	decq	%r8
1920Sstevel@tonic-gate	jz	.L17
1930Sstevel@tonic-gate
1940Sstevel@tonic-gate	movq	40(%rsi), %rax
1950Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
1960Sstevel@tonic-gate	addq	%r9, %rax
1970Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1980Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
1990Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2000Sstevel@tonic-gate	decq	%r8
2010Sstevel@tonic-gate	jz	.L17
2020Sstevel@tonic-gate
2030Sstevel@tonic-gate	movq	48(%rsi), %rax
2040Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
2050Sstevel@tonic-gate	addq	%r9, %rax
2060Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2070Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
2080Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2090Sstevel@tonic-gate	decq	%r8
2100Sstevel@tonic-gate	jz	.L17
2110Sstevel@tonic-gate
2120Sstevel@tonic-gate
2130Sstevel@tonic-gate.L17:
2140Sstevel@tonic-gate	movq	%r9, %rax
2150Sstevel@tonic-gate	ret
216*8933Sopensolaris@drydog.com	SET_SIZE(big_mul_set_vec)
217*8933Sopensolaris@drydog.com
2180Sstevel@tonic-gate
2190Sstevel@tonic-gate/ ------------------------------------------------------------------------
2200Sstevel@tonic-gate/
2210Sstevel@tonic-gate/  Implementation of big_mul_add_vec which exploits
2220Sstevel@tonic-gate/  the 64X64->128 bit  unsigned multiply instruction.
2230Sstevel@tonic-gate/
2240Sstevel@tonic-gate/  As defined in Sun's bignum library for pkcs11, bignums are
225*8933Sopensolaris@drydog.com/  composed of an array of 64-bit "digits" or "chunks" along with
226*8933Sopensolaris@drydog.com/  descriptive information.
2270Sstevel@tonic-gate/
2280Sstevel@tonic-gate/ ------------------------------------------------------------------------
2290Sstevel@tonic-gate
2300Sstevel@tonic-gate/ r += a * digit, r and a are vectors of length len
2310Sstevel@tonic-gate/ returns the carry digit
2320Sstevel@tonic-gate/ r and a are 64 bit aligned.
2330Sstevel@tonic-gate/
2340Sstevel@tonic-gate/ uint64_t
235*8933Sopensolaris@drydog.com/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
2360Sstevel@tonic-gate/
237*8933Sopensolaris@drydog.com	ENTRY(big_mul_add_vec)
2380Sstevel@tonic-gate	xorq	%rax, %rax		/ if (len == 0) return (0)
2390Sstevel@tonic-gate	testq	%rdx, %rdx
2400Sstevel@tonic-gate	jz	.L27
2410Sstevel@tonic-gate
2420Sstevel@tonic-gate	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
2430Sstevel@tonic-gate	xorq	%r9, %r9		/ cy = 0
2440Sstevel@tonic-gate
2450Sstevel@tonic-gate.L25:
2460Sstevel@tonic-gate	cmpq	$8, %r8			/ 8 - len
2470Sstevel@tonic-gate	jb	.L26
2480Sstevel@tonic-gate	movq	0(%rsi), %rax		/ rax = a[0]
2490Sstevel@tonic-gate	movq	0(%rdi), %r10		/ r10 = r[0]
2500Sstevel@tonic-gate	movq	8(%rsi), %r11		/ prefetch a[1]
2510Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
2520Sstevel@tonic-gate	addq	%r10, %rax
2530Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[0]
2540Sstevel@tonic-gate	movq	8(%rdi), %r10		/ prefetch r[1]
2550Sstevel@tonic-gate	addq	%r9, %rax
2560Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2570Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
2580Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate	movq	%r11, %rax
2610Sstevel@tonic-gate	movq	16(%rsi), %r11		/ prefetch a[2]
2620Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
2630Sstevel@tonic-gate	addq	%r10, %rax
2640Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[1]
2650Sstevel@tonic-gate	movq	16(%rdi), %r10		/ prefetch r[2]
2660Sstevel@tonic-gate	addq	%r9, %rax
2670Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2680Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
2690Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2700Sstevel@tonic-gate
2710Sstevel@tonic-gate	movq	%r11, %rax
2720Sstevel@tonic-gate	movq	24(%rsi), %r11		/ prefetch a[3]
2730Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
2740Sstevel@tonic-gate	addq	%r10, %rax
2750Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[2]
2760Sstevel@tonic-gate	movq	24(%rdi), %r10		/ prefetch r[3]
2770Sstevel@tonic-gate	addq	%r9, %rax
2780Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2790Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
2800Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2810Sstevel@tonic-gate
2820Sstevel@tonic-gate	movq	%r11, %rax
2830Sstevel@tonic-gate	movq	32(%rsi), %r11		/ prefetch a[4]
2840Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
2850Sstevel@tonic-gate	addq	%r10, %rax
2860Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[3]
2870Sstevel@tonic-gate	movq	32(%rdi), %r10		/ prefetch r[4]
2880Sstevel@tonic-gate	addq	%r9, %rax
2890Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2900Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
2910Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2920Sstevel@tonic-gate
2930Sstevel@tonic-gate	movq	%r11, %rax
2940Sstevel@tonic-gate	movq	40(%rsi), %r11		/ prefetch a[5]
2950Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
2960Sstevel@tonic-gate	addq	%r10, %rax
2970Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[4]
2980Sstevel@tonic-gate	movq	40(%rdi), %r10		/ prefetch r[5]
2990Sstevel@tonic-gate	addq	%r9, %rax
3000Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3010Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
3020Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3030Sstevel@tonic-gate
3040Sstevel@tonic-gate	movq	%r11, %rax
3050Sstevel@tonic-gate	movq	48(%rsi), %r11		/ prefetch a[6]
3060Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
3070Sstevel@tonic-gate	addq	%r10, %rax
3080Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[5]
3090Sstevel@tonic-gate	movq	48(%rdi), %r10		/ prefetch r[6]
3100Sstevel@tonic-gate	addq	%r9, %rax
3110Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3120Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
3130Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3140Sstevel@tonic-gate
3150Sstevel@tonic-gate	movq	%r11, %rax
3160Sstevel@tonic-gate	movq	56(%rsi), %r11		/ prefetch a[7]
3170Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
3180Sstevel@tonic-gate	addq	%r10, %rax
3190Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[6]
3200Sstevel@tonic-gate	movq	56(%rdi), %r10		/ prefetch r[7]
3210Sstevel@tonic-gate	addq	%r9, %rax
3220Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3230Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
3240Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3250Sstevel@tonic-gate
3260Sstevel@tonic-gate	movq	%r11, %rax
3270Sstevel@tonic-gate	mulq	%rcx			/ p = a[7] * digit
3280Sstevel@tonic-gate	addq	%r10, %rax
3290Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[7]
3300Sstevel@tonic-gate	addq	%r9, %rax
3310Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3320Sstevel@tonic-gate	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
3330Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3340Sstevel@tonic-gate
3350Sstevel@tonic-gate	addq	$64, %rsi
3360Sstevel@tonic-gate	addq	$64, %rdi
3370Sstevel@tonic-gate	subq	$8, %r8
3380Sstevel@tonic-gate
3390Sstevel@tonic-gate	jz	.L27
3400Sstevel@tonic-gate	jmp	.L25
3410Sstevel@tonic-gate
3420Sstevel@tonic-gate.L26:
3430Sstevel@tonic-gate	movq	0(%rsi), %rax
3440Sstevel@tonic-gate	movq	0(%rdi), %r10
3450Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
3460Sstevel@tonic-gate	addq	%r10, %rax
3470Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[0]
3480Sstevel@tonic-gate	addq	%r9, %rax
3490Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3500Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
3510Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3520Sstevel@tonic-gate	decq	%r8
3530Sstevel@tonic-gate	jz	.L27
3540Sstevel@tonic-gate
3550Sstevel@tonic-gate	movq	8(%rsi), %rax
3560Sstevel@tonic-gate	movq	8(%rdi), %r10
3570Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
3580Sstevel@tonic-gate	addq	%r10, %rax
3590Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[1]
3600Sstevel@tonic-gate	addq	%r9, %rax
3610Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3620Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
3630Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3640Sstevel@tonic-gate	decq	%r8
3650Sstevel@tonic-gate	jz	.L27
3660Sstevel@tonic-gate
3670Sstevel@tonic-gate	movq	16(%rsi), %rax
3680Sstevel@tonic-gate	movq	16(%rdi), %r10
3690Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
3700Sstevel@tonic-gate	addq	%r10, %rax
3710Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[2]
3720Sstevel@tonic-gate	addq	%r9, %rax
3730Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3740Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
3750Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3760Sstevel@tonic-gate	decq	%r8
3770Sstevel@tonic-gate	jz	.L27
3780Sstevel@tonic-gate
3790Sstevel@tonic-gate	movq	24(%rsi), %rax
3800Sstevel@tonic-gate	movq	24(%rdi), %r10
3810Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
3820Sstevel@tonic-gate	addq	%r10, %rax
3830Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[3]
3840Sstevel@tonic-gate	addq	%r9, %rax
3850Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3860Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
3870Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3880Sstevel@tonic-gate	decq	%r8
3890Sstevel@tonic-gate	jz	.L27
3900Sstevel@tonic-gate
3910Sstevel@tonic-gate	movq	32(%rsi), %rax
3920Sstevel@tonic-gate	movq	32(%rdi), %r10
3930Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
3940Sstevel@tonic-gate	addq	%r10, %rax
3950Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[4]
3960Sstevel@tonic-gate	addq	%r9, %rax
3970Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3980Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
3990Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
4000Sstevel@tonic-gate	decq	%r8
4010Sstevel@tonic-gate	jz	.L27
4020Sstevel@tonic-gate
4030Sstevel@tonic-gate	movq	40(%rsi), %rax
4040Sstevel@tonic-gate	movq	40(%rdi), %r10
4050Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
4060Sstevel@tonic-gate	addq	%r10, %rax
4070Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[5]
4080Sstevel@tonic-gate	addq	%r9, %rax
4090Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
4100Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
4110Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
4120Sstevel@tonic-gate	decq	%r8
4130Sstevel@tonic-gate	jz	.L27
4140Sstevel@tonic-gate
4150Sstevel@tonic-gate	movq	48(%rsi), %rax
4160Sstevel@tonic-gate	movq	48(%rdi), %r10
4170Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
4180Sstevel@tonic-gate	addq	%r10, %rax
4190Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[6]
4200Sstevel@tonic-gate	addq	%r9, %rax
4210Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
4220Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
4230Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
4240Sstevel@tonic-gate	decq	%r8
4250Sstevel@tonic-gate	jz	.L27
4260Sstevel@tonic-gate
4270Sstevel@tonic-gate
4280Sstevel@tonic-gate.L27:
4290Sstevel@tonic-gate	movq	%r9, %rax
4300Sstevel@tonic-gate	ret
431*8933Sopensolaris@drydog.com	SET_SIZE(big_mul_add_vec)
4320Sstevel@tonic-gate
4330Sstevel@tonic-gate
4340Sstevel@tonic-gate/ void
435*8933Sopensolaris@drydog.com/ big_sqr_vec(uint64_t *r, uint64_t *a, int len)
4360Sstevel@tonic-gate
437*8933Sopensolaris@drydog.com	ENTRY(big_sqr_vec)
4380Sstevel@tonic-gate	pushq	%rbx
4390Sstevel@tonic-gate	pushq	%rbp
4400Sstevel@tonic-gate	pushq	%r12
4410Sstevel@tonic-gate	pushq	%r13
4420Sstevel@tonic-gate	pushq	%r14
4430Sstevel@tonic-gate	pushq	%r15
4440Sstevel@tonic-gate	pushq	%rdx			/ save arg3, len
4450Sstevel@tonic-gate	pushq	%rsi			/ save arg2, a
4460Sstevel@tonic-gate	pushq	%rdi			/ save arg1, r
4470Sstevel@tonic-gate
4480Sstevel@tonic-gate	leaq	8(%rdi), %r13		/ tr = r + 1
4490Sstevel@tonic-gate	movq	%rsi, %r14		/ ta = a
4500Sstevel@tonic-gate	movq	%rdx, %r15		/ tlen = len
4510Sstevel@tonic-gate	decq	%r15			/ tlen = len - 1
4520Sstevel@tonic-gate	movq	%r13, %rdi		/ arg1 = tr
4530Sstevel@tonic-gate	leaq	8(%r14), %rsi		/ arg2 = ta + 1
4540Sstevel@tonic-gate	movq	%r15, %rdx		/ arg3 = tlen
4550Sstevel@tonic-gate	movq	0(%r14), %rcx		/ arg4 = ta[0]
456*8933Sopensolaris@drydog.com	call	big_mul_set_vec
4570Sstevel@tonic-gate	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
4580Sstevel@tonic-gate.L31:
4590Sstevel@tonic-gate	decq	%r15			/ --tlen
4600Sstevel@tonic-gate	jz	.L32			/ while (--tlen != 0)
4610Sstevel@tonic-gate
4620Sstevel@tonic-gate	addq	$16, %r13		/ tr += 2
4630Sstevel@tonic-gate	addq	$8, %r14		/ ++ta
4640Sstevel@tonic-gate	movq	%r13, %rdi		/ arg1 = tr
4650Sstevel@tonic-gate	leaq	8(%r14), %rsi		/ arg2 = ta + 1
4660Sstevel@tonic-gate	movq	%r15, %rdx		/ arg3 = tlen
4670Sstevel@tonic-gate	movq	0(%r14), %rcx		/ arg4 = ta[0]
468*8933Sopensolaris@drydog.com	call	big_mul_add_vec
4690Sstevel@tonic-gate	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
4700Sstevel@tonic-gate	jmp	.L31
4710Sstevel@tonic-gate
4720Sstevel@tonic-gate.L32:
4730Sstevel@tonic-gate
4740Sstevel@tonic-gate/ No more function calls after this.
4750Sstevel@tonic-gate/ Restore arguments to registers.
4760Sstevel@tonic-gate/ However, don't use %rdx for arg3, len, because it is heavily
4770Sstevel@tonic-gate/ used by the hardware MUL instruction.  Use %r8, instead.
4780Sstevel@tonic-gate	movq	0(%rsp), %rdi		/ %rdi == arg1 == r
4790Sstevel@tonic-gate	movq	8(%rsp), %rsi		/ %rsi == arg2 == a
4800Sstevel@tonic-gate	movq	16(%rsp), %r8		/ %r8  == arg3 == len
4810Sstevel@tonic-gate
4820Sstevel@tonic-gate	movq	0(%rsi), %rax		/ %rax = a[0];
4830Sstevel@tonic-gate	mulq	%rax			/ s = %edx:%eax = a[0]**2
4840Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo64(s)
4850Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi64(s)
4860Sstevel@tonic-gate	xorq	%rdx, %rdx
4870Sstevel@tonic-gate	movq	8(%rdi), %rax		/ p = %rdx:%rax = r[1]
4880Sstevel@tonic-gate	addq	%rax, %rax
4890Sstevel@tonic-gate	adcq	$0, %rdx		/ p = p << 1
4900Sstevel@tonic-gate	addq	%r9, %rax
4910Sstevel@tonic-gate	adcq	$0, %rdx		/ p = (r[1] << 1) + cy
4920Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo64(p)
4930Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi64(p)
4940Sstevel@tonic-gate	movq	$1, %r11		/ row = 1
4950Sstevel@tonic-gate	movq	$2, %r12		/ col = 2
4960Sstevel@tonic-gate	movq	%r8, %r15
4970Sstevel@tonic-gate	decq	%r15			/ tlen = len - 1
4980Sstevel@tonic-gate.L33:
4990Sstevel@tonic-gate	cmpq	%r8, %r11		/ len - row
5000Sstevel@tonic-gate	jae	.L34			/ while (row < len)
5010Sstevel@tonic-gate
5020Sstevel@tonic-gate	movq	0(%rsi, %r11, 8), %rax	/ s = (uint128_t)a[row]
5030Sstevel@tonic-gate	mulq	%rax			/ s = s * s
5040Sstevel@tonic-gate	xorq	%rbx, %rbx
5050Sstevel@tonic-gate	movq	0(%rdi, %r12, 8), %rcx	/ p = (uint128_t)r[col]
5060Sstevel@tonic-gate	addq	%rcx, %rcx
5070Sstevel@tonic-gate	adcq	$0, %rbx		/ p = p << 1
5080Sstevel@tonic-gate	addq	%rcx, %rax
5090Sstevel@tonic-gate	adcq	%rbx, %rdx		/ t = p + s
5100Sstevel@tonic-gate	xorq	%r10, %r10
5110Sstevel@tonic-gate	movq	%rax, %rbp		/ t2 = 0:lo64(t)
5120Sstevel@tonic-gate	addq	%r9, %rbp
5130Sstevel@tonic-gate	adcq	$0, %r10		/ t2 = %r10:%rbp = lo64(t) + cy
5140Sstevel@tonic-gate	movq	%rbp, 0(%rdi, %r12, 8)	/ r[col] = lo64(t2)
5150Sstevel@tonic-gate	xorq	%rcx, %rcx
5160Sstevel@tonic-gate	movq	%rdx, %r9
5170Sstevel@tonic-gate	addq	%r10, %r9
5180Sstevel@tonic-gate	adcq	$0, %rcx		/ cy = hi64(t) + hi64(t2)
5190Sstevel@tonic-gate	cmpq	%r11, %r15
5200Sstevel@tonic-gate	je	.L34			/ if (row == len - 1) break
5210Sstevel@tonic-gate	xorq	%rdx, %rdx
5220Sstevel@tonic-gate	movq	8(%rdi, %r12, 8), %rax
5230Sstevel@tonic-gate	addq	%rax, %rax
5240Sstevel@tonic-gate	adcq	$0, %rdx
5250Sstevel@tonic-gate	addq	%r9, %rax
5260Sstevel@tonic-gate	adcq	%rcx, %rdx		/ p = (lo64(r[col+1]) << 1) + cy
5270Sstevel@tonic-gate	movq	%rax, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(p)
5280Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi64(p)
5290Sstevel@tonic-gate
5300Sstevel@tonic-gate	incq	%r11			/ ++row
5310Sstevel@tonic-gate	addq	$2, %r12		/ col += 2
5320Sstevel@tonic-gate	jmp	.L33
5330Sstevel@tonic-gate
5340Sstevel@tonic-gate.L34:
5350Sstevel@tonic-gate	movq	%r9, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(cy)
5360Sstevel@tonic-gate
5370Sstevel@tonic-gate	addq	$24, %rsp		/ skip %rdi, %rsi, %rdx
5380Sstevel@tonic-gate	popq	%r15
5390Sstevel@tonic-gate	popq	%r14
5400Sstevel@tonic-gate	popq	%r13
5410Sstevel@tonic-gate	popq	%r12
5420Sstevel@tonic-gate	popq	%rbp
5430Sstevel@tonic-gate	popq	%rbx
5440Sstevel@tonic-gate
5450Sstevel@tonic-gate	ret
5460Sstevel@tonic-gate
547*8933Sopensolaris@drydog.com	SET_SIZE(big_sqr_vec)
5480Sstevel@tonic-gate
5490Sstevel@tonic-gate#endif	/* lint */
550