10Sstevel@tonic-gate/* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*8933Sopensolaris@drydog.com * Common Development and Distribution License (the "License"). 6*8933Sopensolaris@drydog.com * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate/* 22*8933Sopensolaris@drydog.com * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate#include <sys/asm_linkage.h> 270Sstevel@tonic-gate 280Sstevel@tonic-gate#if defined(lint) || defined(__lint) 290Sstevel@tonic-gate 300Sstevel@tonic-gate#include <sys/types.h> 310Sstevel@tonic-gate 320Sstevel@tonic-gate/* ARGSUSED */ 330Sstevel@tonic-gateuint64_t 34*8933Sopensolaris@drydog.combig_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 350Sstevel@tonic-gate{ return (0); } 360Sstevel@tonic-gate 370Sstevel@tonic-gate/* ARGSUSED */ 380Sstevel@tonic-gateuint64_t 39*8933Sopensolaris@drydog.combig_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 400Sstevel@tonic-gate{ return (0); } 410Sstevel@tonic-gate 420Sstevel@tonic-gate/* ARGSUSED */ 430Sstevel@tonic-gatevoid 44*8933Sopensolaris@drydog.combig_sqr_vec(uint64_t *r, uint64_t *a, int len) 450Sstevel@tonic-gate{} 460Sstevel@tonic-gate 470Sstevel@tonic-gate#else /* lint */ 480Sstevel@tonic-gate 490Sstevel@tonic-gate/ ------------------------------------------------------------------------ 500Sstevel@tonic-gate/ 510Sstevel@tonic-gate/ Implementation of big_mul_set_vec which exploits 520Sstevel@tonic-gate/ the 64X64->128 bit unsigned multiply instruction. 530Sstevel@tonic-gate/ 540Sstevel@tonic-gate/ As defined in Sun's bignum library for pkcs11, bignums are 55*8933Sopensolaris@drydog.com/ composed of an array of 64-bit "digits" or "chunks" along with 56*8933Sopensolaris@drydog.com/ descriptive information. 570Sstevel@tonic-gate/ 580Sstevel@tonic-gate/ ------------------------------------------------------------------------ 590Sstevel@tonic-gate 600Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len 610Sstevel@tonic-gate/ returns the carry digit 620Sstevel@tonic-gate/ r and a are 64 bit aligned. 630Sstevel@tonic-gate/ 640Sstevel@tonic-gate/ uint64_t 65*8933Sopensolaris@drydog.com/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 660Sstevel@tonic-gate/ 67*8933Sopensolaris@drydog.com ENTRY(big_mul_set_vec) 680Sstevel@tonic-gate xorq %rax, %rax / if (len == 0) return (0) 690Sstevel@tonic-gate testq %rdx, %rdx 700Sstevel@tonic-gate jz .L17 710Sstevel@tonic-gate 720Sstevel@tonic-gate movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 730Sstevel@tonic-gate xorq %r9, %r9 / cy = 0 740Sstevel@tonic-gate 750Sstevel@tonic-gate.L15: 760Sstevel@tonic-gate cmpq $8, %r8 / 8 - len 770Sstevel@tonic-gate jb .L16 780Sstevel@tonic-gate movq 0(%rsi), %rax / rax = a[0] 790Sstevel@tonic-gate movq 8(%rsi), %r11 / prefetch a[1] 800Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 810Sstevel@tonic-gate addq %r9, %rax 820Sstevel@tonic-gate adcq $0, %rdx / p += cy 830Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 840Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 850Sstevel@tonic-gate 860Sstevel@tonic-gate movq %r11, %rax 870Sstevel@tonic-gate movq 16(%rsi), %r11 / prefetch a[2] 880Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 890Sstevel@tonic-gate addq %r9, %rax 900Sstevel@tonic-gate adcq $0, %rdx / p += cy 910Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 920Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 930Sstevel@tonic-gate 940Sstevel@tonic-gate movq %r11, %rax 950Sstevel@tonic-gate movq 24(%rsi), %r11 / prefetch a[3] 960Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 970Sstevel@tonic-gate addq %r9, %rax 980Sstevel@tonic-gate adcq $0, %rdx / p += cy 990Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 1000Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1010Sstevel@tonic-gate 1020Sstevel@tonic-gate movq %r11, %rax 1030Sstevel@tonic-gate movq 32(%rsi), %r11 / prefetch a[4] 1040Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 1050Sstevel@tonic-gate addq %r9, %rax 1060Sstevel@tonic-gate adcq $0, %rdx / p += cy 1070Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 1080Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate movq %r11, %rax 1110Sstevel@tonic-gate movq 40(%rsi), %r11 / prefetch a[5] 1120Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 1130Sstevel@tonic-gate addq %r9, %rax 1140Sstevel@tonic-gate adcq $0, %rdx / p += cy 1150Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 1160Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1170Sstevel@tonic-gate 1180Sstevel@tonic-gate movq %r11, %rax 1190Sstevel@tonic-gate movq 48(%rsi), %r11 / prefetch a[6] 1200Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 1210Sstevel@tonic-gate addq %r9, %rax 1220Sstevel@tonic-gate adcq $0, %rdx / p += cy 1230Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 1240Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1250Sstevel@tonic-gate 1260Sstevel@tonic-gate movq %r11, %rax 1270Sstevel@tonic-gate movq 56(%rsi), %r11 / prefetch a[7] 1280Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 1290Sstevel@tonic-gate addq %r9, %rax 1300Sstevel@tonic-gate adcq $0, %rdx / p += cy 1310Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 1320Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1330Sstevel@tonic-gate 1340Sstevel@tonic-gate movq %r11, %rax 1350Sstevel@tonic-gate mulq %rcx / p = a[7] * digit 1360Sstevel@tonic-gate addq %r9, %rax 1370Sstevel@tonic-gate adcq $0, %rdx / p += cy 1380Sstevel@tonic-gate movq %rax, 56(%rdi) / r[7] = lo(p) 1390Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1400Sstevel@tonic-gate 1410Sstevel@tonic-gate addq $64, %rsi 1420Sstevel@tonic-gate addq $64, %rdi 1430Sstevel@tonic-gate subq $8, %r8 1440Sstevel@tonic-gate 1450Sstevel@tonic-gate jz .L17 1460Sstevel@tonic-gate jmp .L15 1470Sstevel@tonic-gate 1480Sstevel@tonic-gate.L16: 1490Sstevel@tonic-gate movq 0(%rsi), %rax 1500Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 1510Sstevel@tonic-gate addq %r9, %rax 1520Sstevel@tonic-gate adcq $0, %rdx / p += cy 1530Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 1540Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1550Sstevel@tonic-gate decq %r8 1560Sstevel@tonic-gate jz .L17 1570Sstevel@tonic-gate 1580Sstevel@tonic-gate movq 8(%rsi), %rax 1590Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 1600Sstevel@tonic-gate addq %r9, %rax 1610Sstevel@tonic-gate adcq $0, %rdx / p += cy 1620Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 1630Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1640Sstevel@tonic-gate decq %r8 1650Sstevel@tonic-gate jz .L17 1660Sstevel@tonic-gate 1670Sstevel@tonic-gate movq 16(%rsi), %rax 1680Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 1690Sstevel@tonic-gate addq %r9, %rax 1700Sstevel@tonic-gate adcq $0, %rdx / p += cy 1710Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 1720Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1730Sstevel@tonic-gate decq %r8 1740Sstevel@tonic-gate jz .L17 1750Sstevel@tonic-gate 1760Sstevel@tonic-gate movq 24(%rsi), %rax 1770Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 1780Sstevel@tonic-gate addq %r9, %rax 1790Sstevel@tonic-gate adcq $0, %rdx / p += cy 1800Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 1810Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1820Sstevel@tonic-gate decq %r8 1830Sstevel@tonic-gate jz .L17 1840Sstevel@tonic-gate 1850Sstevel@tonic-gate movq 32(%rsi), %rax 1860Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 1870Sstevel@tonic-gate addq %r9, %rax 1880Sstevel@tonic-gate adcq $0, %rdx / p += cy 1890Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 1900Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 1910Sstevel@tonic-gate decq %r8 1920Sstevel@tonic-gate jz .L17 1930Sstevel@tonic-gate 1940Sstevel@tonic-gate movq 40(%rsi), %rax 1950Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 1960Sstevel@tonic-gate addq %r9, %rax 1970Sstevel@tonic-gate adcq $0, %rdx / p += cy 1980Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 1990Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2000Sstevel@tonic-gate decq %r8 2010Sstevel@tonic-gate jz .L17 2020Sstevel@tonic-gate 2030Sstevel@tonic-gate movq 48(%rsi), %rax 2040Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 2050Sstevel@tonic-gate addq %r9, %rax 2060Sstevel@tonic-gate adcq $0, %rdx / p += cy 2070Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 2080Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2090Sstevel@tonic-gate decq %r8 2100Sstevel@tonic-gate jz .L17 2110Sstevel@tonic-gate 2120Sstevel@tonic-gate 2130Sstevel@tonic-gate.L17: 2140Sstevel@tonic-gate movq %r9, %rax 2150Sstevel@tonic-gate ret 216*8933Sopensolaris@drydog.com SET_SIZE(big_mul_set_vec) 217*8933Sopensolaris@drydog.com 2180Sstevel@tonic-gate 2190Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2200Sstevel@tonic-gate/ 2210Sstevel@tonic-gate/ Implementation of big_mul_add_vec which exploits 2220Sstevel@tonic-gate/ the 64X64->128 bit unsigned multiply instruction. 2230Sstevel@tonic-gate/ 2240Sstevel@tonic-gate/ As defined in Sun's bignum library for pkcs11, bignums are 225*8933Sopensolaris@drydog.com/ composed of an array of 64-bit "digits" or "chunks" along with 226*8933Sopensolaris@drydog.com/ descriptive information. 2270Sstevel@tonic-gate/ 2280Sstevel@tonic-gate/ ------------------------------------------------------------------------ 2290Sstevel@tonic-gate 2300Sstevel@tonic-gate/ r += a * digit, r and a are vectors of length len 2310Sstevel@tonic-gate/ returns the carry digit 2320Sstevel@tonic-gate/ r and a are 64 bit aligned. 2330Sstevel@tonic-gate/ 2340Sstevel@tonic-gate/ uint64_t 235*8933Sopensolaris@drydog.com/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit) 2360Sstevel@tonic-gate/ 237*8933Sopensolaris@drydog.com ENTRY(big_mul_add_vec) 2380Sstevel@tonic-gate xorq %rax, %rax / if (len == 0) return (0) 2390Sstevel@tonic-gate testq %rdx, %rdx 2400Sstevel@tonic-gate jz .L27 2410Sstevel@tonic-gate 2420Sstevel@tonic-gate movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 2430Sstevel@tonic-gate xorq %r9, %r9 / cy = 0 2440Sstevel@tonic-gate 2450Sstevel@tonic-gate.L25: 2460Sstevel@tonic-gate cmpq $8, %r8 / 8 - len 2470Sstevel@tonic-gate jb .L26 2480Sstevel@tonic-gate movq 0(%rsi), %rax / rax = a[0] 2490Sstevel@tonic-gate movq 0(%rdi), %r10 / r10 = r[0] 2500Sstevel@tonic-gate movq 8(%rsi), %r11 / prefetch a[1] 2510Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 2520Sstevel@tonic-gate addq %r10, %rax 2530Sstevel@tonic-gate adcq $0, %rdx / p += r[0] 2540Sstevel@tonic-gate movq 8(%rdi), %r10 / prefetch r[1] 2550Sstevel@tonic-gate addq %r9, %rax 2560Sstevel@tonic-gate adcq $0, %rdx / p += cy 2570Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 2580Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2590Sstevel@tonic-gate 2600Sstevel@tonic-gate movq %r11, %rax 2610Sstevel@tonic-gate movq 16(%rsi), %r11 / prefetch a[2] 2620Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 2630Sstevel@tonic-gate addq %r10, %rax 2640Sstevel@tonic-gate adcq $0, %rdx / p += r[1] 2650Sstevel@tonic-gate movq 16(%rdi), %r10 / prefetch r[2] 2660Sstevel@tonic-gate addq %r9, %rax 2670Sstevel@tonic-gate adcq $0, %rdx / p += cy 2680Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 2690Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2700Sstevel@tonic-gate 2710Sstevel@tonic-gate movq %r11, %rax 2720Sstevel@tonic-gate movq 24(%rsi), %r11 / prefetch a[3] 2730Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 2740Sstevel@tonic-gate addq %r10, %rax 2750Sstevel@tonic-gate adcq $0, %rdx / p += r[2] 2760Sstevel@tonic-gate movq 24(%rdi), %r10 / prefetch r[3] 2770Sstevel@tonic-gate addq %r9, %rax 2780Sstevel@tonic-gate adcq $0, %rdx / p += cy 2790Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 2800Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2810Sstevel@tonic-gate 2820Sstevel@tonic-gate movq %r11, %rax 2830Sstevel@tonic-gate movq 32(%rsi), %r11 / prefetch a[4] 2840Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 2850Sstevel@tonic-gate addq %r10, %rax 2860Sstevel@tonic-gate adcq $0, %rdx / p += r[3] 2870Sstevel@tonic-gate movq 32(%rdi), %r10 / prefetch r[4] 2880Sstevel@tonic-gate addq %r9, %rax 2890Sstevel@tonic-gate adcq $0, %rdx / p += cy 2900Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 2910Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 2920Sstevel@tonic-gate 2930Sstevel@tonic-gate movq %r11, %rax 2940Sstevel@tonic-gate movq 40(%rsi), %r11 / prefetch a[5] 2950Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 2960Sstevel@tonic-gate addq %r10, %rax 2970Sstevel@tonic-gate adcq $0, %rdx / p += r[4] 2980Sstevel@tonic-gate movq 40(%rdi), %r10 / prefetch r[5] 2990Sstevel@tonic-gate addq %r9, %rax 3000Sstevel@tonic-gate adcq $0, %rdx / p += cy 3010Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 3020Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3030Sstevel@tonic-gate 3040Sstevel@tonic-gate movq %r11, %rax 3050Sstevel@tonic-gate movq 48(%rsi), %r11 / prefetch a[6] 3060Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 3070Sstevel@tonic-gate addq %r10, %rax 3080Sstevel@tonic-gate adcq $0, %rdx / p += r[5] 3090Sstevel@tonic-gate movq 48(%rdi), %r10 / prefetch r[6] 3100Sstevel@tonic-gate addq %r9, %rax 3110Sstevel@tonic-gate adcq $0, %rdx / p += cy 3120Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 3130Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3140Sstevel@tonic-gate 3150Sstevel@tonic-gate movq %r11, %rax 3160Sstevel@tonic-gate movq 56(%rsi), %r11 / prefetch a[7] 3170Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 3180Sstevel@tonic-gate addq %r10, %rax 3190Sstevel@tonic-gate adcq $0, %rdx / p += r[6] 3200Sstevel@tonic-gate movq 56(%rdi), %r10 / prefetch r[7] 3210Sstevel@tonic-gate addq %r9, %rax 3220Sstevel@tonic-gate adcq $0, %rdx / p += cy 3230Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 3240Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3250Sstevel@tonic-gate 3260Sstevel@tonic-gate movq %r11, %rax 3270Sstevel@tonic-gate mulq %rcx / p = a[7] * digit 3280Sstevel@tonic-gate addq %r10, %rax 3290Sstevel@tonic-gate adcq $0, %rdx / p += r[7] 3300Sstevel@tonic-gate addq %r9, %rax 3310Sstevel@tonic-gate adcq $0, %rdx / p += cy 3320Sstevel@tonic-gate movq %rax, 56(%rdi) / r[7] = lo(p) 3330Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3340Sstevel@tonic-gate 3350Sstevel@tonic-gate addq $64, %rsi 3360Sstevel@tonic-gate addq $64, %rdi 3370Sstevel@tonic-gate subq $8, %r8 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate jz .L27 3400Sstevel@tonic-gate jmp .L25 3410Sstevel@tonic-gate 3420Sstevel@tonic-gate.L26: 3430Sstevel@tonic-gate movq 0(%rsi), %rax 3440Sstevel@tonic-gate movq 0(%rdi), %r10 3450Sstevel@tonic-gate mulq %rcx / p = a[0] * digit 3460Sstevel@tonic-gate addq %r10, %rax 3470Sstevel@tonic-gate adcq $0, %rdx / p += r[0] 3480Sstevel@tonic-gate addq %r9, %rax 3490Sstevel@tonic-gate adcq $0, %rdx / p += cy 3500Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo(p) 3510Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3520Sstevel@tonic-gate decq %r8 3530Sstevel@tonic-gate jz .L27 3540Sstevel@tonic-gate 3550Sstevel@tonic-gate movq 8(%rsi), %rax 3560Sstevel@tonic-gate movq 8(%rdi), %r10 3570Sstevel@tonic-gate mulq %rcx / p = a[1] * digit 3580Sstevel@tonic-gate addq %r10, %rax 3590Sstevel@tonic-gate adcq $0, %rdx / p += r[1] 3600Sstevel@tonic-gate addq %r9, %rax 3610Sstevel@tonic-gate adcq $0, %rdx / p += cy 3620Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo(p) 3630Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3640Sstevel@tonic-gate decq %r8 3650Sstevel@tonic-gate jz .L27 3660Sstevel@tonic-gate 3670Sstevel@tonic-gate movq 16(%rsi), %rax 3680Sstevel@tonic-gate movq 16(%rdi), %r10 3690Sstevel@tonic-gate mulq %rcx / p = a[2] * digit 3700Sstevel@tonic-gate addq %r10, %rax 3710Sstevel@tonic-gate adcq $0, %rdx / p += r[2] 3720Sstevel@tonic-gate addq %r9, %rax 3730Sstevel@tonic-gate adcq $0, %rdx / p += cy 3740Sstevel@tonic-gate movq %rax, 16(%rdi) / r[2] = lo(p) 3750Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3760Sstevel@tonic-gate decq %r8 3770Sstevel@tonic-gate jz .L27 3780Sstevel@tonic-gate 3790Sstevel@tonic-gate movq 24(%rsi), %rax 3800Sstevel@tonic-gate movq 24(%rdi), %r10 3810Sstevel@tonic-gate mulq %rcx / p = a[3] * digit 3820Sstevel@tonic-gate addq %r10, %rax 3830Sstevel@tonic-gate adcq $0, %rdx / p += r[3] 3840Sstevel@tonic-gate addq %r9, %rax 3850Sstevel@tonic-gate adcq $0, %rdx / p += cy 3860Sstevel@tonic-gate movq %rax, 24(%rdi) / r[3] = lo(p) 3870Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 3880Sstevel@tonic-gate decq %r8 3890Sstevel@tonic-gate jz .L27 3900Sstevel@tonic-gate 3910Sstevel@tonic-gate movq 32(%rsi), %rax 3920Sstevel@tonic-gate movq 32(%rdi), %r10 3930Sstevel@tonic-gate mulq %rcx / p = a[4] * digit 3940Sstevel@tonic-gate addq %r10, %rax 3950Sstevel@tonic-gate adcq $0, %rdx / p += r[4] 3960Sstevel@tonic-gate addq %r9, %rax 3970Sstevel@tonic-gate adcq $0, %rdx / p += cy 3980Sstevel@tonic-gate movq %rax, 32(%rdi) / r[4] = lo(p) 3990Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 4000Sstevel@tonic-gate decq %r8 4010Sstevel@tonic-gate jz .L27 4020Sstevel@tonic-gate 4030Sstevel@tonic-gate movq 40(%rsi), %rax 4040Sstevel@tonic-gate movq 40(%rdi), %r10 4050Sstevel@tonic-gate mulq %rcx / p = a[5] * digit 4060Sstevel@tonic-gate addq %r10, %rax 4070Sstevel@tonic-gate adcq $0, %rdx / p += r[5] 4080Sstevel@tonic-gate addq %r9, %rax 4090Sstevel@tonic-gate adcq $0, %rdx / p += cy 4100Sstevel@tonic-gate movq %rax, 40(%rdi) / r[5] = lo(p) 4110Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 4120Sstevel@tonic-gate decq %r8 4130Sstevel@tonic-gate jz .L27 4140Sstevel@tonic-gate 4150Sstevel@tonic-gate movq 48(%rsi), %rax 4160Sstevel@tonic-gate movq 48(%rdi), %r10 4170Sstevel@tonic-gate mulq %rcx / p = a[6] * digit 4180Sstevel@tonic-gate addq %r10, %rax 4190Sstevel@tonic-gate adcq $0, %rdx / p += r[6] 4200Sstevel@tonic-gate addq %r9, %rax 4210Sstevel@tonic-gate adcq $0, %rdx / p += cy 4220Sstevel@tonic-gate movq %rax, 48(%rdi) / r[6] = lo(p) 4230Sstevel@tonic-gate movq %rdx, %r9 / cy = hi(p) 4240Sstevel@tonic-gate decq %r8 4250Sstevel@tonic-gate jz .L27 4260Sstevel@tonic-gate 4270Sstevel@tonic-gate 4280Sstevel@tonic-gate.L27: 4290Sstevel@tonic-gate movq %r9, %rax 4300Sstevel@tonic-gate ret 431*8933Sopensolaris@drydog.com SET_SIZE(big_mul_add_vec) 4320Sstevel@tonic-gate 4330Sstevel@tonic-gate 4340Sstevel@tonic-gate/ void 435*8933Sopensolaris@drydog.com/ big_sqr_vec(uint64_t *r, uint64_t *a, int len) 4360Sstevel@tonic-gate 437*8933Sopensolaris@drydog.com ENTRY(big_sqr_vec) 4380Sstevel@tonic-gate pushq %rbx 4390Sstevel@tonic-gate pushq %rbp 4400Sstevel@tonic-gate pushq %r12 4410Sstevel@tonic-gate pushq %r13 4420Sstevel@tonic-gate pushq %r14 4430Sstevel@tonic-gate pushq %r15 4440Sstevel@tonic-gate pushq %rdx / save arg3, len 4450Sstevel@tonic-gate pushq %rsi / save arg2, a 4460Sstevel@tonic-gate pushq %rdi / save arg1, r 4470Sstevel@tonic-gate 4480Sstevel@tonic-gate leaq 8(%rdi), %r13 / tr = r + 1 4490Sstevel@tonic-gate movq %rsi, %r14 / ta = a 4500Sstevel@tonic-gate movq %rdx, %r15 / tlen = len 4510Sstevel@tonic-gate decq %r15 / tlen = len - 1 4520Sstevel@tonic-gate movq %r13, %rdi / arg1 = tr 4530Sstevel@tonic-gate leaq 8(%r14), %rsi / arg2 = ta + 1 4540Sstevel@tonic-gate movq %r15, %rdx / arg3 = tlen 4550Sstevel@tonic-gate movq 0(%r14), %rcx / arg4 = ta[0] 456*8933Sopensolaris@drydog.com call big_mul_set_vec 4570Sstevel@tonic-gate movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 4580Sstevel@tonic-gate.L31: 4590Sstevel@tonic-gate decq %r15 / --tlen 4600Sstevel@tonic-gate jz .L32 / while (--tlen != 0) 4610Sstevel@tonic-gate 4620Sstevel@tonic-gate addq $16, %r13 / tr += 2 4630Sstevel@tonic-gate addq $8, %r14 / ++ta 4640Sstevel@tonic-gate movq %r13, %rdi / arg1 = tr 4650Sstevel@tonic-gate leaq 8(%r14), %rsi / arg2 = ta + 1 4660Sstevel@tonic-gate movq %r15, %rdx / arg3 = tlen 4670Sstevel@tonic-gate movq 0(%r14), %rcx / arg4 = ta[0] 468*8933Sopensolaris@drydog.com call big_mul_add_vec 4690Sstevel@tonic-gate movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy 4700Sstevel@tonic-gate jmp .L31 4710Sstevel@tonic-gate 4720Sstevel@tonic-gate.L32: 4730Sstevel@tonic-gate 4740Sstevel@tonic-gate/ No more function calls after this. 4750Sstevel@tonic-gate/ Restore arguments to registers. 4760Sstevel@tonic-gate/ However, don't use %rdx for arg3, len, because it is heavily 4770Sstevel@tonic-gate/ used by the hardware MUL instruction. Use %r8, instead. 4780Sstevel@tonic-gate movq 0(%rsp), %rdi / %rdi == arg1 == r 4790Sstevel@tonic-gate movq 8(%rsp), %rsi / %rsi == arg2 == a 4800Sstevel@tonic-gate movq 16(%rsp), %r8 / %r8 == arg3 == len 4810Sstevel@tonic-gate 4820Sstevel@tonic-gate movq 0(%rsi), %rax / %rax = a[0]; 4830Sstevel@tonic-gate mulq %rax / s = %edx:%eax = a[0]**2 4840Sstevel@tonic-gate movq %rax, 0(%rdi) / r[0] = lo64(s) 4850Sstevel@tonic-gate movq %rdx, %r9 / cy = hi64(s) 4860Sstevel@tonic-gate xorq %rdx, %rdx 4870Sstevel@tonic-gate movq 8(%rdi), %rax / p = %rdx:%rax = r[1] 4880Sstevel@tonic-gate addq %rax, %rax 4890Sstevel@tonic-gate adcq $0, %rdx / p = p << 1 4900Sstevel@tonic-gate addq %r9, %rax 4910Sstevel@tonic-gate adcq $0, %rdx / p = (r[1] << 1) + cy 4920Sstevel@tonic-gate movq %rax, 8(%rdi) / r[1] = lo64(p) 4930Sstevel@tonic-gate movq %rdx, %r9 / cy = hi64(p) 4940Sstevel@tonic-gate movq $1, %r11 / row = 1 4950Sstevel@tonic-gate movq $2, %r12 / col = 2 4960Sstevel@tonic-gate movq %r8, %r15 4970Sstevel@tonic-gate decq %r15 / tlen = len - 1 4980Sstevel@tonic-gate.L33: 4990Sstevel@tonic-gate cmpq %r8, %r11 / len - row 5000Sstevel@tonic-gate jae .L34 / while (row < len) 5010Sstevel@tonic-gate 5020Sstevel@tonic-gate movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row] 5030Sstevel@tonic-gate mulq %rax / s = s * s 5040Sstevel@tonic-gate xorq %rbx, %rbx 5050Sstevel@tonic-gate movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col] 5060Sstevel@tonic-gate addq %rcx, %rcx 5070Sstevel@tonic-gate adcq $0, %rbx / p = p << 1 5080Sstevel@tonic-gate addq %rcx, %rax 5090Sstevel@tonic-gate adcq %rbx, %rdx / t = p + s 5100Sstevel@tonic-gate xorq %r10, %r10 5110Sstevel@tonic-gate movq %rax, %rbp / t2 = 0:lo64(t) 5120Sstevel@tonic-gate addq %r9, %rbp 5130Sstevel@tonic-gate adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy 5140Sstevel@tonic-gate movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2) 5150Sstevel@tonic-gate xorq %rcx, %rcx 5160Sstevel@tonic-gate movq %rdx, %r9 5170Sstevel@tonic-gate addq %r10, %r9 5180Sstevel@tonic-gate adcq $0, %rcx / cy = hi64(t) + hi64(t2) 5190Sstevel@tonic-gate cmpq %r11, %r15 5200Sstevel@tonic-gate je .L34 / if (row == len - 1) break 5210Sstevel@tonic-gate xorq %rdx, %rdx 5220Sstevel@tonic-gate movq 8(%rdi, %r12, 8), %rax 5230Sstevel@tonic-gate addq %rax, %rax 5240Sstevel@tonic-gate adcq $0, %rdx 5250Sstevel@tonic-gate addq %r9, %rax 5260Sstevel@tonic-gate adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy 5270Sstevel@tonic-gate movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p) 5280Sstevel@tonic-gate movq %rdx, %r9 / cy = hi64(p) 5290Sstevel@tonic-gate 5300Sstevel@tonic-gate incq %r11 / ++row 5310Sstevel@tonic-gate addq $2, %r12 / col += 2 5320Sstevel@tonic-gate jmp .L33 5330Sstevel@tonic-gate 5340Sstevel@tonic-gate.L34: 5350Sstevel@tonic-gate movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy) 5360Sstevel@tonic-gate 5370Sstevel@tonic-gate addq $24, %rsp / skip %rdi, %rsi, %rdx 5380Sstevel@tonic-gate popq %r15 5390Sstevel@tonic-gate popq %r14 5400Sstevel@tonic-gate popq %r13 5410Sstevel@tonic-gate popq %r12 5420Sstevel@tonic-gate popq %rbp 5430Sstevel@tonic-gate popq %rbx 5440Sstevel@tonic-gate 5450Sstevel@tonic-gate ret 5460Sstevel@tonic-gate 547*8933Sopensolaris@drydog.com SET_SIZE(big_sqr_vec) 5480Sstevel@tonic-gate 5490Sstevel@tonic-gate#endif /* lint */ 550