1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2// 3// Permission to use, copy, modify, and/or distribute this software for any 4// purpose with or without fee is hereby granted, provided that the above 5// copyright notice and this permission notice appear in all copies. 6// 7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 15// ---------------------------------------------------------------------------- 16// Multiply z := x * y 17// Inputs x[8], y[8]; output z[16] 18// 19// extern void bignum_mul_8_16_alt 20// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); 21// 22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y 23// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y 24// ---------------------------------------------------------------------------- 25 26#include "s2n_bignum_internal.h" 27 28 .intel_syntax noprefix 29 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) 30 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) 31 .text 32 33// These are actually right 34 35#define z rdi 36#define x rsi 37 38// This is moved from rdx to free it for muls 39 40#define y rcx 41 42// Other variables used as a rotating 3-word window to add terms to 43 44#define t0 r8 45#define t1 r9 46#define t2 r10 47 48// Macro for the key "multiply and add to (c,h,l)" step 49 50#define combadd(c,h,l,numa,numb) \ 51 mov rax, numa; \ 52 mul QWORD PTR numb; \ 53 add l, rax; \ 54 adc h, rdx; \ 55 adc c, 0 56 57// A minutely shorter form for when c = 0 initially 58 59#define combadz(c,h,l,numa,numb) \ 60 mov rax, numa; \ 61 mul QWORD PTR numb; \ 62 add l, rax; \ 63 adc h, rdx; \ 64 adc c, c 65 66// A short form where we don't expect a top carry 67 68#define combads(h,l,numa,numb) \ 69 mov rax, numa; \ 70 mul QWORD PTR numb; \ 71 add l, rax; \ 72 adc h, rdx 73 74S2N_BN_SYMBOL(bignum_mul_8_16_alt): 75 _CET_ENDBR 76 77#if WINDOWS_ABI 78 push rdi 79 push rsi 80 mov rdi, rcx 81 mov rsi, rdx 82 mov rdx, r8 83#endif 84 85// Copy y into a safe register to start with 86 87 mov y, rdx 88 89// Result term 0 90 91 mov rax, [x] 92 mul QWORD PTR [y] 93 94 mov [z], rax 95 mov t0, rdx 96 xor t1, t1 97 98// Result term 1 99 100 xor t2, t2 101 combads(t1,t0,[x],[y+8]) 102 combadz(t2,t1,t0,[x+8],[y]) 103 mov [z+8], t0 104 105// Result term 2 106 107 xor t0, t0 108 combadz(t0,t2,t1,[x],[y+16]) 109 combadd(t0,t2,t1,[x+8],[y+8]) 110 combadd(t0,t2,t1,[x+16],[y]) 111 mov [z+16], t1 112 113// Result term 3 114 115 xor t1, t1 116 combadz(t1,t0,t2,[x],[y+24]) 117 combadd(t1,t0,t2,[x+8],[y+16]) 118 combadd(t1,t0,t2,[x+16],[y+8]) 119 combadd(t1,t0,t2,[x+24],[y]) 120 mov [z+24], t2 121 122// Result term 4 123 124 xor t2, t2 125 combadz(t2,t1,t0,[x],[y+32]) 126 combadd(t2,t1,t0,[x+8],[y+24]) 127 combadd(t2,t1,t0,[x+16],[y+16]) 128 combadd(t2,t1,t0,[x+24],[y+8]) 129 combadd(t2,t1,t0,[x+32],[y]) 130 mov [z+32], t0 131 132// Result term 5 133 134 xor t0, t0 135 combadz(t0,t2,t1,[x],[y+40]) 136 combadd(t0,t2,t1,[x+8],[y+32]) 137 combadd(t0,t2,t1,[x+16],[y+24]) 138 combadd(t0,t2,t1,[x+24],[y+16]) 139 combadd(t0,t2,t1,[x+32],[y+8]) 140 combadd(t0,t2,t1,[x+40],[y]) 141 mov [z+40], t1 142 143// Result term 6 144 145 xor t1, t1 146 combadz(t1,t0,t2,[x],[y+48]) 147 combadd(t1,t0,t2,[x+8],[y+40]) 148 combadd(t1,t0,t2,[x+16],[y+32]) 149 combadd(t1,t0,t2,[x+24],[y+24]) 150 combadd(t1,t0,t2,[x+32],[y+16]) 151 combadd(t1,t0,t2,[x+40],[y+8]) 152 combadd(t1,t0,t2,[x+48],[y]) 153 mov [z+48], t2 154 155// Result term 7 156 157 xor t2, t2 158 combadz(t2,t1,t0,[x],[y+56]) 159 combadd(t2,t1,t0,[x+8],[y+48]) 160 combadd(t2,t1,t0,[x+16],[y+40]) 161 combadd(t2,t1,t0,[x+24],[y+32]) 162 combadd(t2,t1,t0,[x+32],[y+24]) 163 combadd(t2,t1,t0,[x+40],[y+16]) 164 combadd(t2,t1,t0,[x+48],[y+8]) 165 combadd(t2,t1,t0,[x+56],[y]) 166 mov [z+56], t0 167 168// Result term 8 169 170 xor t0, t0 171 combadz(t0,t2,t1,[x+8],[y+56]) 172 combadd(t0,t2,t1,[x+16],[y+48]) 173 combadd(t0,t2,t1,[x+24],[y+40]) 174 combadd(t0,t2,t1,[x+32],[y+32]) 175 combadd(t0,t2,t1,[x+40],[y+24]) 176 combadd(t0,t2,t1,[x+48],[y+16]) 177 combadd(t0,t2,t1,[x+56],[y+8]) 178 mov [z+64], t1 179 180// Result term 9 181 182 xor t1, t1 183 combadz(t1,t0,t2,[x+16],[y+56]) 184 combadd(t1,t0,t2,[x+24],[y+48]) 185 combadd(t1,t0,t2,[x+32],[y+40]) 186 combadd(t1,t0,t2,[x+40],[y+32]) 187 combadd(t1,t0,t2,[x+48],[y+24]) 188 combadd(t1,t0,t2,[x+56],[y+16]) 189 mov [z+72], t2 190 191// Result term 10 192 193 xor t2, t2 194 combadz(t2,t1,t0,[x+24],[y+56]) 195 combadd(t2,t1,t0,[x+32],[y+48]) 196 combadd(t2,t1,t0,[x+40],[y+40]) 197 combadd(t2,t1,t0,[x+48],[y+32]) 198 combadd(t2,t1,t0,[x+56],[y+24]) 199 mov [z+80], t0 200 201// Result term 11 202 203 xor t0, t0 204 combadz(t0,t2,t1,[x+32],[y+56]) 205 combadd(t0,t2,t1,[x+40],[y+48]) 206 combadd(t0,t2,t1,[x+48],[y+40]) 207 combadd(t0,t2,t1,[x+56],[y+32]) 208 mov [z+88], t1 209 210// Result term 12 211 212 xor t1, t1 213 combadz(t1,t0,t2,[x+40],[y+56]) 214 combadd(t1,t0,t2,[x+48],[y+48]) 215 combadd(t1,t0,t2,[x+56],[y+40]) 216 mov [z+96], t2 217 218// Result term 13 219 220 xor t2, t2 221 combadz(t2,t1,t0,[x+48],[y+56]) 222 combadd(t2,t1,t0,[x+56],[y+48]) 223 mov [z+104], t0 224 225// Result term 14 226 227 combads(t2,t1,[x+56],[y+56]) 228 mov [z+112], t1 229 230// Result term 11 231 232 mov [z+120], t2 233 234// Return 235 236#if WINDOWS_ABI 237 pop rsi 238 pop rdi 239#endif 240 ret 241 242#if defined(__linux__) && defined(__ELF__) 243.section .note.GNU-stack,"",%progbits 244#endif 245