xref: /openbsd-src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S (revision 22787c513b4b59ee1fb13a32326a50f73cd342c1)
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted, provided that the above
5// copyright notice and this permission notice appear in all copies.
6//
7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15// ----------------------------------------------------------------------------
16// Multiply z := x * y
17// Inputs x[8], y[8]; output z[16]
18//
19//    extern void bignum_mul_8_16_alt
20//     (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
21//
22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
23// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
24// ----------------------------------------------------------------------------
25
26#include "s2n_bignum_internal.h"
27
28        .intel_syntax noprefix
29        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
30        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
31        .text
32
33// These are actually right
34
35#define z rdi
36#define x rsi
37
38// This is moved from rdx to free it for muls
39
40#define y rcx
41
42// Other variables used as a rotating 3-word window to add terms to
43
44#define t0 r8
45#define t1 r9
46#define t2 r10
47
48// Macro for the key "multiply and add to (c,h,l)" step
49
50#define combadd(c,h,l,numa,numb)                \
51        mov     rax, numa;                      \
52        mul     QWORD PTR numb;                 \
53        add     l, rax;                         \
54        adc     h, rdx;                         \
55        adc     c, 0
56
57// A minutely shorter form for when c = 0 initially
58
59#define combadz(c,h,l,numa,numb)                \
60        mov     rax, numa;                      \
61        mul     QWORD PTR numb;                 \
62        add     l, rax;                         \
63        adc     h, rdx;                         \
64        adc     c, c
65
66// A short form where we don't expect a top carry
67
68#define combads(h,l,numa,numb)                  \
69        mov     rax, numa;                      \
70        mul     QWORD PTR numb;                 \
71        add     l, rax;                         \
72        adc     h, rdx
73
74S2N_BN_SYMBOL(bignum_mul_8_16_alt):
75	_CET_ENDBR
76
77#if WINDOWS_ABI
78        push    rdi
79        push    rsi
80        mov     rdi, rcx
81        mov     rsi, rdx
82        mov     rdx, r8
83#endif
84
85// Copy y into a safe register to start with
86
87        mov     y, rdx
88
89// Result term 0
90
91        mov     rax, [x]
92        mul     QWORD PTR [y]
93
94        mov     [z], rax
95        mov     t0, rdx
96        xor     t1, t1
97
98// Result term 1
99
100        xor     t2, t2
101        combads(t1,t0,[x],[y+8])
102        combadz(t2,t1,t0,[x+8],[y])
103        mov     [z+8], t0
104
105// Result term 2
106
107        xor     t0, t0
108        combadz(t0,t2,t1,[x],[y+16])
109        combadd(t0,t2,t1,[x+8],[y+8])
110        combadd(t0,t2,t1,[x+16],[y])
111        mov     [z+16], t1
112
113// Result term 3
114
115        xor     t1, t1
116        combadz(t1,t0,t2,[x],[y+24])
117        combadd(t1,t0,t2,[x+8],[y+16])
118        combadd(t1,t0,t2,[x+16],[y+8])
119        combadd(t1,t0,t2,[x+24],[y])
120        mov     [z+24], t2
121
122// Result term 4
123
124        xor     t2, t2
125        combadz(t2,t1,t0,[x],[y+32])
126        combadd(t2,t1,t0,[x+8],[y+24])
127        combadd(t2,t1,t0,[x+16],[y+16])
128        combadd(t2,t1,t0,[x+24],[y+8])
129        combadd(t2,t1,t0,[x+32],[y])
130        mov     [z+32], t0
131
132// Result term 5
133
134        xor     t0, t0
135        combadz(t0,t2,t1,[x],[y+40])
136        combadd(t0,t2,t1,[x+8],[y+32])
137        combadd(t0,t2,t1,[x+16],[y+24])
138        combadd(t0,t2,t1,[x+24],[y+16])
139        combadd(t0,t2,t1,[x+32],[y+8])
140        combadd(t0,t2,t1,[x+40],[y])
141        mov     [z+40], t1
142
143// Result term 6
144
145        xor     t1, t1
146        combadz(t1,t0,t2,[x],[y+48])
147        combadd(t1,t0,t2,[x+8],[y+40])
148        combadd(t1,t0,t2,[x+16],[y+32])
149        combadd(t1,t0,t2,[x+24],[y+24])
150        combadd(t1,t0,t2,[x+32],[y+16])
151        combadd(t1,t0,t2,[x+40],[y+8])
152        combadd(t1,t0,t2,[x+48],[y])
153        mov     [z+48], t2
154
155// Result term 7
156
157        xor     t2, t2
158        combadz(t2,t1,t0,[x],[y+56])
159        combadd(t2,t1,t0,[x+8],[y+48])
160        combadd(t2,t1,t0,[x+16],[y+40])
161        combadd(t2,t1,t0,[x+24],[y+32])
162        combadd(t2,t1,t0,[x+32],[y+24])
163        combadd(t2,t1,t0,[x+40],[y+16])
164        combadd(t2,t1,t0,[x+48],[y+8])
165        combadd(t2,t1,t0,[x+56],[y])
166        mov     [z+56], t0
167
168// Result term 8
169
170        xor     t0, t0
171        combadz(t0,t2,t1,[x+8],[y+56])
172        combadd(t0,t2,t1,[x+16],[y+48])
173        combadd(t0,t2,t1,[x+24],[y+40])
174        combadd(t0,t2,t1,[x+32],[y+32])
175        combadd(t0,t2,t1,[x+40],[y+24])
176        combadd(t0,t2,t1,[x+48],[y+16])
177        combadd(t0,t2,t1,[x+56],[y+8])
178        mov     [z+64], t1
179
180// Result term 9
181
182        xor     t1, t1
183        combadz(t1,t0,t2,[x+16],[y+56])
184        combadd(t1,t0,t2,[x+24],[y+48])
185        combadd(t1,t0,t2,[x+32],[y+40])
186        combadd(t1,t0,t2,[x+40],[y+32])
187        combadd(t1,t0,t2,[x+48],[y+24])
188        combadd(t1,t0,t2,[x+56],[y+16])
189        mov     [z+72], t2
190
191// Result term 10
192
193        xor     t2, t2
194        combadz(t2,t1,t0,[x+24],[y+56])
195        combadd(t2,t1,t0,[x+32],[y+48])
196        combadd(t2,t1,t0,[x+40],[y+40])
197        combadd(t2,t1,t0,[x+48],[y+32])
198        combadd(t2,t1,t0,[x+56],[y+24])
199        mov     [z+80], t0
200
201// Result term 11
202
203        xor     t0, t0
204        combadz(t0,t2,t1,[x+32],[y+56])
205        combadd(t0,t2,t1,[x+40],[y+48])
206        combadd(t0,t2,t1,[x+48],[y+40])
207        combadd(t0,t2,t1,[x+56],[y+32])
208        mov     [z+88], t1
209
210// Result term 12
211
212        xor     t1, t1
213        combadz(t1,t0,t2,[x+40],[y+56])
214        combadd(t1,t0,t2,[x+48],[y+48])
215        combadd(t1,t0,t2,[x+56],[y+40])
216        mov     [z+96], t2
217
218// Result term 13
219
220        xor     t2, t2
221        combadz(t2,t1,t0,[x+48],[y+56])
222        combadd(t2,t1,t0,[x+56],[y+48])
223        mov     [z+104], t0
224
225// Result term 14
226
227        combads(t2,t1,[x+56],[y+56])
228        mov     [z+112], t1
229
230// Result term 11
231
232        mov     [z+120], t2
233
234// Return
235
236#if WINDOWS_ABI
237        pop    rsi
238        pop    rdi
239#endif
240        ret
241
242#if defined(__linux__) && defined(__ELF__)
243.section .note.GNU-stack,"",%progbits
244#endif
245