xref: /openbsd-src/lib/libcrypto/bn/arch/amd64/bignum_mul.S (revision 22787c513b4b59ee1fb13a32326a50f73cd342c1)
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted, provided that the above
5// copyright notice and this permission notice appear in all copies.
6//
7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15// ----------------------------------------------------------------------------
16// Multiply z := x * y
17// Inputs x[m], y[n]; output z[k]
18//
19//    extern void bignum_mul
20//     (uint64_t k, uint64_t *z,
21//      uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
22//
23// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
24// Truncates the result in general unless k >= m + n
25//
26// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
27// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
28// ----------------------------------------------------------------------------
29
30#include "s2n_bignum_internal.h"
31
32        .intel_syntax noprefix
33        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
34        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
35        .text
36
37// These are actually right
38
39#define p rdi
40#define z rsi
41#define n r8
42
43// These are not
44
45#define c r15
46#define h r14
47#define l r13
48#define x r12
49#define y r11
50#define i rbx
51#define k r10
52#define m rbp
53
54// These are always local scratch since multiplier result is in these
55
56#define a rax
57#define d rdx
58
59
60
61S2N_BN_SYMBOL(bignum_mul):
62	_CET_ENDBR
63
64#if WINDOWS_ABI
65        push    rdi
66        push    rsi
67        mov     rdi, rcx
68        mov     rsi, rdx
69        mov     rdx, r8
70        mov     rcx, r9
71        mov     r8, [rsp+56]
72        mov     r9, [rsp+64]
73#endif
74
75// We use too many registers, and also we need rax:rdx for multiplications
76
77        push    rbx
78        push    rbp
79        push    r12
80        push    r13
81        push    r14
82        push    r15
83        mov     m, rdx
84
85// If the result size is zero, do nothing
86// Note that even if either or both inputs has size zero, we can't
87// just give up because we at least need to zero the output array
88// If we did a multiply-add variant, however, then we could
89
90        test    p, p
91        jz      end
92
93// Set initial 2-part sum to zero (we zero c inside the body)
94
95        xor     h,h
96        xor     l,l
97
98// Otherwise do outer loop k = 0 ... k = p - 1
99
100        xor     k, k
101
102outerloop:
103
104// Zero our carry term first; we eventually want it and a zero is useful now
105// Set a =  max 0 (k + 1 - n), i = min (k + 1) m
106// This defines the range a <= j < i for the inner summation
107// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
108// And since we want to increment it anyway, we might as well do it now
109
110        xor     c, c            // c = 0
111        inc     k               // k = k + 1
112
113        mov     a, k            // a = k + 1
114        sub     a, n            // a = k + 1 - n
115        cmovc   a, c            // a = max 0 (k + 1 - n)
116
117        mov     i, m            // i = m
118        cmp     k, m            // CF <=> k + 1 < m
119        cmovc   i, k            // i = min (k + 1) m
120
121// Turn i into a loop count, and skip things if it's <= 0
122// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
123// and then launch into the main inner loop, postdecrementing i
124
125        mov     d, k
126        sub     d, i
127        sub     i, a
128        jbe     innerend
129        lea     x,[rcx+8*a]
130        lea     y,[r9+8*d-8]
131
132innerloop:
133        mov     rax, [y+8*i]
134        mul     QWORD PTR  [x]
135        add     x, 8
136        add     l, rax
137        adc     h, rdx
138        adc     c, 0
139        dec     i
140        jnz     innerloop
141
142innerend:
143
144        mov     [z], l
145        mov     l, h
146        mov     h, c
147        add     z, 8
148
149        cmp     k, p
150        jc      outerloop
151
152end:
153        pop     r15
154        pop     r14
155        pop     r13
156        pop     r12
157        pop     rbp
158        pop     rbx
159#if WINDOWS_ABI
160        pop    rsi
161        pop    rdi
162#endif
163        ret
164
165#if defined(__linux__) && defined(__ELF__)
166.section .note.GNU-stack,"",%progbits
167#endif
168