xref: /openbsd-src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S (revision 22787c513b4b59ee1fb13a32326a50f73cd342c1)
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted, provided that the above
5// copyright notice and this permission notice appear in all copies.
6//
7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15// ----------------------------------------------------------------------------
16// Square, z := x^2
17// Input x[4]; output z[8]
18//
19//    extern void bignum_sqr_4_8_alt
20//      (uint64_t z[static 8], uint64_t x[static 4]);
21//
22// Standard x86-64 ABI: RDI = z, RSI = x
23// Microsoft x64 ABI:   RCX = z, RDX = x
24// ----------------------------------------------------------------------------
25
26#include "s2n_bignum_internal.h"
27
28        .intel_syntax noprefix
29        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
30        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
31        .text
32
33// Input arguments
34
35#define z rdi
36#define x rsi
37
38// Other variables used as a rotating 3-word window to add terms to
39
40#define t0 rcx
41#define t1 r8
42#define t2 r9
43
44// Macro for the key "multiply and add to (c,h,l)" step, for square term
45
46#define combadd1(c,h,l,numa)                    \
47        mov     rax, numa;                      \
48        mul     rax;                            \
49        add     l, rax;                         \
50        adc     h, rdx;                         \
51        adc     c, 0
52
53// A short form where we don't expect a top carry
54
55#define combads(h,l,numa)                       \
56        mov     rax, numa;                      \
57        mul     rax;                            \
58        add     l, rax;                         \
59        adc     h, rdx
60
61// A version doubling before adding, for non-square terms
62
63#define combadd2(c,h,l,numa,numb)               \
64        mov     rax, numa;                      \
65        mul     QWORD PTR numb;                 \
66        add     rax, rax;                       \
67        adc     rdx, rdx;                       \
68        adc     c, 0;                           \
69        add     l, rax;                         \
70        adc     h, rdx;                         \
71        adc     c, 0
72
73S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
74	_CET_ENDBR
75
76#if WINDOWS_ABI
77        push    rdi
78        push    rsi
79        mov     rdi, rcx
80        mov     rsi, rdx
81#endif
82
83// Result term 0
84
85        mov     rax, [x]
86        mul     rax
87
88        mov     [z], rax
89        mov     t0, rdx
90        xor     t1, t1
91
92// Result term 1
93
94       xor     t2, t2
95       combadd2(t2,t1,t0,[x],[x+8])
96       mov     [z+8], t0
97
98// Result term 2
99
100        xor     t0, t0
101        combadd1(t0,t2,t1,[x+8])
102        combadd2(t0,t2,t1,[x],[x+16])
103        mov     [z+16], t1
104
105// Result term 3
106
107        xor     t1, t1
108        combadd2(t1,t0,t2,[x],[x+24])
109        combadd2(t1,t0,t2,[x+8],[x+16])
110        mov     [z+24], t2
111
112// Result term 4
113
114        xor     t2, t2
115        combadd2(t2,t1,t0,[x+8],[x+24])
116        combadd1(t2,t1,t0,[x+16])
117        mov     [z+32], t0
118
119// Result term 5
120
121        xor     t0, t0
122        combadd2(t0,t2,t1,[x+16],[x+24])
123        mov     [z+40], t1
124
125// Result term 6
126
127        xor     t1, t1
128        combads(t0,t2,[x+24])
129        mov     [z+48], t2
130
131// Result term 7
132
133        mov     [z+56], t0
134
135// Return
136
137#if WINDOWS_ABI
138        pop    rsi
139        pop    rdi
140#endif
141        ret
142
143#if defined(__linux__) && defined(__ELF__)
144.section .note.GNU-stack,"",%progbits
145#endif
146