1*05c892eeSjsing /* $OpenBSD: bn_internal.h,v 1.15 2023/06/25 11:42:26 jsing Exp $ */
2b97356a2Sjsing /*
3b97356a2Sjsing * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4b97356a2Sjsing *
5b97356a2Sjsing * Permission to use, copy, modify, and distribute this software for any
6b97356a2Sjsing * purpose with or without fee is hereby granted, provided that the above
7b97356a2Sjsing * copyright notice and this permission notice appear in all copies.
8b97356a2Sjsing *
9b97356a2Sjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10b97356a2Sjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11b97356a2Sjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12b97356a2Sjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13b97356a2Sjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14b97356a2Sjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15b97356a2Sjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16b97356a2Sjsing */
17b97356a2Sjsing
18b97356a2Sjsing #include <openssl/bn.h>
19b97356a2Sjsing
20b97356a2Sjsing #include "bn_arch.h"
21b97356a2Sjsing
22b97356a2Sjsing #ifndef HEADER_BN_INTERNAL_H
23b97356a2Sjsing #define HEADER_BN_INTERNAL_H
24b97356a2Sjsing
25b5aa971dSjsing int bn_word_clz(BN_ULONG w);
26b5aa971dSjsing
27b5aa971dSjsing int bn_bitsize(const BIGNUM *bn);
28b5aa971dSjsing
29a936a3bfSjsing #ifndef HAVE_BN_CT_NE_ZERO
30a936a3bfSjsing static inline int
bn_ct_ne_zero(BN_ULONG w)31a936a3bfSjsing bn_ct_ne_zero(BN_ULONG w)
32a936a3bfSjsing {
33a936a3bfSjsing return (w | ~(w - 1)) >> (BN_BITS2 - 1);
34a936a3bfSjsing }
35a936a3bfSjsing #endif
36a936a3bfSjsing
37a936a3bfSjsing #ifndef HAVE_BN_CT_NE_ZERO_MASK
38a936a3bfSjsing static inline BN_ULONG
bn_ct_ne_zero_mask(BN_ULONG w)39a936a3bfSjsing bn_ct_ne_zero_mask(BN_ULONG w)
40a936a3bfSjsing {
41a936a3bfSjsing return 0 - bn_ct_ne_zero(w);
42a936a3bfSjsing }
43a936a3bfSjsing #endif
44a936a3bfSjsing
45a936a3bfSjsing #ifndef HAVE_BN_CT_EQ_ZERO
46a936a3bfSjsing static inline int
bn_ct_eq_zero(BN_ULONG w)47a936a3bfSjsing bn_ct_eq_zero(BN_ULONG w)
48a936a3bfSjsing {
49a936a3bfSjsing return 1 - bn_ct_ne_zero(w);
50a936a3bfSjsing }
51a936a3bfSjsing #endif
52a936a3bfSjsing
53a936a3bfSjsing #ifndef HAVE_BN_CT_EQ_ZERO_MASK
54a936a3bfSjsing static inline BN_ULONG
bn_ct_eq_zero_mask(BN_ULONG w)55a936a3bfSjsing bn_ct_eq_zero_mask(BN_ULONG w)
56a936a3bfSjsing {
57a936a3bfSjsing return 0 - bn_ct_eq_zero(w);
58a936a3bfSjsing }
59a936a3bfSjsing #endif
60a936a3bfSjsing
6199bf4692Sjsing #ifndef HAVE_BN_CLZW
6299bf4692Sjsing static inline int
bn_clzw(BN_ULONG w)6399bf4692Sjsing bn_clzw(BN_ULONG w)
6499bf4692Sjsing {
6599bf4692Sjsing return bn_word_clz(w);
6699bf4692Sjsing }
6799bf4692Sjsing #endif
6899bf4692Sjsing
6927142ad2Sjsing /*
7027142ad2Sjsing * Big number primitives are named as the operation followed by a suffix
7127142ad2Sjsing * that indicates the number of words that it operates on, where 'w' means
7227142ad2Sjsing * single word, 'dw' means double word, 'tw' means triple word and 'qw' means
7327142ad2Sjsing * quadruple word. Unless otherwise noted, the size of the output is implied
7427142ad2Sjsing * based on its inputs, for example bn_mulw() takes two single word inputs
7527142ad2Sjsing * and is going to produce a double word result.
7627142ad2Sjsing *
7727142ad2Sjsing * Where a function implements multiple operations, these are listed in order.
7827142ad2Sjsing * For example, a function that computes (r1:r0) = a * b + c is named
7927142ad2Sjsing * bn_mulw_addw(), producing a double word result.
8027142ad2Sjsing */
8127142ad2Sjsing
8227142ad2Sjsing /*
83*05c892eeSjsing * Default implementations for BN_ULLONG architectures.
84*05c892eeSjsing *
85*05c892eeSjsing * On these platforms the C compiler is generally better at optimising without
86*05c892eeSjsing * the use of inline assembly primitives. However, it can be difficult for the
87*05c892eeSjsing * compiler to see through primitives in order to combine operations, due to
88*05c892eeSjsing * type changes/narrowing. For this reason compound primitives are usually
89*05c892eeSjsing * explicitly provided.
9027142ad2Sjsing */
91*05c892eeSjsing #ifdef BN_ULLONG
92*05c892eeSjsing
9327142ad2Sjsing #ifndef HAVE_BN_ADDW
94*05c892eeSjsing #define HAVE_BN_ADDW
9527142ad2Sjsing static inline void
bn_addw(BN_ULONG a,BN_ULONG b,BN_ULONG * out_r1,BN_ULONG * out_r0)9627142ad2Sjsing bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
9727142ad2Sjsing {
9827142ad2Sjsing BN_ULLONG r;
9927142ad2Sjsing
10027142ad2Sjsing r = (BN_ULLONG)a + (BN_ULLONG)b;
10127142ad2Sjsing
10227142ad2Sjsing *out_r1 = r >> BN_BITS2;
10327142ad2Sjsing *out_r0 = r & BN_MASK2;
10427142ad2Sjsing }
105*05c892eeSjsing #endif
10627142ad2Sjsing
107*05c892eeSjsing #ifndef HAVE_BN_ADDW_ADDW
108*05c892eeSjsing #define HAVE_BN_ADDW_ADDW
109*05c892eeSjsing static inline void
bn_addw_addw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG * out_r1,BN_ULONG * out_r0)110*05c892eeSjsing bn_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
111*05c892eeSjsing BN_ULONG *out_r0)
112*05c892eeSjsing {
113*05c892eeSjsing BN_ULLONG r;
114*05c892eeSjsing
115*05c892eeSjsing r = (BN_ULLONG)a + (BN_ULLONG)b + (BN_ULLONG)c;
116*05c892eeSjsing
117*05c892eeSjsing *out_r1 = r >> BN_BITS2;
118*05c892eeSjsing *out_r0 = r & BN_MASK2;
119*05c892eeSjsing }
120*05c892eeSjsing #endif
121*05c892eeSjsing
122*05c892eeSjsing #ifndef HAVE_BN_MULW
123*05c892eeSjsing #define HAVE_BN_MULW
124*05c892eeSjsing static inline void
bn_mulw(BN_ULONG a,BN_ULONG b,BN_ULONG * out_r1,BN_ULONG * out_r0)125*05c892eeSjsing bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
126*05c892eeSjsing {
127*05c892eeSjsing BN_ULLONG r;
128*05c892eeSjsing
129*05c892eeSjsing r = (BN_ULLONG)a * (BN_ULLONG)b;
130*05c892eeSjsing
131*05c892eeSjsing *out_r1 = r >> BN_BITS2;
132*05c892eeSjsing *out_r0 = r & BN_MASK2;
133*05c892eeSjsing }
134*05c892eeSjsing #endif
135*05c892eeSjsing
136*05c892eeSjsing #ifndef HAVE_BN_MULW_ADDW
137*05c892eeSjsing #define HAVE_BN_MULW_ADDW
138*05c892eeSjsing static inline void
bn_mulw_addw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG * out_r1,BN_ULONG * out_r0)139*05c892eeSjsing bn_mulw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
140*05c892eeSjsing BN_ULONG *out_r0)
141*05c892eeSjsing {
142*05c892eeSjsing BN_ULLONG r;
143*05c892eeSjsing
144*05c892eeSjsing r = (BN_ULLONG)a * (BN_ULLONG)b + (BN_ULLONG)c;
145*05c892eeSjsing
146*05c892eeSjsing *out_r1 = r >> BN_BITS2;
147*05c892eeSjsing *out_r0 = r & BN_MASK2;
148*05c892eeSjsing }
149*05c892eeSjsing #endif
150*05c892eeSjsing
151*05c892eeSjsing #ifndef HAVE_BN_MULW_ADDW_ADDW
152*05c892eeSjsing #define HAVE_BN_MULW_ADDW_ADDW
153*05c892eeSjsing static inline void
bn_mulw_addw_addw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG d,BN_ULONG * out_r1,BN_ULONG * out_r0)154*05c892eeSjsing bn_mulw_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG d,
155*05c892eeSjsing BN_ULONG *out_r1, BN_ULONG *out_r0)
156*05c892eeSjsing {
157*05c892eeSjsing BN_ULLONG r;
158*05c892eeSjsing
159*05c892eeSjsing r = (BN_ULLONG)a * (BN_ULLONG)b + (BN_ULLONG)c + (BN_ULLONG)d;
160*05c892eeSjsing
161*05c892eeSjsing *out_r1 = r >> BN_BITS2;
162*05c892eeSjsing *out_r0 = r & BN_MASK2;
163*05c892eeSjsing }
164*05c892eeSjsing #endif
165*05c892eeSjsing
166*05c892eeSjsing #endif /* !BN_ULLONG */
167*05c892eeSjsing
168*05c892eeSjsing /*
169*05c892eeSjsing * bn_addw() computes (r1:r0) = a + b, where both inputs are single words,
170*05c892eeSjsing * producing a double word result. The value of r1 is the carry from the
171*05c892eeSjsing * addition.
172*05c892eeSjsing */
173*05c892eeSjsing #ifndef HAVE_BN_ADDW
17427142ad2Sjsing static inline void
bn_addw(BN_ULONG a,BN_ULONG b,BN_ULONG * out_r1,BN_ULONG * out_r0)17527142ad2Sjsing bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
17627142ad2Sjsing {
17727142ad2Sjsing BN_ULONG r1, r0, c1, c2;
17827142ad2Sjsing
17927142ad2Sjsing c1 = a | b;
18027142ad2Sjsing c2 = a & b;
18127142ad2Sjsing r0 = a + b;
18227142ad2Sjsing r1 = ((c1 & ~r0) | c2) >> (BN_BITS2 - 1); /* carry */
18327142ad2Sjsing
18427142ad2Sjsing *out_r1 = r1;
18527142ad2Sjsing *out_r0 = r0;
18627142ad2Sjsing }
18727142ad2Sjsing #endif
18827142ad2Sjsing
189ee749820Sjsing /*
190ee749820Sjsing * bn_addw_addw() computes (r1:r0) = a + b + c, where all inputs are single
191ee749820Sjsing * words, producing a double word result.
192ee749820Sjsing */
193ee749820Sjsing #ifndef HAVE_BN_ADDW_ADDW
194ee749820Sjsing static inline void
bn_addw_addw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG * out_r1,BN_ULONG * out_r0)195ee749820Sjsing bn_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
196ee749820Sjsing BN_ULONG *out_r0)
197ee749820Sjsing {
198ee749820Sjsing BN_ULONG carry, r1, r0;
199ee749820Sjsing
200ee749820Sjsing bn_addw(a, b, &r1, &r0);
201ee749820Sjsing bn_addw(r0, c, &carry, &r0);
202ee749820Sjsing r1 += carry;
203ee749820Sjsing
204ee749820Sjsing *out_r1 = r1;
205ee749820Sjsing *out_r0 = r0;
206ee749820Sjsing }
207ee749820Sjsing #endif
208ee749820Sjsing
209ee749820Sjsing /*
21073e8eea7Sjsing * bn_qwaddqw() computes
21173e8eea7Sjsing * (r4:r3:r2:r1:r0) = (a3:a2:a1:a0) + (b3:b2:b1:b0) + carry, where a is a quad word,
21273e8eea7Sjsing * b is a quad word, and carry is a single word with value 0 or 1, producing a four
21373e8eea7Sjsing * word result and carry.
21473e8eea7Sjsing */
21573e8eea7Sjsing #ifndef HAVE_BN_QWADDQW
21673e8eea7Sjsing static inline void
bn_qwaddqw(BN_ULONG a3,BN_ULONG a2,BN_ULONG a1,BN_ULONG a0,BN_ULONG b3,BN_ULONG b2,BN_ULONG b1,BN_ULONG b0,BN_ULONG carry,BN_ULONG * out_carry,BN_ULONG * out_r3,BN_ULONG * out_r2,BN_ULONG * out_r1,BN_ULONG * out_r0)21773e8eea7Sjsing bn_qwaddqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3,
21873e8eea7Sjsing BN_ULONG b2, BN_ULONG b1, BN_ULONG b0, BN_ULONG carry, BN_ULONG *out_carry,
21973e8eea7Sjsing BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
22073e8eea7Sjsing {
22173e8eea7Sjsing BN_ULONG r3, r2, r1, r0;
22273e8eea7Sjsing
22373e8eea7Sjsing bn_addw_addw(a0, b0, carry, &carry, &r0);
22473e8eea7Sjsing bn_addw_addw(a1, b1, carry, &carry, &r1);
22573e8eea7Sjsing bn_addw_addw(a2, b2, carry, &carry, &r2);
22673e8eea7Sjsing bn_addw_addw(a3, b3, carry, &carry, &r3);
22773e8eea7Sjsing
22873e8eea7Sjsing *out_carry = carry;
22973e8eea7Sjsing *out_r3 = r3;
23073e8eea7Sjsing *out_r2 = r2;
23173e8eea7Sjsing *out_r1 = r1;
23273e8eea7Sjsing *out_r0 = r0;
23373e8eea7Sjsing }
23473e8eea7Sjsing #endif
23573e8eea7Sjsing
23673e8eea7Sjsing /*
237ee749820Sjsing * bn_subw() computes r0 = a - b, where both inputs are single words,
238ee749820Sjsing * producing a single word result and borrow.
239ee749820Sjsing */
240ee749820Sjsing #ifndef HAVE_BN_SUBW
241ee749820Sjsing static inline void
bn_subw(BN_ULONG a,BN_ULONG b,BN_ULONG * out_borrow,BN_ULONG * out_r0)242ee749820Sjsing bn_subw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_borrow, BN_ULONG *out_r0)
243ee749820Sjsing {
244ee749820Sjsing BN_ULONG borrow, r0;
245ee749820Sjsing
246ee749820Sjsing r0 = a - b;
247ee749820Sjsing borrow = ((r0 | (b & ~a)) & (b | ~a)) >> (BN_BITS2 - 1);
248ee749820Sjsing
249ee749820Sjsing *out_borrow = borrow;
250ee749820Sjsing *out_r0 = r0;
251ee749820Sjsing }
252ee749820Sjsing #endif
253ee749820Sjsing
254ee749820Sjsing /*
255ee749820Sjsing * bn_subw_subw() computes r0 = a - b - c, where all inputs are single words,
256ee749820Sjsing * producing a single word result and borrow.
257ee749820Sjsing */
258ee749820Sjsing #ifndef HAVE_BN_SUBW_SUBW
259ee749820Sjsing static inline void
bn_subw_subw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG * out_borrow,BN_ULONG * out_r0)260ee749820Sjsing bn_subw_subw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_borrow,
261ee749820Sjsing BN_ULONG *out_r0)
262ee749820Sjsing {
263ee749820Sjsing BN_ULONG b1, b2, r0;
264ee749820Sjsing
265ee749820Sjsing bn_subw(a, b, &b1, &r0);
266ee749820Sjsing bn_subw(r0, c, &b2, &r0);
267ee749820Sjsing
268ee749820Sjsing *out_borrow = b1 + b2;
269ee749820Sjsing *out_r0 = r0;
270ee749820Sjsing }
271ee749820Sjsing #endif
272ee749820Sjsing
27382c46216Sjsing /*
27473e8eea7Sjsing * bn_qwsubqw() computes
27573e8eea7Sjsing * (r3:r2:r1:r0) = (a3:a2:a1:a0) - (b3:b2:b1:b0) - borrow, where a is a quad word,
27673e8eea7Sjsing * b is a quad word, and borrow is a single word with value 0 or 1, producing a
27773e8eea7Sjsing * four word result and borrow.
27873e8eea7Sjsing */
27973e8eea7Sjsing #ifndef HAVE_BN_QWSUBQW
28073e8eea7Sjsing static inline void
bn_qwsubqw(BN_ULONG a3,BN_ULONG a2,BN_ULONG a1,BN_ULONG a0,BN_ULONG b3,BN_ULONG b2,BN_ULONG b1,BN_ULONG b0,BN_ULONG borrow,BN_ULONG * out_borrow,BN_ULONG * out_r3,BN_ULONG * out_r2,BN_ULONG * out_r1,BN_ULONG * out_r0)28173e8eea7Sjsing bn_qwsubqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3,
28273e8eea7Sjsing BN_ULONG b2, BN_ULONG b1, BN_ULONG b0, BN_ULONG borrow, BN_ULONG *out_borrow,
28373e8eea7Sjsing BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
28473e8eea7Sjsing {
28573e8eea7Sjsing BN_ULONG r3, r2, r1, r0;
28673e8eea7Sjsing
28773e8eea7Sjsing bn_subw_subw(a0, b0, borrow, &borrow, &r0);
28873e8eea7Sjsing bn_subw_subw(a1, b1, borrow, &borrow, &r1);
28973e8eea7Sjsing bn_subw_subw(a2, b2, borrow, &borrow, &r2);
29073e8eea7Sjsing bn_subw_subw(a3, b3, borrow, &borrow, &r3);
29173e8eea7Sjsing
29273e8eea7Sjsing *out_borrow = borrow;
29373e8eea7Sjsing *out_r3 = r3;
29473e8eea7Sjsing *out_r2 = r2;
29573e8eea7Sjsing *out_r1 = r1;
29673e8eea7Sjsing *out_r0 = r0;
29773e8eea7Sjsing }
29873e8eea7Sjsing #endif
29973e8eea7Sjsing
30073e8eea7Sjsing /*
30182c46216Sjsing * bn_mulw() computes (r1:r0) = a * b, where both inputs are single words,
30282c46216Sjsing * producing a double word result.
30382c46216Sjsing */
30482c46216Sjsing #ifndef HAVE_BN_MULW
305b97356a2Sjsing /*
306b97356a2Sjsing * Multiply two words (a * b) producing a double word result (h:l).
307b97356a2Sjsing *
308b97356a2Sjsing * This can be rewritten as:
309b97356a2Sjsing *
310b97356a2Sjsing * a * b = (hi32(a) * 2^32 + lo32(a)) * (hi32(b) * 2^32 + lo32(b))
311b97356a2Sjsing * = hi32(a) * hi32(b) * 2^64 +
312b97356a2Sjsing * hi32(a) * lo32(b) * 2^32 +
313b97356a2Sjsing * hi32(b) * lo32(a) * 2^32 +
314b97356a2Sjsing * lo32(a) * lo32(b)
315b97356a2Sjsing *
316b97356a2Sjsing * The multiplication for each part of a and b can be calculated for each of
317b97356a2Sjsing * these four terms without overflowing a BN_ULONG, as the maximum value of a
318b97356a2Sjsing * 32 bit x 32 bit multiplication is 32 + 32 = 64 bits. Once these
319b97356a2Sjsing * multiplications have been performed the result can be partitioned and summed
320b97356a2Sjsing * into a double word (h:l). The same applies on a 32 bit system, substituting
321b97356a2Sjsing * 16 for 32 and 32 for 64.
322b97356a2Sjsing */
323b97356a2Sjsing #if 1
324b97356a2Sjsing static inline void
bn_mulw(BN_ULONG a,BN_ULONG b,BN_ULONG * out_r1,BN_ULONG * out_r0)32582c46216Sjsing bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
326b97356a2Sjsing {
32793321b80Sjsing BN_ULONG a1, a0, b1, b0, r1, r0;
32893321b80Sjsing BN_ULONG carry, x;
329b97356a2Sjsing
33082c46216Sjsing a1 = a >> BN_BITS4;
33182c46216Sjsing a0 = a & BN_MASK2l;
33282c46216Sjsing b1 = b >> BN_BITS4;
33382c46216Sjsing b0 = b & BN_MASK2l;
334b97356a2Sjsing
33582c46216Sjsing r1 = a1 * b1;
33682c46216Sjsing r0 = a0 * b0;
337b97356a2Sjsing
33882c46216Sjsing /* (a1 * b0) << BN_BITS4, partition the result across r1:r0 with carry. */
33982c46216Sjsing x = a1 * b0;
34082c46216Sjsing r1 += x >> BN_BITS4;
34193321b80Sjsing bn_addw(r0, x << BN_BITS4, &carry, &r0);
34293321b80Sjsing r1 += carry;
343b97356a2Sjsing
34482c46216Sjsing /* (b1 * a0) << BN_BITS4, partition the result across r1:r0 with carry. */
34582c46216Sjsing x = b1 * a0;
34682c46216Sjsing r1 += x >> BN_BITS4;
34793321b80Sjsing bn_addw(r0, x << BN_BITS4, &carry, &r0);
34893321b80Sjsing r1 += carry;
349b97356a2Sjsing
35082c46216Sjsing *out_r1 = r1;
35182c46216Sjsing *out_r0 = r0;
352b97356a2Sjsing }
353b97356a2Sjsing #else
354b97356a2Sjsing
355b97356a2Sjsing /*
356b97356a2Sjsing * XXX - this accumulator based version uses fewer instructions, however
357b97356a2Sjsing * requires more variables/registers. It seems to be slower on at least amd64
358b97356a2Sjsing * and i386, however may be faster on other architectures that have more
359b97356a2Sjsing * registers available. Further testing is required and one of the two
360b97356a2Sjsing * implementations should eventually be removed.
361b97356a2Sjsing */
362b97356a2Sjsing static inline void
bn_mulw(BN_ULONG a,BN_ULONG b,BN_ULONG * out_r1,BN_ULONG * out_r0)36382c46216Sjsing bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
364b97356a2Sjsing {
36582c46216Sjsing BN_ULONG a1, a0, b1, b0, r1, r0, x;
366b97356a2Sjsing BN_ULONG acc0, acc1, acc2, acc3;
367b97356a2Sjsing
36882c46216Sjsing a1 = a >> BN_BITS4;
36982c46216Sjsing b1 = b >> BN_BITS4;
37082c46216Sjsing a0 = a & BN_MASK2l;
37182c46216Sjsing b0 = b & BN_MASK2l;
372b97356a2Sjsing
37382c46216Sjsing r1 = a1 * b1;
37482c46216Sjsing r0 = a0 * b0;
375b97356a2Sjsing
37682c46216Sjsing acc0 = r0 & BN_MASK2l;
37782c46216Sjsing acc1 = r0 >> BN_BITS4;
37882c46216Sjsing acc2 = r1 & BN_MASK2l;
37982c46216Sjsing acc3 = r1 >> BN_BITS4;
380b97356a2Sjsing
38182c46216Sjsing /* (a1 * b0) << BN_BITS4, partition the result across r1:r0. */
38282c46216Sjsing x = a1 * b0;
383b97356a2Sjsing acc1 += x & BN_MASK2l;
384b97356a2Sjsing acc2 += (acc1 >> BN_BITS4) + (x >> BN_BITS4);
385f7d534bcSjsing acc1 &= BN_MASK2l;
386b97356a2Sjsing acc3 += acc2 >> BN_BITS4;
387f7d534bcSjsing acc2 &= BN_MASK2l;
388b97356a2Sjsing
38982c46216Sjsing /* (b1 * a0) << BN_BITS4, partition the result across r1:r0. */
39082c46216Sjsing x = b1 * a0;
391b97356a2Sjsing acc1 += x & BN_MASK2l;
392b97356a2Sjsing acc2 += (acc1 >> BN_BITS4) + (x >> BN_BITS4);
393f7d534bcSjsing acc1 &= BN_MASK2l;
394b97356a2Sjsing acc3 += acc2 >> BN_BITS4;
395f7d534bcSjsing acc2 &= BN_MASK2l;
396b97356a2Sjsing
39782c46216Sjsing *out_r1 = (acc3 << BN_BITS4) | acc2;
39882c46216Sjsing *out_r0 = (acc1 << BN_BITS4) | acc0;
399b97356a2Sjsing }
400b97356a2Sjsing #endif
401b97356a2Sjsing #endif
402b97356a2Sjsing
40382c46216Sjsing #ifndef HAVE_BN_MULW_LO
404b97356a2Sjsing static inline BN_ULONG
bn_mulw_lo(BN_ULONG a,BN_ULONG b)40582c46216Sjsing bn_mulw_lo(BN_ULONG a, BN_ULONG b)
406b97356a2Sjsing {
407b97356a2Sjsing return a * b;
408b97356a2Sjsing }
409b97356a2Sjsing #endif
410b97356a2Sjsing
41182c46216Sjsing #ifndef HAVE_BN_MULW_HI
412b97356a2Sjsing static inline BN_ULONG
bn_mulw_hi(BN_ULONG a,BN_ULONG b)41382c46216Sjsing bn_mulw_hi(BN_ULONG a, BN_ULONG b)
414b97356a2Sjsing {
415b97356a2Sjsing BN_ULONG h, l;
416b97356a2Sjsing
41782c46216Sjsing bn_mulw(a, b, &h, &l);
418b97356a2Sjsing
419b97356a2Sjsing return h;
420b97356a2Sjsing }
421b97356a2Sjsing #endif
422b97356a2Sjsing
42327142ad2Sjsing /*
42427142ad2Sjsing * bn_mulw_addw() computes (r1:r0) = a * b + c with all inputs being single
42527142ad2Sjsing * words, producing a double word result.
42627142ad2Sjsing */
42727142ad2Sjsing #ifndef HAVE_BN_MULW_ADDW
42827142ad2Sjsing static inline void
bn_mulw_addw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG * out_r1,BN_ULONG * out_r0)42927142ad2Sjsing bn_mulw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
43027142ad2Sjsing BN_ULONG *out_r0)
43127142ad2Sjsing {
43227142ad2Sjsing BN_ULONG carry, r1, r0;
43327142ad2Sjsing
43482c46216Sjsing bn_mulw(a, b, &r1, &r0);
43527142ad2Sjsing bn_addw(r0, c, &carry, &r0);
43627142ad2Sjsing r1 += carry;
43727142ad2Sjsing
43827142ad2Sjsing *out_r1 = r1;
43927142ad2Sjsing *out_r0 = r0;
44027142ad2Sjsing }
44127142ad2Sjsing #endif
44227142ad2Sjsing
44327142ad2Sjsing /*
44427142ad2Sjsing * bn_mulw_addw_addw() computes (r1:r0) = a * b + c + d with all inputs being
44527142ad2Sjsing * single words, producing a double word result.
44627142ad2Sjsing */
44727142ad2Sjsing #ifndef HAVE_BN_MULW_ADDW_ADDW
44827142ad2Sjsing static inline void
bn_mulw_addw_addw(BN_ULONG a,BN_ULONG b,BN_ULONG c,BN_ULONG d,BN_ULONG * out_r1,BN_ULONG * out_r0)44927142ad2Sjsing bn_mulw_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG d,
45027142ad2Sjsing BN_ULONG *out_r1, BN_ULONG *out_r0)
45127142ad2Sjsing {
45227142ad2Sjsing BN_ULONG carry, r1, r0;
45327142ad2Sjsing
45427142ad2Sjsing bn_mulw_addw(a, b, c, &r1, &r0);
45527142ad2Sjsing bn_addw(r0, d, &carry, &r0);
45627142ad2Sjsing r1 += carry;
45727142ad2Sjsing
45827142ad2Sjsing *out_r1 = r1;
45927142ad2Sjsing *out_r0 = r0;
46027142ad2Sjsing }
46127142ad2Sjsing #endif
46227142ad2Sjsing
46327142ad2Sjsing /*
46427142ad2Sjsing * bn_mulw_addtw() computes (r2:r1:r0) = a * b + (c2:c1:c0), where a and b are
46527142ad2Sjsing * single words and (c2:c1:c0) is a triple word, producing a triple word result.
46627142ad2Sjsing * The caller must ensure that the inputs provided do not result in c2
46727142ad2Sjsing * overflowing.
46827142ad2Sjsing */
46927142ad2Sjsing #ifndef HAVE_BN_MULW_ADDTW
47027142ad2Sjsing static inline void
bn_mulw_addtw(BN_ULONG a,BN_ULONG b,BN_ULONG c2,BN_ULONG c1,BN_ULONG c0,BN_ULONG * out_r2,BN_ULONG * out_r1,BN_ULONG * out_r0)47127142ad2Sjsing bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
47227142ad2Sjsing BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
47327142ad2Sjsing {
4740d14184dSjsing BN_ULONG carry, r2, r1, r0, x1;
47527142ad2Sjsing
4760d14184dSjsing bn_mulw_addw(a, b, c0, &x1, &r0);
47727142ad2Sjsing bn_addw(c1, x1, &carry, &r1);
47827142ad2Sjsing r2 = c2 + carry;
47927142ad2Sjsing
48027142ad2Sjsing *out_r2 = r2;
48127142ad2Sjsing *out_r1 = r1;
48227142ad2Sjsing *out_r0 = r0;
48327142ad2Sjsing }
48427142ad2Sjsing #endif
48527142ad2Sjsing
486a8b2fef0Sjsing /*
487b47e8ee9Sjsing * bn_mul2_mulw_addtw() computes (r2:r1:r0) = 2 * a * b + (c2:c1:c0), where a
488b47e8ee9Sjsing * and b are single words and (c2:c1:c0) is a triple word, producing a triple
489b47e8ee9Sjsing * word result. The caller must ensure that the inputs provided do not result
490b47e8ee9Sjsing * in c2 overflowing.
491a8b2fef0Sjsing */
492a8b2fef0Sjsing #ifndef HAVE_BN_MUL2_MULW_ADDTW
493a8b2fef0Sjsing static inline void
bn_mul2_mulw_addtw(BN_ULONG a,BN_ULONG b,BN_ULONG c2,BN_ULONG c1,BN_ULONG c0,BN_ULONG * out_r2,BN_ULONG * out_r1,BN_ULONG * out_r0)494a8b2fef0Sjsing bn_mul2_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
495a8b2fef0Sjsing BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
496a8b2fef0Sjsing {
497a8b2fef0Sjsing BN_ULONG r2, r1, r0, x1, x0;
498a8b2fef0Sjsing BN_ULONG carry;
499a8b2fef0Sjsing
500a8b2fef0Sjsing bn_mulw(a, b, &x1, &x0);
501a8b2fef0Sjsing bn_addw(c0, x0, &carry, &r0);
502a8b2fef0Sjsing bn_addw(c1, x1 + carry, &r2, &r1);
503a8b2fef0Sjsing bn_addw(c2, r2, &carry, &r2);
504a8b2fef0Sjsing bn_addw(r0, x0, &carry, &r0);
505a8b2fef0Sjsing bn_addw(r1, x1 + carry, &carry, &r1);
506a8b2fef0Sjsing r2 += carry;
507a8b2fef0Sjsing
508a8b2fef0Sjsing *out_r2 = r2;
509a8b2fef0Sjsing *out_r1 = r1;
510a8b2fef0Sjsing *out_r0 = r0;
511a8b2fef0Sjsing }
512a8b2fef0Sjsing #endif
513a8b2fef0Sjsing
51473e8eea7Sjsing /*
51573e8eea7Sjsing * bn_qwmulw_addw() computes (r4:r3:r2:r1:r0) = (a3:a2:a1:a0) * b + c, where a
51673e8eea7Sjsing * is a quad word, b is a single word and c is a single word, producing a five
51773e8eea7Sjsing * word result.
51873e8eea7Sjsing */
51973e8eea7Sjsing #ifndef HAVE_BN_QWMULW_ADDW
52073e8eea7Sjsing static inline void
bn_qwmulw_addw(BN_ULONG a3,BN_ULONG a2,BN_ULONG a1,BN_ULONG a0,BN_ULONG b,BN_ULONG c,BN_ULONG * out_r4,BN_ULONG * out_r3,BN_ULONG * out_r2,BN_ULONG * out_r1,BN_ULONG * out_r0)52173e8eea7Sjsing bn_qwmulw_addw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b,
52273e8eea7Sjsing BN_ULONG c, BN_ULONG *out_r4, BN_ULONG *out_r3, BN_ULONG *out_r2,
52373e8eea7Sjsing BN_ULONG *out_r1, BN_ULONG *out_r0)
52473e8eea7Sjsing {
52573e8eea7Sjsing BN_ULONG r3, r2, r1, r0;
52673e8eea7Sjsing
52773e8eea7Sjsing bn_mulw_addw(a0, b, c, &c, &r0);
52873e8eea7Sjsing bn_mulw_addw(a1, b, c, &c, &r1);
52973e8eea7Sjsing bn_mulw_addw(a2, b, c, &c, &r2);
53073e8eea7Sjsing bn_mulw_addw(a3, b, c, &c, &r3);
53173e8eea7Sjsing
53273e8eea7Sjsing *out_r4 = c;
53373e8eea7Sjsing *out_r3 = r3;
53473e8eea7Sjsing *out_r2 = r2;
53573e8eea7Sjsing *out_r1 = r1;
53673e8eea7Sjsing *out_r0 = r0;
53773e8eea7Sjsing }
53873e8eea7Sjsing #endif
53973e8eea7Sjsing
54073e8eea7Sjsing /*
54173e8eea7Sjsing * bn_qwmulw_addqw_addw() computes
54273e8eea7Sjsing * (r4:r3:r2:r1:r0) = (a3:a2:a1:a0) * b + (c3:c2:c1:c0) + d, where a
54373e8eea7Sjsing * is a quad word, b is a single word, c is a quad word, and d is a single word,
54473e8eea7Sjsing * producing a five word result.
54573e8eea7Sjsing */
54673e8eea7Sjsing #ifndef HAVE_BN_QWMULW_ADDQW_ADDW
54773e8eea7Sjsing static inline void
bn_qwmulw_addqw_addw(BN_ULONG a3,BN_ULONG a2,BN_ULONG a1,BN_ULONG a0,BN_ULONG b,BN_ULONG c3,BN_ULONG c2,BN_ULONG c1,BN_ULONG c0,BN_ULONG d,BN_ULONG * out_r4,BN_ULONG * out_r3,BN_ULONG * out_r2,BN_ULONG * out_r1,BN_ULONG * out_r0)54873e8eea7Sjsing bn_qwmulw_addqw_addw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0,
54973e8eea7Sjsing BN_ULONG b, BN_ULONG c3, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, BN_ULONG d,
55073e8eea7Sjsing BN_ULONG *out_r4, BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1,
55173e8eea7Sjsing BN_ULONG *out_r0)
55273e8eea7Sjsing {
55373e8eea7Sjsing BN_ULONG r3, r2, r1, r0;
55473e8eea7Sjsing
55573e8eea7Sjsing bn_mulw_addw_addw(a0, b, c0, d, &d, &r0);
55673e8eea7Sjsing bn_mulw_addw_addw(a1, b, c1, d, &d, &r1);
55773e8eea7Sjsing bn_mulw_addw_addw(a2, b, c2, d, &d, &r2);
55873e8eea7Sjsing bn_mulw_addw_addw(a3, b, c3, d, &d, &r3);
55973e8eea7Sjsing
56073e8eea7Sjsing *out_r4 = d;
56173e8eea7Sjsing *out_r3 = r3;
56273e8eea7Sjsing *out_r2 = r2;
56373e8eea7Sjsing *out_r1 = r1;
56473e8eea7Sjsing *out_r0 = r0;
56573e8eea7Sjsing }
56673e8eea7Sjsing #endif
56773e8eea7Sjsing
568b97356a2Sjsing #endif
569