xref: /onnv-gate/usr/src/common/openssl/crypto/bn/asm/x86_64-gcc.c (revision 2139:6243c3338933)
1 /*
2  * x86_64 BIGNUM accelerator version 0.1, December 2002.
3  *
4  * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5  * project.
6  *
7  * Rights for redistribution and usage in source and binary forms are
8  * granted according to the OpenSSL license. Warranty of any kind is
9  * disclaimed.
10  *
11  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12  *    versions, like 1.0...
13  * A. Well, that's because this code is basically a quick-n-dirty
14  *    proof-of-concept hack. As you can see it's implemented with
15  *    inline assembler, which means that you're bound to GCC and that
16  *    there might be enough room for further improvement.
17  *
18  * Q. Why inline assembler?
19  * A. x86_64 features own ABI which I'm not familiar with. This is
20  *    why I decided to let the compiler take care of subroutine
21  *    prologue/epilogue as well as register allocation. For reference.
22  *    Win64 implements different ABI for AMD64, different from Linux.
23  *
24  * Q. How much faster does it get?
25  * A. 'apps/openssl speed rsa dsa' output with no-asm:
26  *
27  *	                  sign    verify    sign/s verify/s
28  *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
29  *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
30  *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
31  *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
32  *	                  sign    verify    sign/s verify/s
33  *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
34  *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
35  *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
36  *
37  *    'apps/openssl speed rsa dsa' output with this module:
38  *
39  *	                  sign    verify    sign/s verify/s
40  *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
41  *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
42  *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
43  *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
44  *	                  sign    verify    sign/s verify/s
45  *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
46  *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
47  *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
48  *
49  *    For the reference. IA-32 assembler implementation performs
50  *    very much like 64-bit code compiled with no-asm on the same
51  *    machine.
52  */
53 
54 #define BN_ULONG unsigned long
55 
56 /*
57  * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
58  * "g"(0)		let the compiler to decide where does it
59  *			want to keep the value of zero;
60  */
61 #define mul_add(r,a,word,carry) do {	\
62 	register BN_ULONG high,low;	\
63 	asm ("mulq %3"			\
64 		: "=a"(low),"=d"(high)	\
65 		: "a"(word),"m"(a)	\
66 		: "cc");		\
67 	asm ("addq %2,%0; adcq %3,%1"	\
68 		: "+r"(carry),"+d"(high)\
69 		: "a"(low),"g"(0)	\
70 		: "cc");		\
71 	asm ("addq %2,%0; adcq %3,%1"	\
72 		: "+m"(r),"+d"(high)	\
73 		: "r"(carry),"g"(0)	\
74 		: "cc");		\
75 	carry=high;			\
76 	} while (0)
77 
78 #define mul(r,a,word,carry) do {	\
79 	register BN_ULONG high,low;	\
80 	asm ("mulq %3"			\
81 		: "=a"(low),"=d"(high)	\
82 		: "a"(word),"g"(a)	\
83 		: "cc");		\
84 	asm ("addq %2,%0; adcq %3,%1"	\
85 		: "+r"(carry),"+d"(high)\
86 		: "a"(low),"g"(0)	\
87 		: "cc");		\
88 	(r)=carry, carry=high;		\
89 	} while (0)
90 
91 #define sqr(r0,r1,a)			\
92 	asm ("mulq %2"			\
93 		: "=a"(r0),"=d"(r1)	\
94 		: "a"(a)		\
95 		: "cc");
96 
bn_mul_add_words(BN_ULONG * rp,BN_ULONG * ap,int num,BN_ULONG w)97 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
98 	{
99 	BN_ULONG c1=0;
100 
101 	if (num <= 0) return(c1);
102 
103 	while (num&~3)
104 		{
105 		mul_add(rp[0],ap[0],w,c1);
106 		mul_add(rp[1],ap[1],w,c1);
107 		mul_add(rp[2],ap[2],w,c1);
108 		mul_add(rp[3],ap[3],w,c1);
109 		ap+=4; rp+=4; num-=4;
110 		}
111 	if (num)
112 		{
113 		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
114 		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
115 		mul_add(rp[2],ap[2],w,c1); return c1;
116 		}
117 
118 	return(c1);
119 	}
120 
bn_mul_words(BN_ULONG * rp,BN_ULONG * ap,int num,BN_ULONG w)121 BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
122 	{
123 	BN_ULONG c1=0;
124 
125 	if (num <= 0) return(c1);
126 
127 	while (num&~3)
128 		{
129 		mul(rp[0],ap[0],w,c1);
130 		mul(rp[1],ap[1],w,c1);
131 		mul(rp[2],ap[2],w,c1);
132 		mul(rp[3],ap[3],w,c1);
133 		ap+=4; rp+=4; num-=4;
134 		}
135 	if (num)
136 		{
137 		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
138 		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
139 		mul(rp[2],ap[2],w,c1);
140 		}
141 	return(c1);
142 	}
143 
bn_sqr_words(BN_ULONG * r,BN_ULONG * a,int n)144 void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
145         {
146 	if (n <= 0) return;
147 
148 	while (n&~3)
149 		{
150 		sqr(r[0],r[1],a[0]);
151 		sqr(r[2],r[3],a[1]);
152 		sqr(r[4],r[5],a[2]);
153 		sqr(r[6],r[7],a[3]);
154 		a+=4; r+=8; n-=4;
155 		}
156 	if (n)
157 		{
158 		sqr(r[0],r[1],a[0]); if (--n == 0) return;
159 		sqr(r[2],r[3],a[1]); if (--n == 0) return;
160 		sqr(r[4],r[5],a[2]);
161 		}
162 	}
163 
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)164 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
165 {	BN_ULONG ret,waste;
166 
167 	asm ("divq	%4"
168 		: "=a"(ret),"=d"(waste)
169 		: "a"(l),"d"(h),"g"(d)
170 		: "cc");
171 
172 	return ret;
173 }
174 
bn_add_words(BN_ULONG * rp,BN_ULONG * ap,BN_ULONG * bp,int n)175 BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
176 { BN_ULONG ret=0,i=0;
177 
178 	if (n <= 0) return 0;
179 
180 	asm (
181 	"	subq	%2,%2		\n"
182 	".align 16			\n"
183 	"1:	movq	(%4,%2,8),%0	\n"
184 	"	adcq	(%5,%2,8),%0	\n"
185 	"	movq	%0,(%3,%2,8)	\n"
186 	"	leaq	1(%2),%2	\n"
187 	"	loop	1b		\n"
188 	"	sbbq	%0,%0		\n"
189 		: "=&a"(ret),"+c"(n),"=&r"(i)
190 		: "r"(rp),"r"(ap),"r"(bp)
191 		: "cc"
192 	);
193 
194   return ret&1;
195 }
196 
197 #ifndef SIMICS
bn_sub_words(BN_ULONG * rp,BN_ULONG * ap,BN_ULONG * bp,int n)198 BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
199 { BN_ULONG ret=0,i=0;
200 
201 	if (n <= 0) return 0;
202 
203 	asm (
204 	"	subq	%2,%2		\n"
205 	".align 16			\n"
206 	"1:	movq	(%4,%2,8),%0	\n"
207 	"	sbbq	(%5,%2,8),%0	\n"
208 	"	movq	%0,(%3,%2,8)	\n"
209 	"	leaq	1(%2),%2	\n"
210 	"	loop	1b		\n"
211 	"	sbbq	%0,%0		\n"
212 		: "=&a"(ret),"+c"(n),"=&r"(i)
213 		: "r"(rp),"r"(ap),"r"(bp)
214 		: "cc"
215 	);
216 
217   return ret&1;
218 }
219 #else
220 /* Simics 1.4<7 has buggy sbbq:-( */
221 #define BN_MASK2 0xffffffffffffffffL
bn_sub_words(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b,int n)222 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
223         {
224 	BN_ULONG t1,t2;
225 	int c=0;
226 
227 	if (n <= 0) return((BN_ULONG)0);
228 
229 	for (;;)
230 		{
231 		t1=a[0]; t2=b[0];
232 		r[0]=(t1-t2-c)&BN_MASK2;
233 		if (t1 != t2) c=(t1 < t2);
234 		if (--n <= 0) break;
235 
236 		t1=a[1]; t2=b[1];
237 		r[1]=(t1-t2-c)&BN_MASK2;
238 		if (t1 != t2) c=(t1 < t2);
239 		if (--n <= 0) break;
240 
241 		t1=a[2]; t2=b[2];
242 		r[2]=(t1-t2-c)&BN_MASK2;
243 		if (t1 != t2) c=(t1 < t2);
244 		if (--n <= 0) break;
245 
246 		t1=a[3]; t2=b[3];
247 		r[3]=(t1-t2-c)&BN_MASK2;
248 		if (t1 != t2) c=(t1 < t2);
249 		if (--n <= 0) break;
250 
251 		a+=4;
252 		b+=4;
253 		r+=4;
254 		}
255 	return(c);
256 	}
257 #endif
258 
259 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
260 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
261 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
262 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
263 
264 #if 0
265 /* original macros are kept for reference purposes */
266 #define mul_add_c(a,b,c0,c1,c2) {	\
267 	BN_ULONG ta=(a),tb=(b);		\
268 	t1 = ta * tb;			\
269 	t2 = BN_UMULT_HIGH(ta,tb);	\
270 	c0 += t1; t2 += (c0<t1)?1:0;	\
271 	c1 += t2; c2 += (c1<t2)?1:0;	\
272 	}
273 
274 #define mul_add_c2(a,b,c0,c1,c2) {	\
275 	BN_ULONG ta=(a),tb=(b),t0;	\
276 	t1 = BN_UMULT_HIGH(ta,tb);	\
277 	t0 = ta * tb;			\
278 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
279 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
280 	c0 += t1; t2 += (c0<t1)?1:0;	\
281 	c1 += t2; c2 += (c1<t2)?1:0;	\
282 	}
283 #else
284 #define mul_add_c(a,b,c0,c1,c2)	do {	\
285 	asm ("mulq %3"			\
286 		: "=a"(t1),"=d"(t2)	\
287 		: "a"(a),"m"(b)		\
288 		: "cc");		\
289 	asm ("addq %2,%0; adcq %3,%1"	\
290 		: "+r"(c0),"+d"(t2)	\
291 		: "a"(t1),"g"(0)	\
292 		: "cc");		\
293 	asm ("addq %2,%0; adcq %3,%1"	\
294 		: "+r"(c1),"+r"(c2)	\
295 		: "d"(t2),"g"(0)	\
296 		: "cc");		\
297 	} while (0)
298 
299 #define sqr_add_c(a,i,c0,c1,c2)	do {	\
300 	asm ("mulq %2"			\
301 		: "=a"(t1),"=d"(t2)	\
302 		: "a"(a[i])		\
303 		: "cc");		\
304 	asm ("addq %2,%0; adcq %3,%1"	\
305 		: "+r"(c0),"+d"(t2)	\
306 		: "a"(t1),"g"(0)	\
307 		: "cc");		\
308 	asm ("addq %2,%0; adcq %3,%1"	\
309 		: "+r"(c1),"+r"(c2)	\
310 		: "d"(t2),"g"(0)	\
311 		: "cc");		\
312 	} while (0)
313 
314 #define mul_add_c2(a,b,c0,c1,c2) do {	\
315 	asm ("mulq %3"			\
316 		: "=a"(t1),"=d"(t2)	\
317 		: "a"(a),"m"(b)		\
318 		: "cc");		\
319 	asm ("addq %0,%0; adcq %2,%1"	\
320 		: "+d"(t2),"+r"(c2)	\
321 		: "g"(0)		\
322 		: "cc");		\
323 	asm ("addq %0,%0; adcq %2,%1"	\
324 		: "+a"(t1),"+d"(t2)	\
325 		: "g"(0)		\
326 		: "cc");		\
327 	asm ("addq %2,%0; adcq %3,%1"	\
328 		: "+r"(c0),"+d"(t2)	\
329 		: "a"(t1),"g"(0)	\
330 		: "cc");		\
331 	asm ("addq %2,%0; adcq %3,%1"	\
332 		: "+r"(c1),"+r"(c2)	\
333 		: "d"(t2),"g"(0)	\
334 		: "cc");		\
335 	} while (0)
336 #endif
337 
338 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
339 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
340 
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)341 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
342 	{
343 	BN_ULONG t1,t2;
344 	BN_ULONG c1,c2,c3;
345 
346 	c1=0;
347 	c2=0;
348 	c3=0;
349 	mul_add_c(a[0],b[0],c1,c2,c3);
350 	r[0]=c1;
351 	c1=0;
352 	mul_add_c(a[0],b[1],c2,c3,c1);
353 	mul_add_c(a[1],b[0],c2,c3,c1);
354 	r[1]=c2;
355 	c2=0;
356 	mul_add_c(a[2],b[0],c3,c1,c2);
357 	mul_add_c(a[1],b[1],c3,c1,c2);
358 	mul_add_c(a[0],b[2],c3,c1,c2);
359 	r[2]=c3;
360 	c3=0;
361 	mul_add_c(a[0],b[3],c1,c2,c3);
362 	mul_add_c(a[1],b[2],c1,c2,c3);
363 	mul_add_c(a[2],b[1],c1,c2,c3);
364 	mul_add_c(a[3],b[0],c1,c2,c3);
365 	r[3]=c1;
366 	c1=0;
367 	mul_add_c(a[4],b[0],c2,c3,c1);
368 	mul_add_c(a[3],b[1],c2,c3,c1);
369 	mul_add_c(a[2],b[2],c2,c3,c1);
370 	mul_add_c(a[1],b[3],c2,c3,c1);
371 	mul_add_c(a[0],b[4],c2,c3,c1);
372 	r[4]=c2;
373 	c2=0;
374 	mul_add_c(a[0],b[5],c3,c1,c2);
375 	mul_add_c(a[1],b[4],c3,c1,c2);
376 	mul_add_c(a[2],b[3],c3,c1,c2);
377 	mul_add_c(a[3],b[2],c3,c1,c2);
378 	mul_add_c(a[4],b[1],c3,c1,c2);
379 	mul_add_c(a[5],b[0],c3,c1,c2);
380 	r[5]=c3;
381 	c3=0;
382 	mul_add_c(a[6],b[0],c1,c2,c3);
383 	mul_add_c(a[5],b[1],c1,c2,c3);
384 	mul_add_c(a[4],b[2],c1,c2,c3);
385 	mul_add_c(a[3],b[3],c1,c2,c3);
386 	mul_add_c(a[2],b[4],c1,c2,c3);
387 	mul_add_c(a[1],b[5],c1,c2,c3);
388 	mul_add_c(a[0],b[6],c1,c2,c3);
389 	r[6]=c1;
390 	c1=0;
391 	mul_add_c(a[0],b[7],c2,c3,c1);
392 	mul_add_c(a[1],b[6],c2,c3,c1);
393 	mul_add_c(a[2],b[5],c2,c3,c1);
394 	mul_add_c(a[3],b[4],c2,c3,c1);
395 	mul_add_c(a[4],b[3],c2,c3,c1);
396 	mul_add_c(a[5],b[2],c2,c3,c1);
397 	mul_add_c(a[6],b[1],c2,c3,c1);
398 	mul_add_c(a[7],b[0],c2,c3,c1);
399 	r[7]=c2;
400 	c2=0;
401 	mul_add_c(a[7],b[1],c3,c1,c2);
402 	mul_add_c(a[6],b[2],c3,c1,c2);
403 	mul_add_c(a[5],b[3],c3,c1,c2);
404 	mul_add_c(a[4],b[4],c3,c1,c2);
405 	mul_add_c(a[3],b[5],c3,c1,c2);
406 	mul_add_c(a[2],b[6],c3,c1,c2);
407 	mul_add_c(a[1],b[7],c3,c1,c2);
408 	r[8]=c3;
409 	c3=0;
410 	mul_add_c(a[2],b[7],c1,c2,c3);
411 	mul_add_c(a[3],b[6],c1,c2,c3);
412 	mul_add_c(a[4],b[5],c1,c2,c3);
413 	mul_add_c(a[5],b[4],c1,c2,c3);
414 	mul_add_c(a[6],b[3],c1,c2,c3);
415 	mul_add_c(a[7],b[2],c1,c2,c3);
416 	r[9]=c1;
417 	c1=0;
418 	mul_add_c(a[7],b[3],c2,c3,c1);
419 	mul_add_c(a[6],b[4],c2,c3,c1);
420 	mul_add_c(a[5],b[5],c2,c3,c1);
421 	mul_add_c(a[4],b[6],c2,c3,c1);
422 	mul_add_c(a[3],b[7],c2,c3,c1);
423 	r[10]=c2;
424 	c2=0;
425 	mul_add_c(a[4],b[7],c3,c1,c2);
426 	mul_add_c(a[5],b[6],c3,c1,c2);
427 	mul_add_c(a[6],b[5],c3,c1,c2);
428 	mul_add_c(a[7],b[4],c3,c1,c2);
429 	r[11]=c3;
430 	c3=0;
431 	mul_add_c(a[7],b[5],c1,c2,c3);
432 	mul_add_c(a[6],b[6],c1,c2,c3);
433 	mul_add_c(a[5],b[7],c1,c2,c3);
434 	r[12]=c1;
435 	c1=0;
436 	mul_add_c(a[6],b[7],c2,c3,c1);
437 	mul_add_c(a[7],b[6],c2,c3,c1);
438 	r[13]=c2;
439 	c2=0;
440 	mul_add_c(a[7],b[7],c3,c1,c2);
441 	r[14]=c3;
442 	r[15]=c1;
443 	}
444 
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)445 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
446 	{
447 	BN_ULONG t1,t2;
448 	BN_ULONG c1,c2,c3;
449 
450 	c1=0;
451 	c2=0;
452 	c3=0;
453 	mul_add_c(a[0],b[0],c1,c2,c3);
454 	r[0]=c1;
455 	c1=0;
456 	mul_add_c(a[0],b[1],c2,c3,c1);
457 	mul_add_c(a[1],b[0],c2,c3,c1);
458 	r[1]=c2;
459 	c2=0;
460 	mul_add_c(a[2],b[0],c3,c1,c2);
461 	mul_add_c(a[1],b[1],c3,c1,c2);
462 	mul_add_c(a[0],b[2],c3,c1,c2);
463 	r[2]=c3;
464 	c3=0;
465 	mul_add_c(a[0],b[3],c1,c2,c3);
466 	mul_add_c(a[1],b[2],c1,c2,c3);
467 	mul_add_c(a[2],b[1],c1,c2,c3);
468 	mul_add_c(a[3],b[0],c1,c2,c3);
469 	r[3]=c1;
470 	c1=0;
471 	mul_add_c(a[3],b[1],c2,c3,c1);
472 	mul_add_c(a[2],b[2],c2,c3,c1);
473 	mul_add_c(a[1],b[3],c2,c3,c1);
474 	r[4]=c2;
475 	c2=0;
476 	mul_add_c(a[2],b[3],c3,c1,c2);
477 	mul_add_c(a[3],b[2],c3,c1,c2);
478 	r[5]=c3;
479 	c3=0;
480 	mul_add_c(a[3],b[3],c1,c2,c3);
481 	r[6]=c1;
482 	r[7]=c2;
483 	}
484 
bn_sqr_comba8(BN_ULONG * r,BN_ULONG * a)485 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
486 	{
487 	BN_ULONG t1,t2;
488 	BN_ULONG c1,c2,c3;
489 
490 	c1=0;
491 	c2=0;
492 	c3=0;
493 	sqr_add_c(a,0,c1,c2,c3);
494 	r[0]=c1;
495 	c1=0;
496 	sqr_add_c2(a,1,0,c2,c3,c1);
497 	r[1]=c2;
498 	c2=0;
499 	sqr_add_c(a,1,c3,c1,c2);
500 	sqr_add_c2(a,2,0,c3,c1,c2);
501 	r[2]=c3;
502 	c3=0;
503 	sqr_add_c2(a,3,0,c1,c2,c3);
504 	sqr_add_c2(a,2,1,c1,c2,c3);
505 	r[3]=c1;
506 	c1=0;
507 	sqr_add_c(a,2,c2,c3,c1);
508 	sqr_add_c2(a,3,1,c2,c3,c1);
509 	sqr_add_c2(a,4,0,c2,c3,c1);
510 	r[4]=c2;
511 	c2=0;
512 	sqr_add_c2(a,5,0,c3,c1,c2);
513 	sqr_add_c2(a,4,1,c3,c1,c2);
514 	sqr_add_c2(a,3,2,c3,c1,c2);
515 	r[5]=c3;
516 	c3=0;
517 	sqr_add_c(a,3,c1,c2,c3);
518 	sqr_add_c2(a,4,2,c1,c2,c3);
519 	sqr_add_c2(a,5,1,c1,c2,c3);
520 	sqr_add_c2(a,6,0,c1,c2,c3);
521 	r[6]=c1;
522 	c1=0;
523 	sqr_add_c2(a,7,0,c2,c3,c1);
524 	sqr_add_c2(a,6,1,c2,c3,c1);
525 	sqr_add_c2(a,5,2,c2,c3,c1);
526 	sqr_add_c2(a,4,3,c2,c3,c1);
527 	r[7]=c2;
528 	c2=0;
529 	sqr_add_c(a,4,c3,c1,c2);
530 	sqr_add_c2(a,5,3,c3,c1,c2);
531 	sqr_add_c2(a,6,2,c3,c1,c2);
532 	sqr_add_c2(a,7,1,c3,c1,c2);
533 	r[8]=c3;
534 	c3=0;
535 	sqr_add_c2(a,7,2,c1,c2,c3);
536 	sqr_add_c2(a,6,3,c1,c2,c3);
537 	sqr_add_c2(a,5,4,c1,c2,c3);
538 	r[9]=c1;
539 	c1=0;
540 	sqr_add_c(a,5,c2,c3,c1);
541 	sqr_add_c2(a,6,4,c2,c3,c1);
542 	sqr_add_c2(a,7,3,c2,c3,c1);
543 	r[10]=c2;
544 	c2=0;
545 	sqr_add_c2(a,7,4,c3,c1,c2);
546 	sqr_add_c2(a,6,5,c3,c1,c2);
547 	r[11]=c3;
548 	c3=0;
549 	sqr_add_c(a,6,c1,c2,c3);
550 	sqr_add_c2(a,7,5,c1,c2,c3);
551 	r[12]=c1;
552 	c1=0;
553 	sqr_add_c2(a,7,6,c2,c3,c1);
554 	r[13]=c2;
555 	c2=0;
556 	sqr_add_c(a,7,c3,c1,c2);
557 	r[14]=c3;
558 	r[15]=c1;
559 	}
560 
bn_sqr_comba4(BN_ULONG * r,BN_ULONG * a)561 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
562 	{
563 	BN_ULONG t1,t2;
564 	BN_ULONG c1,c2,c3;
565 
566 	c1=0;
567 	c2=0;
568 	c3=0;
569 	sqr_add_c(a,0,c1,c2,c3);
570 	r[0]=c1;
571 	c1=0;
572 	sqr_add_c2(a,1,0,c2,c3,c1);
573 	r[1]=c2;
574 	c2=0;
575 	sqr_add_c(a,1,c3,c1,c2);
576 	sqr_add_c2(a,2,0,c3,c1,c2);
577 	r[2]=c3;
578 	c3=0;
579 	sqr_add_c2(a,3,0,c1,c2,c3);
580 	sqr_add_c2(a,2,1,c1,c2,c3);
581 	r[3]=c1;
582 	c1=0;
583 	sqr_add_c(a,2,c2,c3,c1);
584 	sqr_add_c2(a,3,1,c2,c3,c1);
585 	r[4]=c2;
586 	c2=0;
587 	sqr_add_c2(a,3,2,c3,c1,c2);
588 	r[5]=c3;
589 	c3=0;
590 	sqr_add_c(a,3,c1,c2,c3);
591 	r[6]=c1;
592 	r[7]=c2;
593 	}
594