xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/mips/mips64.S (revision 1b3d6f93806f8821fe459e13ad13e605b37c6d43)
1#include "mips_arch.h"
2
3#if defined(_MIPS_ARCH_MIPS64R6)
4# define ddivu(rs,rt)
5# define mfqt(rd,rs,rt)	ddivu	rd,rs,rt
6# define mfrm(rd,rs,rt)	dmodu	rd,rs,rt
7#elif defined(_MIPS_ARCH_MIPS32R6)
8# define divu(rs,rt)
9# define mfqt(rd,rs,rt)	divu	rd,rs,rt
10# define mfrm(rd,rs,rt)	modu	rd,rs,rt
11#else
12# define ddivu(rs,rt)	ddivu	$0,rs,rt
13# define mfqt(rd,rs,rt)	mflo	rd
14# define mfrm(rd,rs,rt)	mfhi	rd
15#endif
16
17.rdata
18.asciiz	"mips3.s, Version 1.2"
19.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
20
21.text
22.set	noat
23
24.align	5
25.globl	bn_mul_add_words
26.ent	bn_mul_add_words
27bn_mul_add_words:
28	.set	noreorder
29	bgtz	$6,bn_mul_add_words_internal
30	move	$2,$0
31	jr	$31
32	move	$4,$2
33.end	bn_mul_add_words
34
35.align	5
36.ent	bn_mul_add_words_internal
37bn_mul_add_words_internal:
38	.set	reorder
39	li	$3,-4
40	and	$8,$6,$3
41	beqz	$8,.L_bn_mul_add_words_tail
42
43.L_bn_mul_add_words_loop:
44	ld	$12,0($5)
45	dmultu	($12,$7)
46	ld	$13,0($4)
47	ld	$14,8($5)
48	ld	$15,8($4)
49	ld	$8,2*8($5)
50	ld	$9,2*8($4)
51	daddu	$13,$2
52	sltu	$2,$13,$2	# All manuals say it "compares 32-bit
53				# values", but it seems to work fine
54				# even on 64-bit registers.
55	mflo	($1,$12,$7)
56	mfhi	($12,$12,$7)
57	daddu	$13,$1
58	daddu	$2,$12
59	 dmultu	($14,$7)
60	sltu	$1,$13,$1
61	sd	$13,0($4)
62	daddu	$2,$1
63
64	ld	$10,3*8($5)
65	ld	$11,3*8($4)
66	daddu	$15,$2
67	sltu	$2,$15,$2
68	mflo	($1,$14,$7)
69	mfhi	($14,$14,$7)
70	daddu	$15,$1
71	daddu	$2,$14
72	 dmultu	($8,$7)
73	sltu	$1,$15,$1
74	sd	$15,8($4)
75	daddu	$2,$1
76
77	subu	$6,4
78	daddu $4,4*8
79	daddu $5,4*8
80	daddu	$9,$2
81	sltu	$2,$9,$2
82	mflo	($1,$8,$7)
83	mfhi	($8,$8,$7)
84	daddu	$9,$1
85	daddu	$2,$8
86	 dmultu	($10,$7)
87	sltu	$1,$9,$1
88	sd	$9,-2*8($4)
89	daddu	$2,$1
90
91
92	and	$8,$6,$3
93	daddu	$11,$2
94	sltu	$2,$11,$2
95	mflo	($1,$10,$7)
96	mfhi	($10,$10,$7)
97	daddu	$11,$1
98	daddu	$2,$10
99	sltu	$1,$11,$1
100	sd	$11,-8($4)
101	.set	noreorder
102	bgtz	$8,.L_bn_mul_add_words_loop
103	daddu	$2,$1
104
105	beqz	$6,.L_bn_mul_add_words_return
106	nop
107
108.L_bn_mul_add_words_tail:
109	.set	reorder
110	ld	$12,0($5)
111	dmultu	($12,$7)
112	ld	$13,0($4)
113	subu	$6,1
114	daddu	$13,$2
115	sltu	$2,$13,$2
116	mflo	($1,$12,$7)
117	mfhi	($12,$12,$7)
118	daddu	$13,$1
119	daddu	$2,$12
120	sltu	$1,$13,$1
121	sd	$13,0($4)
122	daddu	$2,$1
123	beqz	$6,.L_bn_mul_add_words_return
124
125	ld	$12,8($5)
126	dmultu	($12,$7)
127	ld	$13,8($4)
128	subu	$6,1
129	daddu	$13,$2
130	sltu	$2,$13,$2
131	mflo	($1,$12,$7)
132	mfhi	($12,$12,$7)
133	daddu	$13,$1
134	daddu	$2,$12
135	sltu	$1,$13,$1
136	sd	$13,8($4)
137	daddu	$2,$1
138	beqz	$6,.L_bn_mul_add_words_return
139
140	ld	$12,2*8($5)
141	dmultu	($12,$7)
142	ld	$13,2*8($4)
143	daddu	$13,$2
144	sltu	$2,$13,$2
145	mflo	($1,$12,$7)
146	mfhi	($12,$12,$7)
147	daddu	$13,$1
148	daddu	$2,$12
149	sltu	$1,$13,$1
150	sd	$13,2*8($4)
151	daddu	$2,$1
152
153.L_bn_mul_add_words_return:
154	.set	noreorder
155	jr	$31
156	move	$4,$2
157.end	bn_mul_add_words_internal
158
159.align	5
160.globl	bn_mul_words
161.ent	bn_mul_words
162bn_mul_words:
163	.set	noreorder
164	bgtz	$6,bn_mul_words_internal
165	move	$2,$0
166	jr	$31
167	move	$4,$2
168.end	bn_mul_words
169
170.align	5
171.ent	bn_mul_words_internal
172bn_mul_words_internal:
173	.set	reorder
174	li	$3,-4
175	and	$8,$6,$3
176	beqz	$8,.L_bn_mul_words_tail
177
178.L_bn_mul_words_loop:
179	ld	$12,0($5)
180	dmultu	($12,$7)
181	ld	$14,8($5)
182	ld	$8,2*8($5)
183	ld	$10,3*8($5)
184	mflo	($1,$12,$7)
185	mfhi	($12,$12,$7)
186	daddu	$2,$1
187	sltu	$13,$2,$1
188	 dmultu	($14,$7)
189	sd	$2,0($4)
190	daddu	$2,$13,$12
191
192	subu	$6,4
193	daddu $4,4*8
194	daddu $5,4*8
195	mflo	($1,$14,$7)
196	mfhi	($14,$14,$7)
197	daddu	$2,$1
198	sltu	$15,$2,$1
199	 dmultu	($8,$7)
200	sd	$2,-3*8($4)
201	daddu	$2,$15,$14
202
203	mflo	($1,$8,$7)
204	mfhi	($8,$8,$7)
205	daddu	$2,$1
206	sltu	$9,$2,$1
207	 dmultu	($10,$7)
208	sd	$2,-2*8($4)
209	daddu	$2,$9,$8
210
211	and	$8,$6,$3
212	mflo	($1,$10,$7)
213	mfhi	($10,$10,$7)
214	daddu	$2,$1
215	sltu	$11,$2,$1
216	sd	$2,-8($4)
217	.set	noreorder
218	bgtz	$8,.L_bn_mul_words_loop
219	daddu	$2,$11,$10
220
221	beqz	$6,.L_bn_mul_words_return
222	nop
223
224.L_bn_mul_words_tail:
225	.set	reorder
226	ld	$12,0($5)
227	dmultu	($12,$7)
228	subu	$6,1
229	mflo	($1,$12,$7)
230	mfhi	($12,$12,$7)
231	daddu	$2,$1
232	sltu	$13,$2,$1
233	sd	$2,0($4)
234	daddu	$2,$13,$12
235	beqz	$6,.L_bn_mul_words_return
236
237	ld	$12,8($5)
238	dmultu	($12,$7)
239	subu	$6,1
240	mflo	($1,$12,$7)
241	mfhi	($12,$12,$7)
242	daddu	$2,$1
243	sltu	$13,$2,$1
244	sd	$2,8($4)
245	daddu	$2,$13,$12
246	beqz	$6,.L_bn_mul_words_return
247
248	ld	$12,2*8($5)
249	dmultu	($12,$7)
250	mflo	($1,$12,$7)
251	mfhi	($12,$12,$7)
252	daddu	$2,$1
253	sltu	$13,$2,$1
254	sd	$2,2*8($4)
255	daddu	$2,$13,$12
256
257.L_bn_mul_words_return:
258	.set	noreorder
259	jr	$31
260	move	$4,$2
261.end	bn_mul_words_internal
262
263.align	5
264.globl	bn_sqr_words
265.ent	bn_sqr_words
266bn_sqr_words:
267	.set	noreorder
268	bgtz	$6,bn_sqr_words_internal
269	move	$2,$0
270	jr	$31
271	move	$4,$2
272.end	bn_sqr_words
273
274.align	5
275.ent	bn_sqr_words_internal
276bn_sqr_words_internal:
277	.set	reorder
278	li	$3,-4
279	and	$8,$6,$3
280	beqz	$8,.L_bn_sqr_words_tail
281
282.L_bn_sqr_words_loop:
283	ld	$12,0($5)
284	dmultu	($12,$12)
285	ld	$14,8($5)
286	ld	$8,2*8($5)
287	ld	$10,3*8($5)
288	mflo	($13,$12,$12)
289	mfhi	($12,$12,$12)
290	sd	$13,0($4)
291	sd	$12,8($4)
292
293	dmultu	($14,$14)
294	subu	$6,4
295	daddu $4,8*8
296	daddu $5,4*8
297	mflo	($15,$14,$14)
298	mfhi	($14,$14,$14)
299	sd	$15,-6*8($4)
300	sd	$14,-5*8($4)
301
302	dmultu	($8,$8)
303	mflo	($9,$8,$8)
304	mfhi	($8,$8,$8)
305	sd	$9,-4*8($4)
306	sd	$8,-3*8($4)
307
308
309	dmultu	($10,$10)
310	and	$8,$6,$3
311	mflo	($11,$10,$10)
312	mfhi	($10,$10,$10)
313	sd	$11,-2*8($4)
314
315	.set	noreorder
316	bgtz	$8,.L_bn_sqr_words_loop
317	sd	$10,-8($4)
318
319	beqz	$6,.L_bn_sqr_words_return
320	nop
321
322.L_bn_sqr_words_tail:
323	.set	reorder
324	ld	$12,0($5)
325	dmultu	($12,$12)
326	subu	$6,1
327	mflo	($13,$12,$12)
328	mfhi	($12,$12,$12)
329	sd	$13,0($4)
330	sd	$12,8($4)
331	beqz	$6,.L_bn_sqr_words_return
332
333	ld	$12,8($5)
334	dmultu	($12,$12)
335	subu	$6,1
336	mflo	($13,$12,$12)
337	mfhi	($12,$12,$12)
338	sd	$13,2*8($4)
339	sd	$12,3*8($4)
340	beqz	$6,.L_bn_sqr_words_return
341
342	ld	$12,2*8($5)
343	dmultu	($12,$12)
344	mflo	($13,$12,$12)
345	mfhi	($12,$12,$12)
346	sd	$13,4*8($4)
347	sd	$12,5*8($4)
348
349.L_bn_sqr_words_return:
350	.set	noreorder
351	jr	$31
352	move	$4,$2
353
354.end	bn_sqr_words_internal
355
356.align	5
357.globl	bn_add_words
358.ent	bn_add_words
359bn_add_words:
360	.set	noreorder
361	bgtz	$7,bn_add_words_internal
362	move	$2,$0
363	jr	$31
364	move	$4,$2
365.end	bn_add_words
366
367.align	5
368.ent	bn_add_words_internal
369bn_add_words_internal:
370	.set	reorder
371	li	$3,-4
372	and	$1,$7,$3
373	beqz	$1,.L_bn_add_words_tail
374
375.L_bn_add_words_loop:
376	ld	$12,0($5)
377	ld	$8,0($6)
378	subu	$7,4
379	ld	$13,8($5)
380	and	$1,$7,$3
381	ld	$14,2*8($5)
382	daddu $6,4*8
383	ld	$15,3*8($5)
384	daddu $4,4*8
385	ld	$9,-3*8($6)
386	daddu $5,4*8
387	ld	$10,-2*8($6)
388	ld	$11,-8($6)
389	daddu	$8,$12
390	sltu	$24,$8,$12
391	daddu	$12,$8,$2
392	sltu	$2,$12,$8
393	sd	$12,-4*8($4)
394	daddu	$2,$24
395
396	daddu	$9,$13
397	sltu	$25,$9,$13
398	daddu	$13,$9,$2
399	sltu	$2,$13,$9
400	sd	$13,-3*8($4)
401	daddu	$2,$25
402
403	daddu	$10,$14
404	sltu	$24,$10,$14
405	daddu	$14,$10,$2
406	sltu	$2,$14,$10
407	sd	$14,-2*8($4)
408	daddu	$2,$24
409
410	daddu	$11,$15
411	sltu	$25,$11,$15
412	daddu	$15,$11,$2
413	sltu	$2,$15,$11
414	sd	$15,-8($4)
415
416	.set	noreorder
417	bgtz	$1,.L_bn_add_words_loop
418	daddu	$2,$25
419
420	beqz	$7,.L_bn_add_words_return
421	nop
422
423.L_bn_add_words_tail:
424	.set	reorder
425	ld	$12,0($5)
426	ld	$8,0($6)
427	daddu	$8,$12
428	subu	$7,1
429	sltu	$24,$8,$12
430	daddu	$12,$8,$2
431	sltu	$2,$12,$8
432	sd	$12,0($4)
433	daddu	$2,$24
434	beqz	$7,.L_bn_add_words_return
435
436	ld	$13,8($5)
437	ld	$9,8($6)
438	daddu	$9,$13
439	subu	$7,1
440	sltu	$25,$9,$13
441	daddu	$13,$9,$2
442	sltu	$2,$13,$9
443	sd	$13,8($4)
444	daddu	$2,$25
445	beqz	$7,.L_bn_add_words_return
446
447	ld	$14,2*8($5)
448	ld	$10,2*8($6)
449	daddu	$10,$14
450	sltu	$24,$10,$14
451	daddu	$14,$10,$2
452	sltu	$2,$14,$10
453	sd	$14,2*8($4)
454	daddu	$2,$24
455
456.L_bn_add_words_return:
457	.set	noreorder
458	jr	$31
459	move	$4,$2
460
461.end	bn_add_words_internal
462
463.align	5
464.globl	bn_sub_words
465.ent	bn_sub_words
466bn_sub_words:
467	.set	noreorder
468	bgtz	$7,bn_sub_words_internal
469	move	$2,$0
470	jr	$31
471	move	$4,$0
472.end	bn_sub_words
473
474.align	5
475.ent	bn_sub_words_internal
476bn_sub_words_internal:
477	.set	reorder
478	li	$3,-4
479	and	$1,$7,$3
480	beqz	$1,.L_bn_sub_words_tail
481
482.L_bn_sub_words_loop:
483	ld	$12,0($5)
484	ld	$8,0($6)
485	subu	$7,4
486	ld	$13,8($5)
487	and	$1,$7,$3
488	ld	$14,2*8($5)
489	daddu $6,4*8
490	ld	$15,3*8($5)
491	daddu $4,4*8
492	ld	$9,-3*8($6)
493	daddu $5,4*8
494	ld	$10,-2*8($6)
495	ld	$11,-8($6)
496	sltu	$24,$12,$8
497	dsubu	$8,$12,$8
498	dsubu	$12,$8,$2
499	sgtu	$2,$12,$8
500	sd	$12,-4*8($4)
501	daddu	$2,$24
502
503	sltu	$25,$13,$9
504	dsubu	$9,$13,$9
505	dsubu	$13,$9,$2
506	sgtu	$2,$13,$9
507	sd	$13,-3*8($4)
508	daddu	$2,$25
509
510
511	sltu	$24,$14,$10
512	dsubu	$10,$14,$10
513	dsubu	$14,$10,$2
514	sgtu	$2,$14,$10
515	sd	$14,-2*8($4)
516	daddu	$2,$24
517
518	sltu	$25,$15,$11
519	dsubu	$11,$15,$11
520	dsubu	$15,$11,$2
521	sgtu	$2,$15,$11
522	sd	$15,-8($4)
523
524	.set	noreorder
525	bgtz	$1,.L_bn_sub_words_loop
526	daddu	$2,$25
527
528	beqz	$7,.L_bn_sub_words_return
529	nop
530
531.L_bn_sub_words_tail:
532	.set	reorder
533	ld	$12,0($5)
534	ld	$8,0($6)
535	subu	$7,1
536	sltu	$24,$12,$8
537	dsubu	$8,$12,$8
538	dsubu	$12,$8,$2
539	sgtu	$2,$12,$8
540	sd	$12,0($4)
541	daddu	$2,$24
542	beqz	$7,.L_bn_sub_words_return
543
544	ld	$13,8($5)
545	subu	$7,1
546	ld	$9,8($6)
547	sltu	$25,$13,$9
548	dsubu	$9,$13,$9
549	dsubu	$13,$9,$2
550	sgtu	$2,$13,$9
551	sd	$13,8($4)
552	daddu	$2,$25
553	beqz	$7,.L_bn_sub_words_return
554
555	ld	$14,2*8($5)
556	ld	$10,2*8($6)
557	sltu	$24,$14,$10
558	dsubu	$10,$14,$10
559	dsubu	$14,$10,$2
560	sgtu	$2,$14,$10
561	sd	$14,2*8($4)
562	daddu	$2,$24
563
564.L_bn_sub_words_return:
565	.set	noreorder
566	jr	$31
567	move	$4,$2
568.end	bn_sub_words_internal
569
570#if 0
571/*
572 * The bn_div_3_words entry point is re-used for constant-time interface.
573 * Implementation is retained as historical reference.
574 */
575.align 5
576.globl	bn_div_3_words
577.ent	bn_div_3_words
578bn_div_3_words:
579	.set	noreorder
580	move	$7,$4		# we know that bn_div_words does not
581				# touch $7, $10, $11 and preserves $6
582				# so that we can save two arguments
583				# and return address in registers
584				# instead of stack:-)
585
586	ld	$4,($7)
587	move	$10,$5
588	bne	$4,$6,bn_div_3_words_internal
589	ld	$5,-8($7)
590	li	$2,-1
591	jr	$31
592	move	$4,$2
593.end	bn_div_3_words
594
595.align	5
596.ent	bn_div_3_words_internal
597bn_div_3_words_internal:
598	.set	reorder
599	move	$11,$31
600	bal	bn_div_words_internal
601	move	$31,$11
602	dmultu	($10,$2)
603	ld	$14,-2*8($7)
604	move	$8,$0
605	mfhi	($13,$10,$2)
606	mflo	($12,$10,$2)
607	sltu	$24,$13,$5
608.L_bn_div_3_words_inner_loop:
609	bnez	$24,.L_bn_div_3_words_inner_loop_done
610	sgeu	$1,$14,$12
611	seq	$25,$13,$5
612	and	$1,$25
613	sltu	$15,$12,$10
614	daddu	$5,$6
615	dsubu	$13,$15
616	dsubu	$12,$10
617	sltu	$24,$13,$5
618	sltu	$8,$5,$6
619	or	$24,$8
620	.set	noreorder
621	beqz	$1,.L_bn_div_3_words_inner_loop
622	dsubu	$2,1
623	daddu	$2,1
624	.set	reorder
625.L_bn_div_3_words_inner_loop_done:
626	.set	noreorder
627	jr	$31
628	move	$4,$2
629.end	bn_div_3_words_internal
630#endif
631
632.align	5
633.globl	bn_div_words
634.ent	bn_div_words
635bn_div_words:
636	.set	noreorder
637	bnez	$6,bn_div_words_internal
638	li	$2,-1		# I would rather signal div-by-zero
639				# which can be done with 'break 7'
640	jr	$31
641	move	$4,$2
642.end	bn_div_words
643
644.align	5
645.ent	bn_div_words_internal
646bn_div_words_internal:
647	move	$3,$0
648	bltz	$6,.L_bn_div_words_body
649	move	$25,$3
650	dsll	$6,1
651	bgtz	$6,.-4
652	addu	$25,1
653
654	.set	reorder
655	negu	$13,$25
656	li	$14,-1
657	dsll	$14,$13
658	and	$14,$4
659	dsrl	$1,$5,$13
660	.set	noreorder
661	beqz	$14,.+12
662	nop
663	break	6		# signal overflow
664	.set	reorder
665	dsll	$4,$25
666	dsll	$5,$25
667	or	$4,$1
668.L_bn_div_words_body:
669	dsrl	$3,$6,4*8	# bits
670	sgeu	$1,$4,$6
671	.set	noreorder
672	beqz	$1,.+12
673	nop
674	dsubu	$4,$6
675	.set	reorder
676
677	li	$8,-1
678	dsrl	$9,$4,4*8	# bits
679	dsrl	$8,4*8	# q=0xffffffff
680	beq	$3,$9,.L_bn_div_words_skip_div1
681	ddivu	($4,$3)
682	mfqt	($8,$4,$3)
683.L_bn_div_words_skip_div1:
684	dmultu	($6,$8)
685	dsll	$15,$4,4*8	# bits
686	dsrl	$1,$5,4*8	# bits
687	or	$15,$1
688	mflo	($12,$6,$8)
689	mfhi	($13,$6,$8)
690.L_bn_div_words_inner_loop1:
691	sltu	$14,$15,$12
692	seq	$24,$9,$13
693	sltu	$1,$9,$13
694	and	$14,$24
695	sltu	$2,$12,$6
696	or	$1,$14
697	.set	noreorder
698	beqz	$1,.L_bn_div_words_inner_loop1_done
699	dsubu	$13,$2
700	dsubu	$12,$6
701	b	.L_bn_div_words_inner_loop1
702	dsubu	$8,1
703	.set	reorder
704.L_bn_div_words_inner_loop1_done:
705
706	dsll	$5,4*8	# bits
707	dsubu	$4,$15,$12
708	dsll	$2,$8,4*8	# bits
709
710	li	$8,-1
711	dsrl	$9,$4,4*8	# bits
712	dsrl	$8,4*8	# q=0xffffffff
713	beq	$3,$9,.L_bn_div_words_skip_div2
714	ddivu	($4,$3)
715	mfqt	($8,$4,$3)
716.L_bn_div_words_skip_div2:
717	dmultu	($6,$8)
718	dsll	$15,$4,4*8	# bits
719	dsrl	$1,$5,4*8	# bits
720	or	$15,$1
721	mflo	($12,$6,$8)
722	mfhi	($13,$6,$8)
723.L_bn_div_words_inner_loop2:
724	sltu	$14,$15,$12
725	seq	$24,$9,$13
726	sltu	$1,$9,$13
727	and	$14,$24
728	sltu	$3,$12,$6
729	or	$1,$14
730	.set	noreorder
731	beqz	$1,.L_bn_div_words_inner_loop2_done
732	dsubu	$13,$3
733	dsubu	$12,$6
734	b	.L_bn_div_words_inner_loop2
735	dsubu	$8,1
736	.set	reorder
737.L_bn_div_words_inner_loop2_done:
738
739	dsubu	$4,$15,$12
740	or	$2,$8
741	dsrl	$3,$4,$25	# $3 contains remainder if anybody wants it
742	dsrl	$6,$25		# restore $6
743
744	.set	noreorder
745	move	$5,$3
746	jr	$31
747	move	$4,$2
748.end	bn_div_words_internal
749
750.align	5
751.globl	bn_mul_comba8
752.ent	bn_mul_comba8
753bn_mul_comba8:
754	.set	noreorder
755	.frame	$29,6*8,$31
756	.mask	0x003f0000,-8
757	dsubu $29,6*8
758	sd	$21,5*8($29)
759	sd	$20,4*8($29)
760	sd	$19,3*8($29)
761	sd	$18,2*8($29)
762	sd	$17,1*8($29)
763	sd	$16,0*8($29)
764
765	.set	reorder
766	ld	$12,0($5)	# If compiled with -mips3 option on
767				# R5000 box assembler barks on this
768				# 1ine with "should not have mult/div
769				# as last instruction in bb (R10K
770				# bug)" warning. If anybody out there
771				# has a clue about how to circumvent
772				# this do send me a note.
773				#		<appro@fy.chalmers.se>
774
775	ld	$8,0($6)
776	ld	$13,8($5)
777	ld	$14,2*8($5)
778	dmultu	($12,$8)		# mul_add_c(a[0],b[0],c1,c2,c3);
779	ld	$15,3*8($5)
780	ld	$9,8($6)
781	ld	$10,2*8($6)
782	ld	$11,3*8($6)
783	mflo	($2,$12,$8)
784	mfhi	($3,$12,$8)
785
786	ld	$16,4*8($5)
787	ld	$18,5*8($5)
788	dmultu	($12,$9)		# mul_add_c(a[0],b[1],c2,c3,c1);
789	ld	$20,6*8($5)
790	ld	$5,7*8($5)
791	ld	$17,4*8($6)
792	ld	$19,5*8($6)
793	mflo	($24,$12,$9)
794	mfhi	($25,$12,$9)
795	daddu	$3,$24
796	sltu	$1,$3,$24
797	dmultu	($13,$8)		# mul_add_c(a[1],b[0],c2,c3,c1);
798	daddu	$7,$25,$1
799	ld	$21,6*8($6)
800	ld	$6,7*8($6)
801	sd	$2,0($4)	# r[0]=c1;
802	mflo	($24,$13,$8)
803	mfhi	($25,$13,$8)
804	daddu	$3,$24
805	sltu	$1,$3,$24
806	 dmultu	($14,$8)		# mul_add_c(a[2],b[0],c3,c1,c2);
807	daddu	$25,$1
808	daddu	$7,$25
809	sltu	$2,$7,$25
810	sd	$3,8($4)	# r[1]=c2;
811
812	mflo	($24,$14,$8)
813	mfhi	($25,$14,$8)
814	daddu	$7,$24
815	sltu	$1,$7,$24
816	dmultu	($13,$9)		# mul_add_c(a[1],b[1],c3,c1,c2);
817	daddu	$25,$1
818	daddu	$2,$25
819	mflo	($24,$13,$9)
820	mfhi	($25,$13,$9)
821	daddu	$7,$24
822	sltu	$1,$7,$24
823	dmultu	($12,$10)		# mul_add_c(a[0],b[2],c3,c1,c2);
824	daddu	$25,$1
825	daddu	$2,$25
826	sltu	$3,$2,$25
827	mflo	($24,$12,$10)
828	mfhi	($25,$12,$10)
829	daddu	$7,$24
830	sltu	$1,$7,$24
831	 dmultu	($12,$11)		# mul_add_c(a[0],b[3],c1,c2,c3);
832	daddu	$25,$1
833	daddu	$2,$25
834	sltu	$1,$2,$25
835	daddu	$3,$1
836	sd	$7,2*8($4)	# r[2]=c3;
837
838	mflo	($24,$12,$11)
839	mfhi	($25,$12,$11)
840	daddu	$2,$24
841	sltu	$1,$2,$24
842	dmultu	($13,$10)		# mul_add_c(a[1],b[2],c1,c2,c3);
843	daddu	$25,$1
844	daddu	$3,$25
845	sltu	$7,$3,$25
846	mflo	($24,$13,$10)
847	mfhi	($25,$13,$10)
848	daddu	$2,$24
849	sltu	$1,$2,$24
850	dmultu	($14,$9)		# mul_add_c(a[2],b[1],c1,c2,c3);
851	daddu	$25,$1
852	daddu	$3,$25
853	sltu	$1,$3,$25
854	daddu	$7,$1
855	mflo	($24,$14,$9)
856	mfhi	($25,$14,$9)
857	daddu	$2,$24
858	sltu	$1,$2,$24
859	dmultu	($15,$8)		# mul_add_c(a[3],b[0],c1,c2,c3);
860	daddu	$25,$1
861	daddu	$3,$25
862	sltu	$1,$3,$25
863	daddu	$7,$1
864	mflo	($24,$15,$8)
865	mfhi	($25,$15,$8)
866	daddu	$2,$24
867	sltu	$1,$2,$24
868	 dmultu	($16,$8)		# mul_add_c(a[4],b[0],c2,c3,c1);
869	daddu	$25,$1
870	daddu	$3,$25
871	sltu	$1,$3,$25
872	daddu	$7,$1
873	sd	$2,3*8($4)	# r[3]=c1;
874
875	mflo	($24,$16,$8)
876	mfhi	($25,$16,$8)
877	daddu	$3,$24
878	sltu	$1,$3,$24
879	dmultu	($15,$9)		# mul_add_c(a[3],b[1],c2,c3,c1);
880	daddu	$25,$1
881	daddu	$7,$25
882	sltu	$2,$7,$25
883	mflo	($24,$15,$9)
884	mfhi	($25,$15,$9)
885	daddu	$3,$24
886	sltu	$1,$3,$24
887	dmultu	($14,$10)		# mul_add_c(a[2],b[2],c2,c3,c1);
888	daddu	$25,$1
889	daddu	$7,$25
890	sltu	$1,$7,$25
891	daddu	$2,$1
892	mflo	($24,$14,$10)
893	mfhi	($25,$14,$10)
894	daddu	$3,$24
895	sltu	$1,$3,$24
896	dmultu	($13,$11)		# mul_add_c(a[1],b[3],c2,c3,c1);
897	daddu	$25,$1
898	daddu	$7,$25
899	sltu	$1,$7,$25
900	daddu	$2,$1
901	mflo	($24,$13,$11)
902	mfhi	($25,$13,$11)
903	daddu	$3,$24
904	sltu	$1,$3,$24
905	dmultu	($12,$17)		# mul_add_c(a[0],b[4],c2,c3,c1);
906	daddu	$25,$1
907	daddu	$7,$25
908	sltu	$1,$7,$25
909	daddu	$2,$1
910	mflo	($24,$12,$17)
911	mfhi	($25,$12,$17)
912	daddu	$3,$24
913	sltu	$1,$3,$24
914	 dmultu	($12,$19)		# mul_add_c(a[0],b[5],c3,c1,c2);
915	daddu	$25,$1
916	daddu	$7,$25
917	sltu	$1,$7,$25
918	daddu	$2,$1
919	sd	$3,4*8($4)	# r[4]=c2;
920
921	mflo	($24,$12,$19)
922	mfhi	($25,$12,$19)
923	daddu	$7,$24
924	sltu	$1,$7,$24
925	dmultu	($13,$17)		# mul_add_c(a[1],b[4],c3,c1,c2);
926	daddu	$25,$1
927	daddu	$2,$25
928	sltu	$3,$2,$25
929	mflo	($24,$13,$17)
930	mfhi	($25,$13,$17)
931	daddu	$7,$24
932	sltu	$1,$7,$24
933	dmultu	($14,$11)		# mul_add_c(a[2],b[3],c3,c1,c2);
934	daddu	$25,$1
935	daddu	$2,$25
936	sltu	$1,$2,$25
937	daddu	$3,$1
938	mflo	($24,$14,$11)
939	mfhi	($25,$14,$11)
940	daddu	$7,$24
941	sltu	$1,$7,$24
942	dmultu	($15,$10)		# mul_add_c(a[3],b[2],c3,c1,c2);
943	daddu	$25,$1
944	daddu	$2,$25
945	sltu	$1,$2,$25
946	daddu	$3,$1
947	mflo	($24,$15,$10)
948	mfhi	($25,$15,$10)
949	daddu	$7,$24
950	sltu	$1,$7,$24
951	dmultu	($16,$9)		# mul_add_c(a[4],b[1],c3,c1,c2);
952	daddu	$25,$1
953	daddu	$2,$25
954	sltu	$1,$2,$25
955	daddu	$3,$1
956	mflo	($24,$16,$9)
957	mfhi	($25,$16,$9)
958	daddu	$7,$24
959	sltu	$1,$7,$24
960	dmultu	($18,$8)		# mul_add_c(a[5],b[0],c3,c1,c2);
961	daddu	$25,$1
962	daddu	$2,$25
963	sltu	$1,$2,$25
964	daddu	$3,$1
965	mflo	($24,$18,$8)
966	mfhi	($25,$18,$8)
967	daddu	$7,$24
968	sltu	$1,$7,$24
969	 dmultu	($20,$8)		# mul_add_c(a[6],b[0],c1,c2,c3);
970	daddu	$25,$1
971	daddu	$2,$25
972	sltu	$1,$2,$25
973	daddu	$3,$1
974	sd	$7,5*8($4)	# r[5]=c3;
975
976	mflo	($24,$20,$8)
977	mfhi	($25,$20,$8)
978	daddu	$2,$24
979	sltu	$1,$2,$24
980	dmultu	($18,$9)		# mul_add_c(a[5],b[1],c1,c2,c3);
981	daddu	$25,$1
982	daddu	$3,$25
983	sltu	$7,$3,$25
984	mflo	($24,$18,$9)
985	mfhi	($25,$18,$9)
986	daddu	$2,$24
987	sltu	$1,$2,$24
988	dmultu	($16,$10)		# mul_add_c(a[4],b[2],c1,c2,c3);
989	daddu	$25,$1
990	daddu	$3,$25
991	sltu	$1,$3,$25
992	daddu	$7,$1
993	mflo	($24,$16,$10)
994	mfhi	($25,$16,$10)
995	daddu	$2,$24
996	sltu	$1,$2,$24
997	dmultu	($15,$11)		# mul_add_c(a[3],b[3],c1,c2,c3);
998	daddu	$25,$1
999	daddu	$3,$25
1000	sltu	$1,$3,$25
1001	daddu	$7,$1
1002	mflo	($24,$15,$11)
1003	mfhi	($25,$15,$11)
1004	daddu	$2,$24
1005	sltu	$1,$2,$24
1006	dmultu	($14,$17)		# mul_add_c(a[2],b[4],c1,c2,c3);
1007	daddu	$25,$1
1008	daddu	$3,$25
1009	sltu	$1,$3,$25
1010	daddu	$7,$1
1011	mflo	($24,$14,$17)
1012	mfhi	($25,$14,$17)
1013	daddu	$2,$24
1014	sltu	$1,$2,$24
1015	dmultu	($13,$19)		# mul_add_c(a[1],b[5],c1,c2,c3);
1016	daddu	$25,$1
1017	daddu	$3,$25
1018	sltu	$1,$3,$25
1019	daddu	$7,$1
1020	mflo	($24,$13,$19)
1021	mfhi	($25,$13,$19)
1022	daddu	$2,$24
1023	sltu	$1,$2,$24
1024	dmultu	($12,$21)		# mul_add_c(a[0],b[6],c1,c2,c3);
1025	daddu	$25,$1
1026	daddu	$3,$25
1027	sltu	$1,$3,$25
1028	daddu	$7,$1
1029	mflo	($24,$12,$21)
1030	mfhi	($25,$12,$21)
1031	daddu	$2,$24
1032	sltu	$1,$2,$24
1033	 dmultu	($12,$6)		# mul_add_c(a[0],b[7],c2,c3,c1);
1034	daddu	$25,$1
1035	daddu	$3,$25
1036	sltu	$1,$3,$25
1037	daddu	$7,$1
1038	sd	$2,6*8($4)	# r[6]=c1;
1039
1040	mflo	($24,$12,$6)
1041	mfhi	($25,$12,$6)
1042	daddu	$3,$24
1043	sltu	$1,$3,$24
1044	dmultu	($13,$21)		# mul_add_c(a[1],b[6],c2,c3,c1);
1045	daddu	$25,$1
1046	daddu	$7,$25
1047	sltu	$2,$7,$25
1048	mflo	($24,$13,$21)
1049	mfhi	($25,$13,$21)
1050	daddu	$3,$24
1051	sltu	$1,$3,$24
1052	dmultu	($14,$19)		# mul_add_c(a[2],b[5],c2,c3,c1);
1053	daddu	$25,$1
1054	daddu	$7,$25
1055	sltu	$1,$7,$25
1056	daddu	$2,$1
1057	mflo	($24,$14,$19)
1058	mfhi	($25,$14,$19)
1059	daddu	$3,$24
1060	sltu	$1,$3,$24
1061	dmultu	($15,$17)		# mul_add_c(a[3],b[4],c2,c3,c1);
1062	daddu	$25,$1
1063	daddu	$7,$25
1064	sltu	$1,$7,$25
1065	daddu	$2,$1
1066	mflo	($24,$15,$17)
1067	mfhi	($25,$15,$17)
1068	daddu	$3,$24
1069	sltu	$1,$3,$24
1070	dmultu	($16,$11)		# mul_add_c(a[4],b[3],c2,c3,c1);
1071	daddu	$25,$1
1072	daddu	$7,$25
1073	sltu	$1,$7,$25
1074	daddu	$2,$1
1075	mflo	($24,$16,$11)
1076	mfhi	($25,$16,$11)
1077	daddu	$3,$24
1078	sltu	$1,$3,$24
1079	dmultu	($18,$10)		# mul_add_c(a[5],b[2],c2,c3,c1);
1080	daddu	$25,$1
1081	daddu	$7,$25
1082	sltu	$1,$7,$25
1083	daddu	$2,$1
1084	mflo	($24,$18,$10)
1085	mfhi	($25,$18,$10)
1086	daddu	$3,$24
1087	sltu	$1,$3,$24
1088	dmultu	($20,$9)		# mul_add_c(a[6],b[1],c2,c3,c1);
1089	daddu	$25,$1
1090	daddu	$7,$25
1091	sltu	$1,$7,$25
1092	daddu	$2,$1
1093	mflo	($24,$20,$9)
1094	mfhi	($25,$20,$9)
1095	daddu	$3,$24
1096	sltu	$1,$3,$24
1097	dmultu	($5,$8)		# mul_add_c(a[7],b[0],c2,c3,c1);
1098	daddu	$25,$1
1099	daddu	$7,$25
1100	sltu	$1,$7,$25
1101	daddu	$2,$1
1102	mflo	($24,$5,$8)
1103	mfhi	($25,$5,$8)
1104	daddu	$3,$24
1105	sltu	$1,$3,$24
1106	 dmultu	($5,$9)		# mul_add_c(a[7],b[1],c3,c1,c2);
1107	daddu	$25,$1
1108	daddu	$7,$25
1109	sltu	$1,$7,$25
1110	daddu	$2,$1
1111	sd	$3,7*8($4)	# r[7]=c2;
1112
1113	mflo	($24,$5,$9)
1114	mfhi	($25,$5,$9)
1115	daddu	$7,$24
1116	sltu	$1,$7,$24
1117	dmultu	($20,$10)		# mul_add_c(a[6],b[2],c3,c1,c2);
1118	daddu	$25,$1
1119	daddu	$2,$25
1120	sltu	$3,$2,$25
1121	mflo	($24,$20,$10)
1122	mfhi	($25,$20,$10)
1123	daddu	$7,$24
1124	sltu	$1,$7,$24
1125	dmultu	($18,$11)		# mul_add_c(a[5],b[3],c3,c1,c2);
1126	daddu	$25,$1
1127	daddu	$2,$25
1128	sltu	$1,$2,$25
1129	daddu	$3,$1
1130	mflo	($24,$18,$11)
1131	mfhi	($25,$18,$11)
1132	daddu	$7,$24
1133	sltu	$1,$7,$24
1134	dmultu	($16,$17)		# mul_add_c(a[4],b[4],c3,c1,c2);
1135	daddu	$25,$1
1136	daddu	$2,$25
1137	sltu	$1,$2,$25
1138	daddu	$3,$1
1139	mflo	($24,$16,$17)
1140	mfhi	($25,$16,$17)
1141	daddu	$7,$24
1142	sltu	$1,$7,$24
1143	dmultu	($15,$19)		# mul_add_c(a[3],b[5],c3,c1,c2);
1144	daddu	$25,$1
1145	daddu	$2,$25
1146	sltu	$1,$2,$25
1147	daddu	$3,$1
1148	mflo	($24,$15,$19)
1149	mfhi	($25,$15,$19)
1150	daddu	$7,$24
1151	sltu	$1,$7,$24
1152	dmultu	($14,$21)		# mul_add_c(a[2],b[6],c3,c1,c2);
1153	daddu	$25,$1
1154	daddu	$2,$25
1155	sltu	$1,$2,$25
1156	daddu	$3,$1
1157	mflo	($24,$14,$21)
1158	mfhi	($25,$14,$21)
1159	daddu	$7,$24
1160	sltu	$1,$7,$24
1161	dmultu	($13,$6)		# mul_add_c(a[1],b[7],c3,c1,c2);
1162	daddu	$25,$1
1163	daddu	$2,$25
1164	sltu	$1,$2,$25
1165	daddu	$3,$1
1166	mflo	($24,$13,$6)
1167	mfhi	($25,$13,$6)
1168	daddu	$7,$24
1169	sltu	$1,$7,$24
1170	 dmultu	($14,$6)		# mul_add_c(a[2],b[7],c1,c2,c3);
1171	daddu	$25,$1
1172	daddu	$2,$25
1173	sltu	$1,$2,$25
1174	daddu	$3,$1
1175	sd	$7,8*8($4)	# r[8]=c3;
1176
1177	mflo	($24,$14,$6)
1178	mfhi	($25,$14,$6)
1179	daddu	$2,$24
1180	sltu	$1,$2,$24
1181	dmultu	($15,$21)		# mul_add_c(a[3],b[6],c1,c2,c3);
1182	daddu	$25,$1
1183	daddu	$3,$25
1184	sltu	$7,$3,$25
1185	mflo	($24,$15,$21)
1186	mfhi	($25,$15,$21)
1187	daddu	$2,$24
1188	sltu	$1,$2,$24
1189	dmultu	($16,$19)		# mul_add_c(a[4],b[5],c1,c2,c3);
1190	daddu	$25,$1
1191	daddu	$3,$25
1192	sltu	$1,$3,$25
1193	daddu	$7,$1
1194	mflo	($24,$16,$19)
1195	mfhi	($25,$16,$19)
1196	daddu	$2,$24
1197	sltu	$1,$2,$24
1198	dmultu	($18,$17)		# mul_add_c(a[5],b[4],c1,c2,c3);
1199	daddu	$25,$1
1200	daddu	$3,$25
1201	sltu	$1,$3,$25
1202	daddu	$7,$1
1203	mflo	($24,$18,$17)
1204	mfhi	($25,$18,$17)
1205	daddu	$2,$24
1206	sltu	$1,$2,$24
1207	dmultu	($20,$11)		# mul_add_c(a[6],b[3],c1,c2,c3);
1208	daddu	$25,$1
1209	daddu	$3,$25
1210	sltu	$1,$3,$25
1211	daddu	$7,$1
1212	mflo	($24,$20,$11)
1213	mfhi	($25,$20,$11)
1214	daddu	$2,$24
1215	sltu	$1,$2,$24
1216	dmultu	($5,$10)		# mul_add_c(a[7],b[2],c1,c2,c3);
1217	daddu	$25,$1
1218	daddu	$3,$25
1219	sltu	$1,$3,$25
1220	daddu	$7,$1
1221	mflo	($24,$5,$10)
1222	mfhi	($25,$5,$10)
1223	daddu	$2,$24
1224	sltu	$1,$2,$24
1225	 dmultu	($5,$11)		# mul_add_c(a[7],b[3],c2,c3,c1);
1226	daddu	$25,$1
1227	daddu	$3,$25
1228	sltu	$1,$3,$25
1229	daddu	$7,$1
1230	sd	$2,9*8($4)	# r[9]=c1;
1231
1232	mflo	($24,$5,$11)
1233	mfhi	($25,$5,$11)
1234	daddu	$3,$24
1235	sltu	$1,$3,$24
1236	dmultu	($20,$17)		# mul_add_c(a[6],b[4],c2,c3,c1);
1237	daddu	$25,$1
1238	daddu	$7,$25
1239	sltu	$2,$7,$25
1240	mflo	($24,$20,$17)
1241	mfhi	($25,$20,$17)
1242	daddu	$3,$24
1243	sltu	$1,$3,$24
1244	dmultu	($18,$19)		# mul_add_c(a[5],b[5],c2,c3,c1);
1245	daddu	$25,$1
1246	daddu	$7,$25
1247	sltu	$1,$7,$25
1248	daddu	$2,$1
1249	mflo	($24,$18,$19)
1250	mfhi	($25,$18,$19)
1251	daddu	$3,$24
1252	sltu	$1,$3,$24
1253	dmultu	($16,$21)		# mul_add_c(a[4],b[6],c2,c3,c1);
1254	daddu	$25,$1
1255	daddu	$7,$25
1256	sltu	$1,$7,$25
1257	daddu	$2,$1
1258	mflo	($24,$16,$21)
1259	mfhi	($25,$16,$21)
1260	daddu	$3,$24
1261	sltu	$1,$3,$24
1262	dmultu	($15,$6)		# mul_add_c(a[3],b[7],c2,c3,c1);
1263	daddu	$25,$1
1264	daddu	$7,$25
1265	sltu	$1,$7,$25
1266	daddu	$2,$1
1267	mflo	($24,$15,$6)
1268	mfhi	($25,$15,$6)
1269	daddu	$3,$24
1270	sltu	$1,$3,$24
1271	dmultu	($16,$6)		# mul_add_c(a[4],b[7],c3,c1,c2);
1272	daddu	$25,$1
1273	daddu	$7,$25
1274	sltu	$1,$7,$25
1275	daddu	$2,$1
1276	sd	$3,10*8($4)	# r[10]=c2;
1277
1278	mflo	($24,$16,$6)
1279	mfhi	($25,$16,$6)
1280	daddu	$7,$24
1281	sltu	$1,$7,$24
1282	dmultu	($18,$21)		# mul_add_c(a[5],b[6],c3,c1,c2);
1283	daddu	$25,$1
1284	daddu	$2,$25
1285	sltu	$3,$2,$25
1286	mflo	($24,$18,$21)
1287	mfhi	($25,$18,$21)
1288	daddu	$7,$24
1289	sltu	$1,$7,$24
1290	dmultu	($20,$19)		# mul_add_c(a[6],b[5],c3,c1,c2);
1291	daddu	$25,$1
1292	daddu	$2,$25
1293	sltu	$1,$2,$25
1294	daddu	$3,$1
1295	mflo	($24,$20,$19)
1296	mfhi	($25,$20,$19)
1297	daddu	$7,$24
1298	sltu	$1,$7,$24
1299	dmultu	($5,$17)		# mul_add_c(a[7],b[4],c3,c1,c2);
1300	daddu	$25,$1
1301	daddu	$2,$25
1302	sltu	$1,$2,$25
1303	daddu	$3,$1
1304	mflo	($24,$5,$17)
1305	mfhi	($25,$5,$17)
1306	daddu	$7,$24
1307	sltu	$1,$7,$24
1308	 dmultu	($5,$19)		# mul_add_c(a[7],b[5],c1,c2,c3);
1309	daddu	$25,$1
1310	daddu	$2,$25
1311	sltu	$1,$2,$25
1312	daddu	$3,$1
1313	sd	$7,11*8($4)	# r[11]=c3;
1314
1315	mflo	($24,$5,$19)
1316	mfhi	($25,$5,$19)
1317	daddu	$2,$24
1318	sltu	$1,$2,$24
1319	dmultu	($20,$21)		# mul_add_c(a[6],b[6],c1,c2,c3);
1320	daddu	$25,$1
1321	daddu	$3,$25
1322	sltu	$7,$3,$25
1323	mflo	($24,$20,$21)
1324	mfhi	($25,$20,$21)
1325	daddu	$2,$24
1326	sltu	$1,$2,$24
1327	dmultu	($18,$6)		# mul_add_c(a[5],b[7],c1,c2,c3);
1328	daddu	$25,$1
1329	daddu	$3,$25
1330	sltu	$1,$3,$25
1331	daddu	$7,$1
1332	mflo	($24,$18,$6)
1333	mfhi	($25,$18,$6)
1334	daddu	$2,$24
1335	sltu	$1,$2,$24
1336	 dmultu	($20,$6)		# mul_add_c(a[6],b[7],c2,c3,c1);
1337	daddu	$25,$1
1338	daddu	$3,$25
1339	sltu	$1,$3,$25
1340	daddu	$7,$1
1341	sd	$2,12*8($4)	# r[12]=c1;
1342
1343	mflo	($24,$20,$6)
1344	mfhi	($25,$20,$6)
1345	daddu	$3,$24
1346	sltu	$1,$3,$24
1347	dmultu	($5,$21)		# mul_add_c(a[7],b[6],c2,c3,c1);
1348	daddu	$25,$1
1349	daddu	$7,$25
1350	sltu	$2,$7,$25
1351	mflo	($24,$5,$21)
1352	mfhi	($25,$5,$21)
1353	daddu	$3,$24
1354	sltu	$1,$3,$24
1355	dmultu	($5,$6)		# mul_add_c(a[7],b[7],c3,c1,c2);
1356	daddu	$25,$1
1357	daddu	$7,$25
1358	sltu	$1,$7,$25
1359	daddu	$2,$1
1360	sd	$3,13*8($4)	# r[13]=c2;
1361
1362	mflo	($24,$5,$6)
1363	mfhi	($25,$5,$6)
1364	daddu	$7,$24
1365	sltu	$1,$7,$24
1366	daddu	$25,$1
1367	daddu	$2,$25
1368	sd	$7,14*8($4)	# r[14]=c3;
1369	sd	$2,15*8($4)	# r[15]=c1;
1370
1371	.set	noreorder
1372	ld	$21,5*8($29)
1373	ld	$20,4*8($29)
1374	ld	$19,3*8($29)
1375	ld	$18,2*8($29)
1376	ld	$17,1*8($29)
1377	ld	$16,0*8($29)
1378	jr	$31
1379	daddu $29,6*8
1380.end	bn_mul_comba8
1381
1382.align	5
1383.globl	bn_mul_comba4
1384.ent	bn_mul_comba4
1385bn_mul_comba4:
1386	.set	reorder
1387	ld	$12,0($5)
1388	ld	$8,0($6)
1389	ld	$13,8($5)
1390	ld	$14,2*8($5)
1391	dmultu	($12,$8)		# mul_add_c(a[0],b[0],c1,c2,c3);
1392	ld	$15,3*8($5)
1393	ld	$9,8($6)
1394	ld	$10,2*8($6)
1395	ld	$11,3*8($6)
1396	mflo	($2,$12,$8)
1397	mfhi	($3,$12,$8)
1398	sd	$2,0($4)
1399
1400	dmultu	($12,$9)		# mul_add_c(a[0],b[1],c2,c3,c1);
1401	mflo	($24,$12,$9)
1402	mfhi	($25,$12,$9)
1403	daddu	$3,$24
1404	sltu	$1,$3,$24
1405	dmultu	($13,$8)		# mul_add_c(a[1],b[0],c2,c3,c1);
1406	daddu	$7,$25,$1
1407	mflo	($24,$13,$8)
1408	mfhi	($25,$13,$8)
1409	daddu	$3,$24
1410	sltu	$1,$3,$24
1411	 dmultu	($14,$8)		# mul_add_c(a[2],b[0],c3,c1,c2);
1412	daddu	$25,$1
1413	daddu	$7,$25
1414	sltu	$2,$7,$25
1415	sd	$3,8($4)
1416
1417	mflo	($24,$14,$8)
1418	mfhi	($25,$14,$8)
1419	daddu	$7,$24
1420	sltu	$1,$7,$24
1421	dmultu	($13,$9)		# mul_add_c(a[1],b[1],c3,c1,c2);
1422	daddu	$25,$1
1423	daddu	$2,$25
1424	mflo	($24,$13,$9)
1425	mfhi	($25,$13,$9)
1426	daddu	$7,$24
1427	sltu	$1,$7,$24
1428	dmultu	($12,$10)		# mul_add_c(a[0],b[2],c3,c1,c2);
1429	daddu	$25,$1
1430	daddu	$2,$25
1431	sltu	$3,$2,$25
1432	mflo	($24,$12,$10)
1433	mfhi	($25,$12,$10)
1434	daddu	$7,$24
1435	sltu	$1,$7,$24
1436	 dmultu	($12,$11)		# mul_add_c(a[0],b[3],c1,c2,c3);
1437	daddu	$25,$1
1438	daddu	$2,$25
1439	sltu	$1,$2,$25
1440	daddu	$3,$1
1441	sd	$7,2*8($4)
1442
1443	mflo	($24,$12,$11)
1444	mfhi	($25,$12,$11)
1445	daddu	$2,$24
1446	sltu	$1,$2,$24
1447	dmultu	($13,$10)		# mul_add_c(a[1],b[2],c1,c2,c3);
1448	daddu	$25,$1
1449	daddu	$3,$25
1450	sltu	$7,$3,$25
1451	mflo	($24,$13,$10)
1452	mfhi	($25,$13,$10)
1453	daddu	$2,$24
1454	sltu	$1,$2,$24
1455	dmultu	($14,$9)		# mul_add_c(a[2],b[1],c1,c2,c3);
1456	daddu	$25,$1
1457	daddu	$3,$25
1458	sltu	$1,$3,$25
1459	daddu	$7,$1
1460	mflo	($24,$14,$9)
1461	mfhi	($25,$14,$9)
1462	daddu	$2,$24
1463	sltu	$1,$2,$24
1464	dmultu	($15,$8)		# mul_add_c(a[3],b[0],c1,c2,c3);
1465	daddu	$25,$1
1466	daddu	$3,$25
1467	sltu	$1,$3,$25
1468	daddu	$7,$1
1469	mflo	($24,$15,$8)
1470	mfhi	($25,$15,$8)
1471	daddu	$2,$24
1472	sltu	$1,$2,$24
1473	 dmultu	($15,$9)		# mul_add_c(a[3],b[1],c2,c3,c1);
1474	daddu	$25,$1
1475	daddu	$3,$25
1476	sltu	$1,$3,$25
1477	daddu	$7,$1
1478	sd	$2,3*8($4)
1479
1480	mflo	($24,$15,$9)
1481	mfhi	($25,$15,$9)
1482	daddu	$3,$24
1483	sltu	$1,$3,$24
1484	dmultu	($14,$10)		# mul_add_c(a[2],b[2],c2,c3,c1);
1485	daddu	$25,$1
1486	daddu	$7,$25
1487	sltu	$2,$7,$25
1488	mflo	($24,$14,$10)
1489	mfhi	($25,$14,$10)
1490	daddu	$3,$24
1491	sltu	$1,$3,$24
1492	dmultu	($13,$11)		# mul_add_c(a[1],b[3],c2,c3,c1);
1493	daddu	$25,$1
1494	daddu	$7,$25
1495	sltu	$1,$7,$25
1496	daddu	$2,$1
1497	mflo	($24,$13,$11)
1498	mfhi	($25,$13,$11)
1499	daddu	$3,$24
1500	sltu	$1,$3,$24
1501	 dmultu	($14,$11)		# mul_add_c(a[2],b[3],c3,c1,c2);
1502	daddu	$25,$1
1503	daddu	$7,$25
1504	sltu	$1,$7,$25
1505	daddu	$2,$1
1506	sd	$3,4*8($4)
1507
1508	mflo	($24,$14,$11)
1509	mfhi	($25,$14,$11)
1510	daddu	$7,$24
1511	sltu	$1,$7,$24
1512	dmultu	($15,$10)		# mul_add_c(a[3],b[2],c3,c1,c2);
1513	daddu	$25,$1
1514	daddu	$2,$25
1515	sltu	$3,$2,$25
1516	mflo	($24,$15,$10)
1517	mfhi	($25,$15,$10)
1518	daddu	$7,$24
1519	sltu	$1,$7,$24
1520	 dmultu	($15,$11)		# mul_add_c(a[3],b[3],c1,c2,c3);
1521	daddu	$25,$1
1522	daddu	$2,$25
1523	sltu	$1,$2,$25
1524	daddu	$3,$1
1525	sd	$7,5*8($4)
1526
1527	mflo	($24,$15,$11)
1528	mfhi	($25,$15,$11)
1529	daddu	$2,$24
1530	sltu	$1,$2,$24
1531	daddu	$25,$1
1532	daddu	$3,$25
1533	sd	$2,6*8($4)
1534	sd	$3,7*8($4)
1535
1536	.set	noreorder
1537	jr	$31
1538	nop
1539.end	bn_mul_comba4
1540
1541.align	5
1542.globl	bn_sqr_comba8
1543.ent	bn_sqr_comba8
1544bn_sqr_comba8:
1545	.set	reorder
1546	ld	$12,0($5)
1547	ld	$13,8($5)
1548	ld	$14,2*8($5)
1549	ld	$15,3*8($5)
1550
1551	dmultu	($12,$12)		# mul_add_c(a[0],b[0],c1,c2,c3);
1552	ld	$8,4*8($5)
1553	ld	$9,5*8($5)
1554	ld	$10,6*8($5)
1555	ld	$11,7*8($5)
1556	mflo	($2,$12,$12)
1557	mfhi	($3,$12,$12)
1558	sd	$2,0($4)
1559
1560	dmultu	($12,$13)		# mul_add_c2(a[0],b[1],c2,c3,c1);
1561	mflo	($24,$12,$13)
1562	mfhi	($25,$12,$13)
1563	slt	$2,$25,$0
1564	dsll	$25,1
1565	 dmultu	($14,$12)		# mul_add_c2(a[2],b[0],c3,c1,c2);
1566	slt	$6,$24,$0
1567	daddu	$25,$6
1568	dsll	$24,1
1569	daddu	$3,$24
1570	sltu	$1,$3,$24
1571	daddu	$7,$25,$1
1572	sd	$3,8($4)
1573	sltu	$1,$7,$25
1574	daddu	$2,$1
1575	mflo	($24,$14,$12)
1576	mfhi	($25,$14,$12)
1577	daddu	$7,$24
1578	sltu	$1,$7,$24
1579	 dmultu	($13,$13)		# forward multiplication
1580	daddu	$7,$24
1581	daddu	$1,$25
1582	sltu	$24,$7,$24
1583	daddu	$2,$1
1584	daddu	$25,$24
1585	sltu	$3,$2,$1
1586	daddu	$2,$25
1587	sltu	$25,$2,$25
1588	daddu	$3,$25
1589	mflo	($24,$13,$13)
1590	mfhi	($25,$13,$13)
1591	daddu	$7,$24
1592	sltu	$1,$7,$24
1593	 dmultu	($12,$15)		# mul_add_c2(a[0],b[3],c1,c2,c3);
1594	daddu	$25,$1
1595	daddu	$2,$25
1596	sltu	$1,$2,$25
1597	daddu	$3,$1
1598	sd	$7,2*8($4)
1599	mflo	($24,$12,$15)
1600	mfhi	($25,$12,$15)
1601	daddu	$2,$24
1602	sltu	$1,$2,$24
1603	 dmultu	($13,$14)		# forward multiplication
1604	daddu	$2,$24
1605	daddu	$1,$25
1606	sltu	$24,$2,$24
1607	daddu	$3,$1
1608	daddu	$25,$24
1609	sltu	$7,$3,$1
1610	daddu	$3,$25
1611	sltu	$25,$3,$25
1612	daddu	$7,$25
1613	mflo	($24,$13,$14)
1614	mfhi	($25,$13,$14)
1615	daddu	$2,$24
1616	sltu	$1,$2,$24
1617	 dmultu	($8,$12)		# forward multiplication
1618	daddu	$2,$24
1619	daddu	$1,$25
1620	sltu	$24,$2,$24
1621	daddu	$3,$1
1622	daddu	$25,$24
1623	sltu	$1,$3,$1
1624	daddu	$3,$25
1625	daddu	$7,$1
1626	sltu	$25,$3,$25
1627	daddu	$7,$25
1628	mflo	($24,$8,$12)
1629	mfhi	($25,$8,$12)
1630	sd	$2,3*8($4)
1631	daddu	$3,$24
1632	sltu	$1,$3,$24
1633	 dmultu	($15,$13)		# forward multiplication
1634	daddu	$3,$24
1635	daddu	$1,$25
1636	sltu	$24,$3,$24
1637	daddu	$7,$1
1638	daddu	$25,$24
1639	sltu	$2,$7,$1
1640	daddu	$7,$25
1641	sltu	$25,$7,$25
1642	daddu	$2,$25
1643	mflo	($24,$15,$13)
1644	mfhi	($25,$15,$13)
1645	daddu	$3,$24
1646	sltu	$1,$3,$24
1647	 dmultu	($14,$14)		# forward multiplication
1648	daddu	$3,$24
1649	daddu	$1,$25
1650	sltu	$24,$3,$24
1651	daddu	$7,$1
1652	daddu	$25,$24
1653	sltu	$1,$7,$1
1654	daddu	$7,$25
1655	daddu	$2,$1
1656	sltu	$25,$7,$25
1657	daddu	$2,$25
1658	mflo	($24,$14,$14)
1659	mfhi	($25,$14,$14)
1660	daddu	$3,$24
1661	sltu	$1,$3,$24
1662	 dmultu	($12,$9)		# mul_add_c2(a[0],b[5],c3,c1,c2);
1663	daddu	$25,$1
1664	daddu	$7,$25
1665	sltu	$1,$7,$25
1666	daddu	$2,$1
1667	sd	$3,4*8($4)
1668	mflo	($24,$12,$9)
1669	mfhi	($25,$12,$9)
1670	daddu	$7,$24
1671	sltu	$1,$7,$24
1672	 dmultu	($13,$8)		# forward multiplication
1673	daddu	$7,$24
1674	daddu	$1,$25
1675	sltu	$24,$7,$24
1676	daddu	$2,$1
1677	daddu	$25,$24
1678	sltu	$3,$2,$1
1679	daddu	$2,$25
1680	sltu	$25,$2,$25
1681	daddu	$3,$25
1682	mflo	($24,$13,$8)
1683	mfhi	($25,$13,$8)
1684	daddu	$7,$24
1685	sltu	$1,$7,$24
1686	 dmultu	($14,$15)		# forward multiplication
1687	daddu	$7,$24
1688	daddu	$1,$25
1689	sltu	$24,$7,$24
1690	daddu	$2,$1
1691	daddu	$25,$24
1692	sltu	$1,$2,$1
1693	daddu	$2,$25
1694	daddu	$3,$1
1695	sltu	$25,$2,$25
1696	daddu	$3,$25
1697	mflo	($24,$14,$15)
1698	mfhi	($25,$14,$15)
1699	daddu	$7,$24
1700	sltu	$1,$7,$24
1701	 dmultu	($10,$12)		# forward multiplication
1702	daddu	$7,$24
1703	daddu	$1,$25
1704	sltu	$24,$7,$24
1705	daddu	$2,$1
1706	daddu	$25,$24
1707	sltu	$1,$2,$1
1708	daddu	$2,$25
1709	daddu	$3,$1
1710	sltu	$25,$2,$25
1711	daddu	$3,$25
1712	mflo	($24,$10,$12)
1713	mfhi	($25,$10,$12)
1714	sd	$7,5*8($4)
1715	daddu	$2,$24
1716	sltu	$1,$2,$24
1717	 dmultu	($9,$13)		# forward multiplication
1718	daddu	$2,$24
1719	daddu	$1,$25
1720	sltu	$24,$2,$24
1721	daddu	$3,$1
1722	daddu	$25,$24
1723	sltu	$7,$3,$1
1724	daddu	$3,$25
1725	sltu	$25,$3,$25
1726	daddu	$7,$25
1727	mflo	($24,$9,$13)
1728	mfhi	($25,$9,$13)
1729	daddu	$2,$24
1730	sltu	$1,$2,$24
1731	 dmultu	($8,$14)		# forward multiplication
1732	daddu	$2,$24
1733	daddu	$1,$25
1734	sltu	$24,$2,$24
1735	daddu	$3,$1
1736	daddu	$25,$24
1737	sltu	$1,$3,$1
1738	daddu	$3,$25
1739	daddu	$7,$1
1740	sltu	$25,$3,$25
1741	daddu	$7,$25
1742	mflo	($24,$8,$14)
1743	mfhi	($25,$8,$14)
1744	daddu	$2,$24
1745	sltu	$1,$2,$24
1746	 dmultu	($15,$15)		# forward multiplication
1747	daddu	$2,$24
1748	daddu	$1,$25
1749	sltu	$24,$2,$24
1750	daddu	$3,$1
1751	daddu	$25,$24
1752	sltu	$1,$3,$1
1753	daddu	$3,$25
1754	daddu	$7,$1
1755	sltu	$25,$3,$25
1756	daddu	$7,$25
1757	mflo	($24,$15,$15)
1758	mfhi	($25,$15,$15)
1759	daddu	$2,$24
1760	sltu	$1,$2,$24
1761	 dmultu	($12,$11)		# mul_add_c2(a[0],b[7],c2,c3,c1);
1762	daddu	$25,$1
1763	daddu	$3,$25
1764	sltu	$1,$3,$25
1765	daddu	$7,$1
1766	sd	$2,6*8($4)
1767	mflo	($24,$12,$11)
1768	mfhi	($25,$12,$11)
1769	daddu	$3,$24
1770	sltu	$1,$3,$24
1771	 dmultu	($13,$10)		# forward multiplication
1772	daddu	$3,$24
1773	daddu	$1,$25
1774	sltu	$24,$3,$24
1775	daddu	$7,$1
1776	daddu	$25,$24
1777	sltu	$2,$7,$1
1778	daddu	$7,$25
1779	sltu	$25,$7,$25
1780	daddu	$2,$25
1781	mflo	($24,$13,$10)
1782	mfhi	($25,$13,$10)
1783	daddu	$3,$24
1784	sltu	$1,$3,$24
1785	 dmultu	($14,$9)		# forward multiplication
1786	daddu	$3,$24
1787	daddu	$1,$25
1788	sltu	$24,$3,$24
1789	daddu	$7,$1
1790	daddu	$25,$24
1791	sltu	$1,$7,$1
1792	daddu	$7,$25
1793	daddu	$2,$1
1794	sltu	$25,$7,$25
1795	daddu	$2,$25
1796	mflo	($24,$14,$9)
1797	mfhi	($25,$14,$9)
1798	daddu	$3,$24
1799	sltu	$1,$3,$24
1800	 dmultu	($15,$8)		# forward multiplication
1801	daddu	$3,$24
1802	daddu	$1,$25
1803	sltu	$24,$3,$24
1804	daddu	$7,$1
1805	daddu	$25,$24
1806	sltu	$1,$7,$1
1807	daddu	$7,$25
1808	daddu	$2,$1
1809	sltu	$25,$7,$25
1810	daddu	$2,$25
1811	mflo	($24,$15,$8)
1812	mfhi	($25,$15,$8)
1813	daddu	$3,$24
1814	sltu	$1,$3,$24
1815	 dmultu	($11,$13)		# forward multiplication
1816	daddu	$3,$24
1817	daddu	$1,$25
1818	sltu	$24,$3,$24
1819	daddu	$7,$1
1820	daddu	$25,$24
1821	sltu	$1,$7,$1
1822	daddu	$7,$25
1823	daddu	$2,$1
1824	sltu	$25,$7,$25
1825	daddu	$2,$25
1826	mflo	($24,$11,$13)
1827	mfhi	($25,$11,$13)
1828	sd	$3,7*8($4)
1829	daddu	$7,$24
1830	sltu	$1,$7,$24
1831	 dmultu	($10,$14)		# forward multiplication
1832	daddu	$7,$24
1833	daddu	$1,$25
1834	sltu	$24,$7,$24
1835	daddu	$2,$1
1836	daddu	$25,$24
1837	sltu	$3,$2,$1
1838	daddu	$2,$25
1839	sltu	$25,$2,$25
1840	daddu	$3,$25
1841	mflo	($24,$10,$14)
1842	mfhi	($25,$10,$14)
1843	daddu	$7,$24
1844	sltu	$1,$7,$24
1845	 dmultu	($9,$15)		# forward multiplication
1846	daddu	$7,$24
1847	daddu	$1,$25
1848	sltu	$24,$7,$24
1849	daddu	$2,$1
1850	daddu	$25,$24
1851	sltu	$1,$2,$1
1852	daddu	$2,$25
1853	daddu	$3,$1
1854	sltu	$25,$2,$25
1855	daddu	$3,$25
1856	mflo	($24,$9,$15)
1857	mfhi	($25,$9,$15)
1858	daddu	$7,$24
1859	sltu	$1,$7,$24
1860	 dmultu	($8,$8)		# forward multiplication
1861	daddu	$7,$24
1862	daddu	$1,$25
1863	sltu	$24,$7,$24
1864	daddu	$2,$1
1865	daddu	$25,$24
1866	sltu	$1,$2,$1
1867	daddu	$2,$25
1868	daddu	$3,$1
1869	sltu	$25,$2,$25
1870	daddu	$3,$25
1871	mflo	($24,$8,$8)
1872	mfhi	($25,$8,$8)
1873	daddu	$7,$24
1874	sltu	$1,$7,$24
1875	 dmultu	($14,$11)		# mul_add_c2(a[2],b[7],c1,c2,c3);
1876	daddu	$25,$1
1877	daddu	$2,$25
1878	sltu	$1,$2,$25
1879	daddu	$3,$1
1880	sd	$7,8*8($4)
1881	mflo	($24,$14,$11)
1882	mfhi	($25,$14,$11)
1883	daddu	$2,$24
1884	sltu	$1,$2,$24
1885	 dmultu	($15,$10)		# forward multiplication
1886	daddu	$2,$24
1887	daddu	$1,$25
1888	sltu	$24,$2,$24
1889	daddu	$3,$1
1890	daddu	$25,$24
1891	sltu	$7,$3,$1
1892	daddu	$3,$25
1893	sltu	$25,$3,$25
1894	daddu	$7,$25
1895	mflo	($24,$15,$10)
1896	mfhi	($25,$15,$10)
1897	daddu	$2,$24
1898	sltu	$1,$2,$24
1899	 dmultu	($8,$9)		# forward multiplication
1900	daddu	$2,$24
1901	daddu	$1,$25
1902	sltu	$24,$2,$24
1903	daddu	$3,$1
1904	daddu	$25,$24
1905	sltu	$1,$3,$1
1906	daddu	$3,$25
1907	daddu	$7,$1
1908	sltu	$25,$3,$25
1909	daddu	$7,$25
1910	mflo	($24,$8,$9)
1911	mfhi	($25,$8,$9)
1912	daddu	$2,$24
1913	sltu	$1,$2,$24
1914	 dmultu	($11,$15)		# forward multiplication
1915	daddu	$2,$24
1916	daddu	$1,$25
1917	sltu	$24,$2,$24
1918	daddu	$3,$1
1919	daddu	$25,$24
1920	sltu	$1,$3,$1
1921	daddu	$3,$25
1922	daddu	$7,$1
1923	sltu	$25,$3,$25
1924	daddu	$7,$25
1925	mflo	($24,$11,$15)
1926	mfhi	($25,$11,$15)
1927	sd	$2,9*8($4)
1928	daddu	$3,$24
1929	sltu	$1,$3,$24
1930	 dmultu	($10,$8)		# forward multiplication
1931	daddu	$3,$24
1932	daddu	$1,$25
1933	sltu	$24,$3,$24
1934	daddu	$7,$1
1935	daddu	$25,$24
1936	sltu	$2,$7,$1
1937	daddu	$7,$25
1938	sltu	$25,$7,$25
1939	daddu	$2,$25
1940	mflo	($24,$10,$8)
1941	mfhi	($25,$10,$8)
1942	daddu	$3,$24
1943	sltu	$1,$3,$24
1944	 dmultu	($9,$9)		# forward multiplication
1945	daddu	$3,$24
1946	daddu	$1,$25
1947	sltu	$24,$3,$24
1948	daddu	$7,$1
1949	daddu	$25,$24
1950	sltu	$1,$7,$1
1951	daddu	$7,$25
1952	daddu	$2,$1
1953	sltu	$25,$7,$25
1954	daddu	$2,$25
1955	mflo	($24,$9,$9)
1956	mfhi	($25,$9,$9)
1957	daddu	$3,$24
1958	sltu	$1,$3,$24
1959	 dmultu	($8,$11)		# mul_add_c2(a[4],b[7],c3,c1,c2);
1960	daddu	$25,$1
1961	daddu	$7,$25
1962	sltu	$1,$7,$25
1963	daddu	$2,$1
1964	sd	$3,10*8($4)
1965	mflo	($24,$8,$11)
1966	mfhi	($25,$8,$11)
1967	daddu	$7,$24
1968	sltu	$1,$7,$24
1969	 dmultu	($9,$10)		# forward multiplication
1970	daddu	$7,$24
1971	daddu	$1,$25
1972	sltu	$24,$7,$24
1973	daddu	$2,$1
1974	daddu	$25,$24
1975	sltu	$3,$2,$1
1976	daddu	$2,$25
1977	sltu	$25,$2,$25
1978	daddu	$3,$25
1979	mflo	($24,$9,$10)
1980	mfhi	($25,$9,$10)
1981	daddu	$7,$24
1982	sltu	$1,$7,$24
1983	 dmultu	($11,$9)		# forward multiplication
1984	daddu	$7,$24
1985	daddu	$1,$25
1986	sltu	$24,$7,$24
1987	daddu	$2,$1
1988	daddu	$25,$24
1989	sltu	$1,$2,$1
1990	daddu	$2,$25
1991	daddu	$3,$1
1992	sltu	$25,$2,$25
1993	daddu	$3,$25
1994	mflo	($24,$11,$9)
1995	mfhi	($25,$11,$9)
1996	sd	$7,11*8($4)
1997	daddu	$2,$24
1998	sltu	$1,$2,$24
1999	 dmultu	($10,$10)		# forward multiplication
2000	daddu	$2,$24
2001	daddu	$1,$25
2002	sltu	$24,$2,$24
2003	daddu	$3,$1
2004	daddu	$25,$24
2005	sltu	$7,$3,$1
2006	daddu	$3,$25
2007	sltu	$25,$3,$25
2008	daddu	$7,$25
2009	mflo	($24,$10,$10)
2010	mfhi	($25,$10,$10)
2011	daddu	$2,$24
2012	sltu	$1,$2,$24
2013	 dmultu	($10,$11)		# mul_add_c2(a[6],b[7],c2,c3,c1);
2014	daddu	$25,$1
2015	daddu	$3,$25
2016	sltu	$1,$3,$25
2017	daddu	$7,$1
2018	sd	$2,12*8($4)
2019	mflo	($24,$10,$11)
2020	mfhi	($25,$10,$11)
2021	daddu	$3,$24
2022	sltu	$1,$3,$24
2023	 dmultu	($11,$11)		# forward multiplication
2024	daddu	$3,$24
2025	daddu	$1,$25
2026	sltu	$24,$3,$24
2027	daddu	$7,$1
2028	daddu	$25,$24
2029	sltu	$2,$7,$1
2030	daddu	$7,$25
2031	sltu	$25,$7,$25
2032	daddu	$2,$25
2033	mflo	($24,$11,$11)
2034	mfhi	($25,$11,$11)
2035	sd	$3,13*8($4)
2036
2037	daddu	$7,$24
2038	sltu	$1,$7,$24
2039	daddu	$25,$1
2040	daddu	$2,$25
2041	sd	$7,14*8($4)
2042	sd	$2,15*8($4)
2043
2044	.set	noreorder
2045	jr	$31
2046	nop
2047.end	bn_sqr_comba8
2048
2049.align	5
2050.globl	bn_sqr_comba4
2051.ent	bn_sqr_comba4
2052bn_sqr_comba4:
2053	.set	reorder
2054	ld	$12,0($5)
2055	ld	$13,8($5)
2056	dmultu	($12,$12)		# mul_add_c(a[0],b[0],c1,c2,c3);
2057	ld	$14,2*8($5)
2058	ld	$15,3*8($5)
2059	mflo	($2,$12,$12)
2060	mfhi	($3,$12,$12)
2061	sd	$2,0($4)
2062
2063	dmultu	($12,$13)		# mul_add_c2(a[0],b[1],c2,c3,c1);
2064	mflo	($24,$12,$13)
2065	mfhi	($25,$12,$13)
2066	slt	$2,$25,$0
2067	dsll	$25,1
2068	 dmultu	($14,$12)		# mul_add_c2(a[2],b[0],c3,c1,c2);
2069	slt	$6,$24,$0
2070	daddu	$25,$6
2071	dsll	$24,1
2072	daddu	$3,$24
2073	sltu	$1,$3,$24
2074	daddu	$7,$25,$1
2075	sd	$3,8($4)
2076	sltu	$1,$7,$25
2077	daddu	$2,$1
2078	mflo	($24,$14,$12)
2079	mfhi	($25,$14,$12)
2080	daddu	$7,$24
2081	sltu	$1,$7,$24
2082	 dmultu	($13,$13)		# forward multiplication
2083	daddu	$7,$24
2084	daddu	$1,$25
2085	sltu	$24,$7,$24
2086	daddu	$2,$1
2087	daddu	$25,$24
2088	sltu	$3,$2,$1
2089	daddu	$2,$25
2090	sltu	$25,$2,$25
2091	daddu	$3,$25
2092	mflo	($24,$13,$13)
2093	mfhi	($25,$13,$13)
2094	daddu	$7,$24
2095	sltu	$1,$7,$24
2096	 dmultu	($12,$15)		# mul_add_c2(a[0],b[3],c1,c2,c3);
2097	daddu	$25,$1
2098	daddu	$2,$25
2099	sltu	$1,$2,$25
2100	daddu	$3,$1
2101	sd	$7,2*8($4)
2102	mflo	($24,$12,$15)
2103	mfhi	($25,$12,$15)
2104	daddu	$2,$24
2105	sltu	$1,$2,$24
2106	 dmultu	($13,$14)		# forward multiplication
2107	daddu	$2,$24
2108	daddu	$1,$25
2109	sltu	$24,$2,$24
2110	daddu	$3,$1
2111	daddu	$25,$24
2112	sltu	$7,$3,$1
2113	daddu	$3,$25
2114	sltu	$25,$3,$25
2115	daddu	$7,$25
2116	mflo	($24,$13,$14)
2117	mfhi	($25,$13,$14)
2118	daddu	$2,$24
2119	sltu	$1,$2,$24
2120	 dmultu	($15,$13)		# forward multiplication
2121	daddu	$2,$24
2122	daddu	$1,$25
2123	sltu	$24,$2,$24
2124	daddu	$3,$1
2125	daddu	$25,$24
2126	sltu	$1,$3,$1
2127	daddu	$3,$25
2128	daddu	$7,$1
2129	sltu	$25,$3,$25
2130	daddu	$7,$25
2131	mflo	($24,$15,$13)
2132	mfhi	($25,$15,$13)
2133	sd	$2,3*8($4)
2134	daddu	$3,$24
2135	sltu	$1,$3,$24
2136	 dmultu	($14,$14)		# forward multiplication
2137	daddu	$3,$24
2138	daddu	$1,$25
2139	sltu	$24,$3,$24
2140	daddu	$7,$1
2141	daddu	$25,$24
2142	sltu	$2,$7,$1
2143	daddu	$7,$25
2144	sltu	$25,$7,$25
2145	daddu	$2,$25
2146	mflo	($24,$14,$14)
2147	mfhi	($25,$14,$14)
2148	daddu	$3,$24
2149	sltu	$1,$3,$24
2150	 dmultu	($14,$15)		# mul_add_c2(a[2],b[3],c3,c1,c2);
2151	daddu	$25,$1
2152	daddu	$7,$25
2153	sltu	$1,$7,$25
2154	daddu	$2,$1
2155	sd	$3,4*8($4)
2156	mflo	($24,$14,$15)
2157	mfhi	($25,$14,$15)
2158	daddu	$7,$24
2159	sltu	$1,$7,$24
2160	 dmultu	($15,$15)		# forward multiplication
2161	daddu	$7,$24
2162	daddu	$1,$25
2163	sltu	$24,$7,$24
2164	daddu	$2,$1
2165	daddu	$25,$24
2166	sltu	$3,$2,$1
2167	daddu	$2,$25
2168	sltu	$25,$2,$25
2169	daddu	$3,$25
2170	mflo	($24,$15,$15)
2171	mfhi	($25,$15,$15)
2172	sd	$7,5*8($4)
2173
2174	daddu	$2,$24
2175	sltu	$1,$2,$24
2176	daddu	$25,$1
2177	daddu	$3,$25
2178	sd	$2,6*8($4)
2179	sd	$3,7*8($4)
2180
2181	.set	noreorder
2182	jr	$31
2183	nop
2184.end	bn_sqr_comba4
2185