xref: /openbsd-src/lib/libcrypto/bn/asm/mips.pl (revision 71743258a3166741cf8441ad610c50453f062da3)
1ec07fdf1Sdjm#!/usr/bin/env perl
2ec07fdf1Sdjm#
3ec07fdf1Sdjm# ====================================================================
4ec07fdf1Sdjm# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5ec07fdf1Sdjm# project.
6ec07fdf1Sdjm#
7ec07fdf1Sdjm# Rights for redistribution and usage in source and binary forms are
8ec07fdf1Sdjm# granted according to the OpenSSL license. Warranty of any kind is
9ec07fdf1Sdjm# disclaimed.
10ec07fdf1Sdjm# ====================================================================
11ec07fdf1Sdjm
12ec07fdf1Sdjm
13ec07fdf1Sdjm# July 1999
14ec07fdf1Sdjm#
15ec07fdf1Sdjm# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16ec07fdf1Sdjm#
17ec07fdf1Sdjm# The module is designed to work with either of the "new" MIPS ABI(5),
18*71743258Sjmc# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
19ec07fdf1Sdjm# IRIX 5.x not only because it doesn't support new ABIs but also
20ec07fdf1Sdjm# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21ec07fdf1Sdjm# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22ec07fdf1Sdjm# cause illegal instruction exception:-(
23ec07fdf1Sdjm#
24ec07fdf1Sdjm# In addition the code depends on preprocessor flags set up by MIPSpro
25ec07fdf1Sdjm# compiler driver (either as or cc) and therefore (probably?) can't be
26ec07fdf1Sdjm# compiled by the GNU assembler. GNU C driver manages fine though...
27ec07fdf1Sdjm# I mean as long as -mmips-as is specified or is the default option,
28ec07fdf1Sdjm# because then it simply invokes /usr/bin/as which in turn takes
29ec07fdf1Sdjm# perfect care of the preprocessor definitions. Another neat feature
30ec07fdf1Sdjm# offered by the MIPSpro assembler is an optimization pass. This gave
31ec07fdf1Sdjm# me the opportunity to have the code looking more regular as all those
32ec07fdf1Sdjm# architecture dependent instruction rescheduling details were left to
33ec07fdf1Sdjm# the assembler. Cool, huh?
34ec07fdf1Sdjm#
35ec07fdf1Sdjm# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36ec07fdf1Sdjm# goes way over 3 times faster!
37ec07fdf1Sdjm#
38ec07fdf1Sdjm#					<appro@fy.chalmers.se>
39ec07fdf1Sdjm
40ec07fdf1Sdjm# October 2010
41ec07fdf1Sdjm#
42ec07fdf1Sdjm# Adapt the module even for 32-bit ABIs and other OSes. The former was
43ec07fdf1Sdjm# achieved by mechanical replacement of 64-bit arithmetic instructions
44ec07fdf1Sdjm# such as dmultu, daddu, etc. with their 32-bit counterparts and
45ec07fdf1Sdjm# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46ec07fdf1Sdjm# >3x performance improvement naturally does not apply to 32-bit code
47ec07fdf1Sdjm# [because there is no instruction 32-bit compiler can't use], one
48ec07fdf1Sdjm# has to content with 40-85% improvement depending on benchmark and
49ec07fdf1Sdjm# key length, more for longer keys.
50ec07fdf1Sdjm
51ec07fdf1Sdjm$flavour = shift;
52ec07fdf1Sdjmwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53ec07fdf1Sdjmopen STDOUT,">$output";
54ec07fdf1Sdjm
55ec07fdf1Sdjmif ($flavour =~ /64|n32/i) {
56ec07fdf1Sdjm	$LD="ld";
57ec07fdf1Sdjm	$ST="sd";
58ec07fdf1Sdjm	$MULTU="dmultu";
59ec07fdf1Sdjm	$DIVU="ddivu";
60ec07fdf1Sdjm	$ADDU="daddu";
61ec07fdf1Sdjm	$SUBU="dsubu";
62ec07fdf1Sdjm	$SRL="dsrl";
63ec07fdf1Sdjm	$SLL="dsll";
64ec07fdf1Sdjm	$BNSZ=8;
65ec07fdf1Sdjm	$PTR_ADD="daddu";
66ec07fdf1Sdjm	$PTR_SUB="dsubu";
67ec07fdf1Sdjm	$SZREG=8;
68ec07fdf1Sdjm	$REG_S="sd";
69ec07fdf1Sdjm	$REG_L="ld";
70ec07fdf1Sdjm} else {
71ec07fdf1Sdjm	$LD="lw";
72ec07fdf1Sdjm	$ST="sw";
73ec07fdf1Sdjm	$MULTU="multu";
74ec07fdf1Sdjm	$DIVU="divu";
75ec07fdf1Sdjm	$ADDU="addu";
76ec07fdf1Sdjm	$SUBU="subu";
77ec07fdf1Sdjm	$SRL="srl";
78ec07fdf1Sdjm	$SLL="sll";
79ec07fdf1Sdjm	$BNSZ=4;
80ec07fdf1Sdjm	$PTR_ADD="addu";
81ec07fdf1Sdjm	$PTR_SUB="subu";
82ec07fdf1Sdjm	$SZREG=4;
83ec07fdf1Sdjm	$REG_S="sw";
84ec07fdf1Sdjm	$REG_L="lw";
85ec07fdf1Sdjm	$code=".set	mips2\n";
86ec07fdf1Sdjm}
87ec07fdf1Sdjm
88ec07fdf1Sdjm# Below is N32/64 register layout used in the original module.
89ec07fdf1Sdjm#
90ec07fdf1Sdjm($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91ec07fdf1Sdjm($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92ec07fdf1Sdjm($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93ec07fdf1Sdjm($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94ec07fdf1Sdjm($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95ec07fdf1Sdjm($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96ec07fdf1Sdjm#
97ec07fdf1Sdjm# No special adaptation is required for O32. NUBI on the other hand
98ec07fdf1Sdjm# is treated by saving/restoring ($v1,$t0..$t3).
99ec07fdf1Sdjm
100ec07fdf1Sdjm$gp=$v1 if ($flavour =~ /nubi/i);
101ec07fdf1Sdjm
102ec07fdf1Sdjm$minus4=$v1;
103ec07fdf1Sdjm
104ec07fdf1Sdjm$code.=<<___;
105ec07fdf1Sdjm.rdata
106ec07fdf1Sdjm.asciiz	"mips3.s, Version 1.2"
107ec07fdf1Sdjm.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108ec07fdf1Sdjm
109ec07fdf1Sdjm.text
110ec07fdf1Sdjm.set	noat
111ec07fdf1Sdjm
112ec07fdf1Sdjm.align	5
113ec07fdf1Sdjm.globl	bn_mul_add_words
114ec07fdf1Sdjm.ent	bn_mul_add_words
115ec07fdf1Sdjmbn_mul_add_words:
116ec07fdf1Sdjm	.set	noreorder
117ec07fdf1Sdjm	bgtz	$a2,bn_mul_add_words_internal
118ec07fdf1Sdjm	move	$v0,$zero
119ec07fdf1Sdjm	jr	$ra
120ec07fdf1Sdjm	move	$a0,$v0
121ec07fdf1Sdjm.end	bn_mul_add_words
122ec07fdf1Sdjm
123ec07fdf1Sdjm.align	5
124ec07fdf1Sdjm.ent	bn_mul_add_words_internal
125ec07fdf1Sdjmbn_mul_add_words_internal:
126ec07fdf1Sdjm___
127ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
128ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
129ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
130ec07fdf1Sdjm	.set	noreorder
131ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
132ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
133ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
134ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
135ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
136ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
137ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
138ec07fdf1Sdjm___
139ec07fdf1Sdjm$code.=<<___;
140ec07fdf1Sdjm	.set	reorder
141ec07fdf1Sdjm	li	$minus4,-4
142ec07fdf1Sdjm	and	$ta0,$a2,$minus4
143ec07fdf1Sdjm	beqz	$ta0,.L_bn_mul_add_words_tail
144ec07fdf1Sdjm
145ec07fdf1Sdjm.L_bn_mul_add_words_loop:
1469eac5592Smiod	$LD	$t0,0($a1)
147ec07fdf1Sdjm	$MULTU	$t0,$a3
148ec07fdf1Sdjm	$LD	$t1,0($a0)
149ec07fdf1Sdjm	$LD	$t2,$BNSZ($a1)
150ec07fdf1Sdjm	$LD	$t3,$BNSZ($a0)
151ec07fdf1Sdjm	$LD	$ta0,2*$BNSZ($a1)
152ec07fdf1Sdjm	$LD	$ta1,2*$BNSZ($a0)
153ec07fdf1Sdjm	$ADDU	$t1,$v0
154ec07fdf1Sdjm	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
155ec07fdf1Sdjm				# values", but it seems to work fine
156ec07fdf1Sdjm				# even on 64-bit registers.
157ec07fdf1Sdjm	mflo	$at
158ec07fdf1Sdjm	mfhi	$t0
159ec07fdf1Sdjm	$ADDU	$t1,$at
160ec07fdf1Sdjm	$ADDU	$v0,$t0
161ec07fdf1Sdjm	 $MULTU	$t2,$a3
162ec07fdf1Sdjm	sltu	$at,$t1,$at
163ec07fdf1Sdjm	$ST	$t1,0($a0)
164ec07fdf1Sdjm	$ADDU	$v0,$at
165ec07fdf1Sdjm
166ec07fdf1Sdjm	$LD	$ta2,3*$BNSZ($a1)
167ec07fdf1Sdjm	$LD	$ta3,3*$BNSZ($a0)
168ec07fdf1Sdjm	$ADDU	$t3,$v0
169ec07fdf1Sdjm	sltu	$v0,$t3,$v0
170ec07fdf1Sdjm	mflo	$at
171ec07fdf1Sdjm	mfhi	$t2
172ec07fdf1Sdjm	$ADDU	$t3,$at
173ec07fdf1Sdjm	$ADDU	$v0,$t2
174ec07fdf1Sdjm	 $MULTU	$ta0,$a3
175ec07fdf1Sdjm	sltu	$at,$t3,$at
176ec07fdf1Sdjm	$ST	$t3,$BNSZ($a0)
177ec07fdf1Sdjm	$ADDU	$v0,$at
178ec07fdf1Sdjm
179ec07fdf1Sdjm	subu	$a2,4
180ec07fdf1Sdjm	$PTR_ADD $a0,4*$BNSZ
181ec07fdf1Sdjm	$PTR_ADD $a1,4*$BNSZ
182ec07fdf1Sdjm	$ADDU	$ta1,$v0
183ec07fdf1Sdjm	sltu	$v0,$ta1,$v0
184ec07fdf1Sdjm	mflo	$at
185ec07fdf1Sdjm	mfhi	$ta0
186ec07fdf1Sdjm	$ADDU	$ta1,$at
187ec07fdf1Sdjm	$ADDU	$v0,$ta0
188ec07fdf1Sdjm	 $MULTU	$ta2,$a3
189ec07fdf1Sdjm	sltu	$at,$ta1,$at
190ec07fdf1Sdjm	$ST	$ta1,-2*$BNSZ($a0)
191ec07fdf1Sdjm	$ADDU	$v0,$at
192ec07fdf1Sdjm
193ec07fdf1Sdjm
194ec07fdf1Sdjm	and	$ta0,$a2,$minus4
195ec07fdf1Sdjm	$ADDU	$ta3,$v0
196ec07fdf1Sdjm	sltu	$v0,$ta3,$v0
197ec07fdf1Sdjm	mflo	$at
198ec07fdf1Sdjm	mfhi	$ta2
199ec07fdf1Sdjm	$ADDU	$ta3,$at
200ec07fdf1Sdjm	$ADDU	$v0,$ta2
201ec07fdf1Sdjm	sltu	$at,$ta3,$at
202ec07fdf1Sdjm	$ST	$ta3,-$BNSZ($a0)
203ec07fdf1Sdjm	.set	noreorder
2049eac5592Smiod	bgtz	$ta0,.L_bn_mul_add_words_loop
2059eac5592Smiod	$ADDU	$v0,$at
206ec07fdf1Sdjm
207ec07fdf1Sdjm	beqz	$a2,.L_bn_mul_add_words_return
208ec07fdf1Sdjm	nop
209ec07fdf1Sdjm
210ec07fdf1Sdjm.L_bn_mul_add_words_tail:
211ec07fdf1Sdjm	.set	reorder
212ec07fdf1Sdjm	$LD	$t0,0($a1)
213ec07fdf1Sdjm	$MULTU	$t0,$a3
214ec07fdf1Sdjm	$LD	$t1,0($a0)
215ec07fdf1Sdjm	subu	$a2,1
216ec07fdf1Sdjm	$ADDU	$t1,$v0
217ec07fdf1Sdjm	sltu	$v0,$t1,$v0
218ec07fdf1Sdjm	mflo	$at
219ec07fdf1Sdjm	mfhi	$t0
220ec07fdf1Sdjm	$ADDU	$t1,$at
221ec07fdf1Sdjm	$ADDU	$v0,$t0
222ec07fdf1Sdjm	sltu	$at,$t1,$at
223ec07fdf1Sdjm	$ST	$t1,0($a0)
224ec07fdf1Sdjm	$ADDU	$v0,$at
225ec07fdf1Sdjm	beqz	$a2,.L_bn_mul_add_words_return
226ec07fdf1Sdjm
227ec07fdf1Sdjm	$LD	$t0,$BNSZ($a1)
228ec07fdf1Sdjm	$MULTU	$t0,$a3
229ec07fdf1Sdjm	$LD	$t1,$BNSZ($a0)
230ec07fdf1Sdjm	subu	$a2,1
231ec07fdf1Sdjm	$ADDU	$t1,$v0
232ec07fdf1Sdjm	sltu	$v0,$t1,$v0
233ec07fdf1Sdjm	mflo	$at
234ec07fdf1Sdjm	mfhi	$t0
235ec07fdf1Sdjm	$ADDU	$t1,$at
236ec07fdf1Sdjm	$ADDU	$v0,$t0
237ec07fdf1Sdjm	sltu	$at,$t1,$at
238ec07fdf1Sdjm	$ST	$t1,$BNSZ($a0)
239ec07fdf1Sdjm	$ADDU	$v0,$at
240ec07fdf1Sdjm	beqz	$a2,.L_bn_mul_add_words_return
241ec07fdf1Sdjm
242ec07fdf1Sdjm	$LD	$t0,2*$BNSZ($a1)
243ec07fdf1Sdjm	$MULTU	$t0,$a3
244ec07fdf1Sdjm	$LD	$t1,2*$BNSZ($a0)
245ec07fdf1Sdjm	$ADDU	$t1,$v0
246ec07fdf1Sdjm	sltu	$v0,$t1,$v0
247ec07fdf1Sdjm	mflo	$at
248ec07fdf1Sdjm	mfhi	$t0
249ec07fdf1Sdjm	$ADDU	$t1,$at
250ec07fdf1Sdjm	$ADDU	$v0,$t0
251ec07fdf1Sdjm	sltu	$at,$t1,$at
252ec07fdf1Sdjm	$ST	$t1,2*$BNSZ($a0)
253ec07fdf1Sdjm	$ADDU	$v0,$at
254ec07fdf1Sdjm
255ec07fdf1Sdjm.L_bn_mul_add_words_return:
256ec07fdf1Sdjm	.set	noreorder
257ec07fdf1Sdjm___
258ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
259ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
260ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
261ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
262ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
263ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
264ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
265ec07fdf1Sdjm___
266ec07fdf1Sdjm$code.=<<___;
267ec07fdf1Sdjm	jr	$ra
268ec07fdf1Sdjm	move	$a0,$v0
269ec07fdf1Sdjm.end	bn_mul_add_words_internal
270ec07fdf1Sdjm
271ec07fdf1Sdjm.align	5
272ec07fdf1Sdjm.globl	bn_mul_words
273ec07fdf1Sdjm.ent	bn_mul_words
274ec07fdf1Sdjmbn_mul_words:
275ec07fdf1Sdjm	.set	noreorder
276ec07fdf1Sdjm	bgtz	$a2,bn_mul_words_internal
277ec07fdf1Sdjm	move	$v0,$zero
278ec07fdf1Sdjm	jr	$ra
279ec07fdf1Sdjm	move	$a0,$v0
280ec07fdf1Sdjm.end	bn_mul_words
281ec07fdf1Sdjm
282ec07fdf1Sdjm.align	5
283ec07fdf1Sdjm.ent	bn_mul_words_internal
284ec07fdf1Sdjmbn_mul_words_internal:
285ec07fdf1Sdjm___
286ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
287ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
288ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
289ec07fdf1Sdjm	.set	noreorder
290ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
291ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
292ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
293ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
294ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
295ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
296ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
297ec07fdf1Sdjm___
298ec07fdf1Sdjm$code.=<<___;
299ec07fdf1Sdjm	.set	reorder
300ec07fdf1Sdjm	li	$minus4,-4
301ec07fdf1Sdjm	and	$ta0,$a2,$minus4
302ec07fdf1Sdjm	beqz	$ta0,.L_bn_mul_words_tail
303ec07fdf1Sdjm
304ec07fdf1Sdjm.L_bn_mul_words_loop:
3059eac5592Smiod	$LD	$t0,0($a1)
306ec07fdf1Sdjm	$MULTU	$t0,$a3
307ec07fdf1Sdjm	$LD	$t2,$BNSZ($a1)
308ec07fdf1Sdjm	$LD	$ta0,2*$BNSZ($a1)
309ec07fdf1Sdjm	$LD	$ta2,3*$BNSZ($a1)
310ec07fdf1Sdjm	mflo	$at
311ec07fdf1Sdjm	mfhi	$t0
312ec07fdf1Sdjm	$ADDU	$v0,$at
313ec07fdf1Sdjm	sltu	$t1,$v0,$at
314ec07fdf1Sdjm	 $MULTU	$t2,$a3
315ec07fdf1Sdjm	$ST	$v0,0($a0)
316ec07fdf1Sdjm	$ADDU	$v0,$t1,$t0
317ec07fdf1Sdjm
318ec07fdf1Sdjm	subu	$a2,4
319ec07fdf1Sdjm	$PTR_ADD $a0,4*$BNSZ
320ec07fdf1Sdjm	$PTR_ADD $a1,4*$BNSZ
321ec07fdf1Sdjm	mflo	$at
322ec07fdf1Sdjm	mfhi	$t2
323ec07fdf1Sdjm	$ADDU	$v0,$at
324ec07fdf1Sdjm	sltu	$t3,$v0,$at
325ec07fdf1Sdjm	 $MULTU	$ta0,$a3
326ec07fdf1Sdjm	$ST	$v0,-3*$BNSZ($a0)
327ec07fdf1Sdjm	$ADDU	$v0,$t3,$t2
328ec07fdf1Sdjm
329ec07fdf1Sdjm	mflo	$at
330ec07fdf1Sdjm	mfhi	$ta0
331ec07fdf1Sdjm	$ADDU	$v0,$at
332ec07fdf1Sdjm	sltu	$ta1,$v0,$at
333ec07fdf1Sdjm	 $MULTU	$ta2,$a3
334ec07fdf1Sdjm	$ST	$v0,-2*$BNSZ($a0)
335ec07fdf1Sdjm	$ADDU	$v0,$ta1,$ta0
336ec07fdf1Sdjm
337ec07fdf1Sdjm	and	$ta0,$a2,$minus4
338ec07fdf1Sdjm	mflo	$at
339ec07fdf1Sdjm	mfhi	$ta2
340ec07fdf1Sdjm	$ADDU	$v0,$at
341ec07fdf1Sdjm	sltu	$ta3,$v0,$at
342ec07fdf1Sdjm	$ST	$v0,-$BNSZ($a0)
343ec07fdf1Sdjm	.set	noreorder
3449eac5592Smiod	bgtz	$ta0,.L_bn_mul_words_loop
3459eac5592Smiod	$ADDU	$v0,$ta3,$ta2
346ec07fdf1Sdjm
347ec07fdf1Sdjm	beqz	$a2,.L_bn_mul_words_return
348ec07fdf1Sdjm	nop
349ec07fdf1Sdjm
350ec07fdf1Sdjm.L_bn_mul_words_tail:
351ec07fdf1Sdjm	.set	reorder
352ec07fdf1Sdjm	$LD	$t0,0($a1)
353ec07fdf1Sdjm	$MULTU	$t0,$a3
354ec07fdf1Sdjm	subu	$a2,1
355ec07fdf1Sdjm	mflo	$at
356ec07fdf1Sdjm	mfhi	$t0
357ec07fdf1Sdjm	$ADDU	$v0,$at
358ec07fdf1Sdjm	sltu	$t1,$v0,$at
359ec07fdf1Sdjm	$ST	$v0,0($a0)
360ec07fdf1Sdjm	$ADDU	$v0,$t1,$t0
361ec07fdf1Sdjm	beqz	$a2,.L_bn_mul_words_return
362ec07fdf1Sdjm
363ec07fdf1Sdjm	$LD	$t0,$BNSZ($a1)
364ec07fdf1Sdjm	$MULTU	$t0,$a3
365ec07fdf1Sdjm	subu	$a2,1
366ec07fdf1Sdjm	mflo	$at
367ec07fdf1Sdjm	mfhi	$t0
368ec07fdf1Sdjm	$ADDU	$v0,$at
369ec07fdf1Sdjm	sltu	$t1,$v0,$at
370ec07fdf1Sdjm	$ST	$v0,$BNSZ($a0)
371ec07fdf1Sdjm	$ADDU	$v0,$t1,$t0
372ec07fdf1Sdjm	beqz	$a2,.L_bn_mul_words_return
373ec07fdf1Sdjm
374ec07fdf1Sdjm	$LD	$t0,2*$BNSZ($a1)
375ec07fdf1Sdjm	$MULTU	$t0,$a3
376ec07fdf1Sdjm	mflo	$at
377ec07fdf1Sdjm	mfhi	$t0
378ec07fdf1Sdjm	$ADDU	$v0,$at
379ec07fdf1Sdjm	sltu	$t1,$v0,$at
380ec07fdf1Sdjm	$ST	$v0,2*$BNSZ($a0)
381ec07fdf1Sdjm	$ADDU	$v0,$t1,$t0
382ec07fdf1Sdjm
383ec07fdf1Sdjm.L_bn_mul_words_return:
384ec07fdf1Sdjm	.set	noreorder
385ec07fdf1Sdjm___
386ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
387ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
388ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
389ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
390ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
391ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
392ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
393ec07fdf1Sdjm___
394ec07fdf1Sdjm$code.=<<___;
395ec07fdf1Sdjm	jr	$ra
396ec07fdf1Sdjm	move	$a0,$v0
397ec07fdf1Sdjm.end	bn_mul_words_internal
398ec07fdf1Sdjm
399ec07fdf1Sdjm.align	5
400ec07fdf1Sdjm.globl	bn_sqr_words
401ec07fdf1Sdjm.ent	bn_sqr_words
402ec07fdf1Sdjmbn_sqr_words:
403ec07fdf1Sdjm	.set	noreorder
404ec07fdf1Sdjm	bgtz	$a2,bn_sqr_words_internal
405ec07fdf1Sdjm	move	$v0,$zero
406ec07fdf1Sdjm	jr	$ra
407ec07fdf1Sdjm	move	$a0,$v0
408ec07fdf1Sdjm.end	bn_sqr_words
409ec07fdf1Sdjm
410ec07fdf1Sdjm.align	5
411ec07fdf1Sdjm.ent	bn_sqr_words_internal
412ec07fdf1Sdjmbn_sqr_words_internal:
413ec07fdf1Sdjm___
414ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
415ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
416ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
417ec07fdf1Sdjm	.set	noreorder
418ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
419ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
420ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
421ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
422ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
423ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
424ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
425ec07fdf1Sdjm___
426ec07fdf1Sdjm$code.=<<___;
427ec07fdf1Sdjm	.set	reorder
428ec07fdf1Sdjm	li	$minus4,-4
429ec07fdf1Sdjm	and	$ta0,$a2,$minus4
430ec07fdf1Sdjm	beqz	$ta0,.L_bn_sqr_words_tail
431ec07fdf1Sdjm
432ec07fdf1Sdjm.L_bn_sqr_words_loop:
4339eac5592Smiod	$LD	$t0,0($a1)
434ec07fdf1Sdjm	$MULTU	$t0,$t0
435ec07fdf1Sdjm	$LD	$t2,$BNSZ($a1)
436ec07fdf1Sdjm	$LD	$ta0,2*$BNSZ($a1)
437ec07fdf1Sdjm	$LD	$ta2,3*$BNSZ($a1)
438ec07fdf1Sdjm	mflo	$t1
439ec07fdf1Sdjm	mfhi	$t0
440ec07fdf1Sdjm	$ST	$t1,0($a0)
441ec07fdf1Sdjm	$ST	$t0,$BNSZ($a0)
442ec07fdf1Sdjm
443ec07fdf1Sdjm	$MULTU	$t2,$t2
444ec07fdf1Sdjm	subu	$a2,4
445ec07fdf1Sdjm	$PTR_ADD $a0,8*$BNSZ
446ec07fdf1Sdjm	$PTR_ADD $a1,4*$BNSZ
447ec07fdf1Sdjm	mflo	$t3
448ec07fdf1Sdjm	mfhi	$t2
449ec07fdf1Sdjm	$ST	$t3,-6*$BNSZ($a0)
450ec07fdf1Sdjm	$ST	$t2,-5*$BNSZ($a0)
451ec07fdf1Sdjm
452ec07fdf1Sdjm	$MULTU	$ta0,$ta0
453ec07fdf1Sdjm	mflo	$ta1
454ec07fdf1Sdjm	mfhi	$ta0
455ec07fdf1Sdjm	$ST	$ta1,-4*$BNSZ($a0)
456ec07fdf1Sdjm	$ST	$ta0,-3*$BNSZ($a0)
457ec07fdf1Sdjm
458ec07fdf1Sdjm
459ec07fdf1Sdjm	$MULTU	$ta2,$ta2
460ec07fdf1Sdjm	and	$ta0,$a2,$minus4
461ec07fdf1Sdjm	mflo	$ta3
462ec07fdf1Sdjm	mfhi	$ta2
463ec07fdf1Sdjm	$ST	$ta3,-2*$BNSZ($a0)
464ec07fdf1Sdjm
465ec07fdf1Sdjm	.set	noreorder
4669eac5592Smiod	bgtz	$ta0,.L_bn_sqr_words_loop
4679eac5592Smiod	$ST	$ta2,-$BNSZ($a0)
468ec07fdf1Sdjm
469ec07fdf1Sdjm	beqz	$a2,.L_bn_sqr_words_return
470ec07fdf1Sdjm	nop
471ec07fdf1Sdjm
472ec07fdf1Sdjm.L_bn_sqr_words_tail:
473ec07fdf1Sdjm	.set	reorder
474ec07fdf1Sdjm	$LD	$t0,0($a1)
475ec07fdf1Sdjm	$MULTU	$t0,$t0
476ec07fdf1Sdjm	subu	$a2,1
477ec07fdf1Sdjm	mflo	$t1
478ec07fdf1Sdjm	mfhi	$t0
479ec07fdf1Sdjm	$ST	$t1,0($a0)
480ec07fdf1Sdjm	$ST	$t0,$BNSZ($a0)
481ec07fdf1Sdjm	beqz	$a2,.L_bn_sqr_words_return
482ec07fdf1Sdjm
483ec07fdf1Sdjm	$LD	$t0,$BNSZ($a1)
484ec07fdf1Sdjm	$MULTU	$t0,$t0
485ec07fdf1Sdjm	subu	$a2,1
486ec07fdf1Sdjm	mflo	$t1
487ec07fdf1Sdjm	mfhi	$t0
488ec07fdf1Sdjm	$ST	$t1,2*$BNSZ($a0)
489ec07fdf1Sdjm	$ST	$t0,3*$BNSZ($a0)
490ec07fdf1Sdjm	beqz	$a2,.L_bn_sqr_words_return
491ec07fdf1Sdjm
492ec07fdf1Sdjm	$LD	$t0,2*$BNSZ($a1)
493ec07fdf1Sdjm	$MULTU	$t0,$t0
494ec07fdf1Sdjm	mflo	$t1
495ec07fdf1Sdjm	mfhi	$t0
496ec07fdf1Sdjm	$ST	$t1,4*$BNSZ($a0)
497ec07fdf1Sdjm	$ST	$t0,5*$BNSZ($a0)
498ec07fdf1Sdjm
499ec07fdf1Sdjm.L_bn_sqr_words_return:
500ec07fdf1Sdjm	.set	noreorder
501ec07fdf1Sdjm___
502ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
503ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
504ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
505ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
506ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
507ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
508ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
509ec07fdf1Sdjm___
510ec07fdf1Sdjm$code.=<<___;
511ec07fdf1Sdjm	jr	$ra
512ec07fdf1Sdjm	move	$a0,$v0
513ec07fdf1Sdjm
514ec07fdf1Sdjm.end	bn_sqr_words_internal
515ec07fdf1Sdjm
516ec07fdf1Sdjm.align	5
517ec07fdf1Sdjm.globl	bn_add_words
518ec07fdf1Sdjm.ent	bn_add_words
519ec07fdf1Sdjmbn_add_words:
520ec07fdf1Sdjm	.set	noreorder
521ec07fdf1Sdjm	bgtz	$a3,bn_add_words_internal
522ec07fdf1Sdjm	move	$v0,$zero
523ec07fdf1Sdjm	jr	$ra
524ec07fdf1Sdjm	move	$a0,$v0
525ec07fdf1Sdjm.end	bn_add_words
526ec07fdf1Sdjm
527ec07fdf1Sdjm.align	5
528ec07fdf1Sdjm.ent	bn_add_words_internal
529ec07fdf1Sdjmbn_add_words_internal:
530ec07fdf1Sdjm___
531ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
532ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
533ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
534ec07fdf1Sdjm	.set	noreorder
535ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
536ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
537ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
538ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
539ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
540ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
541ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
542ec07fdf1Sdjm___
543ec07fdf1Sdjm$code.=<<___;
544ec07fdf1Sdjm	.set	reorder
545ec07fdf1Sdjm	li	$minus4,-4
546ec07fdf1Sdjm	and	$at,$a3,$minus4
547ec07fdf1Sdjm	beqz	$at,.L_bn_add_words_tail
548ec07fdf1Sdjm
549ec07fdf1Sdjm.L_bn_add_words_loop:
5509eac5592Smiod	$LD	$t0,0($a1)
551ec07fdf1Sdjm	$LD	$ta0,0($a2)
552ec07fdf1Sdjm	subu	$a3,4
553ec07fdf1Sdjm	$LD	$t1,$BNSZ($a1)
554ec07fdf1Sdjm	and	$at,$a3,$minus4
555ec07fdf1Sdjm	$LD	$t2,2*$BNSZ($a1)
556ec07fdf1Sdjm	$PTR_ADD $a2,4*$BNSZ
557ec07fdf1Sdjm	$LD	$t3,3*$BNSZ($a1)
558ec07fdf1Sdjm	$PTR_ADD $a0,4*$BNSZ
559ec07fdf1Sdjm	$LD	$ta1,-3*$BNSZ($a2)
560ec07fdf1Sdjm	$PTR_ADD $a1,4*$BNSZ
561ec07fdf1Sdjm	$LD	$ta2,-2*$BNSZ($a2)
562ec07fdf1Sdjm	$LD	$ta3,-$BNSZ($a2)
563ec07fdf1Sdjm	$ADDU	$ta0,$t0
564ec07fdf1Sdjm	sltu	$t8,$ta0,$t0
565ec07fdf1Sdjm	$ADDU	$t0,$ta0,$v0
566ec07fdf1Sdjm	sltu	$v0,$t0,$ta0
567ec07fdf1Sdjm	$ST	$t0,-4*$BNSZ($a0)
568ec07fdf1Sdjm	$ADDU	$v0,$t8
569ec07fdf1Sdjm
570ec07fdf1Sdjm	$ADDU	$ta1,$t1
571ec07fdf1Sdjm	sltu	$t9,$ta1,$t1
572ec07fdf1Sdjm	$ADDU	$t1,$ta1,$v0
573ec07fdf1Sdjm	sltu	$v0,$t1,$ta1
574ec07fdf1Sdjm	$ST	$t1,-3*$BNSZ($a0)
575ec07fdf1Sdjm	$ADDU	$v0,$t9
576ec07fdf1Sdjm
577ec07fdf1Sdjm	$ADDU	$ta2,$t2
578ec07fdf1Sdjm	sltu	$t8,$ta2,$t2
579ec07fdf1Sdjm	$ADDU	$t2,$ta2,$v0
580ec07fdf1Sdjm	sltu	$v0,$t2,$ta2
581ec07fdf1Sdjm	$ST	$t2,-2*$BNSZ($a0)
582ec07fdf1Sdjm	$ADDU	$v0,$t8
583ec07fdf1Sdjm
584ec07fdf1Sdjm	$ADDU	$ta3,$t3
585ec07fdf1Sdjm	sltu	$t9,$ta3,$t3
586ec07fdf1Sdjm	$ADDU	$t3,$ta3,$v0
587ec07fdf1Sdjm	sltu	$v0,$t3,$ta3
588ec07fdf1Sdjm	$ST	$t3,-$BNSZ($a0)
589ec07fdf1Sdjm
590ec07fdf1Sdjm	.set	noreorder
5919eac5592Smiod	bgtz	$at,.L_bn_add_words_loop
5929eac5592Smiod	$ADDU	$v0,$t9
593ec07fdf1Sdjm
594ec07fdf1Sdjm	beqz	$a3,.L_bn_add_words_return
595ec07fdf1Sdjm	nop
596ec07fdf1Sdjm
597ec07fdf1Sdjm.L_bn_add_words_tail:
598ec07fdf1Sdjm	.set	reorder
599ec07fdf1Sdjm	$LD	$t0,0($a1)
600ec07fdf1Sdjm	$LD	$ta0,0($a2)
601ec07fdf1Sdjm	$ADDU	$ta0,$t0
602ec07fdf1Sdjm	subu	$a3,1
603ec07fdf1Sdjm	sltu	$t8,$ta0,$t0
604ec07fdf1Sdjm	$ADDU	$t0,$ta0,$v0
605ec07fdf1Sdjm	sltu	$v0,$t0,$ta0
606ec07fdf1Sdjm	$ST	$t0,0($a0)
607ec07fdf1Sdjm	$ADDU	$v0,$t8
608ec07fdf1Sdjm	beqz	$a3,.L_bn_add_words_return
609ec07fdf1Sdjm
610ec07fdf1Sdjm	$LD	$t1,$BNSZ($a1)
611ec07fdf1Sdjm	$LD	$ta1,$BNSZ($a2)
612ec07fdf1Sdjm	$ADDU	$ta1,$t1
613ec07fdf1Sdjm	subu	$a3,1
614ec07fdf1Sdjm	sltu	$t9,$ta1,$t1
615ec07fdf1Sdjm	$ADDU	$t1,$ta1,$v0
616ec07fdf1Sdjm	sltu	$v0,$t1,$ta1
617ec07fdf1Sdjm	$ST	$t1,$BNSZ($a0)
618ec07fdf1Sdjm	$ADDU	$v0,$t9
619ec07fdf1Sdjm	beqz	$a3,.L_bn_add_words_return
620ec07fdf1Sdjm
621ec07fdf1Sdjm	$LD	$t2,2*$BNSZ($a1)
622ec07fdf1Sdjm	$LD	$ta2,2*$BNSZ($a2)
623ec07fdf1Sdjm	$ADDU	$ta2,$t2
624ec07fdf1Sdjm	sltu	$t8,$ta2,$t2
625ec07fdf1Sdjm	$ADDU	$t2,$ta2,$v0
626ec07fdf1Sdjm	sltu	$v0,$t2,$ta2
627ec07fdf1Sdjm	$ST	$t2,2*$BNSZ($a0)
628ec07fdf1Sdjm	$ADDU	$v0,$t8
629ec07fdf1Sdjm
630ec07fdf1Sdjm.L_bn_add_words_return:
631ec07fdf1Sdjm	.set	noreorder
632ec07fdf1Sdjm___
633ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
634ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
635ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
636ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
637ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
638ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
639ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
640ec07fdf1Sdjm___
641ec07fdf1Sdjm$code.=<<___;
642ec07fdf1Sdjm	jr	$ra
643ec07fdf1Sdjm	move	$a0,$v0
644ec07fdf1Sdjm
645ec07fdf1Sdjm.end	bn_add_words_internal
646ec07fdf1Sdjm
647ec07fdf1Sdjm.align	5
648ec07fdf1Sdjm.globl	bn_sub_words
649ec07fdf1Sdjm.ent	bn_sub_words
650ec07fdf1Sdjmbn_sub_words:
651ec07fdf1Sdjm	.set	noreorder
652ec07fdf1Sdjm	bgtz	$a3,bn_sub_words_internal
653ec07fdf1Sdjm	move	$v0,$zero
654ec07fdf1Sdjm	jr	$ra
655ec07fdf1Sdjm	move	$a0,$zero
656ec07fdf1Sdjm.end	bn_sub_words
657ec07fdf1Sdjm
658ec07fdf1Sdjm.align	5
659ec07fdf1Sdjm.ent	bn_sub_words_internal
660ec07fdf1Sdjmbn_sub_words_internal:
661ec07fdf1Sdjm___
662ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
663ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
664ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
665ec07fdf1Sdjm	.set	noreorder
666ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
667ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
668ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
669ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
670ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
671ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
672ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
673ec07fdf1Sdjm___
674ec07fdf1Sdjm$code.=<<___;
675ec07fdf1Sdjm	.set	reorder
676ec07fdf1Sdjm	li	$minus4,-4
677ec07fdf1Sdjm	and	$at,$a3,$minus4
678ec07fdf1Sdjm	beqz	$at,.L_bn_sub_words_tail
679ec07fdf1Sdjm
680ec07fdf1Sdjm.L_bn_sub_words_loop:
6819eac5592Smiod	$LD	$t0,0($a1)
682ec07fdf1Sdjm	$LD	$ta0,0($a2)
683ec07fdf1Sdjm	subu	$a3,4
684ec07fdf1Sdjm	$LD	$t1,$BNSZ($a1)
685ec07fdf1Sdjm	and	$at,$a3,$minus4
686ec07fdf1Sdjm	$LD	$t2,2*$BNSZ($a1)
687ec07fdf1Sdjm	$PTR_ADD $a2,4*$BNSZ
688ec07fdf1Sdjm	$LD	$t3,3*$BNSZ($a1)
689ec07fdf1Sdjm	$PTR_ADD $a0,4*$BNSZ
690ec07fdf1Sdjm	$LD	$ta1,-3*$BNSZ($a2)
691ec07fdf1Sdjm	$PTR_ADD $a1,4*$BNSZ
692ec07fdf1Sdjm	$LD	$ta2,-2*$BNSZ($a2)
693ec07fdf1Sdjm	$LD	$ta3,-$BNSZ($a2)
694ec07fdf1Sdjm	sltu	$t8,$t0,$ta0
695ec07fdf1Sdjm	$SUBU	$ta0,$t0,$ta0
696ec07fdf1Sdjm	$SUBU	$t0,$ta0,$v0
697ec07fdf1Sdjm	sgtu	$v0,$t0,$ta0
698ec07fdf1Sdjm	$ST	$t0,-4*$BNSZ($a0)
699ec07fdf1Sdjm	$ADDU	$v0,$t8
700ec07fdf1Sdjm
701ec07fdf1Sdjm	sltu	$t9,$t1,$ta1
702ec07fdf1Sdjm	$SUBU	$ta1,$t1,$ta1
703ec07fdf1Sdjm	$SUBU	$t1,$ta1,$v0
704ec07fdf1Sdjm	sgtu	$v0,$t1,$ta1
705ec07fdf1Sdjm	$ST	$t1,-3*$BNSZ($a0)
706ec07fdf1Sdjm	$ADDU	$v0,$t9
707ec07fdf1Sdjm
708ec07fdf1Sdjm
709ec07fdf1Sdjm	sltu	$t8,$t2,$ta2
710ec07fdf1Sdjm	$SUBU	$ta2,$t2,$ta2
711ec07fdf1Sdjm	$SUBU	$t2,$ta2,$v0
712ec07fdf1Sdjm	sgtu	$v0,$t2,$ta2
713ec07fdf1Sdjm	$ST	$t2,-2*$BNSZ($a0)
714ec07fdf1Sdjm	$ADDU	$v0,$t8
715ec07fdf1Sdjm
716ec07fdf1Sdjm	sltu	$t9,$t3,$ta3
717ec07fdf1Sdjm	$SUBU	$ta3,$t3,$ta3
718ec07fdf1Sdjm	$SUBU	$t3,$ta3,$v0
719ec07fdf1Sdjm	sgtu	$v0,$t3,$ta3
720ec07fdf1Sdjm	$ST	$t3,-$BNSZ($a0)
721ec07fdf1Sdjm
722ec07fdf1Sdjm	.set	noreorder
7239eac5592Smiod	bgtz	$at,.L_bn_sub_words_loop
7249eac5592Smiod	$ADDU	$v0,$t9
725ec07fdf1Sdjm
726ec07fdf1Sdjm	beqz	$a3,.L_bn_sub_words_return
727ec07fdf1Sdjm	nop
728ec07fdf1Sdjm
729ec07fdf1Sdjm.L_bn_sub_words_tail:
730ec07fdf1Sdjm	.set	reorder
731ec07fdf1Sdjm	$LD	$t0,0($a1)
732ec07fdf1Sdjm	$LD	$ta0,0($a2)
733ec07fdf1Sdjm	subu	$a3,1
734ec07fdf1Sdjm	sltu	$t8,$t0,$ta0
735ec07fdf1Sdjm	$SUBU	$ta0,$t0,$ta0
736ec07fdf1Sdjm	$SUBU	$t0,$ta0,$v0
737ec07fdf1Sdjm	sgtu	$v0,$t0,$ta0
738ec07fdf1Sdjm	$ST	$t0,0($a0)
739ec07fdf1Sdjm	$ADDU	$v0,$t8
740ec07fdf1Sdjm	beqz	$a3,.L_bn_sub_words_return
741ec07fdf1Sdjm
742ec07fdf1Sdjm	$LD	$t1,$BNSZ($a1)
743ec07fdf1Sdjm	subu	$a3,1
744ec07fdf1Sdjm	$LD	$ta1,$BNSZ($a2)
745ec07fdf1Sdjm	sltu	$t9,$t1,$ta1
746ec07fdf1Sdjm	$SUBU	$ta1,$t1,$ta1
747ec07fdf1Sdjm	$SUBU	$t1,$ta1,$v0
748ec07fdf1Sdjm	sgtu	$v0,$t1,$ta1
749ec07fdf1Sdjm	$ST	$t1,$BNSZ($a0)
750ec07fdf1Sdjm	$ADDU	$v0,$t9
751ec07fdf1Sdjm	beqz	$a3,.L_bn_sub_words_return
752ec07fdf1Sdjm
753ec07fdf1Sdjm	$LD	$t2,2*$BNSZ($a1)
754ec07fdf1Sdjm	$LD	$ta2,2*$BNSZ($a2)
755ec07fdf1Sdjm	sltu	$t8,$t2,$ta2
756ec07fdf1Sdjm	$SUBU	$ta2,$t2,$ta2
757ec07fdf1Sdjm	$SUBU	$t2,$ta2,$v0
758ec07fdf1Sdjm	sgtu	$v0,$t2,$ta2
759ec07fdf1Sdjm	$ST	$t2,2*$BNSZ($a0)
760ec07fdf1Sdjm	$ADDU	$v0,$t8
761ec07fdf1Sdjm
762ec07fdf1Sdjm.L_bn_sub_words_return:
763ec07fdf1Sdjm	.set	noreorder
764ec07fdf1Sdjm___
765ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
766ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
767ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
768ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
769ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
770ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
771ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
772ec07fdf1Sdjm___
773ec07fdf1Sdjm$code.=<<___;
774ec07fdf1Sdjm	jr	$ra
775ec07fdf1Sdjm	move	$a0,$v0
776ec07fdf1Sdjm.end	bn_sub_words_internal
777ec07fdf1Sdjm
778ec07fdf1Sdjm.align 5
779ec07fdf1Sdjm.globl	bn_div_3_words
780ec07fdf1Sdjm.ent	bn_div_3_words
781ec07fdf1Sdjmbn_div_3_words:
782ec07fdf1Sdjm	.set	noreorder
783ec07fdf1Sdjm	move	$a3,$a0		# we know that bn_div_words does not
784ec07fdf1Sdjm				# touch $a3, $ta2, $ta3 and preserves $a2
785ec07fdf1Sdjm				# so that we can save two arguments
786ec07fdf1Sdjm				# and return address in registers
787ec07fdf1Sdjm				# instead of stack:-)
788ec07fdf1Sdjm
789ec07fdf1Sdjm	$LD	$a0,($a3)
790ec07fdf1Sdjm	move	$ta2,$a1
791ec07fdf1Sdjm	bne	$a0,$a2,bn_div_3_words_internal
792ec07fdf1Sdjm	$LD	$a1,-$BNSZ($a3)
793ec07fdf1Sdjm	li	$v0,-1
794ec07fdf1Sdjm	jr	$ra
795ec07fdf1Sdjm	move	$a0,$v0
796ec07fdf1Sdjm.end	bn_div_3_words
797ec07fdf1Sdjm
798ec07fdf1Sdjm.align	5
799ec07fdf1Sdjm.ent	bn_div_3_words_internal
800ec07fdf1Sdjmbn_div_3_words_internal:
801ec07fdf1Sdjm___
802ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
803ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
804ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
805ec07fdf1Sdjm	.set	noreorder
806ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
807ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
808ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
809ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
810ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
811ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
812ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
813ec07fdf1Sdjm___
814ec07fdf1Sdjm$code.=<<___;
815ec07fdf1Sdjm	.set	reorder
816ec07fdf1Sdjm	move	$ta3,$ra
8179eac5592Smiod	bal	bn_div_words_internal
818ec07fdf1Sdjm	move	$ra,$ta3
819ec07fdf1Sdjm	$MULTU	$ta2,$v0
820ec07fdf1Sdjm	$LD	$t2,-2*$BNSZ($a3)
821ec07fdf1Sdjm	move	$ta0,$zero
822ec07fdf1Sdjm	mfhi	$t1
823ec07fdf1Sdjm	mflo	$t0
824ec07fdf1Sdjm	sltu	$t8,$t1,$a1
825ec07fdf1Sdjm.L_bn_div_3_words_inner_loop:
826ec07fdf1Sdjm	bnez	$t8,.L_bn_div_3_words_inner_loop_done
827ec07fdf1Sdjm	sgeu	$at,$t2,$t0
828ec07fdf1Sdjm	seq	$t9,$t1,$a1
829ec07fdf1Sdjm	and	$at,$t9
830ec07fdf1Sdjm	sltu	$t3,$t0,$ta2
831ec07fdf1Sdjm	$ADDU	$a1,$a2
832ec07fdf1Sdjm	$SUBU	$t1,$t3
833ec07fdf1Sdjm	$SUBU	$t0,$ta2
834ec07fdf1Sdjm	sltu	$t8,$t1,$a1
835ec07fdf1Sdjm	sltu	$ta0,$a1,$a2
836ec07fdf1Sdjm	or	$t8,$ta0
837ec07fdf1Sdjm	.set	noreorder
8389eac5592Smiod	beqz	$at,.L_bn_div_3_words_inner_loop
839ec07fdf1Sdjm	$SUBU	$v0,1
8409eac5592Smiod	$ADDU	$v0,1
841ec07fdf1Sdjm	.set	reorder
842ec07fdf1Sdjm.L_bn_div_3_words_inner_loop_done:
843ec07fdf1Sdjm	.set	noreorder
844ec07fdf1Sdjm___
845ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
846ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
847ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
848ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
849ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
850ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
851ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
852ec07fdf1Sdjm___
853ec07fdf1Sdjm$code.=<<___;
854ec07fdf1Sdjm	jr	$ra
855ec07fdf1Sdjm	move	$a0,$v0
856ec07fdf1Sdjm.end	bn_div_3_words_internal
857ec07fdf1Sdjm
858ec07fdf1Sdjm.align	5
859ec07fdf1Sdjm.globl	bn_div_words
860ec07fdf1Sdjm.ent	bn_div_words
861ec07fdf1Sdjmbn_div_words:
862ec07fdf1Sdjm	.set	noreorder
863ec07fdf1Sdjm	bnez	$a2,bn_div_words_internal
864ec07fdf1Sdjm	li	$v0,-1		# I would rather signal div-by-zero
865ec07fdf1Sdjm				# which can be done with 'break 7'
866ec07fdf1Sdjm	jr	$ra
867ec07fdf1Sdjm	move	$a0,$v0
868ec07fdf1Sdjm.end	bn_div_words
869ec07fdf1Sdjm
870ec07fdf1Sdjm.align	5
871ec07fdf1Sdjm.ent	bn_div_words_internal
872ec07fdf1Sdjmbn_div_words_internal:
873ec07fdf1Sdjm___
874ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
875ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
876ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
877ec07fdf1Sdjm	.set	noreorder
878ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
879ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
880ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
881ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
882ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
883ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
884ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
885ec07fdf1Sdjm___
886ec07fdf1Sdjm$code.=<<___;
887ec07fdf1Sdjm	move	$v1,$zero
888ec07fdf1Sdjm	bltz	$a2,.L_bn_div_words_body
889ec07fdf1Sdjm	move	$t9,$v1
890ec07fdf1Sdjm	$SLL	$a2,1
891ec07fdf1Sdjm	bgtz	$a2,.-4
892ec07fdf1Sdjm	addu	$t9,1
893ec07fdf1Sdjm
894ec07fdf1Sdjm	.set	reorder
895ec07fdf1Sdjm	negu	$t1,$t9
896ec07fdf1Sdjm	li	$t2,-1
897ec07fdf1Sdjm	$SLL	$t2,$t1
898ec07fdf1Sdjm	and	$t2,$a0
899ec07fdf1Sdjm	$SRL	$at,$a1,$t1
900ec07fdf1Sdjm	.set	noreorder
9019eac5592Smiod	beqz	$t2,.+12
9029eac5592Smiod	nop
903ec07fdf1Sdjm	break	6		# signal overflow
904ec07fdf1Sdjm	.set	reorder
905ec07fdf1Sdjm	$SLL	$a0,$t9
906ec07fdf1Sdjm	$SLL	$a1,$t9
907ec07fdf1Sdjm	or	$a0,$at
908ec07fdf1Sdjm___
909ec07fdf1Sdjm$QT=$ta0;
910ec07fdf1Sdjm$HH=$ta1;
911ec07fdf1Sdjm$DH=$v1;
912ec07fdf1Sdjm$code.=<<___;
913ec07fdf1Sdjm.L_bn_div_words_body:
914ec07fdf1Sdjm	$SRL	$DH,$a2,4*$BNSZ	# bits
915ec07fdf1Sdjm	sgeu	$at,$a0,$a2
916ec07fdf1Sdjm	.set	noreorder
9179eac5592Smiod	beqz	$at,.+12
9189eac5592Smiod	nop
919ec07fdf1Sdjm	$SUBU	$a0,$a2
920ec07fdf1Sdjm	.set	reorder
921ec07fdf1Sdjm
922ec07fdf1Sdjm	li	$QT,-1
923ec07fdf1Sdjm	$SRL	$HH,$a0,4*$BNSZ	# bits
924ec07fdf1Sdjm	$SRL	$QT,4*$BNSZ	# q=0xffffffff
925ec07fdf1Sdjm	beq	$DH,$HH,.L_bn_div_words_skip_div1
926ec07fdf1Sdjm	$DIVU	$zero,$a0,$DH
927ec07fdf1Sdjm	mflo	$QT
928ec07fdf1Sdjm.L_bn_div_words_skip_div1:
929ec07fdf1Sdjm	$MULTU	$a2,$QT
930ec07fdf1Sdjm	$SLL	$t3,$a0,4*$BNSZ	# bits
931ec07fdf1Sdjm	$SRL	$at,$a1,4*$BNSZ	# bits
932ec07fdf1Sdjm	or	$t3,$at
933ec07fdf1Sdjm	mflo	$t0
934ec07fdf1Sdjm	mfhi	$t1
935ec07fdf1Sdjm.L_bn_div_words_inner_loop1:
936ec07fdf1Sdjm	sltu	$t2,$t3,$t0
937ec07fdf1Sdjm	seq	$t8,$HH,$t1
938ec07fdf1Sdjm	sltu	$at,$HH,$t1
939ec07fdf1Sdjm	and	$t2,$t8
940ec07fdf1Sdjm	sltu	$v0,$t0,$a2
941ec07fdf1Sdjm	or	$at,$t2
942ec07fdf1Sdjm	.set	noreorder
943ec07fdf1Sdjm	beqz	$at,.L_bn_div_words_inner_loop1_done
944ec07fdf1Sdjm	$SUBU	$t1,$v0
945ec07fdf1Sdjm	$SUBU	$t0,$a2
946ec07fdf1Sdjm	b	.L_bn_div_words_inner_loop1
947ec07fdf1Sdjm	$SUBU	$QT,1
948ec07fdf1Sdjm	.set	reorder
949ec07fdf1Sdjm.L_bn_div_words_inner_loop1_done:
950ec07fdf1Sdjm
951ec07fdf1Sdjm	$SLL	$a1,4*$BNSZ	# bits
952ec07fdf1Sdjm	$SUBU	$a0,$t3,$t0
953ec07fdf1Sdjm	$SLL	$v0,$QT,4*$BNSZ	# bits
954ec07fdf1Sdjm
955ec07fdf1Sdjm	li	$QT,-1
956ec07fdf1Sdjm	$SRL	$HH,$a0,4*$BNSZ	# bits
957ec07fdf1Sdjm	$SRL	$QT,4*$BNSZ	# q=0xffffffff
958ec07fdf1Sdjm	beq	$DH,$HH,.L_bn_div_words_skip_div2
959ec07fdf1Sdjm	$DIVU	$zero,$a0,$DH
960ec07fdf1Sdjm	mflo	$QT
961ec07fdf1Sdjm.L_bn_div_words_skip_div2:
962ec07fdf1Sdjm	$MULTU	$a2,$QT
963ec07fdf1Sdjm	$SLL	$t3,$a0,4*$BNSZ	# bits
964ec07fdf1Sdjm	$SRL	$at,$a1,4*$BNSZ	# bits
965ec07fdf1Sdjm	or	$t3,$at
966ec07fdf1Sdjm	mflo	$t0
967ec07fdf1Sdjm	mfhi	$t1
968ec07fdf1Sdjm.L_bn_div_words_inner_loop2:
969ec07fdf1Sdjm	sltu	$t2,$t3,$t0
970ec07fdf1Sdjm	seq	$t8,$HH,$t1
971ec07fdf1Sdjm	sltu	$at,$HH,$t1
972ec07fdf1Sdjm	and	$t2,$t8
973ec07fdf1Sdjm	sltu	$v1,$t0,$a2
974ec07fdf1Sdjm	or	$at,$t2
975ec07fdf1Sdjm	.set	noreorder
976ec07fdf1Sdjm	beqz	$at,.L_bn_div_words_inner_loop2_done
977ec07fdf1Sdjm	$SUBU	$t1,$v1
978ec07fdf1Sdjm	$SUBU	$t0,$a2
979ec07fdf1Sdjm	b	.L_bn_div_words_inner_loop2
980ec07fdf1Sdjm	$SUBU	$QT,1
981ec07fdf1Sdjm	.set	reorder
982ec07fdf1Sdjm.L_bn_div_words_inner_loop2_done:
983ec07fdf1Sdjm
984ec07fdf1Sdjm	$SUBU	$a0,$t3,$t0
985ec07fdf1Sdjm	or	$v0,$QT
986ec07fdf1Sdjm	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
987ec07fdf1Sdjm	$SRL	$a2,$t9		# restore $a2
988ec07fdf1Sdjm
989ec07fdf1Sdjm	.set	noreorder
990ec07fdf1Sdjm	move	$a1,$v1
991ec07fdf1Sdjm___
992ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
993ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
994ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
995ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
996ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
997ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
998ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
999ec07fdf1Sdjm___
1000ec07fdf1Sdjm$code.=<<___;
1001ec07fdf1Sdjm	jr	$ra
1002ec07fdf1Sdjm	move	$a0,$v0
1003ec07fdf1Sdjm.end	bn_div_words_internal
1004ec07fdf1Sdjm___
1005ec07fdf1Sdjmundef $HH; undef $QT; undef $DH;
1006ec07fdf1Sdjm
1007ec07fdf1Sdjm($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008ec07fdf1Sdjm($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1009ec07fdf1Sdjm
1010ec07fdf1Sdjm($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011ec07fdf1Sdjm($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1012ec07fdf1Sdjm
1013ec07fdf1Sdjm($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1014ec07fdf1Sdjm
1015ec07fdf1Sdjm$code.=<<___;
1016ec07fdf1Sdjm
1017ec07fdf1Sdjm.align	5
1018ec07fdf1Sdjm.globl	bn_mul_comba8
1019ec07fdf1Sdjm.ent	bn_mul_comba8
1020ec07fdf1Sdjmbn_mul_comba8:
1021ec07fdf1Sdjm	.set	noreorder
1022ec07fdf1Sdjm___
1023ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
1024ec07fdf1Sdjm	.frame	$sp,12*$SZREG,$ra
1025ec07fdf1Sdjm	.mask	0x803ff008,-$SZREG
1026ec07fdf1Sdjm	$PTR_SUB $sp,12*$SZREG
1027ec07fdf1Sdjm	$REG_S	$ra,11*$SZREG($sp)
1028ec07fdf1Sdjm	$REG_S	$s5,10*$SZREG($sp)
1029ec07fdf1Sdjm	$REG_S	$s4,9*$SZREG($sp)
1030ec07fdf1Sdjm	$REG_S	$s3,8*$SZREG($sp)
1031ec07fdf1Sdjm	$REG_S	$s2,7*$SZREG($sp)
1032ec07fdf1Sdjm	$REG_S	$s1,6*$SZREG($sp)
1033ec07fdf1Sdjm	$REG_S	$s0,5*$SZREG($sp)
1034ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
1035ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
1036ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
1037ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
1038ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
1039ec07fdf1Sdjm___
1040ec07fdf1Sdjm$code.=<<___ if ($flavour !~ /nubi/i);
1041ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
1042ec07fdf1Sdjm	.mask	0x003f0000,-$SZREG
1043ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
1044ec07fdf1Sdjm	$REG_S	$s5,5*$SZREG($sp)
1045ec07fdf1Sdjm	$REG_S	$s4,4*$SZREG($sp)
1046ec07fdf1Sdjm	$REG_S	$s3,3*$SZREG($sp)
1047ec07fdf1Sdjm	$REG_S	$s2,2*$SZREG($sp)
1048ec07fdf1Sdjm	$REG_S	$s1,1*$SZREG($sp)
1049ec07fdf1Sdjm	$REG_S	$s0,0*$SZREG($sp)
1050ec07fdf1Sdjm___
1051ec07fdf1Sdjm$code.=<<___;
1052ec07fdf1Sdjm
1053ec07fdf1Sdjm	.set	reorder
1054ec07fdf1Sdjm	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1055ec07fdf1Sdjm				# R5000 box assembler barks on this
1056ec07fdf1Sdjm				# 1ine with "should not have mult/div
1057ec07fdf1Sdjm				# as last instruction in bb (R10K
1058ec07fdf1Sdjm				# bug)" warning. If anybody out there
1059ec07fdf1Sdjm				# has a clue about how to circumvent
1060ec07fdf1Sdjm				# this do send me a note.
1061ec07fdf1Sdjm				#		<appro\@fy.chalmers.se>
1062ec07fdf1Sdjm
1063ec07fdf1Sdjm	$LD	$b_0,0($a2)
1064ec07fdf1Sdjm	$LD	$a_1,$BNSZ($a1)
1065ec07fdf1Sdjm	$LD	$a_2,2*$BNSZ($a1)
1066ec07fdf1Sdjm	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1067ec07fdf1Sdjm	$LD	$a_3,3*$BNSZ($a1)
1068ec07fdf1Sdjm	$LD	$b_1,$BNSZ($a2)
1069ec07fdf1Sdjm	$LD	$b_2,2*$BNSZ($a2)
1070ec07fdf1Sdjm	$LD	$b_3,3*$BNSZ($a2)
1071ec07fdf1Sdjm	mflo	$c_1
1072ec07fdf1Sdjm	mfhi	$c_2
1073ec07fdf1Sdjm
1074ec07fdf1Sdjm	$LD	$a_4,4*$BNSZ($a1)
1075ec07fdf1Sdjm	$LD	$a_5,5*$BNSZ($a1)
1076ec07fdf1Sdjm	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1077ec07fdf1Sdjm	$LD	$a_6,6*$BNSZ($a1)
1078ec07fdf1Sdjm	$LD	$a_7,7*$BNSZ($a1)
1079ec07fdf1Sdjm	$LD	$b_4,4*$BNSZ($a2)
1080ec07fdf1Sdjm	$LD	$b_5,5*$BNSZ($a2)
1081ec07fdf1Sdjm	mflo	$t_1
1082ec07fdf1Sdjm	mfhi	$t_2
1083ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1084ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1085ec07fdf1Sdjm	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1086ec07fdf1Sdjm	$ADDU	$c_3,$t_2,$at
1087ec07fdf1Sdjm	$LD	$b_6,6*$BNSZ($a2)
1088ec07fdf1Sdjm	$LD	$b_7,7*$BNSZ($a2)
1089ec07fdf1Sdjm	$ST	$c_1,0($a0)	# r[0]=c1;
1090ec07fdf1Sdjm	mflo	$t_1
1091ec07fdf1Sdjm	mfhi	$t_2
1092ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1093ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1094ec07fdf1Sdjm	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1095ec07fdf1Sdjm	$ADDU	$t_2,$at
1096ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1097ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1098ec07fdf1Sdjm	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1099ec07fdf1Sdjm
1100ec07fdf1Sdjm	mflo	$t_1
1101ec07fdf1Sdjm	mfhi	$t_2
1102ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1103ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1104ec07fdf1Sdjm	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1105ec07fdf1Sdjm	$ADDU	$t_2,$at
1106ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1107ec07fdf1Sdjm	mflo	$t_1
1108ec07fdf1Sdjm	mfhi	$t_2
1109ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1110ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1111ec07fdf1Sdjm	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1112ec07fdf1Sdjm	$ADDU	$t_2,$at
1113ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1114ec07fdf1Sdjm	sltu	$c_2,$c_1,$t_2
1115ec07fdf1Sdjm	mflo	$t_1
1116ec07fdf1Sdjm	mfhi	$t_2
1117ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1118ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1119ec07fdf1Sdjm	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1120ec07fdf1Sdjm	$ADDU	$t_2,$at
1121ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1122ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1123ec07fdf1Sdjm	$ADDU	$c_2,$at
1124ec07fdf1Sdjm	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1125ec07fdf1Sdjm
1126ec07fdf1Sdjm	mflo	$t_1
1127ec07fdf1Sdjm	mfhi	$t_2
1128ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1129ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1130ec07fdf1Sdjm	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1131ec07fdf1Sdjm	$ADDU	$t_2,$at
1132ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1133ec07fdf1Sdjm	sltu	$c_3,$c_2,$t_2
1134ec07fdf1Sdjm	mflo	$t_1
1135ec07fdf1Sdjm	mfhi	$t_2
1136ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1137ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1138ec07fdf1Sdjm	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1139ec07fdf1Sdjm	$ADDU	$t_2,$at
1140ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1141ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1142ec07fdf1Sdjm	$ADDU	$c_3,$at
1143ec07fdf1Sdjm	mflo	$t_1
1144ec07fdf1Sdjm	mfhi	$t_2
1145ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1146ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1147ec07fdf1Sdjm	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1148ec07fdf1Sdjm	$ADDU	$t_2,$at
1149ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1150ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1151ec07fdf1Sdjm	$ADDU	$c_3,$at
1152ec07fdf1Sdjm	mflo	$t_1
1153ec07fdf1Sdjm	mfhi	$t_2
1154ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1155ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1156ec07fdf1Sdjm	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
1157ec07fdf1Sdjm	$ADDU	$t_2,$at
1158ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1159ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1160ec07fdf1Sdjm	$ADDU	$c_3,$at
1161ec07fdf1Sdjm	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1162ec07fdf1Sdjm
1163ec07fdf1Sdjm	mflo	$t_1
1164ec07fdf1Sdjm	mfhi	$t_2
1165ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1166ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1167ec07fdf1Sdjm	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1168ec07fdf1Sdjm	$ADDU	$t_2,$at
1169ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1170ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1171ec07fdf1Sdjm	mflo	$t_1
1172ec07fdf1Sdjm	mfhi	$t_2
1173ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1174ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1175ec07fdf1Sdjm	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1176ec07fdf1Sdjm	$ADDU	$t_2,$at
1177ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1178ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1179ec07fdf1Sdjm	$ADDU	$c_1,$at
1180ec07fdf1Sdjm	mflo	$t_1
1181ec07fdf1Sdjm	mfhi	$t_2
1182ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1183ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1184ec07fdf1Sdjm	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1185ec07fdf1Sdjm	$ADDU	$t_2,$at
1186ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1187ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1188ec07fdf1Sdjm	$ADDU	$c_1,$at
1189ec07fdf1Sdjm	mflo	$t_1
1190ec07fdf1Sdjm	mfhi	$t_2
1191ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1192ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1193ec07fdf1Sdjm	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
1194ec07fdf1Sdjm	$ADDU	$t_2,$at
1195ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1196ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1197ec07fdf1Sdjm	$ADDU	$c_1,$at
1198ec07fdf1Sdjm	mflo	$t_1
1199ec07fdf1Sdjm	mfhi	$t_2
1200ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1201ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1202ec07fdf1Sdjm	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
1203ec07fdf1Sdjm	$ADDU	$t_2,$at
1204ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1205ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1206ec07fdf1Sdjm	$ADDU	$c_1,$at
1207ec07fdf1Sdjm	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1208ec07fdf1Sdjm
1209ec07fdf1Sdjm	mflo	$t_1
1210ec07fdf1Sdjm	mfhi	$t_2
1211ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1212ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1213ec07fdf1Sdjm	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
1214ec07fdf1Sdjm	$ADDU	$t_2,$at
1215ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1216ec07fdf1Sdjm	sltu	$c_2,$c_1,$t_2
1217ec07fdf1Sdjm	mflo	$t_1
1218ec07fdf1Sdjm	mfhi	$t_2
1219ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1220ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1221ec07fdf1Sdjm	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1222ec07fdf1Sdjm	$ADDU	$t_2,$at
1223ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1224ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1225ec07fdf1Sdjm	$ADDU	$c_2,$at
1226ec07fdf1Sdjm	mflo	$t_1
1227ec07fdf1Sdjm	mfhi	$t_2
1228ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1229ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1230ec07fdf1Sdjm	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1231ec07fdf1Sdjm	$ADDU	$t_2,$at
1232ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1233ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1234ec07fdf1Sdjm	$ADDU	$c_2,$at
1235ec07fdf1Sdjm	mflo	$t_1
1236ec07fdf1Sdjm	mfhi	$t_2
1237ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1238ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1239ec07fdf1Sdjm	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
1240ec07fdf1Sdjm	$ADDU	$t_2,$at
1241ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1242ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1243ec07fdf1Sdjm	$ADDU	$c_2,$at
1244ec07fdf1Sdjm	mflo	$t_1
1245ec07fdf1Sdjm	mfhi	$t_2
1246ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1247ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1248ec07fdf1Sdjm	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
1249ec07fdf1Sdjm	$ADDU	$t_2,$at
1250ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1251ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1252ec07fdf1Sdjm	$ADDU	$c_2,$at
1253ec07fdf1Sdjm	mflo	$t_1
1254ec07fdf1Sdjm	mfhi	$t_2
1255ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1256ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1257ec07fdf1Sdjm	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
1258ec07fdf1Sdjm	$ADDU	$t_2,$at
1259ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1260ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1261ec07fdf1Sdjm	$ADDU	$c_2,$at
1262ec07fdf1Sdjm	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1263ec07fdf1Sdjm
1264ec07fdf1Sdjm	mflo	$t_1
1265ec07fdf1Sdjm	mfhi	$t_2
1266ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1267ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1268ec07fdf1Sdjm	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
1269ec07fdf1Sdjm	$ADDU	$t_2,$at
1270ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1271ec07fdf1Sdjm	sltu	$c_3,$c_2,$t_2
1272ec07fdf1Sdjm	mflo	$t_1
1273ec07fdf1Sdjm	mfhi	$t_2
1274ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1275ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1276ec07fdf1Sdjm	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
1277ec07fdf1Sdjm	$ADDU	$t_2,$at
1278ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1279ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1280ec07fdf1Sdjm	$ADDU	$c_3,$at
1281ec07fdf1Sdjm	mflo	$t_1
1282ec07fdf1Sdjm	mfhi	$t_2
1283ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1284ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1285ec07fdf1Sdjm	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1286ec07fdf1Sdjm	$ADDU	$t_2,$at
1287ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1288ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1289ec07fdf1Sdjm	$ADDU	$c_3,$at
1290ec07fdf1Sdjm	mflo	$t_1
1291ec07fdf1Sdjm	mfhi	$t_2
1292ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1293ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1294ec07fdf1Sdjm	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
1295ec07fdf1Sdjm	$ADDU	$t_2,$at
1296ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1297ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1298ec07fdf1Sdjm	$ADDU	$c_3,$at
1299ec07fdf1Sdjm	mflo	$t_1
1300ec07fdf1Sdjm	mfhi	$t_2
1301ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1302ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1303ec07fdf1Sdjm	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
1304ec07fdf1Sdjm	$ADDU	$t_2,$at
1305ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1306ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1307ec07fdf1Sdjm	$ADDU	$c_3,$at
1308ec07fdf1Sdjm	mflo	$t_1
1309ec07fdf1Sdjm	mfhi	$t_2
1310ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1311ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1312ec07fdf1Sdjm	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
1313ec07fdf1Sdjm	$ADDU	$t_2,$at
1314ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1315ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1316ec07fdf1Sdjm	$ADDU	$c_3,$at
1317ec07fdf1Sdjm	mflo	$t_1
1318ec07fdf1Sdjm	mfhi	$t_2
1319ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1320ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1321ec07fdf1Sdjm	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
1322ec07fdf1Sdjm	$ADDU	$t_2,$at
1323ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1324ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1325ec07fdf1Sdjm	$ADDU	$c_3,$at
1326ec07fdf1Sdjm	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1327ec07fdf1Sdjm
1328ec07fdf1Sdjm	mflo	$t_1
1329ec07fdf1Sdjm	mfhi	$t_2
1330ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1331ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1332ec07fdf1Sdjm	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
1333ec07fdf1Sdjm	$ADDU	$t_2,$at
1334ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1335ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1336ec07fdf1Sdjm	mflo	$t_1
1337ec07fdf1Sdjm	mfhi	$t_2
1338ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1339ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1340ec07fdf1Sdjm	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
1341ec07fdf1Sdjm	$ADDU	$t_2,$at
1342ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1343ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1344ec07fdf1Sdjm	$ADDU	$c_1,$at
1345ec07fdf1Sdjm	mflo	$t_1
1346ec07fdf1Sdjm	mfhi	$t_2
1347ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1348ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1349ec07fdf1Sdjm	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
1350ec07fdf1Sdjm	$ADDU	$t_2,$at
1351ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1352ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1353ec07fdf1Sdjm	$ADDU	$c_1,$at
1354ec07fdf1Sdjm	mflo	$t_1
1355ec07fdf1Sdjm	mfhi	$t_2
1356ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1357ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1358ec07fdf1Sdjm	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
1359ec07fdf1Sdjm	$ADDU	$t_2,$at
1360ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1361ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1362ec07fdf1Sdjm	$ADDU	$c_1,$at
1363ec07fdf1Sdjm	mflo	$t_1
1364ec07fdf1Sdjm	mfhi	$t_2
1365ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1366ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1367ec07fdf1Sdjm	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
1368ec07fdf1Sdjm	$ADDU	$t_2,$at
1369ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1370ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1371ec07fdf1Sdjm	$ADDU	$c_1,$at
1372ec07fdf1Sdjm	mflo	$t_1
1373ec07fdf1Sdjm	mfhi	$t_2
1374ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1375ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1376ec07fdf1Sdjm	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
1377ec07fdf1Sdjm	$ADDU	$t_2,$at
1378ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1379ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1380ec07fdf1Sdjm	$ADDU	$c_1,$at
1381ec07fdf1Sdjm	mflo	$t_1
1382ec07fdf1Sdjm	mfhi	$t_2
1383ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1384ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1385ec07fdf1Sdjm	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
1386ec07fdf1Sdjm	$ADDU	$t_2,$at
1387ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1388ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1389ec07fdf1Sdjm	$ADDU	$c_1,$at
1390ec07fdf1Sdjm	mflo	$t_1
1391ec07fdf1Sdjm	mfhi	$t_2
1392ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1393ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1394ec07fdf1Sdjm	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
1395ec07fdf1Sdjm	$ADDU	$t_2,$at
1396ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1397ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1398ec07fdf1Sdjm	$ADDU	$c_1,$at
1399ec07fdf1Sdjm	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1400ec07fdf1Sdjm
1401ec07fdf1Sdjm	mflo	$t_1
1402ec07fdf1Sdjm	mfhi	$t_2
1403ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1404ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1405ec07fdf1Sdjm	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
1406ec07fdf1Sdjm	$ADDU	$t_2,$at
1407ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1408ec07fdf1Sdjm	sltu	$c_2,$c_1,$t_2
1409ec07fdf1Sdjm	mflo	$t_1
1410ec07fdf1Sdjm	mfhi	$t_2
1411ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1412ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1413ec07fdf1Sdjm	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
1414ec07fdf1Sdjm	$ADDU	$t_2,$at
1415ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1416ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1417ec07fdf1Sdjm	$ADDU	$c_2,$at
1418ec07fdf1Sdjm	mflo	$t_1
1419ec07fdf1Sdjm	mfhi	$t_2
1420ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1421ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1422ec07fdf1Sdjm	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
1423ec07fdf1Sdjm	$ADDU	$t_2,$at
1424ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1425ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1426ec07fdf1Sdjm	$ADDU	$c_2,$at
1427ec07fdf1Sdjm	mflo	$t_1
1428ec07fdf1Sdjm	mfhi	$t_2
1429ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1430ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1431ec07fdf1Sdjm	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
1432ec07fdf1Sdjm	$ADDU	$t_2,$at
1433ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1434ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1435ec07fdf1Sdjm	$ADDU	$c_2,$at
1436ec07fdf1Sdjm	mflo	$t_1
1437ec07fdf1Sdjm	mfhi	$t_2
1438ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1439ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1440ec07fdf1Sdjm	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
1441ec07fdf1Sdjm	$ADDU	$t_2,$at
1442ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1443ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1444ec07fdf1Sdjm	$ADDU	$c_2,$at
1445ec07fdf1Sdjm	mflo	$t_1
1446ec07fdf1Sdjm	mfhi	$t_2
1447ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1448ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1449ec07fdf1Sdjm	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
1450ec07fdf1Sdjm	$ADDU	$t_2,$at
1451ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1452ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1453ec07fdf1Sdjm	$ADDU	$c_2,$at
1454ec07fdf1Sdjm	mflo	$t_1
1455ec07fdf1Sdjm	mfhi	$t_2
1456ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1457ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1458ec07fdf1Sdjm	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
1459ec07fdf1Sdjm	$ADDU	$t_2,$at
1460ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1461ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1462ec07fdf1Sdjm	$ADDU	$c_2,$at
1463ec07fdf1Sdjm	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1464ec07fdf1Sdjm
1465ec07fdf1Sdjm	mflo	$t_1
1466ec07fdf1Sdjm	mfhi	$t_2
1467ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1468ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1469ec07fdf1Sdjm	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
1470ec07fdf1Sdjm	$ADDU	$t_2,$at
1471ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1472ec07fdf1Sdjm	sltu	$c_3,$c_2,$t_2
1473ec07fdf1Sdjm	mflo	$t_1
1474ec07fdf1Sdjm	mfhi	$t_2
1475ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1476ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1477ec07fdf1Sdjm	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
1478ec07fdf1Sdjm	$ADDU	$t_2,$at
1479ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1480ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1481ec07fdf1Sdjm	$ADDU	$c_3,$at
1482ec07fdf1Sdjm	mflo	$t_1
1483ec07fdf1Sdjm	mfhi	$t_2
1484ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1485ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1486ec07fdf1Sdjm	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
1487ec07fdf1Sdjm	$ADDU	$t_2,$at
1488ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1489ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1490ec07fdf1Sdjm	$ADDU	$c_3,$at
1491ec07fdf1Sdjm	mflo	$t_1
1492ec07fdf1Sdjm	mfhi	$t_2
1493ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1494ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1495ec07fdf1Sdjm	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
1496ec07fdf1Sdjm	$ADDU	$t_2,$at
1497ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1498ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1499ec07fdf1Sdjm	$ADDU	$c_3,$at
1500ec07fdf1Sdjm	mflo	$t_1
1501ec07fdf1Sdjm	mfhi	$t_2
1502ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1503ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1504ec07fdf1Sdjm	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
1505ec07fdf1Sdjm	$ADDU	$t_2,$at
1506ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1507ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1508ec07fdf1Sdjm	$ADDU	$c_3,$at
1509ec07fdf1Sdjm	mflo	$t_1
1510ec07fdf1Sdjm	mfhi	$t_2
1511ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1512ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1513ec07fdf1Sdjm	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
1514ec07fdf1Sdjm	$ADDU	$t_2,$at
1515ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1516ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1517ec07fdf1Sdjm	$ADDU	$c_3,$at
1518ec07fdf1Sdjm	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1519ec07fdf1Sdjm
1520ec07fdf1Sdjm	mflo	$t_1
1521ec07fdf1Sdjm	mfhi	$t_2
1522ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1523ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1524ec07fdf1Sdjm	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
1525ec07fdf1Sdjm	$ADDU	$t_2,$at
1526ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1527ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1528ec07fdf1Sdjm	mflo	$t_1
1529ec07fdf1Sdjm	mfhi	$t_2
1530ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1531ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1532ec07fdf1Sdjm	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
1533ec07fdf1Sdjm	$ADDU	$t_2,$at
1534ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1535ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1536ec07fdf1Sdjm	$ADDU	$c_1,$at
1537ec07fdf1Sdjm	mflo	$t_1
1538ec07fdf1Sdjm	mfhi	$t_2
1539ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1540ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1541ec07fdf1Sdjm	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
1542ec07fdf1Sdjm	$ADDU	$t_2,$at
1543ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1544ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1545ec07fdf1Sdjm	$ADDU	$c_1,$at
1546ec07fdf1Sdjm	mflo	$t_1
1547ec07fdf1Sdjm	mfhi	$t_2
1548ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1549ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1550ec07fdf1Sdjm	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
1551ec07fdf1Sdjm	$ADDU	$t_2,$at
1552ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1553ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1554ec07fdf1Sdjm	$ADDU	$c_1,$at
1555ec07fdf1Sdjm	mflo	$t_1
1556ec07fdf1Sdjm	mfhi	$t_2
1557ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1558ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1559ec07fdf1Sdjm	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
1560ec07fdf1Sdjm	$ADDU	$t_2,$at
1561ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1562ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1563ec07fdf1Sdjm	$ADDU	$c_1,$at
1564ec07fdf1Sdjm	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1565ec07fdf1Sdjm
1566ec07fdf1Sdjm	mflo	$t_1
1567ec07fdf1Sdjm	mfhi	$t_2
1568ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1569ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1570ec07fdf1Sdjm	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
1571ec07fdf1Sdjm	$ADDU	$t_2,$at
1572ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1573ec07fdf1Sdjm	sltu	$c_2,$c_1,$t_2
1574ec07fdf1Sdjm	mflo	$t_1
1575ec07fdf1Sdjm	mfhi	$t_2
1576ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1577ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1578ec07fdf1Sdjm	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
1579ec07fdf1Sdjm	$ADDU	$t_2,$at
1580ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1581ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1582ec07fdf1Sdjm	$ADDU	$c_2,$at
1583ec07fdf1Sdjm	mflo	$t_1
1584ec07fdf1Sdjm	mfhi	$t_2
1585ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1586ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1587ec07fdf1Sdjm	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
1588ec07fdf1Sdjm	$ADDU	$t_2,$at
1589ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1590ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1591ec07fdf1Sdjm	$ADDU	$c_2,$at
1592ec07fdf1Sdjm	mflo	$t_1
1593ec07fdf1Sdjm	mfhi	$t_2
1594ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1595ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1596ec07fdf1Sdjm	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
1597ec07fdf1Sdjm	$ADDU	$t_2,$at
1598ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1599ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1600ec07fdf1Sdjm	$ADDU	$c_2,$at
1601ec07fdf1Sdjm	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1602ec07fdf1Sdjm
1603ec07fdf1Sdjm	mflo	$t_1
1604ec07fdf1Sdjm	mfhi	$t_2
1605ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1606ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1607ec07fdf1Sdjm	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
1608ec07fdf1Sdjm	$ADDU	$t_2,$at
1609ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1610ec07fdf1Sdjm	sltu	$c_3,$c_2,$t_2
1611ec07fdf1Sdjm	mflo	$t_1
1612ec07fdf1Sdjm	mfhi	$t_2
1613ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1614ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1615ec07fdf1Sdjm	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
1616ec07fdf1Sdjm	$ADDU	$t_2,$at
1617ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1618ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1619ec07fdf1Sdjm	$ADDU	$c_3,$at
1620ec07fdf1Sdjm	mflo	$t_1
1621ec07fdf1Sdjm	mfhi	$t_2
1622ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1623ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1624ec07fdf1Sdjm	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
1625ec07fdf1Sdjm	$ADDU	$t_2,$at
1626ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1627ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1628ec07fdf1Sdjm	$ADDU	$c_3,$at
1629ec07fdf1Sdjm	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1630ec07fdf1Sdjm
1631ec07fdf1Sdjm	mflo	$t_1
1632ec07fdf1Sdjm	mfhi	$t_2
1633ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1634ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1635ec07fdf1Sdjm	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
1636ec07fdf1Sdjm	$ADDU	$t_2,$at
1637ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1638ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1639ec07fdf1Sdjm	mflo	$t_1
1640ec07fdf1Sdjm	mfhi	$t_2
1641ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1642ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1643ec07fdf1Sdjm	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
1644ec07fdf1Sdjm	$ADDU	$t_2,$at
1645ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1646ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1647ec07fdf1Sdjm	$ADDU	$c_1,$at
1648ec07fdf1Sdjm	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1649ec07fdf1Sdjm
1650ec07fdf1Sdjm	mflo	$t_1
1651ec07fdf1Sdjm	mfhi	$t_2
1652ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1653ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1654ec07fdf1Sdjm	$ADDU	$t_2,$at
1655ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1656ec07fdf1Sdjm	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1657ec07fdf1Sdjm	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1658ec07fdf1Sdjm
1659ec07fdf1Sdjm	.set	noreorder
1660ec07fdf1Sdjm___
1661ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
1662ec07fdf1Sdjm	$REG_L	$s5,10*$SZREG($sp)
1663ec07fdf1Sdjm	$REG_L	$s4,9*$SZREG($sp)
1664ec07fdf1Sdjm	$REG_L	$s3,8*$SZREG($sp)
1665ec07fdf1Sdjm	$REG_L	$s2,7*$SZREG($sp)
1666ec07fdf1Sdjm	$REG_L	$s1,6*$SZREG($sp)
1667ec07fdf1Sdjm	$REG_L	$s0,5*$SZREG($sp)
1668ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
1669ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
1670ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
1671ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
1672ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
1673ec07fdf1Sdjm	jr	$ra
1674ec07fdf1Sdjm	$PTR_ADD $sp,12*$SZREG
1675ec07fdf1Sdjm___
1676ec07fdf1Sdjm$code.=<<___ if ($flavour !~ /nubi/i);
1677ec07fdf1Sdjm	$REG_L	$s5,5*$SZREG($sp)
1678ec07fdf1Sdjm	$REG_L	$s4,4*$SZREG($sp)
1679ec07fdf1Sdjm	$REG_L	$s3,3*$SZREG($sp)
1680ec07fdf1Sdjm	$REG_L	$s2,2*$SZREG($sp)
1681ec07fdf1Sdjm	$REG_L	$s1,1*$SZREG($sp)
1682ec07fdf1Sdjm	$REG_L	$s0,0*$SZREG($sp)
1683ec07fdf1Sdjm	jr	$ra
1684ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
1685ec07fdf1Sdjm___
1686ec07fdf1Sdjm$code.=<<___;
1687ec07fdf1Sdjm.end	bn_mul_comba8
1688ec07fdf1Sdjm
1689ec07fdf1Sdjm.align	5
1690ec07fdf1Sdjm.globl	bn_mul_comba4
1691ec07fdf1Sdjm.ent	bn_mul_comba4
1692ec07fdf1Sdjmbn_mul_comba4:
1693ec07fdf1Sdjm___
1694ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
1695ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
1696ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
1697ec07fdf1Sdjm	.set	noreorder
1698ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
1699ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
1700ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
1701ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
1702ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
1703ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
1704ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
1705ec07fdf1Sdjm___
1706ec07fdf1Sdjm$code.=<<___;
1707ec07fdf1Sdjm	.set	reorder
1708ec07fdf1Sdjm	$LD	$a_0,0($a1)
1709ec07fdf1Sdjm	$LD	$b_0,0($a2)
1710ec07fdf1Sdjm	$LD	$a_1,$BNSZ($a1)
1711ec07fdf1Sdjm	$LD	$a_2,2*$BNSZ($a1)
1712ec07fdf1Sdjm	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1713ec07fdf1Sdjm	$LD	$a_3,3*$BNSZ($a1)
1714ec07fdf1Sdjm	$LD	$b_1,$BNSZ($a2)
1715ec07fdf1Sdjm	$LD	$b_2,2*$BNSZ($a2)
1716ec07fdf1Sdjm	$LD	$b_3,3*$BNSZ($a2)
1717ec07fdf1Sdjm	mflo	$c_1
1718ec07fdf1Sdjm	mfhi	$c_2
1719ec07fdf1Sdjm	$ST	$c_1,0($a0)
1720ec07fdf1Sdjm
1721ec07fdf1Sdjm	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1722ec07fdf1Sdjm	mflo	$t_1
1723ec07fdf1Sdjm	mfhi	$t_2
1724ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1725ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1726ec07fdf1Sdjm	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1727ec07fdf1Sdjm	$ADDU	$c_3,$t_2,$at
1728ec07fdf1Sdjm	mflo	$t_1
1729ec07fdf1Sdjm	mfhi	$t_2
1730ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1731ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1732ec07fdf1Sdjm	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1733ec07fdf1Sdjm	$ADDU	$t_2,$at
1734ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1735ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1736ec07fdf1Sdjm	$ST	$c_2,$BNSZ($a0)
1737ec07fdf1Sdjm
1738ec07fdf1Sdjm	mflo	$t_1
1739ec07fdf1Sdjm	mfhi	$t_2
1740ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1741ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1742ec07fdf1Sdjm	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1743ec07fdf1Sdjm	$ADDU	$t_2,$at
1744ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1745ec07fdf1Sdjm	mflo	$t_1
1746ec07fdf1Sdjm	mfhi	$t_2
1747ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1748ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1749ec07fdf1Sdjm	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1750ec07fdf1Sdjm	$ADDU	$t_2,$at
1751ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1752ec07fdf1Sdjm	sltu	$c_2,$c_1,$t_2
1753ec07fdf1Sdjm	mflo	$t_1
1754ec07fdf1Sdjm	mfhi	$t_2
1755ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1756ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1757ec07fdf1Sdjm	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1758ec07fdf1Sdjm	$ADDU	$t_2,$at
1759ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1760ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1761ec07fdf1Sdjm	$ADDU	$c_2,$at
1762ec07fdf1Sdjm	$ST	$c_3,2*$BNSZ($a0)
1763ec07fdf1Sdjm
1764ec07fdf1Sdjm	mflo	$t_1
1765ec07fdf1Sdjm	mfhi	$t_2
1766ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1767ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1768ec07fdf1Sdjm	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1769ec07fdf1Sdjm	$ADDU	$t_2,$at
1770ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1771ec07fdf1Sdjm	sltu	$c_3,$c_2,$t_2
1772ec07fdf1Sdjm	mflo	$t_1
1773ec07fdf1Sdjm	mfhi	$t_2
1774ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1775ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1776ec07fdf1Sdjm	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1777ec07fdf1Sdjm	$ADDU	$t_2,$at
1778ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1779ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1780ec07fdf1Sdjm	$ADDU	$c_3,$at
1781ec07fdf1Sdjm	mflo	$t_1
1782ec07fdf1Sdjm	mfhi	$t_2
1783ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1784ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1785ec07fdf1Sdjm	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1786ec07fdf1Sdjm	$ADDU	$t_2,$at
1787ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1788ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1789ec07fdf1Sdjm	$ADDU	$c_3,$at
1790ec07fdf1Sdjm	mflo	$t_1
1791ec07fdf1Sdjm	mfhi	$t_2
1792ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1793ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1794ec07fdf1Sdjm	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1795ec07fdf1Sdjm	$ADDU	$t_2,$at
1796ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1797ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
1798ec07fdf1Sdjm	$ADDU	$c_3,$at
1799ec07fdf1Sdjm	$ST	$c_1,3*$BNSZ($a0)
1800ec07fdf1Sdjm
1801ec07fdf1Sdjm	mflo	$t_1
1802ec07fdf1Sdjm	mfhi	$t_2
1803ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1804ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1805ec07fdf1Sdjm	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1806ec07fdf1Sdjm	$ADDU	$t_2,$at
1807ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1808ec07fdf1Sdjm	sltu	$c_1,$c_3,$t_2
1809ec07fdf1Sdjm	mflo	$t_1
1810ec07fdf1Sdjm	mfhi	$t_2
1811ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1812ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1813ec07fdf1Sdjm	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1814ec07fdf1Sdjm	$ADDU	$t_2,$at
1815ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1816ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1817ec07fdf1Sdjm	$ADDU	$c_1,$at
1818ec07fdf1Sdjm	mflo	$t_1
1819ec07fdf1Sdjm	mfhi	$t_2
1820ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1821ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1822ec07fdf1Sdjm	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1823ec07fdf1Sdjm	$ADDU	$t_2,$at
1824ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1825ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1826ec07fdf1Sdjm	$ADDU	$c_1,$at
1827ec07fdf1Sdjm	$ST	$c_2,4*$BNSZ($a0)
1828ec07fdf1Sdjm
1829ec07fdf1Sdjm	mflo	$t_1
1830ec07fdf1Sdjm	mfhi	$t_2
1831ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1832ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1833ec07fdf1Sdjm	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1834ec07fdf1Sdjm	$ADDU	$t_2,$at
1835ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1836ec07fdf1Sdjm	sltu	$c_2,$c_1,$t_2
1837ec07fdf1Sdjm	mflo	$t_1
1838ec07fdf1Sdjm	mfhi	$t_2
1839ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1840ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1841ec07fdf1Sdjm	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1842ec07fdf1Sdjm	$ADDU	$t_2,$at
1843ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1844ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1845ec07fdf1Sdjm	$ADDU	$c_2,$at
1846ec07fdf1Sdjm	$ST	$c_3,5*$BNSZ($a0)
1847ec07fdf1Sdjm
1848ec07fdf1Sdjm	mflo	$t_1
1849ec07fdf1Sdjm	mfhi	$t_2
1850ec07fdf1Sdjm	$ADDU	$c_1,$t_1
1851ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
1852ec07fdf1Sdjm	$ADDU	$t_2,$at
1853ec07fdf1Sdjm	$ADDU	$c_2,$t_2
1854ec07fdf1Sdjm	$ST	$c_1,6*$BNSZ($a0)
1855ec07fdf1Sdjm	$ST	$c_2,7*$BNSZ($a0)
1856ec07fdf1Sdjm
1857ec07fdf1Sdjm	.set	noreorder
1858ec07fdf1Sdjm___
1859ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
1860ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
1861ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
1862ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
1863ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
1864ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
1865ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
1866ec07fdf1Sdjm___
1867ec07fdf1Sdjm$code.=<<___;
1868ec07fdf1Sdjm	jr	$ra
1869ec07fdf1Sdjm	nop
1870ec07fdf1Sdjm.end	bn_mul_comba4
1871ec07fdf1Sdjm___
1872ec07fdf1Sdjm
1873ec07fdf1Sdjm($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1874ec07fdf1Sdjm
1875e611d49fSbcooksub add_c2 () {
1876e611d49fSbcookmy ($hi,$lo,$c0,$c1,$c2,
1877e611d49fSbcook    $warm,      # !$warm denotes first call with specific sequence of
1878e611d49fSbcook                # $c_[XYZ] when there is no Z-carry to accumulate yet;
1879e611d49fSbcook    $an,$bn     # these two are arguments for multiplication which
1880e611d49fSbcook                # result is used in *next* step [which is why it's
1881e611d49fSbcook                # commented as "forward multiplication" below];
1882e611d49fSbcook    )=@_;
1883e611d49fSbcook$code.=<<___;
1884e611d49fSbcook	mflo	$lo
1885e611d49fSbcook	mfhi	$hi
1886e611d49fSbcook	$ADDU	$c0,$lo
1887e611d49fSbcook	sltu	$at,$c0,$lo
1888e611d49fSbcook	 $MULTU	$an,$bn			# forward multiplication
1889e611d49fSbcook	$ADDU	$c0,$lo
1890e611d49fSbcook	$ADDU	$at,$hi
1891e611d49fSbcook	sltu	$lo,$c0,$lo
1892e611d49fSbcook	$ADDU	$c1,$at
1893e611d49fSbcook	$ADDU	$hi,$lo
1894e611d49fSbcook___
1895e611d49fSbcook$code.=<<___	if (!$warm);
1896e611d49fSbcook	sltu	$c2,$c1,$at
1897e611d49fSbcook	$ADDU	$c1,$hi
1898e611d49fSbcook	sltu	$hi,$c1,$hi
1899e611d49fSbcook	$ADDU	$c2,$hi
1900e611d49fSbcook___
1901e611d49fSbcook$code.=<<___	if ($warm);
1902e611d49fSbcook	sltu	$at,$c1,$at
1903e611d49fSbcook	$ADDU	$c1,$hi
1904e611d49fSbcook	$ADDU	$c2,$at
1905e611d49fSbcook	sltu	$hi,$c1,$hi
1906e611d49fSbcook	$ADDU	$c2,$hi
1907e611d49fSbcook___
1908e611d49fSbcook}
1909e611d49fSbcook
1910ec07fdf1Sdjm$code.=<<___;
1911ec07fdf1Sdjm
1912ec07fdf1Sdjm.align	5
1913ec07fdf1Sdjm.globl	bn_sqr_comba8
1914ec07fdf1Sdjm.ent	bn_sqr_comba8
1915ec07fdf1Sdjmbn_sqr_comba8:
1916ec07fdf1Sdjm___
1917ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
1918ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
1919ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
1920ec07fdf1Sdjm	.set	noreorder
1921ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
1922ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
1923ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
1924ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
1925ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
1926ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
1927ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
1928ec07fdf1Sdjm___
1929ec07fdf1Sdjm$code.=<<___;
1930ec07fdf1Sdjm	.set	reorder
1931ec07fdf1Sdjm	$LD	$a_0,0($a1)
1932ec07fdf1Sdjm	$LD	$a_1,$BNSZ($a1)
1933ec07fdf1Sdjm	$LD	$a_2,2*$BNSZ($a1)
1934ec07fdf1Sdjm	$LD	$a_3,3*$BNSZ($a1)
1935ec07fdf1Sdjm
1936ec07fdf1Sdjm	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1937ec07fdf1Sdjm	$LD	$a_4,4*$BNSZ($a1)
1938ec07fdf1Sdjm	$LD	$a_5,5*$BNSZ($a1)
1939ec07fdf1Sdjm	$LD	$a_6,6*$BNSZ($a1)
1940ec07fdf1Sdjm	$LD	$a_7,7*$BNSZ($a1)
1941ec07fdf1Sdjm	mflo	$c_1
1942ec07fdf1Sdjm	mfhi	$c_2
1943ec07fdf1Sdjm	$ST	$c_1,0($a0)
1944ec07fdf1Sdjm
1945ec07fdf1Sdjm	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
1946ec07fdf1Sdjm	mflo	$t_1
1947ec07fdf1Sdjm	mfhi	$t_2
1948ec07fdf1Sdjm	slt	$c_1,$t_2,$zero
1949ec07fdf1Sdjm	$SLL	$t_2,1
1950ec07fdf1Sdjm	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
1951ec07fdf1Sdjm	slt	$a2,$t_1,$zero
1952ec07fdf1Sdjm	$ADDU	$t_2,$a2
1953ec07fdf1Sdjm	$SLL	$t_1,1
1954ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1955ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1956ec07fdf1Sdjm	$ADDU	$c_3,$t_2,$at
1957ec07fdf1Sdjm	$ST	$c_2,$BNSZ($a0)
1958e611d49fSbcook___
1959e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1960e611d49fSbcook		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
1961e611d49fSbcook$code.=<<___;
1962ec07fdf1Sdjm	mflo	$t_1
1963ec07fdf1Sdjm	mfhi	$t_2
1964ec07fdf1Sdjm	$ADDU	$c_3,$t_1
1965ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
1966ec07fdf1Sdjm	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
1967ec07fdf1Sdjm	$ADDU	$t_2,$at
1968ec07fdf1Sdjm	$ADDU	$c_1,$t_2
1969ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
1970ec07fdf1Sdjm	$ADDU	$c_2,$at
1971ec07fdf1Sdjm	$ST	$c_3,2*$BNSZ($a0)
1972e611d49fSbcook___
1973e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1974e611d49fSbcook		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
1975e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1976e611d49fSbcook		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
1977e611d49fSbcook$code.=<<___;
1978ec07fdf1Sdjm	$ST	$c_1,3*$BNSZ($a0)
1979e611d49fSbcook___
1980e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1981e611d49fSbcook		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
1982e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1983e611d49fSbcook		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
1984e611d49fSbcook$code.=<<___;
1985ec07fdf1Sdjm	mflo	$t_1
1986ec07fdf1Sdjm	mfhi	$t_2
1987ec07fdf1Sdjm	$ADDU	$c_2,$t_1
1988ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
1989ec07fdf1Sdjm	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
1990ec07fdf1Sdjm	$ADDU	$t_2,$at
1991ec07fdf1Sdjm	$ADDU	$c_3,$t_2
1992ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
1993ec07fdf1Sdjm	$ADDU	$c_1,$at
1994ec07fdf1Sdjm	$ST	$c_2,4*$BNSZ($a0)
1995e611d49fSbcook___
1996e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1997e611d49fSbcook		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
1998e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
1999e611d49fSbcook		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
2000e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2001e611d49fSbcook		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
2002e611d49fSbcook$code.=<<___;
2003ec07fdf1Sdjm	$ST	$c_3,5*$BNSZ($a0)
2004e611d49fSbcook___
2005e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2006e611d49fSbcook		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
2007e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2008e611d49fSbcook		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
2009e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2010e611d49fSbcook		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2011e611d49fSbcook$code.=<<___;
2012ec07fdf1Sdjm	mflo	$t_1
2013ec07fdf1Sdjm	mfhi	$t_2
2014ec07fdf1Sdjm	$ADDU	$c_1,$t_1
2015ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
2016ec07fdf1Sdjm	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
2017ec07fdf1Sdjm	$ADDU	$t_2,$at
2018ec07fdf1Sdjm	$ADDU	$c_2,$t_2
2019ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
2020ec07fdf1Sdjm	$ADDU	$c_3,$at
2021ec07fdf1Sdjm	$ST	$c_1,6*$BNSZ($a0)
2022e611d49fSbcook___
2023e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2024e611d49fSbcook		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
2025e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2026e611d49fSbcook		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
2027e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2028e611d49fSbcook		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
2029e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2030e611d49fSbcook		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
2031e611d49fSbcook$code.=<<___;
2032ec07fdf1Sdjm	$ST	$c_2,7*$BNSZ($a0)
2033e611d49fSbcook___
2034e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2035e611d49fSbcook		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
2036e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2037e611d49fSbcook		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
2038e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2039e611d49fSbcook		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
2040e611d49fSbcook$code.=<<___;
2041ec07fdf1Sdjm	mflo	$t_1
2042ec07fdf1Sdjm	mfhi	$t_2
2043ec07fdf1Sdjm	$ADDU	$c_3,$t_1
2044ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
2045ec07fdf1Sdjm	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
2046ec07fdf1Sdjm	$ADDU	$t_2,$at
2047ec07fdf1Sdjm	$ADDU	$c_1,$t_2
2048ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
2049ec07fdf1Sdjm	$ADDU	$c_2,$at
2050ec07fdf1Sdjm	$ST	$c_3,8*$BNSZ($a0)
2051e611d49fSbcook___
2052e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2053e611d49fSbcook		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
2054e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2055e611d49fSbcook		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
2056e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2057e611d49fSbcook		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
2058e611d49fSbcook$code.=<<___;
2059ec07fdf1Sdjm	$ST	$c_1,9*$BNSZ($a0)
2060e611d49fSbcook___
2061e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2062e611d49fSbcook		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
2063e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2064e611d49fSbcook		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
2065e611d49fSbcook$code.=<<___;
2066ec07fdf1Sdjm	mflo	$t_1
2067ec07fdf1Sdjm	mfhi	$t_2
2068ec07fdf1Sdjm	$ADDU	$c_2,$t_1
2069ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
2070ec07fdf1Sdjm	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
2071ec07fdf1Sdjm	$ADDU	$t_2,$at
2072ec07fdf1Sdjm	$ADDU	$c_3,$t_2
2073ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
2074ec07fdf1Sdjm	$ADDU	$c_1,$at
2075ec07fdf1Sdjm	$ST	$c_2,10*$BNSZ($a0)
2076e611d49fSbcook___
2077e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2078e611d49fSbcook		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
2079e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2080e611d49fSbcook		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
2081e611d49fSbcook$code.=<<___;
2082ec07fdf1Sdjm	$ST	$c_3,11*$BNSZ($a0)
2083e611d49fSbcook___
2084e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2085e611d49fSbcook		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
2086e611d49fSbcook$code.=<<___;
2087ec07fdf1Sdjm	mflo	$t_1
2088ec07fdf1Sdjm	mfhi	$t_2
2089ec07fdf1Sdjm	$ADDU	$c_1,$t_1
2090ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
2091ec07fdf1Sdjm	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
2092ec07fdf1Sdjm	$ADDU	$t_2,$at
2093ec07fdf1Sdjm	$ADDU	$c_2,$t_2
2094ec07fdf1Sdjm	sltu	$at,$c_2,$t_2
2095ec07fdf1Sdjm	$ADDU	$c_3,$at
2096ec07fdf1Sdjm	$ST	$c_1,12*$BNSZ($a0)
2097e611d49fSbcook___
2098e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2099e611d49fSbcook		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
2100e611d49fSbcook$code.=<<___;
2101ec07fdf1Sdjm	$ST	$c_2,13*$BNSZ($a0)
2102ec07fdf1Sdjm
2103ec07fdf1Sdjm	mflo	$t_1
2104ec07fdf1Sdjm	mfhi	$t_2
2105ec07fdf1Sdjm	$ADDU	$c_3,$t_1
2106ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
2107ec07fdf1Sdjm	$ADDU	$t_2,$at
2108ec07fdf1Sdjm	$ADDU	$c_1,$t_2
2109ec07fdf1Sdjm	$ST	$c_3,14*$BNSZ($a0)
2110ec07fdf1Sdjm	$ST	$c_1,15*$BNSZ($a0)
2111ec07fdf1Sdjm
2112ec07fdf1Sdjm	.set	noreorder
2113ec07fdf1Sdjm___
2114ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
2115ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
2116ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
2117ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
2118ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
2119ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
2120ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
2121ec07fdf1Sdjm___
2122ec07fdf1Sdjm$code.=<<___;
2123ec07fdf1Sdjm	jr	$ra
2124ec07fdf1Sdjm	nop
2125ec07fdf1Sdjm.end	bn_sqr_comba8
2126ec07fdf1Sdjm
2127ec07fdf1Sdjm.align	5
2128ec07fdf1Sdjm.globl	bn_sqr_comba4
2129ec07fdf1Sdjm.ent	bn_sqr_comba4
2130ec07fdf1Sdjmbn_sqr_comba4:
2131ec07fdf1Sdjm___
2132ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
2133ec07fdf1Sdjm	.frame	$sp,6*$SZREG,$ra
2134ec07fdf1Sdjm	.mask	0x8000f008,-$SZREG
2135ec07fdf1Sdjm	.set	noreorder
2136ec07fdf1Sdjm	$PTR_SUB $sp,6*$SZREG
2137ec07fdf1Sdjm	$REG_S	$ra,5*$SZREG($sp)
2138ec07fdf1Sdjm	$REG_S	$t3,4*$SZREG($sp)
2139ec07fdf1Sdjm	$REG_S	$t2,3*$SZREG($sp)
2140ec07fdf1Sdjm	$REG_S	$t1,2*$SZREG($sp)
2141ec07fdf1Sdjm	$REG_S	$t0,1*$SZREG($sp)
2142ec07fdf1Sdjm	$REG_S	$gp,0*$SZREG($sp)
2143ec07fdf1Sdjm___
2144ec07fdf1Sdjm$code.=<<___;
2145ec07fdf1Sdjm	.set	reorder
2146ec07fdf1Sdjm	$LD	$a_0,0($a1)
2147ec07fdf1Sdjm	$LD	$a_1,$BNSZ($a1)
2148ec07fdf1Sdjm	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
2149ec07fdf1Sdjm	$LD	$a_2,2*$BNSZ($a1)
2150ec07fdf1Sdjm	$LD	$a_3,3*$BNSZ($a1)
2151ec07fdf1Sdjm	mflo	$c_1
2152ec07fdf1Sdjm	mfhi	$c_2
2153ec07fdf1Sdjm	$ST	$c_1,0($a0)
2154ec07fdf1Sdjm
2155ec07fdf1Sdjm	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
2156ec07fdf1Sdjm	mflo	$t_1
2157ec07fdf1Sdjm	mfhi	$t_2
2158ec07fdf1Sdjm	slt	$c_1,$t_2,$zero
2159ec07fdf1Sdjm	$SLL	$t_2,1
2160ec07fdf1Sdjm	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
2161ec07fdf1Sdjm	slt	$a2,$t_1,$zero
2162ec07fdf1Sdjm	$ADDU	$t_2,$a2
2163ec07fdf1Sdjm	$SLL	$t_1,1
2164ec07fdf1Sdjm	$ADDU	$c_2,$t_1
2165ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
2166ec07fdf1Sdjm	$ADDU	$c_3,$t_2,$at
2167ec07fdf1Sdjm	$ST	$c_2,$BNSZ($a0)
2168e611d49fSbcook___
2169e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2170e611d49fSbcook		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
2171e611d49fSbcook$code.=<<___;
2172ec07fdf1Sdjm	mflo	$t_1
2173ec07fdf1Sdjm	mfhi	$t_2
2174ec07fdf1Sdjm	$ADDU	$c_3,$t_1
2175ec07fdf1Sdjm	sltu	$at,$c_3,$t_1
2176ec07fdf1Sdjm	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
2177ec07fdf1Sdjm	$ADDU	$t_2,$at
2178ec07fdf1Sdjm	$ADDU	$c_1,$t_2
2179ec07fdf1Sdjm	sltu	$at,$c_1,$t_2
2180ec07fdf1Sdjm	$ADDU	$c_2,$at
2181ec07fdf1Sdjm	$ST	$c_3,2*$BNSZ($a0)
2182e611d49fSbcook___
2183e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2184e611d49fSbcook		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
2185e611d49fSbcook	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2186e611d49fSbcook		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2187e611d49fSbcook$code.=<<___;
2188ec07fdf1Sdjm	$ST	$c_1,3*$BNSZ($a0)
2189e611d49fSbcook___
2190e611d49fSbcook	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2191e611d49fSbcook		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2192e611d49fSbcook$code.=<<___;
2193ec07fdf1Sdjm	mflo	$t_1
2194ec07fdf1Sdjm	mfhi	$t_2
2195ec07fdf1Sdjm	$ADDU	$c_2,$t_1
2196ec07fdf1Sdjm	sltu	$at,$c_2,$t_1
2197ec07fdf1Sdjm	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
2198ec07fdf1Sdjm	$ADDU	$t_2,$at
2199ec07fdf1Sdjm	$ADDU	$c_3,$t_2
2200ec07fdf1Sdjm	sltu	$at,$c_3,$t_2
2201ec07fdf1Sdjm	$ADDU	$c_1,$at
2202ec07fdf1Sdjm	$ST	$c_2,4*$BNSZ($a0)
2203e611d49fSbcook___
2204e611d49fSbcook	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2205e611d49fSbcook		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2206e611d49fSbcook$code.=<<___;
2207ec07fdf1Sdjm	$ST	$c_3,5*$BNSZ($a0)
2208ec07fdf1Sdjm
2209ec07fdf1Sdjm	mflo	$t_1
2210ec07fdf1Sdjm	mfhi	$t_2
2211ec07fdf1Sdjm	$ADDU	$c_1,$t_1
2212ec07fdf1Sdjm	sltu	$at,$c_1,$t_1
2213ec07fdf1Sdjm	$ADDU	$t_2,$at
2214ec07fdf1Sdjm	$ADDU	$c_2,$t_2
2215ec07fdf1Sdjm	$ST	$c_1,6*$BNSZ($a0)
2216ec07fdf1Sdjm	$ST	$c_2,7*$BNSZ($a0)
2217ec07fdf1Sdjm
2218ec07fdf1Sdjm	.set	noreorder
2219ec07fdf1Sdjm___
2220ec07fdf1Sdjm$code.=<<___ if ($flavour =~ /nubi/i);
2221ec07fdf1Sdjm	$REG_L	$t3,4*$SZREG($sp)
2222ec07fdf1Sdjm	$REG_L	$t2,3*$SZREG($sp)
2223ec07fdf1Sdjm	$REG_L	$t1,2*$SZREG($sp)
2224ec07fdf1Sdjm	$REG_L	$t0,1*$SZREG($sp)
2225ec07fdf1Sdjm	$REG_L	$gp,0*$SZREG($sp)
2226ec07fdf1Sdjm	$PTR_ADD $sp,6*$SZREG
2227ec07fdf1Sdjm___
2228ec07fdf1Sdjm$code.=<<___;
2229ec07fdf1Sdjm	jr	$ra
2230ec07fdf1Sdjm	nop
2231ec07fdf1Sdjm.end	bn_sqr_comba4
2232ec07fdf1Sdjm___
2233ec07fdf1Sdjmprint $code;
2234ec07fdf1Sdjmclose STDOUT;
2235