xref: /netbsd-src/crypto/external/bsd/openssl.old/lib/libcrypto/arch/sparc64/sparcv9a-mont.S (revision 4724848cf0da353df257f730694b7882798e5daf)
1#include "sparc_arch.h"
2
3.section	".text",#alloc,#execinstr
4
5.global bn_mul_mont_fpu
6.align  32
7bn_mul_mont_fpu:
8	save	%sp,-STACK_FRAME-64,%sp
9
10	cmp	%i5,4
11	bl,a,pn %icc,.Lret
12	clr	%i0
13	andcc	%i5,1,%g0		! %i5 has to be even...
14	bnz,a,pn %icc,.Lret
15	clr	%i0			! signal "unsupported input value"
16
17	srl	%i5,1,%i5
18	sethi	%hi(0xffff),%l7
19	ld	[%i4+0],%g4		! %g4 reassigned, remember?
20	or	%l7,%lo(0xffff),%l7
21	ld	[%i4+4],%o0
22	sllx	%o0,32,%o0
23	or	%o0,%g4,%g4		! %g4=n0[1].n0[0]
24
25	sll	%i5,3,%i5		! num*=8
26
27	add	%sp,STACK_BIAS,%o0		! real top of stack
28	sll	%i5,2,%o1
29	add	%o1,%i5,%o1		! %o1=num*5
30	sub	%o0,%o1,%o0
31	and	%o0,-2048,%o0		! optimize TLB utilization
32	sub	%o0,STACK_BIAS,%sp		! alloca(5*num*8)
33
34	rd	%asi,%o7		! save %asi
35	add	%sp,STACK_BIAS+STACK_FRAME+64,%l0
36	add	%l0,%i5,%l1
37	add	%l1,%i5,%l1	! [an]p_[lh] point at the vectors' ends !
38	add	%l1,%i5,%l2
39	add	%l2,%i5,%l3
40	add	%l3,%i5,%l4
41
42	wr	%g0,210,%asi	! setup %asi for 16-bit FP loads
43
44	add	%i0,%i5,%i0		! readjust input pointers to point
45	add	%i1,%i5,%i1		! at the ends too...
46	add	%i2,%i5,%i2
47	add	%i3,%i5,%i3
48
49	stx	%o7,[%sp+STACK_BIAS+STACK_FRAME+48]	! save %asi
50
51	sub	%g0,%i5,%l5		! i=-num
52	sub	%g0,%i5,%l6		! j=-num
53
54	add	%i1,%l6,%o3
55	add	%i2,%l5,%o4
56
57	ld	[%o3+4],%g1		! bp[0]
58	ld	[%o3+0],%o0
59	ld	[%o4+4],%g5		! ap[0]
60	sllx	%g1,32,%g1
61	ld	[%o4+0],%o1
62	sllx	%g5,32,%g5
63	or	%g1,%o0,%o0
64	or	%g5,%o1,%o1
65
66	add	%i3,%l6,%o5
67
68	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
69	mulx	%g4,%o0,%o0		! ap[0]*bp[0]*n0
70	stx	%o0,[%sp+STACK_BIAS+STACK_FRAME+0]
71
72	ld	[%o3+0],%f17	! load a[j] as pair of 32-bit words
73	.word	0xa1b00c20	! fzeros %f16
74	ld	[%o3+4],%f19
75	.word	0xa5b00c20	! fzeros %f18
76	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
77	.word	0xa9b00c20	! fzeros %f20
78	ld	[%o5+4],%f23
79	.word	0xadb00c20	! fzeros %f22
80
81	! transfer b[i] to FPU as 4x16-bit values
82	ldda	[%o4+2]%asi,%f0
83	fxtod	%f16,%f16
84	ldda	[%o4+0]%asi,%f2
85	fxtod	%f18,%f18
86	ldda	[%o4+6]%asi,%f4
87	fxtod	%f20,%f20
88	ldda	[%o4+4]%asi,%f6
89	fxtod	%f22,%f22
90
91	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
92	ldda	[%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
93	fxtod	%f0,%f0
94	ldda	[%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
95	fxtod	%f2,%f2
96	ldda	[%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
97	fxtod	%f4,%f4
98	ldda	[%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
99	fxtod	%f6,%f6
100
101	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
102	fxtod	%f8,%f8
103	std	%f18,[%l2+%l6]
104	fxtod	%f10,%f10
105	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
106	fxtod	%f12,%f12
107	std	%f22,[%l4+%l6]
108	fxtod	%f14,%f14
109
110		fmuld	%f16,%f0,%f32
111		fmuld	%f20,%f8,%f48
112		fmuld	%f16,%f2,%f34
113		fmuld	%f20,%f10,%f50
114		fmuld	%f16,%f4,%f36
115	faddd	%f32,%f48,%f48
116		fmuld	%f20,%f12,%f52
117		fmuld	%f16,%f6,%f38
118	faddd	%f34,%f50,%f50
119		fmuld	%f20,%f14,%f54
120		fmuld	%f18,%f0,%f40
121	faddd	%f36,%f52,%f52
122		fmuld	%f22,%f8,%f56
123		fmuld	%f18,%f2,%f42
124	faddd	%f38,%f54,%f54
125		fmuld	%f22,%f10,%f58
126		fmuld	%f18,%f4,%f44
127	faddd	%f40,%f56,%f56
128		fmuld	%f22,%f12,%f60
129		fmuld	%f18,%f6,%f46
130	faddd	%f42,%f58,%f58
131		fmuld	%f22,%f14,%f62
132
133	faddd	%f44,%f60,%f24	! %f60
134	faddd	%f46,%f62,%f26	! %f62
135
136	faddd	%f52,%f56,%f52
137	faddd	%f54,%f58,%f54
138
139	fdtox	%f48,%f48
140	fdtox	%f50,%f50
141	fdtox	%f52,%f52
142	fdtox	%f54,%f54
143
144	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
145	add	%l6,8,%l6
146	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
147	add	%i1,%l6,%o4
148	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
149	add	%i3,%l6,%o5
150	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
151
152	ld	[%o4+0],%f17	! load a[j] as pair of 32-bit words
153	.word	0xa1b00c20	! fzeros %f16
154	ld	[%o4+4],%f19
155	.word	0xa5b00c20	! fzeros %f18
156	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
157	.word	0xa9b00c20	! fzeros %f20
158	ld	[%o5+4],%f23
159	.word	0xadb00c20	! fzeros %f22
160
161	fxtod	%f16,%f16
162	fxtod	%f18,%f18
163	fxtod	%f20,%f20
164	fxtod	%f22,%f22
165
166	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
167		fmuld	%f16,%f0,%f32
168	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
169		fmuld	%f20,%f8,%f48
170	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
171		fmuld	%f16,%f2,%f34
172	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
173		fmuld	%f20,%f10,%f50
174
175	srlx	%o0,16,%o7
176	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
177		fmuld	%f16,%f4,%f36
178	add	%o7,%o1,%o1
179	std	%f18,[%l2+%l6]
180		faddd	%f32,%f48,%f48
181		fmuld	%f20,%f12,%f52
182	srlx	%o1,16,%o7
183	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
184		fmuld	%f16,%f6,%f38
185	add	%o7,%o2,%o2
186	std	%f22,[%l4+%l6]
187		faddd	%f34,%f50,%f50
188		fmuld	%f20,%f14,%f54
189	srlx	%o2,16,%o7
190		fmuld	%f18,%f0,%f40
191	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
192		faddd	%f36,%f52,%f52
193		fmuld	%f22,%f8,%f56
194	!and	%o0,%l7,%o0
195	!and	%o1,%l7,%o1
196	!and	%o2,%l7,%o2
197	!sllx	%o1,16,%o1
198	!sllx	%o2,32,%o2
199	!sllx	%o3,48,%o7
200	!or	%o1,%o0,%o0
201	!or	%o2,%o0,%o0
202	!or	%o7,%o0,%o0		! 64-bit result
203	srlx	%o3,16,%g1		! 34-bit carry
204		fmuld	%f18,%f2,%f42
205
206	faddd	%f38,%f54,%f54
207		fmuld	%f22,%f10,%f58
208		fmuld	%f18,%f4,%f44
209	faddd	%f40,%f56,%f56
210		fmuld	%f22,%f12,%f60
211		fmuld	%f18,%f6,%f46
212	faddd	%f42,%f58,%f58
213		fmuld	%f22,%f14,%f62
214
215	faddd	%f24,%f48,%f48
216	faddd	%f26,%f50,%f50
217	faddd	%f44,%f60,%f24	! %f60
218	faddd	%f46,%f62,%f26	! %f62
219
220	faddd	%f52,%f56,%f52
221	faddd	%f54,%f58,%f54
222
223	fdtox	%f48,%f48
224	fdtox	%f50,%f50
225	fdtox	%f52,%f52
226	fdtox	%f54,%f54
227
228	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
229	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
230	addcc	%l6,8,%l6
231	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
232	bz,pn	%icc,.L1stskip
233	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
234
235.align	32			! incidentally already aligned !
236.L1st:
237	add	%i1,%l6,%o4
238	add	%i3,%l6,%o5
239	ld	[%o4+0],%f17	! load a[j] as pair of 32-bit words
240	.word	0xa1b00c20	! fzeros %f16
241	ld	[%o4+4],%f19
242	.word	0xa5b00c20	! fzeros %f18
243	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
244	.word	0xa9b00c20	! fzeros %f20
245	ld	[%o5+4],%f23
246	.word	0xadb00c20	! fzeros %f22
247
248	fxtod	%f16,%f16
249	fxtod	%f18,%f18
250	fxtod	%f20,%f20
251	fxtod	%f22,%f22
252
253	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
254		fmuld	%f16,%f0,%f32
255	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
256		fmuld	%f20,%f8,%f48
257	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
258		fmuld	%f16,%f2,%f34
259	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
260		fmuld	%f20,%f10,%f50
261
262	srlx	%o0,16,%o7
263	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
264		fmuld	%f16,%f4,%f36
265	add	%o7,%o1,%o1
266	std	%f18,[%l2+%l6]
267		faddd	%f32,%f48,%f48
268		fmuld	%f20,%f12,%f52
269	srlx	%o1,16,%o7
270	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
271		fmuld	%f16,%f6,%f38
272	add	%o7,%o2,%o2
273	std	%f22,[%l4+%l6]
274		faddd	%f34,%f50,%f50
275		fmuld	%f20,%f14,%f54
276	srlx	%o2,16,%o7
277		fmuld	%f18,%f0,%f40
278	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
279	and	%o0,%l7,%o0
280		faddd	%f36,%f52,%f52
281		fmuld	%f22,%f8,%f56
282	and	%o1,%l7,%o1
283	and	%o2,%l7,%o2
284		fmuld	%f18,%f2,%f42
285	sllx	%o1,16,%o1
286		faddd	%f38,%f54,%f54
287		fmuld	%f22,%f10,%f58
288	sllx	%o2,32,%o2
289		fmuld	%f18,%f4,%f44
290	sllx	%o3,48,%o7
291	or	%o1,%o0,%o0
292		faddd	%f40,%f56,%f56
293		fmuld	%f22,%f12,%f60
294	or	%o2,%o0,%o0
295		fmuld	%f18,%f6,%f46
296	or	%o7,%o0,%o0		! 64-bit result
297		faddd	%f42,%f58,%f58
298		fmuld	%f22,%f14,%f62
299	addcc	%g1,%o0,%o0
300		faddd	%f24,%f48,%f48
301	srlx	%o3,16,%g1		! 34-bit carry
302		faddd	%f26,%f50,%f50
303	bcs,a	%xcc,.+8
304	add	%g1,1,%g1
305
306	stx	%o0,[%l0]		! tp[j-1]=
307
308	faddd	%f44,%f60,%f24	! %f60
309	faddd	%f46,%f62,%f26	! %f62
310
311	faddd	%f52,%f56,%f52
312	faddd	%f54,%f58,%f54
313
314	fdtox	%f48,%f48
315	fdtox	%f50,%f50
316	fdtox	%f52,%f52
317	fdtox	%f54,%f54
318
319	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
320	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
321	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
322	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
323
324	addcc	%l6,8,%l6
325	bnz,pt	%icc,.L1st
326	add	%l0,8,%l0
327
328.L1stskip:
329	fdtox	%f24,%f24
330	fdtox	%f26,%f26
331
332	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
333	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
334	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
335	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
336
337	srlx	%o0,16,%o7
338	std	%f24,[%sp+STACK_BIAS+STACK_FRAME+32]
339	add	%o7,%o1,%o1
340	std	%f26,[%sp+STACK_BIAS+STACK_FRAME+40]
341	srlx	%o1,16,%o7
342	add	%o7,%o2,%o2
343	srlx	%o2,16,%o7
344	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
345	and	%o0,%l7,%o0
346	and	%o1,%l7,%o1
347	and	%o2,%l7,%o2
348	sllx	%o1,16,%o1
349	sllx	%o2,32,%o2
350	sllx	%o3,48,%o7
351	or	%o1,%o0,%o0
352	or	%o2,%o0,%o0
353	or	%o7,%o0,%o0		! 64-bit result
354	ldx	[%sp+STACK_BIAS+STACK_FRAME+32],%o4
355	addcc	%g1,%o0,%o0
356	ldx	[%sp+STACK_BIAS+STACK_FRAME+40],%o5
357	srlx	%o3,16,%g1		! 34-bit carry
358	bcs,a	%xcc,.+8
359	add	%g1,1,%g1
360
361	stx	%o0,[%l0]		! tp[j-1]=
362	add	%l0,8,%l0
363
364	srlx	%o4,16,%o7
365	add	%o7,%o5,%o5
366	and	%o4,%l7,%o4
367	sllx	%o5,16,%o7
368	or	%o7,%o4,%o4
369	addcc	%g1,%o4,%o4
370	srlx	%o5,48,%g1
371	bcs,a	%xcc,.+8
372	add	%g1,1,%g1
373
374	mov	%g1,%i4
375	stx	%o4,[%l0]		! tp[num-1]=
376
377	ba	.Louter
378	add	%l5,8,%l5
379.align	32
380.Louter:
381	sub	%g0,%i5,%l6		! j=-num
382	add	%sp,STACK_BIAS+STACK_FRAME+64,%l0
383
384	add	%i1,%l6,%o3
385	add	%i2,%l5,%o4
386
387	ld	[%o3+4],%g1		! bp[i]
388	ld	[%o3+0],%o0
389	ld	[%o4+4],%g5		! ap[0]
390	sllx	%g1,32,%g1
391	ld	[%o4+0],%o1
392	sllx	%g5,32,%g5
393	or	%g1,%o0,%o0
394	or	%g5,%o1,%o1
395
396	ldx	[%l0],%o2		! tp[0]
397	mulx	%o1,%o0,%o0
398	addcc	%o2,%o0,%o0
399	mulx	%g4,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
400	stx	%o0,[%sp+STACK_BIAS+STACK_FRAME+0]
401
402	! transfer b[i] to FPU as 4x16-bit values
403	ldda	[%o4+2]%asi,%f0
404	ldda	[%o4+0]%asi,%f2
405	ldda	[%o4+6]%asi,%f4
406	ldda	[%o4+4]%asi,%f6
407
408	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
409	ldda	[%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
410	fxtod	%f0,%f0
411	ldda	[%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
412	fxtod	%f2,%f2
413	ldda	[%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
414	fxtod	%f4,%f4
415	ldda	[%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
416	fxtod	%f6,%f6
417	ldd	[%l1+%l6],%f16		! load a[j] in double format
418	fxtod	%f8,%f8
419	ldd	[%l2+%l6],%f18
420	fxtod	%f10,%f10
421	ldd	[%l3+%l6],%f20		! load n[j] in double format
422	fxtod	%f12,%f12
423	ldd	[%l4+%l6],%f22
424	fxtod	%f14,%f14
425
426		fmuld	%f16,%f0,%f32
427		fmuld	%f20,%f8,%f48
428		fmuld	%f16,%f2,%f34
429		fmuld	%f20,%f10,%f50
430		fmuld	%f16,%f4,%f36
431	faddd	%f32,%f48,%f48
432		fmuld	%f20,%f12,%f52
433		fmuld	%f16,%f6,%f38
434	faddd	%f34,%f50,%f50
435		fmuld	%f20,%f14,%f54
436		fmuld	%f18,%f0,%f40
437	faddd	%f36,%f52,%f52
438		fmuld	%f22,%f8,%f56
439		fmuld	%f18,%f2,%f42
440	faddd	%f38,%f54,%f54
441		fmuld	%f22,%f10,%f58
442		fmuld	%f18,%f4,%f44
443	faddd	%f40,%f56,%f56
444		fmuld	%f22,%f12,%f60
445		fmuld	%f18,%f6,%f46
446	faddd	%f42,%f58,%f58
447		fmuld	%f22,%f14,%f62
448
449	faddd	%f44,%f60,%f24	! %f60
450	faddd	%f46,%f62,%f26	! %f62
451
452	faddd	%f52,%f56,%f52
453	faddd	%f54,%f58,%f54
454
455	fdtox	%f48,%f48
456	fdtox	%f50,%f50
457	fdtox	%f52,%f52
458	fdtox	%f54,%f54
459
460	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
461	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
462	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
463	add	%l6,8,%l6
464	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
465
466	ldd	[%l1+%l6],%f16		! load a[j] in double format
467	ldd	[%l2+%l6],%f18
468	ldd	[%l3+%l6],%f20		! load n[j] in double format
469	ldd	[%l4+%l6],%f22
470
471		fmuld	%f16,%f0,%f32
472		fmuld	%f20,%f8,%f48
473		fmuld	%f16,%f2,%f34
474		fmuld	%f20,%f10,%f50
475		fmuld	%f16,%f4,%f36
476	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
477		faddd	%f32,%f48,%f48
478		fmuld	%f20,%f12,%f52
479	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
480		fmuld	%f16,%f6,%f38
481	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
482		faddd	%f34,%f50,%f50
483		fmuld	%f20,%f14,%f54
484	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
485		fmuld	%f18,%f0,%f40
486
487	srlx	%o0,16,%o7
488		faddd	%f36,%f52,%f52
489		fmuld	%f22,%f8,%f56
490	add	%o7,%o1,%o1
491		fmuld	%f18,%f2,%f42
492	srlx	%o1,16,%o7
493		faddd	%f38,%f54,%f54
494		fmuld	%f22,%f10,%f58
495	add	%o7,%o2,%o2
496		fmuld	%f18,%f4,%f44
497	srlx	%o2,16,%o7
498		faddd	%f40,%f56,%f56
499		fmuld	%f22,%f12,%f60
500	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
501	! why?
502	and	%o0,%l7,%o0
503		fmuld	%f18,%f6,%f46
504	and	%o1,%l7,%o1
505	and	%o2,%l7,%o2
506		faddd	%f42,%f58,%f58
507		fmuld	%f22,%f14,%f62
508	sllx	%o1,16,%o1
509		faddd	%f24,%f48,%f48
510	sllx	%o2,32,%o2
511		faddd	%f26,%f50,%f50
512	sllx	%o3,48,%o7
513	or	%o1,%o0,%o0
514		faddd	%f44,%f60,%f24	! %f60
515	or	%o2,%o0,%o0
516		faddd	%f46,%f62,%f26	! %f62
517	or	%o7,%o0,%o0		! 64-bit result
518	ldx	[%l0],%o7
519		faddd	%f52,%f56,%f52
520	addcc	%o7,%o0,%o0
521	! end-of-why?
522		faddd	%f54,%f58,%f54
523	srlx	%o3,16,%g1		! 34-bit carry
524		fdtox	%f48,%f48
525	bcs,a	%xcc,.+8
526	add	%g1,1,%g1
527
528	fdtox	%f50,%f50
529	fdtox	%f52,%f52
530	fdtox	%f54,%f54
531
532	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
533	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
534	addcc	%l6,8,%l6
535	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
536	bz,pn	%icc,.Linnerskip
537	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
538
539	ba	.Linner
540	nop
541.align	32
542.Linner:
543	ldd	[%l1+%l6],%f16		! load a[j] in double format
544	ldd	[%l2+%l6],%f18
545	ldd	[%l3+%l6],%f20		! load n[j] in double format
546	ldd	[%l4+%l6],%f22
547
548		fmuld	%f16,%f0,%f32
549		fmuld	%f20,%f8,%f48
550		fmuld	%f16,%f2,%f34
551		fmuld	%f20,%f10,%f50
552		fmuld	%f16,%f4,%f36
553	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
554		faddd	%f32,%f48,%f48
555		fmuld	%f20,%f12,%f52
556	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
557		fmuld	%f16,%f6,%f38
558	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
559		faddd	%f34,%f50,%f50
560		fmuld	%f20,%f14,%f54
561	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
562		fmuld	%f18,%f0,%f40
563
564	srlx	%o0,16,%o7
565		faddd	%f36,%f52,%f52
566		fmuld	%f22,%f8,%f56
567	add	%o7,%o1,%o1
568		fmuld	%f18,%f2,%f42
569	srlx	%o1,16,%o7
570		faddd	%f38,%f54,%f54
571		fmuld	%f22,%f10,%f58
572	add	%o7,%o2,%o2
573		fmuld	%f18,%f4,%f44
574	srlx	%o2,16,%o7
575		faddd	%f40,%f56,%f56
576		fmuld	%f22,%f12,%f60
577	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
578	and	%o0,%l7,%o0
579		fmuld	%f18,%f6,%f46
580	and	%o1,%l7,%o1
581	and	%o2,%l7,%o2
582		faddd	%f42,%f58,%f58
583		fmuld	%f22,%f14,%f62
584	sllx	%o1,16,%o1
585		faddd	%f24,%f48,%f48
586	sllx	%o2,32,%o2
587		faddd	%f26,%f50,%f50
588	sllx	%o3,48,%o7
589	or	%o1,%o0,%o0
590		faddd	%f44,%f60,%f24	! %f60
591	or	%o2,%o0,%o0
592		faddd	%f46,%f62,%f26	! %f62
593	or	%o7,%o0,%o0		! 64-bit result
594		faddd	%f52,%f56,%f52
595	addcc	%g1,%o0,%o0
596	ldx	[%l0+8],%o7		! tp[j]
597		faddd	%f54,%f58,%f54
598	srlx	%o3,16,%g1		! 34-bit carry
599		fdtox	%f48,%f48
600	bcs,a	%xcc,.+8
601	add	%g1,1,%g1
602		fdtox	%f50,%f50
603	addcc	%o7,%o0,%o0
604		fdtox	%f52,%f52
605	bcs,a	%xcc,.+8
606	add	%g1,1,%g1
607
608	stx	%o0,[%l0]		! tp[j-1]
609		fdtox	%f54,%f54
610
611	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
612	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
613	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
614	addcc	%l6,8,%l6
615	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
616	bnz,pt	%icc,.Linner
617	add	%l0,8,%l0
618
619.Linnerskip:
620	fdtox	%f24,%f24
621	fdtox	%f26,%f26
622
623	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
624	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
625	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
626	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
627
628	srlx	%o0,16,%o7
629	std	%f24,[%sp+STACK_BIAS+STACK_FRAME+32]
630	add	%o7,%o1,%o1
631	std	%f26,[%sp+STACK_BIAS+STACK_FRAME+40]
632	srlx	%o1,16,%o7
633	add	%o7,%o2,%o2
634	srlx	%o2,16,%o7
635	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
636	and	%o0,%l7,%o0
637	and	%o1,%l7,%o1
638	and	%o2,%l7,%o2
639	sllx	%o1,16,%o1
640	sllx	%o2,32,%o2
641	sllx	%o3,48,%o7
642	or	%o1,%o0,%o0
643	or	%o2,%o0,%o0
644	ldx	[%sp+STACK_BIAS+STACK_FRAME+32],%o4
645	or	%o7,%o0,%o0		! 64-bit result
646	ldx	[%sp+STACK_BIAS+STACK_FRAME+40],%o5
647	addcc	%g1,%o0,%o0
648	ldx	[%l0+8],%o7		! tp[j]
649	srlx	%o3,16,%g1		! 34-bit carry
650	bcs,a	%xcc,.+8
651	add	%g1,1,%g1
652
653	addcc	%o7,%o0,%o0
654	bcs,a	%xcc,.+8
655	add	%g1,1,%g1
656
657	stx	%o0,[%l0]		! tp[j-1]
658	add	%l0,8,%l0
659
660	srlx	%o4,16,%o7
661	add	%o7,%o5,%o5
662	and	%o4,%l7,%o4
663	sllx	%o5,16,%o7
664	or	%o7,%o4,%o4
665	addcc	%g1,%o4,%o4
666	srlx	%o5,48,%g1
667	bcs,a	%xcc,.+8
668	add	%g1,1,%g1
669
670	addcc	%i4,%o4,%o4
671	stx	%o4,[%l0]		! tp[num-1]
672	mov	%g1,%i4
673	bcs,a	%xcc,.+8
674	add	%i4,1,%i4
675
676	addcc	%l5,8,%l5
677	bnz	%icc,.Louter
678	nop
679
680	add	%l0,8,%l0		! adjust tp to point at the end
681	orn	%g0,%g0,%g4
682	sub	%g0,%i5,%o7		! n=-num
683	ba	.Lsub
684	subcc	%g0,%g0,%g0		! clear %icc.c
685
686.align	32
687.Lsub:
688	ldx	[%l0+%o7],%o0
689	add	%i3,%o7,%g1
690	ld	[%g1+0],%o2
691	ld	[%g1+4],%o3
692	srlx	%o0,32,%o1
693	subccc	%o0,%o2,%o2
694	add	%i0,%o7,%g1
695	subccc	%o1,%o3,%o3
696	st	%o2,[%g1+0]
697	add	%o7,8,%o7
698	brnz,pt	%o7,.Lsub
699	st	%o3,[%g1+4]
700	subc	%i4,0,%g4
701	sub	%g0,%i5,%o7		! n=-num
702	ba	.Lcopy
703	nop
704
705.align	32
706.Lcopy:
707	ldx	[%l0+%o7],%o0
708	add	%i0,%o7,%g1
709	ld	[%g1+0],%o2
710	ld	[%g1+4],%o3
711	stx	%g0,[%l0+%o7]
712	and	%o0,%g4,%o0
713	srlx	%o0,32,%o1
714	andn	%o2,%g4,%o2
715	andn	%o3,%g4,%o3
716	or	%o2,%o0,%o0
717	or	%o3,%o1,%o1
718	st	%o0,[%g1+0]
719	add	%o7,8,%o7
720	brnz,pt	%o7,.Lcopy
721	st	%o1,[%g1+4]
722	sub	%g0,%i5,%o7		! n=-num
723
724.Lzap:
725	stx	%g0,[%l1+%o7]
726	stx	%g0,[%l2+%o7]
727	stx	%g0,[%l3+%o7]
728	stx	%g0,[%l4+%o7]
729	add	%o7,8,%o7
730	brnz,pt	%o7,.Lzap
731	nop
732
733	ldx	[%sp+STACK_BIAS+STACK_FRAME+48],%o7
734	wr	%g0,%o7,%asi		! restore %asi
735
736	mov	1,%i0
737.Lret:
738	ret
739	restore
740.type   bn_mul_mont_fpu,#function
741.size	bn_mul_mont_fpu,(.-bn_mul_mont_fpu)
742.asciz	"Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>"
743.align	32
744