xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/sparc/sparcv9a-mont.S (revision 1b3d6f93806f8821fe459e13ad13e605b37c6d43)
1#ifndef __ASSEMBLER__
2# define __ASSEMBLER__ 1
3#endif
4#include "crypto/sparc_arch.h"
5
6.section	".text",#alloc,#execinstr
7
8.global bn_mul_mont_fpu
9.align  32
10bn_mul_mont_fpu:
11	save	%sp,-STACK_FRAME-64,%sp
12
13	cmp	%i5,4
14	bl,a,pn %icc,.Lret
15	clr	%i0
16	andcc	%i5,1,%g0		! %i5 has to be even...
17	bnz,a,pn %icc,.Lret
18	clr	%i0			! signal "unsupported input value"
19
20	srl	%i5,1,%i5
21	sethi	%hi(0xffff),%l7
22	ld	[%i4+0],%g4		! %g4 reassigned, remember?
23	or	%l7,%lo(0xffff),%l7
24	ld	[%i4+4],%o0
25	sllx	%o0,32,%o0
26	or	%o0,%g4,%g4		! %g4=n0[1].n0[0]
27
28	sll	%i5,3,%i5		! num*=8
29
30	add	%sp,STACK_BIAS,%o0		! real top of stack
31	sll	%i5,2,%o1
32	add	%o1,%i5,%o1		! %o1=num*5
33	sub	%o0,%o1,%o0
34	and	%o0,-2048,%o0		! optimize TLB utilization
35	sub	%o0,STACK_BIAS,%sp		! alloca(5*num*8)
36
37	rd	%asi,%o7		! save %asi
38	add	%sp,STACK_BIAS+STACK_FRAME+64,%l0
39	add	%l0,%i5,%l1
40	add	%l1,%i5,%l1	! [an]p_[lh] point at the vectors' ends !
41	add	%l1,%i5,%l2
42	add	%l2,%i5,%l3
43	add	%l3,%i5,%l4
44
45	wr	%g0,210,%asi	! setup %asi for 16-bit FP loads
46
47	add	%i0,%i5,%i0		! readjust input pointers to point
48	add	%i1,%i5,%i1		! at the ends too...
49	add	%i2,%i5,%i2
50	add	%i3,%i5,%i3
51
52	stx	%o7,[%sp+STACK_BIAS+STACK_FRAME+48]	! save %asi
53
54	sub	%g0,%i5,%l5		! i=-num
55	sub	%g0,%i5,%l6		! j=-num
56
57	add	%i1,%l6,%o3
58	add	%i2,%l5,%o4
59
60	ld	[%o3+4],%g1		! bp[0]
61	ld	[%o3+0],%o0
62	ld	[%o4+4],%g5		! ap[0]
63	sllx	%g1,32,%g1
64	ld	[%o4+0],%o1
65	sllx	%g5,32,%g5
66	or	%g1,%o0,%o0
67	or	%g5,%o1,%o1
68
69	add	%i3,%l6,%o5
70
71	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
72	mulx	%g4,%o0,%o0		! ap[0]*bp[0]*n0
73	stx	%o0,[%sp+STACK_BIAS+STACK_FRAME+0]
74
75	ld	[%o3+0],%f17	! load a[j] as pair of 32-bit words
76	.word	0xa1b00c20	! fzeros %f16
77	ld	[%o3+4],%f19
78	.word	0xa5b00c20	! fzeros %f18
79	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
80	.word	0xa9b00c20	! fzeros %f20
81	ld	[%o5+4],%f23
82	.word	0xadb00c20	! fzeros %f22
83
84	! transfer b[i] to FPU as 4x16-bit values
85	ldda	[%o4+2]%asi,%f0
86	fxtod	%f16,%f16
87	ldda	[%o4+0]%asi,%f2
88	fxtod	%f18,%f18
89	ldda	[%o4+6]%asi,%f4
90	fxtod	%f20,%f20
91	ldda	[%o4+4]%asi,%f6
92	fxtod	%f22,%f22
93
94	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
95	ldda	[%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
96	fxtod	%f0,%f0
97	ldda	[%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
98	fxtod	%f2,%f2
99	ldda	[%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
100	fxtod	%f4,%f4
101	ldda	[%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
102	fxtod	%f6,%f6
103
104	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
105	fxtod	%f8,%f8
106	std	%f18,[%l2+%l6]
107	fxtod	%f10,%f10
108	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
109	fxtod	%f12,%f12
110	std	%f22,[%l4+%l6]
111	fxtod	%f14,%f14
112
113		fmuld	%f16,%f0,%f32
114		fmuld	%f20,%f8,%f48
115		fmuld	%f16,%f2,%f34
116		fmuld	%f20,%f10,%f50
117		fmuld	%f16,%f4,%f36
118	faddd	%f32,%f48,%f48
119		fmuld	%f20,%f12,%f52
120		fmuld	%f16,%f6,%f38
121	faddd	%f34,%f50,%f50
122		fmuld	%f20,%f14,%f54
123		fmuld	%f18,%f0,%f40
124	faddd	%f36,%f52,%f52
125		fmuld	%f22,%f8,%f56
126		fmuld	%f18,%f2,%f42
127	faddd	%f38,%f54,%f54
128		fmuld	%f22,%f10,%f58
129		fmuld	%f18,%f4,%f44
130	faddd	%f40,%f56,%f56
131		fmuld	%f22,%f12,%f60
132		fmuld	%f18,%f6,%f46
133	faddd	%f42,%f58,%f58
134		fmuld	%f22,%f14,%f62
135
136	faddd	%f44,%f60,%f24	! %f60
137	faddd	%f46,%f62,%f26	! %f62
138
139	faddd	%f52,%f56,%f52
140	faddd	%f54,%f58,%f54
141
142	fdtox	%f48,%f48
143	fdtox	%f50,%f50
144	fdtox	%f52,%f52
145	fdtox	%f54,%f54
146
147	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
148	add	%l6,8,%l6
149	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
150	add	%i1,%l6,%o4
151	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
152	add	%i3,%l6,%o5
153	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
154
155	ld	[%o4+0],%f17	! load a[j] as pair of 32-bit words
156	.word	0xa1b00c20	! fzeros %f16
157	ld	[%o4+4],%f19
158	.word	0xa5b00c20	! fzeros %f18
159	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
160	.word	0xa9b00c20	! fzeros %f20
161	ld	[%o5+4],%f23
162	.word	0xadb00c20	! fzeros %f22
163
164	fxtod	%f16,%f16
165	fxtod	%f18,%f18
166	fxtod	%f20,%f20
167	fxtod	%f22,%f22
168
169	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
170		fmuld	%f16,%f0,%f32
171	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
172		fmuld	%f20,%f8,%f48
173	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
174		fmuld	%f16,%f2,%f34
175	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
176		fmuld	%f20,%f10,%f50
177
178	srlx	%o0,16,%o7
179	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
180		fmuld	%f16,%f4,%f36
181	add	%o7,%o1,%o1
182	std	%f18,[%l2+%l6]
183		faddd	%f32,%f48,%f48
184		fmuld	%f20,%f12,%f52
185	srlx	%o1,16,%o7
186	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
187		fmuld	%f16,%f6,%f38
188	add	%o7,%o2,%o2
189	std	%f22,[%l4+%l6]
190		faddd	%f34,%f50,%f50
191		fmuld	%f20,%f14,%f54
192	srlx	%o2,16,%o7
193		fmuld	%f18,%f0,%f40
194	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
195		faddd	%f36,%f52,%f52
196		fmuld	%f22,%f8,%f56
197	!and	%o0,%l7,%o0
198	!and	%o1,%l7,%o1
199	!and	%o2,%l7,%o2
200	!sllx	%o1,16,%o1
201	!sllx	%o2,32,%o2
202	!sllx	%o3,48,%o7
203	!or	%o1,%o0,%o0
204	!or	%o2,%o0,%o0
205	!or	%o7,%o0,%o0		! 64-bit result
206	srlx	%o3,16,%g1		! 34-bit carry
207		fmuld	%f18,%f2,%f42
208
209	faddd	%f38,%f54,%f54
210		fmuld	%f22,%f10,%f58
211		fmuld	%f18,%f4,%f44
212	faddd	%f40,%f56,%f56
213		fmuld	%f22,%f12,%f60
214		fmuld	%f18,%f6,%f46
215	faddd	%f42,%f58,%f58
216		fmuld	%f22,%f14,%f62
217
218	faddd	%f24,%f48,%f48
219	faddd	%f26,%f50,%f50
220	faddd	%f44,%f60,%f24	! %f60
221	faddd	%f46,%f62,%f26	! %f62
222
223	faddd	%f52,%f56,%f52
224	faddd	%f54,%f58,%f54
225
226	fdtox	%f48,%f48
227	fdtox	%f50,%f50
228	fdtox	%f52,%f52
229	fdtox	%f54,%f54
230
231	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
232	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
233	addcc	%l6,8,%l6
234	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
235	bz,pn	%icc,.L1stskip
236	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
237
238.align	32			! incidentally already aligned !
239.L1st:
240	add	%i1,%l6,%o4
241	add	%i3,%l6,%o5
242	ld	[%o4+0],%f17	! load a[j] as pair of 32-bit words
243	.word	0xa1b00c20	! fzeros %f16
244	ld	[%o4+4],%f19
245	.word	0xa5b00c20	! fzeros %f18
246	ld	[%o5+0],%f21	! load n[j] as pair of 32-bit words
247	.word	0xa9b00c20	! fzeros %f20
248	ld	[%o5+4],%f23
249	.word	0xadb00c20	! fzeros %f22
250
251	fxtod	%f16,%f16
252	fxtod	%f18,%f18
253	fxtod	%f20,%f20
254	fxtod	%f22,%f22
255
256	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
257		fmuld	%f16,%f0,%f32
258	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
259		fmuld	%f20,%f8,%f48
260	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
261		fmuld	%f16,%f2,%f34
262	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
263		fmuld	%f20,%f10,%f50
264
265	srlx	%o0,16,%o7
266	std	%f16,[%l1+%l6]		! save smashed ap[j] in double format
267		fmuld	%f16,%f4,%f36
268	add	%o7,%o1,%o1
269	std	%f18,[%l2+%l6]
270		faddd	%f32,%f48,%f48
271		fmuld	%f20,%f12,%f52
272	srlx	%o1,16,%o7
273	std	%f20,[%l3+%l6]		! save smashed np[j] in double format
274		fmuld	%f16,%f6,%f38
275	add	%o7,%o2,%o2
276	std	%f22,[%l4+%l6]
277		faddd	%f34,%f50,%f50
278		fmuld	%f20,%f14,%f54
279	srlx	%o2,16,%o7
280		fmuld	%f18,%f0,%f40
281	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
282	and	%o0,%l7,%o0
283		faddd	%f36,%f52,%f52
284		fmuld	%f22,%f8,%f56
285	and	%o1,%l7,%o1
286	and	%o2,%l7,%o2
287		fmuld	%f18,%f2,%f42
288	sllx	%o1,16,%o1
289		faddd	%f38,%f54,%f54
290		fmuld	%f22,%f10,%f58
291	sllx	%o2,32,%o2
292		fmuld	%f18,%f4,%f44
293	sllx	%o3,48,%o7
294	or	%o1,%o0,%o0
295		faddd	%f40,%f56,%f56
296		fmuld	%f22,%f12,%f60
297	or	%o2,%o0,%o0
298		fmuld	%f18,%f6,%f46
299	or	%o7,%o0,%o0		! 64-bit result
300		faddd	%f42,%f58,%f58
301		fmuld	%f22,%f14,%f62
302	addcc	%g1,%o0,%o0
303		faddd	%f24,%f48,%f48
304	srlx	%o3,16,%g1		! 34-bit carry
305		faddd	%f26,%f50,%f50
306	bcs,a	%xcc,.+8
307	add	%g1,1,%g1
308
309	stx	%o0,[%l0]		! tp[j-1]=
310
311	faddd	%f44,%f60,%f24	! %f60
312	faddd	%f46,%f62,%f26	! %f62
313
314	faddd	%f52,%f56,%f52
315	faddd	%f54,%f58,%f54
316
317	fdtox	%f48,%f48
318	fdtox	%f50,%f50
319	fdtox	%f52,%f52
320	fdtox	%f54,%f54
321
322	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
323	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
324	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
325	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
326
327	addcc	%l6,8,%l6
328	bnz,pt	%icc,.L1st
329	add	%l0,8,%l0
330
331.L1stskip:
332	fdtox	%f24,%f24
333	fdtox	%f26,%f26
334
335	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
336	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
337	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
338	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
339
340	srlx	%o0,16,%o7
341	std	%f24,[%sp+STACK_BIAS+STACK_FRAME+32]
342	add	%o7,%o1,%o1
343	std	%f26,[%sp+STACK_BIAS+STACK_FRAME+40]
344	srlx	%o1,16,%o7
345	add	%o7,%o2,%o2
346	srlx	%o2,16,%o7
347	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
348	and	%o0,%l7,%o0
349	and	%o1,%l7,%o1
350	and	%o2,%l7,%o2
351	sllx	%o1,16,%o1
352	sllx	%o2,32,%o2
353	sllx	%o3,48,%o7
354	or	%o1,%o0,%o0
355	or	%o2,%o0,%o0
356	or	%o7,%o0,%o0		! 64-bit result
357	ldx	[%sp+STACK_BIAS+STACK_FRAME+32],%o4
358	addcc	%g1,%o0,%o0
359	ldx	[%sp+STACK_BIAS+STACK_FRAME+40],%o5
360	srlx	%o3,16,%g1		! 34-bit carry
361	bcs,a	%xcc,.+8
362	add	%g1,1,%g1
363
364	stx	%o0,[%l0]		! tp[j-1]=
365	add	%l0,8,%l0
366
367	srlx	%o4,16,%o7
368	add	%o7,%o5,%o5
369	and	%o4,%l7,%o4
370	sllx	%o5,16,%o7
371	or	%o7,%o4,%o4
372	addcc	%g1,%o4,%o4
373	srlx	%o5,48,%g1
374	bcs,a	%xcc,.+8
375	add	%g1,1,%g1
376
377	mov	%g1,%i4
378	stx	%o4,[%l0]		! tp[num-1]=
379
380	ba	.Louter
381	add	%l5,8,%l5
382.align	32
383.Louter:
384	sub	%g0,%i5,%l6		! j=-num
385	add	%sp,STACK_BIAS+STACK_FRAME+64,%l0
386
387	add	%i1,%l6,%o3
388	add	%i2,%l5,%o4
389
390	ld	[%o3+4],%g1		! bp[i]
391	ld	[%o3+0],%o0
392	ld	[%o4+4],%g5		! ap[0]
393	sllx	%g1,32,%g1
394	ld	[%o4+0],%o1
395	sllx	%g5,32,%g5
396	or	%g1,%o0,%o0
397	or	%g5,%o1,%o1
398
399	ldx	[%l0],%o2		! tp[0]
400	mulx	%o1,%o0,%o0
401	addcc	%o2,%o0,%o0
402	mulx	%g4,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
403	stx	%o0,[%sp+STACK_BIAS+STACK_FRAME+0]
404
405	! transfer b[i] to FPU as 4x16-bit values
406	ldda	[%o4+2]%asi,%f0
407	ldda	[%o4+0]%asi,%f2
408	ldda	[%o4+6]%asi,%f4
409	ldda	[%o4+4]%asi,%f6
410
411	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
412	ldda	[%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
413	fxtod	%f0,%f0
414	ldda	[%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
415	fxtod	%f2,%f2
416	ldda	[%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
417	fxtod	%f4,%f4
418	ldda	[%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
419	fxtod	%f6,%f6
420	ldd	[%l1+%l6],%f16		! load a[j] in double format
421	fxtod	%f8,%f8
422	ldd	[%l2+%l6],%f18
423	fxtod	%f10,%f10
424	ldd	[%l3+%l6],%f20		! load n[j] in double format
425	fxtod	%f12,%f12
426	ldd	[%l4+%l6],%f22
427	fxtod	%f14,%f14
428
429		fmuld	%f16,%f0,%f32
430		fmuld	%f20,%f8,%f48
431		fmuld	%f16,%f2,%f34
432		fmuld	%f20,%f10,%f50
433		fmuld	%f16,%f4,%f36
434	faddd	%f32,%f48,%f48
435		fmuld	%f20,%f12,%f52
436		fmuld	%f16,%f6,%f38
437	faddd	%f34,%f50,%f50
438		fmuld	%f20,%f14,%f54
439		fmuld	%f18,%f0,%f40
440	faddd	%f36,%f52,%f52
441		fmuld	%f22,%f8,%f56
442		fmuld	%f18,%f2,%f42
443	faddd	%f38,%f54,%f54
444		fmuld	%f22,%f10,%f58
445		fmuld	%f18,%f4,%f44
446	faddd	%f40,%f56,%f56
447		fmuld	%f22,%f12,%f60
448		fmuld	%f18,%f6,%f46
449	faddd	%f42,%f58,%f58
450		fmuld	%f22,%f14,%f62
451
452	faddd	%f44,%f60,%f24	! %f60
453	faddd	%f46,%f62,%f26	! %f62
454
455	faddd	%f52,%f56,%f52
456	faddd	%f54,%f58,%f54
457
458	fdtox	%f48,%f48
459	fdtox	%f50,%f50
460	fdtox	%f52,%f52
461	fdtox	%f54,%f54
462
463	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
464	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
465	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
466	add	%l6,8,%l6
467	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
468
469	ldd	[%l1+%l6],%f16		! load a[j] in double format
470	ldd	[%l2+%l6],%f18
471	ldd	[%l3+%l6],%f20		! load n[j] in double format
472	ldd	[%l4+%l6],%f22
473
474		fmuld	%f16,%f0,%f32
475		fmuld	%f20,%f8,%f48
476		fmuld	%f16,%f2,%f34
477		fmuld	%f20,%f10,%f50
478		fmuld	%f16,%f4,%f36
479	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
480		faddd	%f32,%f48,%f48
481		fmuld	%f20,%f12,%f52
482	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
483		fmuld	%f16,%f6,%f38
484	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
485		faddd	%f34,%f50,%f50
486		fmuld	%f20,%f14,%f54
487	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
488		fmuld	%f18,%f0,%f40
489
490	srlx	%o0,16,%o7
491		faddd	%f36,%f52,%f52
492		fmuld	%f22,%f8,%f56
493	add	%o7,%o1,%o1
494		fmuld	%f18,%f2,%f42
495	srlx	%o1,16,%o7
496		faddd	%f38,%f54,%f54
497		fmuld	%f22,%f10,%f58
498	add	%o7,%o2,%o2
499		fmuld	%f18,%f4,%f44
500	srlx	%o2,16,%o7
501		faddd	%f40,%f56,%f56
502		fmuld	%f22,%f12,%f60
503	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
504	! why?
505	and	%o0,%l7,%o0
506		fmuld	%f18,%f6,%f46
507	and	%o1,%l7,%o1
508	and	%o2,%l7,%o2
509		faddd	%f42,%f58,%f58
510		fmuld	%f22,%f14,%f62
511	sllx	%o1,16,%o1
512		faddd	%f24,%f48,%f48
513	sllx	%o2,32,%o2
514		faddd	%f26,%f50,%f50
515	sllx	%o3,48,%o7
516	or	%o1,%o0,%o0
517		faddd	%f44,%f60,%f24	! %f60
518	or	%o2,%o0,%o0
519		faddd	%f46,%f62,%f26	! %f62
520	or	%o7,%o0,%o0		! 64-bit result
521	ldx	[%l0],%o7
522		faddd	%f52,%f56,%f52
523	addcc	%o7,%o0,%o0
524	! end-of-why?
525		faddd	%f54,%f58,%f54
526	srlx	%o3,16,%g1		! 34-bit carry
527		fdtox	%f48,%f48
528	bcs,a	%xcc,.+8
529	add	%g1,1,%g1
530
531	fdtox	%f50,%f50
532	fdtox	%f52,%f52
533	fdtox	%f54,%f54
534
535	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
536	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
537	addcc	%l6,8,%l6
538	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
539	bz,pn	%icc,.Linnerskip
540	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
541
542	ba	.Linner
543	nop
544.align	32
545.Linner:
546	ldd	[%l1+%l6],%f16		! load a[j] in double format
547	ldd	[%l2+%l6],%f18
548	ldd	[%l3+%l6],%f20		! load n[j] in double format
549	ldd	[%l4+%l6],%f22
550
551		fmuld	%f16,%f0,%f32
552		fmuld	%f20,%f8,%f48
553		fmuld	%f16,%f2,%f34
554		fmuld	%f20,%f10,%f50
555		fmuld	%f16,%f4,%f36
556	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
557		faddd	%f32,%f48,%f48
558		fmuld	%f20,%f12,%f52
559	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
560		fmuld	%f16,%f6,%f38
561	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
562		faddd	%f34,%f50,%f50
563		fmuld	%f20,%f14,%f54
564	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
565		fmuld	%f18,%f0,%f40
566
567	srlx	%o0,16,%o7
568		faddd	%f36,%f52,%f52
569		fmuld	%f22,%f8,%f56
570	add	%o7,%o1,%o1
571		fmuld	%f18,%f2,%f42
572	srlx	%o1,16,%o7
573		faddd	%f38,%f54,%f54
574		fmuld	%f22,%f10,%f58
575	add	%o7,%o2,%o2
576		fmuld	%f18,%f4,%f44
577	srlx	%o2,16,%o7
578		faddd	%f40,%f56,%f56
579		fmuld	%f22,%f12,%f60
580	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
581	and	%o0,%l7,%o0
582		fmuld	%f18,%f6,%f46
583	and	%o1,%l7,%o1
584	and	%o2,%l7,%o2
585		faddd	%f42,%f58,%f58
586		fmuld	%f22,%f14,%f62
587	sllx	%o1,16,%o1
588		faddd	%f24,%f48,%f48
589	sllx	%o2,32,%o2
590		faddd	%f26,%f50,%f50
591	sllx	%o3,48,%o7
592	or	%o1,%o0,%o0
593		faddd	%f44,%f60,%f24	! %f60
594	or	%o2,%o0,%o0
595		faddd	%f46,%f62,%f26	! %f62
596	or	%o7,%o0,%o0		! 64-bit result
597		faddd	%f52,%f56,%f52
598	addcc	%g1,%o0,%o0
599	ldx	[%l0+8],%o7		! tp[j]
600		faddd	%f54,%f58,%f54
601	srlx	%o3,16,%g1		! 34-bit carry
602		fdtox	%f48,%f48
603	bcs,a	%xcc,.+8
604	add	%g1,1,%g1
605		fdtox	%f50,%f50
606	addcc	%o7,%o0,%o0
607		fdtox	%f52,%f52
608	bcs,a	%xcc,.+8
609	add	%g1,1,%g1
610
611	stx	%o0,[%l0]		! tp[j-1]
612		fdtox	%f54,%f54
613
614	std	%f48,[%sp+STACK_BIAS+STACK_FRAME+0]
615	std	%f50,[%sp+STACK_BIAS+STACK_FRAME+8]
616	std	%f52,[%sp+STACK_BIAS+STACK_FRAME+16]
617	addcc	%l6,8,%l6
618	std	%f54,[%sp+STACK_BIAS+STACK_FRAME+24]
619	bnz,pt	%icc,.Linner
620	add	%l0,8,%l0
621
622.Linnerskip:
623	fdtox	%f24,%f24
624	fdtox	%f26,%f26
625
626	ldx	[%sp+STACK_BIAS+STACK_FRAME+0],%o0
627	ldx	[%sp+STACK_BIAS+STACK_FRAME+8],%o1
628	ldx	[%sp+STACK_BIAS+STACK_FRAME+16],%o2
629	ldx	[%sp+STACK_BIAS+STACK_FRAME+24],%o3
630
631	srlx	%o0,16,%o7
632	std	%f24,[%sp+STACK_BIAS+STACK_FRAME+32]
633	add	%o7,%o1,%o1
634	std	%f26,[%sp+STACK_BIAS+STACK_FRAME+40]
635	srlx	%o1,16,%o7
636	add	%o7,%o2,%o2
637	srlx	%o2,16,%o7
638	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
639	and	%o0,%l7,%o0
640	and	%o1,%l7,%o1
641	and	%o2,%l7,%o2
642	sllx	%o1,16,%o1
643	sllx	%o2,32,%o2
644	sllx	%o3,48,%o7
645	or	%o1,%o0,%o0
646	or	%o2,%o0,%o0
647	ldx	[%sp+STACK_BIAS+STACK_FRAME+32],%o4
648	or	%o7,%o0,%o0		! 64-bit result
649	ldx	[%sp+STACK_BIAS+STACK_FRAME+40],%o5
650	addcc	%g1,%o0,%o0
651	ldx	[%l0+8],%o7		! tp[j]
652	srlx	%o3,16,%g1		! 34-bit carry
653	bcs,a	%xcc,.+8
654	add	%g1,1,%g1
655
656	addcc	%o7,%o0,%o0
657	bcs,a	%xcc,.+8
658	add	%g1,1,%g1
659
660	stx	%o0,[%l0]		! tp[j-1]
661	add	%l0,8,%l0
662
663	srlx	%o4,16,%o7
664	add	%o7,%o5,%o5
665	and	%o4,%l7,%o4
666	sllx	%o5,16,%o7
667	or	%o7,%o4,%o4
668	addcc	%g1,%o4,%o4
669	srlx	%o5,48,%g1
670	bcs,a	%xcc,.+8
671	add	%g1,1,%g1
672
673	addcc	%i4,%o4,%o4
674	stx	%o4,[%l0]		! tp[num-1]
675	mov	%g1,%i4
676	bcs,a	%xcc,.+8
677	add	%i4,1,%i4
678
679	addcc	%l5,8,%l5
680	bnz	%icc,.Louter
681	nop
682
683	add	%l0,8,%l0		! adjust tp to point at the end
684	orn	%g0,%g0,%g4
685	sub	%g0,%i5,%o7		! n=-num
686	ba	.Lsub
687	subcc	%g0,%g0,%g0		! clear %icc.c
688
689.align	32
690.Lsub:
691	ldx	[%l0+%o7],%o0
692	add	%i3,%o7,%g1
693	ld	[%g1+0],%o2
694	ld	[%g1+4],%o3
695	srlx	%o0,32,%o1
696	subccc	%o0,%o2,%o2
697	add	%i0,%o7,%g1
698	subccc	%o1,%o3,%o3
699	st	%o2,[%g1+0]
700	add	%o7,8,%o7
701	brnz,pt	%o7,.Lsub
702	st	%o3,[%g1+4]
703	subc	%i4,0,%g4
704	sub	%g0,%i5,%o7		! n=-num
705	ba	.Lcopy
706	nop
707
708.align	32
709.Lcopy:
710	ldx	[%l0+%o7],%o0
711	add	%i0,%o7,%g1
712	ld	[%g1+0],%o2
713	ld	[%g1+4],%o3
714	stx	%g0,[%l0+%o7]
715	and	%o0,%g4,%o0
716	srlx	%o0,32,%o1
717	andn	%o2,%g4,%o2
718	andn	%o3,%g4,%o3
719	or	%o2,%o0,%o0
720	or	%o3,%o1,%o1
721	st	%o0,[%g1+0]
722	add	%o7,8,%o7
723	brnz,pt	%o7,.Lcopy
724	st	%o1,[%g1+4]
725	sub	%g0,%i5,%o7		! n=-num
726
727.Lzap:
728	stx	%g0,[%l1+%o7]
729	stx	%g0,[%l2+%o7]
730	stx	%g0,[%l3+%o7]
731	stx	%g0,[%l4+%o7]
732	add	%o7,8,%o7
733	brnz,pt	%o7,.Lzap
734	nop
735
736	ldx	[%sp+STACK_BIAS+STACK_FRAME+48],%o7
737	wr	%g0,%o7,%asi		! restore %asi
738
739	mov	1,%i0
740.Lret:
741	ret
742	restore
743.type   bn_mul_mont_fpu,#function
744.size	bn_mul_mont_fpu,(.-bn_mul_mont_fpu)
745.asciz	"Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>"
746.align	32
747