xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/sparc/poly1305-sparcv9.S (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1#include "sparc_arch.h"
2
3#ifdef	__arch64__
4.register	%g2,#scratch
5.register	%g3,#scratch
6# define	STPTR	stx
7# define	SIZE_T	8
8#else
9# define	STPTR	st
10# define	SIZE_T	4
11#endif
12#define	LOCALS	(STACK_BIAS+STACK_FRAME)
13
14.section	".text",#alloc,#execinstr
15
16#ifdef __PIC__
17SPARC_PIC_THUNK(%g1)
18#endif
19
20.globl	poly1305_init
21.align	32
22poly1305_init:
23	save	%sp,-STACK_FRAME-16,%sp
24	nop
25
26	SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
27	ld	[%g1],%g1
28
29	and	%g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
30	cmp	%g1,SPARCV9_FMADD
31	be	.Lpoly1305_init_fma
32	nop
33
34	stx	%g0,[%i0+0]
35	stx	%g0,[%i0+8]		! zero hash value
36	brz,pn	%i1,.Lno_key
37	stx	%g0,[%i0+16]
38
39	and	%i1,7,%i5		! alignment factor
40	andn	%i1,7,%i1
41	sll	%i5,3,%i5		! *8
42	neg	%i5,%i4
43
44	sethi	%hi(0x0ffffffc),%o4
45	set	8,%o1
46	or	%o4,%lo(0x0ffffffc),%o4
47	set	16,%o2
48	sllx	%o4,32,%o5
49	or	%o4,%o5,%o5		! 0x0ffffffc0ffffffc
50	or	%o5,3,%o4		! 0x0ffffffc0fffffff
51
52	ldxa	[%i1+%g0]0x88,%o0	! load little-endian key
53	brz,pt	%i5,.Lkey_aligned
54	ldxa	[%i1+%o1]0x88,%o1
55
56	ldxa	[%i1+%o2]0x88,%o2
57	srlx	%o0,%i5,%o0
58	sllx	%o1,%i4,%o7
59	srlx	%o1,%i5,%o1
60	or	%o7,%o0,%o0
61	sllx	%o2,%i4,%o2
62	or	%o2,%o1,%o1
63
64.Lkey_aligned:
65	and	%o4,%o0,%o0
66	and	%o5,%o1,%o1
67	stx	%o0,[%i0+32+0]		! store key
68	stx	%o1,[%i0+32+8]
69
70	andcc	%g1,SPARCV9_VIS3,%g0
71	be	.Lno_key
72	nop
73
741:	call	.+8
75	add	%o7,poly1305_blocks_vis3-1b,%o7
76
77	add	%o7,poly1305_emit-poly1305_blocks_vis3,%o5
78	STPTR	%o7,[%i2]
79	STPTR	%o5,[%i2+SIZE_T]
80
81	ret
82	restore	%g0,1,%o0		! return 1
83
84.Lno_key:
85	ret
86	restore	%g0,%g0,%o0		! return 0
87.type	poly1305_init,#function
88.size	poly1305_init,.-poly1305_init
89
90.globl	poly1305_blocks
91.align	32
92poly1305_blocks:
93	save	%sp,-STACK_FRAME,%sp
94	srln	%i2,4,%i2
95
96	brz,pn	%i2,.Lno_data
97	nop
98
99	ld	[%i0+32+0],%l1		! load key
100	ld	[%i0+32+4],%l0
101	ld	[%i0+32+8],%l3
102	ld	[%i0+32+12],%l2
103
104	ld	[%i0+0],%o1		! load hash value
105	ld	[%i0+4],%o0
106	ld	[%i0+8],%o3
107	ld	[%i0+12],%o2
108	ld	[%i0+16],%l7
109
110	and	%i1,7,%i5		! alignment factor
111	andn	%i1,7,%i1
112	set	8,%g2
113	sll	%i5,3,%i5		! *8
114	set	16,%g3
115	neg	%i5,%i4
116
117	srl	%l1,2,%l4
118	srl	%l2,2,%l5
119	add	%l1,%l4,%l4
120	srl	%l3,2,%l6
121	add	%l2,%l5,%l5
122	add	%l3,%l6,%l6
123
124.Loop:
125	ldxa	[%i1+%g0]0x88,%g1	! load little-endian input
126	brz,pt	%i5,.Linp_aligned
127	ldxa	[%i1+%g2]0x88,%g2
128
129	ldxa	[%i1+%g3]0x88,%g3
130	srlx	%g1,%i5,%g1
131	sllx	%g2,%i4,%o5
132	srlx	%g2,%i5,%g2
133	or	%o5,%g1,%g1
134	sllx	%g3,%i4,%g3
135	or	%g3,%g2,%g2
136
137.Linp_aligned:
138	srlx	%g1,32,%o4
139	addcc	%g1,%o0,%o0		! accumulate input
140	srlx	%g2,32,%o5
141	addccc	%o4,%o1,%o1
142	addccc	%g2,%o2,%o2
143	addccc	%o5,%o3,%o3
144	addc	%i3,%l7,%l7
145
146	umul	%l0,%o0,%g1
147	umul	%l1,%o0,%g2
148	umul	%l2,%o0,%g3
149	umul	%l3,%o0,%g4
150	 sub	%i2,1,%i2
151	 add	%i1,16,%i1
152
153	umul	%l6,%o1,%o4
154	umul	%l0,%o1,%o5
155	umul	%l1,%o1,%o7
156	add	%o4,%g1,%g1
157	add	%o5,%g2,%g2
158	umul	%l2,%o1,%o4
159	add	%o7,%g3,%g3
160	add	%o4,%g4,%g4
161
162	umul	%l5,%o2,%o5
163	umul	%l6,%o2,%o7
164	umul	%l0,%o2,%o4
165	add	%o5,%g1,%g1
166	add	%o7,%g2,%g2
167	umul	%l1,%o2,%o5
168	add	%o4,%g3,%g3
169	add	%o5,%g4,%g4
170
171	umul	%l4,%o3,%o7
172	umul	%l5,%o3,%o4
173	umul	%l6,%o3,%o5
174	add	%o7,%g1,%g1
175	add	%o4,%g2,%g2
176	umul	%l0,%o3,%o7
177	add	%o5,%g3,%g3
178	add	%o7,%g4,%g4
179
180	umul	%l4,%l7,%o4
181	umul	%l5,%l7,%o5
182	umul	%l6,%l7,%o7
183	umul	%l0,%l7,%l7
184	add	%o4,%g2,%g2
185	add	%o5,%g3,%g3
186	srlx	%g1,32,%o1
187	add	%o7,%g4,%g4
188	srlx	%g2,32,%o2
189
190	addcc	%g2,%o1,%o1
191	srlx	%g3,32,%o3
192	 set	8,%g2
193	addccc	%g3,%o2,%o2
194	srlx	%g4,32,%o4
195	 set	16,%g3
196	addccc	%g4,%o3,%o3
197	addc	%o4,%l7,%l7
198
199	srl	%l7,2,%o4		! final reduction step
200	andn	%l7,3,%o5
201	and	%l7,3,%l7
202	add	%o5,%o4,%o4
203
204	addcc	%o4,%g1,%o0
205	addccc	%g0,%o1,%o1
206	addccc	%g0,%o2,%o2
207	addccc	%g0,%o3,%o3
208	brnz,pt	%i2,.Loop
209	addc	%g0,%l7,%l7
210
211	st	%o1,[%i0+0]		! store hash value
212	st	%o0,[%i0+4]
213	st	%o3,[%i0+8]
214	st	%o2,[%i0+12]
215	st	%l7,[%i0+16]
216
217.Lno_data:
218	ret
219	restore
220.type	poly1305_blocks,#function
221.size	poly1305_blocks,.-poly1305_blocks
222.align	32
223poly1305_blocks_vis3:
224	save	%sp,-STACK_FRAME,%sp
225	srln	%i2,4,%i2
226
227	brz,pn	%i2,.Lno_data
228	nop
229
230	ldx	[%i0+32+0],%o3		! load key
231	ldx	[%i0+32+8],%o4
232
233	ldx	[%i0+0],%o0		! load hash value
234	ldx	[%i0+8],%o1
235	ld	[%i0+16],%o2
236
237	and	%i1,7,%i5		! alignment factor
238	andn	%i1,7,%i1
239	set	8,%l1
240	sll	%i5,3,%i5		! *8
241	set	16,%l2
242	neg	%i5,%i4
243
244	srlx	%o4,2,%o5
245	b	.Loop_vis3
246	add	%o4,%o5,%o5
247
248.Loop_vis3:
249	ldxa	[%i1+%g0]0x88,%g1	! load little-endian input
250	brz,pt	%i5,.Linp_aligned_vis3
251	ldxa	[%i1+%l1]0x88,%g2
252
253	ldxa	[%i1+%l2]0x88,%g3
254	srlx	%g1,%i5,%g1
255	sllx	%g2,%i4,%o7
256	srlx	%g2,%i5,%g2
257	or	%o7,%g1,%g1
258	sllx	%g3,%i4,%g3
259	or	%g3,%g2,%g2
260
261.Linp_aligned_vis3:
262	addcc	%g1,%o0,%o0		! accumulate input
263	 sub	%i2,1,%i2
264	.word	0x93b08269 !addxccc	%g2,%o1,%o1
265	 add	%i1,16,%i1
266
267	mulx	%o3,%o0,%g1		! r0*h0
268	.word	0x95b6c22a !addxc	%i3,%o2,%o2
269	.word	0x85b2c2c8 !umulxhi	%o3,%o0,%g2
270	mulx	%o5,%o1,%g4		! s1*h1
271	.word	0x9fb342c9 !umulxhi	%o5,%o1,%o7
272	addcc	%g4,%g1,%g1
273	mulx	%o4,%o0,%g4		! r1*h0
274	.word	0x85b3c222 !addxc	%o7,%g2,%g2
275	.word	0x87b302c8 !umulxhi	%o4,%o0,%g3
276	addcc	%g4,%g2,%g2
277	mulx	%o3,%o1,%g4		! r0*h1
278	.word	0x87b00223 !addxc	%g0,%g3,%g3
279	.word	0x9fb2c2c9 !umulxhi	%o3,%o1,%o7
280	addcc	%g4,%g2,%g2
281	mulx	%o5,%o2,%g4		! s1*h2
282	.word	0x87b3c223 !addxc	%o7,%g3,%g3
283	mulx	%o3,%o2,%o7		! r0*h2
284	addcc	%g4,%g2,%g2
285	.word	0x87b3c223 !addxc	%o7,%g3,%g3
286
287	srlx	%g3,2,%g4		! final reduction step
288	andn	%g3,3,%o7
289	and	%g3,3,%o2
290	add	%o7,%g4,%g4
291
292	addcc	%g4,%g1,%o0
293	.word	0x93b00262 !addxccc	%g0,%g2,%o1
294	brnz,pt	%i2,.Loop_vis3
295	.word	0x95b0022a !addxc	%g0,%o2,%o2
296
297	stx	%o0,[%i0+0]		! store hash value
298	stx	%o1,[%i0+8]
299	st	%o2,[%i0+16]
300
301	ret
302	restore
303.type	poly1305_blocks_vis3,#function
304.size	poly1305_blocks_vis3,.-poly1305_blocks_vis3
305.globl	poly1305_emit
306.align	32
307poly1305_emit:
308	save	%sp,-STACK_FRAME,%sp
309
310	ld	[%i0+0],%o1		! load hash value
311	ld	[%i0+4],%o0
312	ld	[%i0+8],%o3
313	ld	[%i0+12],%o2
314	ld	[%i0+16],%l7
315
316	addcc	%o0,5,%l0		! compare to modulus
317	addccc	%o1,0,%l1
318	addccc	%o2,0,%l2
319	addccc	%o3,0,%l3
320	addc	%l7,0,%l7
321	andcc	%l7,4,%g0		! did it carry/borrow?
322
323	movnz	%icc,%l0,%o0
324	ld	[%i2+0],%l0		! load nonce
325	movnz	%icc,%l1,%o1
326	ld	[%i2+4],%l1
327	movnz	%icc,%l2,%o2
328	ld	[%i2+8],%l2
329	movnz	%icc,%l3,%o3
330	ld	[%i2+12],%l3
331
332	addcc	%l0,%o0,%o0		! accumulate nonce
333	addccc	%l1,%o1,%o1
334	addccc	%l2,%o2,%o2
335	addc	%l3,%o3,%o3
336
337	srl	%o0,8,%l0
338	stb	%o0,[%i1+0]		! store little-endian result
339	srl	%o0,16,%l1
340	stb	%l0,[%i1+1]
341	srl	%o0,24,%l2
342	stb	%l1,[%i1+2]
343	stb	%l2,[%i1+3]
344
345	srl	%o1,8,%l0
346	stb	%o1,[%i1+4]
347	srl	%o1,16,%l1
348	stb	%l0,[%i1+5]
349	srl	%o1,24,%l2
350	stb	%l1,[%i1+6]
351	stb	%l2,[%i1+7]
352
353	srl	%o2,8,%l0
354	stb	%o2,[%i1+8]
355	srl	%o2,16,%l1
356	stb	%l0,[%i1+9]
357	srl	%o2,24,%l2
358	stb	%l1,[%i1+10]
359	stb	%l2,[%i1+11]
360
361	srl	%o3,8,%l0
362	stb	%o3,[%i1+12]
363	srl	%o3,16,%l1
364	stb	%l0,[%i1+13]
365	srl	%o3,24,%l2
366	stb	%l1,[%i1+14]
367	stb	%l2,[%i1+15]
368
369	ret
370	restore
371.type	poly1305_emit,#function
372.size	poly1305_emit,.-poly1305_emit
373.align	32
374poly1305_init_fma:
375	save	%sp,-STACK_FRAME-16,%sp
376	nop
377
378.Lpoly1305_init_fma:
3791:	call	.+8
380	add	%o7,.Lconsts_fma-1b,%o7
381
382	ldd	[%o7+8*0],%f16			! load constants
383	ldd	[%o7+8*1],%f18
384	ldd	[%o7+8*2],%f20
385	ldd	[%o7+8*3],%f22
386	ldd	[%o7+8*5],%f26
387
388	std	%f16,[%i0+8*0]		! initial hash value, biased 0
389	std	%f18,[%i0+8*1]
390	std	%f20,[%i0+8*2]
391	std	%f22,[%i0+8*3]
392
393	brz,pn	%i1,.Lno_key_fma
394	nop
395
396	stx	%fsr,[%sp+LOCALS]		! save original %fsr
397	ldx	[%o7+8*6],%fsr			! load new %fsr
398
399	std	%f16,[%i0+8*4] 		! key "template"
400	std	%f18,[%i0+8*5]
401	std	%f20,[%i0+8*6]
402	std	%f22,[%i0+8*7]
403
404	and	%i1,7,%l2
405	andn	%i1,7,%i1			! align pointer
406	mov	8,%l0
407	sll	%l2,3,%l2
408	mov	16,%l1
409	neg	%l2,%l3
410
411	ldxa	[%i1+%g0]0x88,%o0		! load little-endian key
412	ldxa	[%i1+%l0]0x88,%o2
413
414	brz	%l2,.Lkey_aligned_fma
415	sethi	%hi(0xf0000000),%l0		!   0xf0000000
416
417	ldxa	[%i1+%l1]0x88,%o4
418
419	srlx	%o0,%l2,%o0			! align data
420	sllx	%o2,%l3,%o1
421	srlx	%o2,%l2,%o2
422	or	%o1,%o0,%o0
423	sllx	%o4,%l3,%o3
424	or	%o3,%o2,%o2
425
426.Lkey_aligned_fma:
427	or	%l0,3,%l1			!   0xf0000003
428	srlx	%o0,32,%o1
429	andn	%o0,%l0,%o0			! &=0x0fffffff
430	andn	%o1,%l1,%o1			! &=0x0ffffffc
431	srlx	%o2,32,%o3
432	andn	%o2,%l1,%o2
433	andn	%o3,%l1,%o3
434
435	st	%o0,[%i0+36]		! fill "template"
436	st	%o1,[%i0+44]
437	st	%o2,[%i0+52]
438	st	%o3,[%i0+60]
439
440	ldd	[%i0+8*4],%f0 		! load [biased] key
441	ldd	[%i0+8*5],%f4
442	ldd	[%i0+8*6],%f8
443	ldd	[%i0+8*7],%f12
444
445	fsubd	%f0,%f16, %f0		! r0
446	 ldd	[%o7+8*7],%f16 		! more constants
447	fsubd	%f4,%f18,%f4		! r1
448	 ldd	[%o7+8*8],%f18
449	fsubd	%f8,%f20,%f8		! r2
450	 ldd	[%o7+8*9],%f20
451	fsubd	%f12,%f22,%f12		! r3
452	 ldd	[%o7+8*10],%f22
453
454	fmuld	%f26,%f4,%f52	! s1
455	fmuld	%f26,%f8,%f40	! s2
456	fmuld	%f26,%f12,%f44	! s3
457
458	faddd	%f0,%f16, %f2
459	faddd	%f4,%f18,%f6
460	faddd	%f8,%f20,%f10
461	faddd	%f12,%f22,%f14
462
463	fsubd	%f2,%f16, %f2
464	 ldd	[%o7+8*11],%f16		! more constants
465	fsubd	%f6,%f18,%f6
466	 ldd	[%o7+8*12],%f18
467	fsubd	%f10,%f20,%f10
468	 ldd	[%o7+8*13],%f20
469	fsubd	%f14,%f22,%f14
470
471	fsubd	%f0,%f2,%f0
472	 std	%f2,[%i0+8*5] 		! r0hi
473	fsubd	%f4,%f6,%f4
474	 std	%f6,[%i0+8*7] 		! r1hi
475	fsubd	%f8,%f10,%f8
476	 std	%f10,[%i0+8*9] 		! r2hi
477	fsubd	%f12,%f14,%f12
478	 std	%f14,[%i0+8*11]		! r3hi
479
480	faddd	%f52,%f16, %f54
481	faddd	%f40,%f18,%f42
482	faddd	%f44,%f20,%f46
483
484	fsubd	%f54,%f16, %f54
485	fsubd	%f42,%f18,%f42
486	fsubd	%f46,%f20,%f46
487
488	fsubd	%f52,%f54,%f52
489	fsubd	%f40,%f42,%f40
490	fsubd	%f44,%f46,%f44
491
492	ldx	[%sp+LOCALS],%fsr		! restore %fsr
493
494	std	%f0,[%i0+8*4] 		! r0lo
495	std	%f4,[%i0+8*6] 		! r1lo
496	std	%f8,[%i0+8*8] 		! r2lo
497	std	%f12,[%i0+8*10]		! r3lo
498
499	std	%f54,[%i0+8*13]
500	std	%f42,[%i0+8*15]
501	std	%f46,[%i0+8*17]
502
503	std	%f52,[%i0+8*12]
504	std	%f40,[%i0+8*14]
505	std	%f44,[%i0+8*16]
506
507	add	%o7,poly1305_blocks_fma-.Lconsts_fma,%o0
508	add	%o7,poly1305_emit_fma-.Lconsts_fma,%o1
509	STPTR	%o0,[%i2]
510	STPTR	%o1,[%i2+SIZE_T]
511
512	ret
513	restore	%g0,1,%o0			! return 1
514
515.Lno_key_fma:
516	ret
517	restore	%g0,%g0,%o0			! return 0
518.type	poly1305_init_fma,#function
519.size	poly1305_init_fma,.-poly1305_init_fma
520
521.align	32
522poly1305_blocks_fma:
523	save	%sp,-STACK_FRAME-48,%sp
524	srln	%i2,4,%i2
525
526	brz,pn	%i2,.Labort
527	sub	%i2,1,%i2
528
5291:	call	.+8
530	add	%o7,.Lconsts_fma-1b,%o7
531
532	ldd	[%o7+8*0],%f16			! load constants
533	ldd	[%o7+8*1],%f18
534	ldd	[%o7+8*2],%f20
535	ldd	[%o7+8*3],%f22
536	ldd	[%o7+8*4],%f24
537	ldd	[%o7+8*5],%f26
538
539	ldd	[%i0+8*0],%f0 		! load [biased] hash value
540	ldd	[%i0+8*1],%f4
541	ldd	[%i0+8*2],%f8
542	ldd	[%i0+8*3],%f12
543
544	std	%f16,[%sp+LOCALS+8*0]		! input "template"
545	sethi	%hi((1023+52+96)<<20),%o3
546	std	%f18,[%sp+LOCALS+8*1]
547	or	%i3,%o3,%o3
548	std	%f20,[%sp+LOCALS+8*2]
549	st	%o3,[%sp+LOCALS+8*3]
550
551	and	%i1,7,%l2
552	andn	%i1,7,%i1			! align pointer
553	mov	8,%l0
554	sll	%l2,3,%l2
555	mov	16,%l1
556	neg	%l2,%l3
557
558	ldxa	[%i1+%g0]0x88,%o0		! load little-endian input
559	brz	%l2,.Linp_aligned_fma
560	ldxa	[%i1+%l0]0x88,%o2
561
562	ldxa	[%i1+%l1]0x88,%o4
563	add	%i1,8,%i1
564
565	srlx	%o0,%l2,%o0			! align data
566	sllx	%o2,%l3,%o1
567	srlx	%o2,%l2,%o2
568	or	%o1,%o0,%o0
569	sllx	%o4,%l3,%o3
570	srlx	%o4,%l2,%o4			! pre-shift
571	or	%o3,%o2,%o2
572
573.Linp_aligned_fma:
574	srlx	%o0,32,%o1
575	movrz	%i2,0,%l1
576	srlx	%o2,32,%o3
577	add	%l1,%i1,%i1			! conditional advance
578
579	st	%o0,[%sp+LOCALS+8*0+4]		! fill "template"
580	st	%o1,[%sp+LOCALS+8*1+4]
581	st	%o2,[%sp+LOCALS+8*2+4]
582	st	%o3,[%sp+LOCALS+8*3+4]
583
584	ldd	[%i0+8*4],%f28 		! load key
585	ldd	[%i0+8*5],%f30
586	ldd	[%i0+8*6],%f32
587	ldd	[%i0+8*7],%f34
588	ldd	[%i0+8*8],%f36
589	ldd	[%i0+8*9],%f38
590	ldd	[%i0+8*10],%f48
591	ldd	[%i0+8*11],%f50
592	ldd	[%i0+8*12],%f52
593	ldd	[%i0+8*13],%f54
594	ldd	[%i0+8*14],%f40
595	ldd	[%i0+8*15],%f42
596	ldd	[%i0+8*16],%f44
597	ldd	[%i0+8*17],%f46
598
599	stx	%fsr,[%sp+LOCALS+8*4]		! save original %fsr
600	ldx	[%o7+8*6],%fsr			! load new %fsr
601
602	subcc	%i2,1,%i2
603	movrz	%i2,0,%l1
604
605	ldd	[%sp+LOCALS+8*0],%f56		! load biased input
606	ldd	[%sp+LOCALS+8*1],%f58
607	ldd	[%sp+LOCALS+8*2],%f60
608	ldd	[%sp+LOCALS+8*3],%f62
609
610	fsubd	%f0,%f16, %f0		! de-bias hash value
611	fsubd	%f4,%f18,%f4
612	 ldxa	[%i1+%g0]0x88,%o0		! modulo-scheduled input load
613	fsubd	%f8,%f20,%f8
614	fsubd	%f12,%f22,%f12
615	 ldxa	[%i1+%l0]0x88,%o2
616
617	fsubd	%f56,%f16, %f56  		! de-bias input
618	fsubd	%f58,%f18,%f58
619	fsubd	%f60,%f20,%f60
620	fsubd	%f62,%f22,%f62
621
622	brz	%l2,.Linp_aligned_fma2
623	add	%l1,%i1,%i1			! conditional advance
624
625	sllx	%o0,%l3,%o1			! align data
626	srlx	%o0,%l2,%o3
627	or	%o1,%o4,%o0
628	sllx	%o2,%l3,%o1
629	srlx	%o2,%l2,%o4			! pre-shift
630	or	%o3,%o1,%o2
631.Linp_aligned_fma2:
632	srlx	%o0,32,%o1
633	srlx	%o2,32,%o3
634
635	faddd	%f0,%f56,%f56			! accumulate input
636	 stw	%o0,[%sp+LOCALS+8*0+4]
637	faddd	%f4,%f58,%f58
638	 stw	%o1,[%sp+LOCALS+8*1+4]
639	faddd	%f8,%f60,%f60
640	 stw	%o2,[%sp+LOCALS+8*2+4]
641	faddd	%f12,%f62,%f62
642	 stw	%o3,[%sp+LOCALS+8*3+4]
643
644	b	.Lentry_fma
645	nop
646
647.align	16
648.Loop_fma:
649	ldxa	[%i1+%g0]0x88,%o0		! modulo-scheduled input load
650	ldxa	[%i1+%l0]0x88,%o2
651	movrz	%i2,0,%l1
652
653	faddd	%f52,%f0,%f0 		! accumulate input
654	faddd	%f54,%f2,%f2
655	faddd	%f62,%f8,%f8
656	faddd	%f60,%f10,%f10
657
658	brz,pn	%l2,.Linp_aligned_fma3
659	add	%l1,%i1,%i1			! conditional advance
660
661	sllx	%o0,%l3,%o1			! align data
662	srlx	%o0,%l2,%o3
663	or	%o1,%o4,%o0
664	sllx	%o2,%l3,%o1
665	srlx	%o2,%l2,%o4			! pre-shift
666	or	%o3,%o1,%o2
667
668.Linp_aligned_fma3:
669	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
670	faddd	%f20,%f4,%f52
671	 srlx	%o0,32,%o1
672	faddd	%f20,%f6,%f54
673	 srlx	%o2,32,%o3
674	faddd	%f24,%f12,%f60
675	 st	%o0,[%sp+LOCALS+8*0+4]		! fill "template"
676	faddd	%f24,%f14,%f62
677	 st	%o1,[%sp+LOCALS+8*1+4]
678	faddd	%f18,%f0,%f48
679	 st	%o2,[%sp+LOCALS+8*2+4]
680	faddd	%f18,%f2,%f50
681	 st	%o3,[%sp+LOCALS+8*3+4]
682	faddd	%f22,%f8,%f56
683	faddd	%f22,%f10,%f58
684
685	fsubd	%f52,%f20,%f52
686	fsubd	%f54,%f20,%f54
687	fsubd	%f60,%f24,%f60
688	fsubd	%f62,%f24,%f62
689	fsubd	%f48,%f18,%f48
690	fsubd	%f50,%f18,%f50
691	fsubd	%f56,%f22,%f56
692	fsubd	%f58,%f22,%f58
693
694	fsubd	%f4,%f52,%f4
695	fsubd	%f6,%f54,%f6
696	fsubd	%f12,%f60,%f12
697	fsubd	%f14,%f62,%f14
698	fsubd	%f8,%f56,%f8
699	fsubd	%f10,%f58,%f10
700	fsubd	%f0,%f48,%f0
701	fsubd	%f2,%f50,%f2
702
703	faddd	%f4,%f48,%f4
704	faddd	%f6,%f50,%f6
705	faddd	%f12,%f56,%f12
706	faddd	%f14,%f58,%f14
707	faddd	%f8,%f52,%f8
708	faddd	%f10,%f54,%f10
709	.word	0x81be805d !fmaddd	%f26,%f60,%f0,%f0
710	.word	0x85be845f !fmaddd	%f26,%f62,%f2,%f2
711
712	faddd	%f4,%f6,%f58
713	 ldd	[%i0+8*12],%f52		! reload constants
714	faddd	%f12,%f14,%f62
715	 ldd	[%i0+8*13],%f54
716	faddd	%f8,%f10,%f60
717	 ldd	[%i0+8*10],%f48
718	faddd	%f0,%f2,%f56
719	 ldd	[%i0+8*11],%f50
720
721.Lentry_fma:
722	fmuld	%f58,%f44,%f0
723	fmuld	%f58,%f46,%f2
724	fmuld	%f58,%f32,%f8
725	fmuld	%f58,%f34,%f10
726	fmuld	%f58,%f28,%f4
727	fmuld	%f58,%f30,%f6
728	fmuld	%f58,%f36,%f12
729	fmuld	%f58,%f38,%f14
730
731	.word	0x81bfc055 !fmaddd	%f62,%f52,%f0,%f0
732	.word	0x85bfc457 !fmaddd	%f62,%f54,%f2,%f2
733	.word	0x91bfd04d !fmaddd	%f62,%f44,%f8,%f8
734	.word	0x95bfd44f !fmaddd	%f62,%f46,%f10,%f10
735	.word	0x89bfc849 !fmaddd	%f62,%f40,%f4,%f4
736	.word	0x8dbfcc4b !fmaddd	%f62,%f42,%f6,%f6
737	.word	0x99bfd85c !fmaddd	%f62,%f28,%f12,%f12
738	.word	0x9dbfdc5e !fmaddd	%f62,%f30,%f14,%f14
739
740	.word	0x81bf4049 !fmaddd	%f60,%f40,%f0,%f0
741	.word	0x85bf444b !fmaddd	%f60,%f42,%f2,%f2
742	.word	0x91bf505c !fmaddd	%f60,%f28,%f8,%f8
743	.word	0x95bf545e !fmaddd	%f60,%f30,%f10,%f10
744	.word	0x89bf484d !fmaddd	%f60,%f44,%f4,%f4
745	 ldd	[%sp+LOCALS+8*0],%f52		! load [biased] input
746	.word	0x8dbf4c4f !fmaddd	%f60,%f46,%f6,%f6
747	 ldd	[%sp+LOCALS+8*1],%f54
748	.word	0x99bf5841 !fmaddd	%f60,%f32,%f12,%f12
749	 ldd	[%sp+LOCALS+8*2],%f62
750	.word	0x9dbf5c43 !fmaddd	%f60,%f34,%f14,%f14
751	 ldd	[%sp+LOCALS+8*3],%f60
752
753	.word	0x81be405c !fmaddd	%f56,%f28,%f0,%f0
754	 fsubd	%f52,%f16, %f52  		! de-bias input
755	.word	0x85be445e !fmaddd	%f56,%f30,%f2,%f2
756	 fsubd	%f54,%f18,%f54
757	.word	0x91be5045 !fmaddd	%f56,%f36,%f8,%f8
758	 fsubd	%f62,%f20,%f62
759	.word	0x95be5447 !fmaddd	%f56,%f38,%f10,%f10
760	 fsubd	%f60,%f22,%f60
761	.word	0x89be4841 !fmaddd	%f56,%f32,%f4,%f4
762	.word	0x8dbe4c43 !fmaddd	%f56,%f34,%f6,%f6
763	.word	0x99be5851 !fmaddd	%f56,%f48,%f12,%f12
764	.word	0x9dbe5c53 !fmaddd	%f56,%f50,%f14,%f14
765
766	bcc	SIZE_T_CC,.Loop_fma
767	subcc	%i2,1,%i2
768
769	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
770	faddd	%f0,%f18,%f48
771	faddd	%f2,%f18,%f50
772	faddd	%f8,%f22,%f56
773	faddd	%f10,%f22,%f58
774	faddd	%f4,%f20,%f52
775	faddd	%f6,%f20,%f54
776	faddd	%f12,%f24,%f60
777	faddd	%f14,%f24,%f62
778
779	fsubd	%f48,%f18,%f48
780	fsubd	%f50,%f18,%f50
781	fsubd	%f56,%f22,%f56
782	fsubd	%f58,%f22,%f58
783	fsubd	%f52,%f20,%f52
784	fsubd	%f54,%f20,%f54
785	fsubd	%f60,%f24,%f60
786	fsubd	%f62,%f24,%f62
787
788	fsubd	%f4,%f52,%f4
789	fsubd	%f6,%f54,%f6
790	fsubd	%f12,%f60,%f12
791	fsubd	%f14,%f62,%f14
792	fsubd	%f8,%f56,%f8
793	fsubd	%f10,%f58,%f10
794	fsubd	%f0,%f48,%f0
795	fsubd	%f2,%f50,%f2
796
797	faddd	%f4,%f48,%f4
798	faddd	%f6,%f50,%f6
799	faddd	%f12,%f56,%f12
800	faddd	%f14,%f58,%f14
801	faddd	%f8,%f52,%f8
802	faddd	%f10,%f54,%f10
803	.word	0x81be805d !fmaddd	%f26,%f60,%f0,%f0
804	.word	0x85be845f !fmaddd	%f26,%f62,%f2,%f2
805
806	faddd	%f4,%f6,%f58
807	faddd	%f12,%f14,%f62
808	faddd	%f8,%f10,%f60
809	faddd	%f0,%f2,%f56
810
811	faddd	%f58,%f18,%f58  		! bias
812	faddd	%f62,%f22,%f62
813	faddd	%f60,%f20,%f60
814	faddd	%f56,%f16, %f56
815
816	ldx	[%sp+LOCALS+8*4],%fsr		! restore saved %fsr
817
818	std	%f58,[%i0+8*1]			! store [biased] hash value
819	std	%f62,[%i0+8*3]
820	std	%f60,[%i0+8*2]
821	std	%f56,[%i0+8*0]
822
823.Labort:
824	ret
825	restore
826.type	poly1305_blocks_fma,#function
827.size	poly1305_blocks_fma,.-poly1305_blocks_fma
828.align	32
829poly1305_emit_fma:
830	save	%sp,-STACK_FRAME,%sp
831
832	ld	[%i0+8*0+0],%l5		! load hash
833	ld	[%i0+8*0+4],%l0
834	ld	[%i0+8*1+0],%o0
835	ld	[%i0+8*1+4],%l1
836	ld	[%i0+8*2+0],%o1
837	ld	[%i0+8*2+4],%l2
838	ld	[%i0+8*3+0],%o2
839	ld	[%i0+8*3+4],%l3
840
841	sethi	%hi(0xfff00000),%o3
842	andn	%l5,%o3,%l5			! mask exponent
843	andn	%o0,%o3,%o0
844	andn	%o1,%o3,%o1
845	andn	%o2,%o3,%o2			! can be partially reduced...
846	mov	3,%o3
847
848	srl	%o2,2,%i3			! ... so reduce
849	and	%o2,%o3,%l4
850	andn	%o2,%o3,%o2
851	add	%i3,%o2,%o2
852
853	addcc	%o2,%l0,%l0
854	addccc	%l5,%l1,%l1
855	addccc	%o0,%l2,%l2
856	addccc	%o1,%l3,%l3
857	addc	%g0,%l4,%l4
858
859	addcc	%l0,5,%l5			! compare to modulus
860	addccc	%l1,0,%o0
861	addccc	%l2,0,%o1
862	addccc	%l3,0,%o2
863	addc	%l4,0,%o3
864
865	srl	%o3,2,%o3			! did it carry/borrow?
866	neg	%o3,%o3
867	sra	%o3,31,%o3			! mask
868
869	andn	%l0,%o3,%l0
870	and	%l5,%o3,%l5
871	andn	%l1,%o3,%l1
872	and	%o0,%o3,%o0
873	or	%l5,%l0,%l0
874	ld	[%i2+0],%l5			! load nonce
875	andn	%l2,%o3,%l2
876	and	%o1,%o3,%o1
877	or	%o0,%l1,%l1
878	ld	[%i2+4],%o0
879	andn	%l3,%o3,%l3
880	and	%o2,%o3,%o2
881	or	%o1,%l2,%l2
882	ld	[%i2+8],%o1
883	or	%o2,%l3,%l3
884	ld	[%i2+12],%o2
885
886	addcc	%l5,%l0,%l0			! accumulate nonce
887	addccc	%o0,%l1,%l1
888	addccc	%o1,%l2,%l2
889	addc	%o2,%l3,%l3
890
891	stb	%l0,[%i1+0]			! write little-endian result
892	srl	%l0,8,%l0
893	stb	%l1,[%i1+4]
894	srl	%l1,8,%l1
895	stb	%l2,[%i1+8]
896	srl	%l2,8,%l2
897	stb	%l3,[%i1+12]
898	srl	%l3,8,%l3
899
900	stb	%l0,[%i1+1]
901	srl	%l0,8,%l0
902	stb	%l1,[%i1+5]
903	srl	%l1,8,%l1
904	stb	%l2,[%i1+9]
905	srl	%l2,8,%l2
906	stb	%l3,[%i1+13]
907	srl	%l3,8,%l3
908
909	stb	%l0,[%i1+2]
910	srl	%l0,8,%l0
911	stb	%l1,[%i1+6]
912	srl	%l1,8,%l1
913	stb	%l2,[%i1+10]
914	srl	%l2,8,%l2
915	stb	%l3,[%i1+14]
916	srl	%l3,8,%l3
917
918	stb	%l0,[%i1+3]
919	stb	%l1,[%i1+7]
920	stb	%l2,[%i1+11]
921	stb	%l3,[%i1+15]
922
923	ret
924	restore
925.type	poly1305_emit_fma,#function
926.size	poly1305_emit_fma,.-poly1305_emit_fma
927.align	64
928.Lconsts_fma:
929.word	0x43300000,0x00000000		! 2^(52+0)
930.word	0x45300000,0x00000000		! 2^(52+32)
931.word	0x47300000,0x00000000		! 2^(52+64)
932.word	0x49300000,0x00000000		! 2^(52+96)
933.word	0x4b500000,0x00000000		! 2^(52+130)
934
935.word	0x37f40000,0x00000000		! 5/2^130
936.word	0,1<<30				! fsr: truncate, no exceptions
937
938.word	0x44300000,0x00000000		! 2^(52+16+0)
939.word	0x46300000,0x00000000		! 2^(52+16+32)
940.word	0x48300000,0x00000000		! 2^(52+16+64)
941.word	0x4a300000,0x00000000		! 2^(52+16+96)
942.word	0x3e300000,0x00000000		! 2^(52+16+0-96)
943.word	0x40300000,0x00000000		! 2^(52+16+32-96)
944.word	0x42300000,0x00000000		! 2^(52+16+64-96)
945.asciz	"Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro@openssl.org>"
946.align	4
947