xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/poly1305-armv8.S (revision 9fd8799cb5ceb66c69f2eb1a6d26a1d587ba1f1e)
1#include "arm_asm.h"
2#include "arm_arch.h"
3
4.text
5
6// forward "declarations" are required for Apple
7
8.globl	poly1305_blocks
9.globl	poly1305_emit
10
11.globl	poly1305_init
12.type	poly1305_init,%function
13.align	5
14poly1305_init:
15	cmp	x1,xzr
16	stp	xzr,xzr,[x0]		// zero hash value
17	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
18
19	csel	x0,xzr,x0,eq
20	b.eq	.Lno_key
21
22#ifdef	__ILP32__
23	ldrsw	x11,.LOPENSSL_armcap_P
24#else
25	ldr	x11,.LOPENSSL_armcap_P
26#endif
27	adr	x10,.LOPENSSL_armcap_P
28
29	ldp	x7,x8,[x1]		// load key
30	mov	x9,#0xfffffffc0fffffff
31	movk	x9,#0x0fff,lsl#48
32	ldr	w17,[x10,x11]
33#ifdef	__ARMEB__
34	rev	x7,x7			// flip bytes
35	rev	x8,x8
36#endif
37	and	x7,x7,x9		// &=0ffffffc0fffffff
38	and	x9,x9,#-4
39	and	x8,x8,x9		// &=0ffffffc0ffffffc
40	stp	x7,x8,[x0,#32]	// save key value
41
42	tst	w17,#ARMV7_NEON
43
44	adr	x12,poly1305_blocks
45	adr	x7,poly1305_blocks_neon
46	adr	x13,poly1305_emit
47	adr	x8,poly1305_emit_neon
48
49	csel	x12,x12,x7,eq
50	csel	x13,x13,x8,eq
51
52#ifdef	__ILP32__
53	stp	w12,w13,[x2]
54#else
55	stp	x12,x13,[x2]
56#endif
57
58	mov	x0,#1
59.Lno_key:
60	ret
61.size	poly1305_init,.-poly1305_init
62
63.type	poly1305_blocks,%function
64.align	5
65poly1305_blocks:
66	ands	x2,x2,#-16
67	b.eq	.Lno_data
68
69	ldp	x4,x5,[x0]		// load hash value
70	ldp	x7,x8,[x0,#32]	// load key value
71	ldr	x6,[x0,#16]
72	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
73	b	.Loop
74
75.align	5
76.Loop:
77	ldp	x10,x11,[x1],#16	// load input
78	sub	x2,x2,#16
79#ifdef	__ARMEB__
80	rev	x10,x10
81	rev	x11,x11
82#endif
83	adds	x4,x4,x10		// accumulate input
84	adcs	x5,x5,x11
85
86	mul	x12,x4,x7		// h0*r0
87	adc	x6,x6,x3
88	umulh	x13,x4,x7
89
90	mul	x10,x5,x9		// h1*5*r1
91	umulh	x11,x5,x9
92
93	adds	x12,x12,x10
94	mul	x10,x4,x8		// h0*r1
95	adc	x13,x13,x11
96	umulh	x14,x4,x8
97
98	adds	x13,x13,x10
99	mul	x10,x5,x7		// h1*r0
100	adc	x14,x14,xzr
101	umulh	x11,x5,x7
102
103	adds	x13,x13,x10
104	mul	x10,x6,x9		// h2*5*r1
105	adc	x14,x14,x11
106	mul	x11,x6,x7		// h2*r0
107
108	adds	x13,x13,x10
109	adc	x14,x14,x11
110
111	and	x10,x14,#-4		// final reduction
112	and	x6,x14,#3
113	add	x10,x10,x14,lsr#2
114	adds	x4,x12,x10
115	adcs	x5,x13,xzr
116	adc	x6,x6,xzr
117
118	cbnz	x2,.Loop
119
120	stp	x4,x5,[x0]		// store hash value
121	str	x6,[x0,#16]
122
123.Lno_data:
124	ret
125.size	poly1305_blocks,.-poly1305_blocks
126
127.type	poly1305_emit,%function
128.align	5
129poly1305_emit:
130	ldp	x4,x5,[x0]		// load hash base 2^64
131	ldr	x6,[x0,#16]
132	ldp	x10,x11,[x2]	// load nonce
133
134	adds	x12,x4,#5		// compare to modulus
135	adcs	x13,x5,xzr
136	adc	x14,x6,xzr
137
138	tst	x14,#-4			// see if it's carried/borrowed
139
140	csel	x4,x4,x12,eq
141	csel	x5,x5,x13,eq
142
143#ifdef	__ARMEB__
144	ror	x10,x10,#32		// flip nonce words
145	ror	x11,x11,#32
146#endif
147	adds	x4,x4,x10		// accumulate nonce
148	adc	x5,x5,x11
149#ifdef	__ARMEB__
150	rev	x4,x4			// flip output bytes
151	rev	x5,x5
152#endif
153	stp	x4,x5,[x1]		// write result
154
155	ret
156.size	poly1305_emit,.-poly1305_emit
157.type	poly1305_mult,%function
158.align	5
159poly1305_mult:
160	mul	x12,x4,x7		// h0*r0
161	umulh	x13,x4,x7
162
163	mul	x10,x5,x9		// h1*5*r1
164	umulh	x11,x5,x9
165
166	adds	x12,x12,x10
167	mul	x10,x4,x8		// h0*r1
168	adc	x13,x13,x11
169	umulh	x14,x4,x8
170
171	adds	x13,x13,x10
172	mul	x10,x5,x7		// h1*r0
173	adc	x14,x14,xzr
174	umulh	x11,x5,x7
175
176	adds	x13,x13,x10
177	mul	x10,x6,x9		// h2*5*r1
178	adc	x14,x14,x11
179	mul	x11,x6,x7		// h2*r0
180
181	adds	x13,x13,x10
182	adc	x14,x14,x11
183
184	and	x10,x14,#-4		// final reduction
185	and	x6,x14,#3
186	add	x10,x10,x14,lsr#2
187	adds	x4,x12,x10
188	adcs	x5,x13,xzr
189	adc	x6,x6,xzr
190
191	ret
192.size	poly1305_mult,.-poly1305_mult
193
194.type	poly1305_splat,%function
195.align	5
196poly1305_splat:
197	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
198	ubfx	x13,x4,#26,#26
199	extr	x14,x5,x4,#52
200	and	x14,x14,#0x03ffffff
201	ubfx	x15,x5,#14,#26
202	extr	x16,x6,x5,#40
203
204	str	w12,[x0,#16*0]	// r0
205	add	w12,w13,w13,lsl#2	// r1*5
206	str	w13,[x0,#16*1]	// r1
207	add	w13,w14,w14,lsl#2	// r2*5
208	str	w12,[x0,#16*2]	// s1
209	str	w14,[x0,#16*3]	// r2
210	add	w14,w15,w15,lsl#2	// r3*5
211	str	w13,[x0,#16*4]	// s2
212	str	w15,[x0,#16*5]	// r3
213	add	w15,w16,w16,lsl#2	// r4*5
214	str	w14,[x0,#16*6]	// s3
215	str	w16,[x0,#16*7]	// r4
216	str	w15,[x0,#16*8]	// s4
217
218	ret
219.size	poly1305_splat,.-poly1305_splat
220
221.type	poly1305_blocks_neon,%function
222.align	5
223poly1305_blocks_neon:
224	ldr	x17,[x0,#24]
225	cmp	x2,#128
226	b.hs	.Lblocks_neon
227	cbz	x17,poly1305_blocks
228
229.Lblocks_neon:
230.inst	0xd503233f		// paciasp
231	stp	x29,x30,[sp,#-80]!
232	add	x29,sp,#0
233
234	ands	x2,x2,#-16
235	b.eq	.Lno_data_neon
236
237	cbz	x17,.Lbase2_64_neon
238
239	ldp	w10,w11,[x0]		// load hash value base 2^26
240	ldp	w12,w13,[x0,#8]
241	ldr	w14,[x0,#16]
242
243	tst	x2,#31
244	b.eq	.Leven_neon
245
246	ldp	x7,x8,[x0,#32]	// load key value
247
248	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
249	lsr	x5,x12,#12
250	adds	x4,x4,x12,lsl#52
251	add	x5,x5,x13,lsl#14
252	adc	x5,x5,xzr
253	lsr	x6,x14,#24
254	adds	x5,x5,x14,lsl#40
255	adc	x14,x6,xzr		// can be partially reduced...
256
257	ldp	x12,x13,[x1],#16	// load input
258	sub	x2,x2,#16
259	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
260
261	and	x10,x14,#-4		// ... so reduce
262	and	x6,x14,#3
263	add	x10,x10,x14,lsr#2
264	adds	x4,x4,x10
265	adcs	x5,x5,xzr
266	adc	x6,x6,xzr
267
268#ifdef	__ARMEB__
269	rev	x12,x12
270	rev	x13,x13
271#endif
272	adds	x4,x4,x12		// accumulate input
273	adcs	x5,x5,x13
274	adc	x6,x6,x3
275
276	bl	poly1305_mult
277	ldr	x30,[sp,#8]
278
279	cbz	x3,.Lstore_base2_64_neon
280
281	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
282	ubfx	x11,x4,#26,#26
283	extr	x12,x5,x4,#52
284	and	x12,x12,#0x03ffffff
285	ubfx	x13,x5,#14,#26
286	extr	x14,x6,x5,#40
287
288	cbnz	x2,.Leven_neon
289
290	stp	w10,w11,[x0]		// store hash value base 2^26
291	stp	w12,w13,[x0,#8]
292	str	w14,[x0,#16]
293	b	.Lno_data_neon
294
295.align	4
296.Lstore_base2_64_neon:
297	stp	x4,x5,[x0]		// store hash value base 2^64
298	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
299	b	.Lno_data_neon
300
301.align	4
302.Lbase2_64_neon:
303	ldp	x7,x8,[x0,#32]	// load key value
304
305	ldp	x4,x5,[x0]		// load hash value base 2^64
306	ldr	x6,[x0,#16]
307
308	tst	x2,#31
309	b.eq	.Linit_neon
310
311	ldp	x12,x13,[x1],#16	// load input
312	sub	x2,x2,#16
313	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
314#ifdef	__ARMEB__
315	rev	x12,x12
316	rev	x13,x13
317#endif
318	adds	x4,x4,x12		// accumulate input
319	adcs	x5,x5,x13
320	adc	x6,x6,x3
321
322	bl	poly1305_mult
323
324.Linit_neon:
325	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
326	ubfx	x11,x4,#26,#26
327	extr	x12,x5,x4,#52
328	and	x12,x12,#0x03ffffff
329	ubfx	x13,x5,#14,#26
330	extr	x14,x6,x5,#40
331
332	stp	d8,d9,[sp,#16]		// meet ABI requirements
333	stp	d10,d11,[sp,#32]
334	stp	d12,d13,[sp,#48]
335	stp	d14,d15,[sp,#64]
336
337	fmov	d24,x10
338	fmov	d25,x11
339	fmov	d26,x12
340	fmov	d27,x13
341	fmov	d28,x14
342
343	////////////////////////////////// initialize r^n table
344	mov	x4,x7			// r^1
345	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
346	mov	x5,x8
347	mov	x6,xzr
348	add	x0,x0,#48+12
349	bl	poly1305_splat
350
351	bl	poly1305_mult		// r^2
352	sub	x0,x0,#4
353	bl	poly1305_splat
354
355	bl	poly1305_mult		// r^3
356	sub	x0,x0,#4
357	bl	poly1305_splat
358
359	bl	poly1305_mult		// r^4
360	sub	x0,x0,#4
361	bl	poly1305_splat
362	ldr	x30,[sp,#8]
363
364	add	x16,x1,#32
365	adr	x17,.Lzeros
366	subs	x2,x2,#64
367	csel	x16,x17,x16,lo
368
369	mov	x4,#1
370	str	x4,[x0,#-24]		// set is_base2_26
371	sub	x0,x0,#48		// restore original x0
372	b	.Ldo_neon
373
374.align	4
375.Leven_neon:
376	add	x16,x1,#32
377	adr	x17,.Lzeros
378	subs	x2,x2,#64
379	csel	x16,x17,x16,lo
380
381	stp	d8,d9,[sp,#16]		// meet ABI requirements
382	stp	d10,d11,[sp,#32]
383	stp	d12,d13,[sp,#48]
384	stp	d14,d15,[sp,#64]
385
386	fmov	d24,x10
387	fmov	d25,x11
388	fmov	d26,x12
389	fmov	d27,x13
390	fmov	d28,x14
391
392.Ldo_neon:
393	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
394	ldp	x9,x13,[x16],#48
395
396	lsl	x3,x3,#24
397	add	x15,x0,#48
398
399#ifdef	__ARMEB__
400	rev	x8,x8
401	rev	x12,x12
402	rev	x9,x9
403	rev	x13,x13
404#endif
405	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
406	and	x5,x9,#0x03ffffff
407	ubfx	x6,x8,#26,#26
408	ubfx	x7,x9,#26,#26
409	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
410	extr	x8,x12,x8,#52
411	extr	x9,x13,x9,#52
412	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
413	fmov	d14,x4
414	and	x8,x8,#0x03ffffff
415	and	x9,x9,#0x03ffffff
416	ubfx	x10,x12,#14,#26
417	ubfx	x11,x13,#14,#26
418	add	x12,x3,x12,lsr#40
419	add	x13,x3,x13,lsr#40
420	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
421	fmov	d15,x6
422	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
423	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
424	fmov	d16,x8
425	fmov	d17,x10
426	fmov	d18,x12
427
428	ldp	x8,x12,[x1],#16	// inp[0:1]
429	ldp	x9,x13,[x1],#48
430
431	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
432	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
433	ld1	{v8.4s},[x15]
434
435#ifdef	__ARMEB__
436	rev	x8,x8
437	rev	x12,x12
438	rev	x9,x9
439	rev	x13,x13
440#endif
441	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
442	and	x5,x9,#0x03ffffff
443	ubfx	x6,x8,#26,#26
444	ubfx	x7,x9,#26,#26
445	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
446	extr	x8,x12,x8,#52
447	extr	x9,x13,x9,#52
448	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
449	fmov	d9,x4
450	and	x8,x8,#0x03ffffff
451	and	x9,x9,#0x03ffffff
452	ubfx	x10,x12,#14,#26
453	ubfx	x11,x13,#14,#26
454	add	x12,x3,x12,lsr#40
455	add	x13,x3,x13,lsr#40
456	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
457	fmov	d10,x6
458	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
459	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
460	movi	v31.2d,#-1
461	fmov	d11,x8
462	fmov	d12,x10
463	fmov	d13,x12
464	ushr	v31.2d,v31.2d,#38
465
466	b.ls	.Lskip_loop
467
468.align	4
469.Loop_neon:
470	////////////////////////////////////////////////////////////////
471	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
472	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
473	//   ___________________/
474	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
475	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
476	//   ___________________/ ____________________/
477	//
478	// Note that we start with inp[2:3]*r^2. This is because it
479	// doesn't depend on reduction in previous iteration.
480	////////////////////////////////////////////////////////////////
481	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
482	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
483	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
484	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
485	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
486
487	subs	x2,x2,#64
488	umull	v23.2d,v14.2s,v7.s[2]
489	csel	x16,x17,x16,lo
490	umull	v22.2d,v14.2s,v5.s[2]
491	umull	v21.2d,v14.2s,v3.s[2]
492	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
493	umull	v20.2d,v14.2s,v1.s[2]
494	ldp	x9,x13,[x16],#48
495	umull	v19.2d,v14.2s,v0.s[2]
496#ifdef	__ARMEB__
497	rev	x8,x8
498	rev	x12,x12
499	rev	x9,x9
500	rev	x13,x13
501#endif
502
503	umlal	v23.2d,v15.2s,v5.s[2]
504	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
505	umlal	v22.2d,v15.2s,v3.s[2]
506	and	x5,x9,#0x03ffffff
507	umlal	v21.2d,v15.2s,v1.s[2]
508	ubfx	x6,x8,#26,#26
509	umlal	v20.2d,v15.2s,v0.s[2]
510	ubfx	x7,x9,#26,#26
511	umlal	v19.2d,v15.2s,v8.s[2]
512	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
513
514	umlal	v23.2d,v16.2s,v3.s[2]
515	extr	x8,x12,x8,#52
516	umlal	v22.2d,v16.2s,v1.s[2]
517	extr	x9,x13,x9,#52
518	umlal	v21.2d,v16.2s,v0.s[2]
519	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
520	umlal	v20.2d,v16.2s,v8.s[2]
521	fmov	d14,x4
522	umlal	v19.2d,v16.2s,v6.s[2]
523	and	x8,x8,#0x03ffffff
524
525	umlal	v23.2d,v17.2s,v1.s[2]
526	and	x9,x9,#0x03ffffff
527	umlal	v22.2d,v17.2s,v0.s[2]
528	ubfx	x10,x12,#14,#26
529	umlal	v21.2d,v17.2s,v8.s[2]
530	ubfx	x11,x13,#14,#26
531	umlal	v20.2d,v17.2s,v6.s[2]
532	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
533	umlal	v19.2d,v17.2s,v4.s[2]
534	fmov	d15,x6
535
536	add	v11.2s,v11.2s,v26.2s
537	add	x12,x3,x12,lsr#40
538	umlal	v23.2d,v18.2s,v0.s[2]
539	add	x13,x3,x13,lsr#40
540	umlal	v22.2d,v18.2s,v8.s[2]
541	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
542	umlal	v21.2d,v18.2s,v6.s[2]
543	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
544	umlal	v20.2d,v18.2s,v4.s[2]
545	fmov	d16,x8
546	umlal	v19.2d,v18.2s,v2.s[2]
547	fmov	d17,x10
548
549	////////////////////////////////////////////////////////////////
550	// (hash+inp[0:1])*r^4 and accumulate
551
552	add	v9.2s,v9.2s,v24.2s
553	fmov	d18,x12
554	umlal	v22.2d,v11.2s,v1.s[0]
555	ldp	x8,x12,[x1],#16	// inp[0:1]
556	umlal	v19.2d,v11.2s,v6.s[0]
557	ldp	x9,x13,[x1],#48
558	umlal	v23.2d,v11.2s,v3.s[0]
559	umlal	v20.2d,v11.2s,v8.s[0]
560	umlal	v21.2d,v11.2s,v0.s[0]
561#ifdef	__ARMEB__
562	rev	x8,x8
563	rev	x12,x12
564	rev	x9,x9
565	rev	x13,x13
566#endif
567
568	add	v10.2s,v10.2s,v25.2s
569	umlal	v22.2d,v9.2s,v5.s[0]
570	umlal	v23.2d,v9.2s,v7.s[0]
571	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
572	umlal	v21.2d,v9.2s,v3.s[0]
573	and	x5,x9,#0x03ffffff
574	umlal	v19.2d,v9.2s,v0.s[0]
575	ubfx	x6,x8,#26,#26
576	umlal	v20.2d,v9.2s,v1.s[0]
577	ubfx	x7,x9,#26,#26
578
579	add	v12.2s,v12.2s,v27.2s
580	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
581	umlal	v22.2d,v10.2s,v3.s[0]
582	extr	x8,x12,x8,#52
583	umlal	v23.2d,v10.2s,v5.s[0]
584	extr	x9,x13,x9,#52
585	umlal	v19.2d,v10.2s,v8.s[0]
586	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
587	umlal	v21.2d,v10.2s,v1.s[0]
588	fmov	d9,x4
589	umlal	v20.2d,v10.2s,v0.s[0]
590	and	x8,x8,#0x03ffffff
591
592	add	v13.2s,v13.2s,v28.2s
593	and	x9,x9,#0x03ffffff
594	umlal	v22.2d,v12.2s,v0.s[0]
595	ubfx	x10,x12,#14,#26
596	umlal	v19.2d,v12.2s,v4.s[0]
597	ubfx	x11,x13,#14,#26
598	umlal	v23.2d,v12.2s,v1.s[0]
599	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
600	umlal	v20.2d,v12.2s,v6.s[0]
601	fmov	d10,x6
602	umlal	v21.2d,v12.2s,v8.s[0]
603	add	x12,x3,x12,lsr#40
604
605	umlal	v22.2d,v13.2s,v8.s[0]
606	add	x13,x3,x13,lsr#40
607	umlal	v19.2d,v13.2s,v2.s[0]
608	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
609	umlal	v23.2d,v13.2s,v0.s[0]
610	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
611	umlal	v20.2d,v13.2s,v4.s[0]
612	fmov	d11,x8
613	umlal	v21.2d,v13.2s,v6.s[0]
614	fmov	d12,x10
615	fmov	d13,x12
616
617	/////////////////////////////////////////////////////////////////
618	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
619	// and P. Schwabe
620	//
621	// [see discussion in poly1305-armv4 module]
622
623	ushr	v29.2d,v22.2d,#26
624	xtn	v27.2s,v22.2d
625	ushr	v30.2d,v19.2d,#26
626	and	v19.16b,v19.16b,v31.16b
627	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
628	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
629	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
630
631	ushr	v29.2d,v23.2d,#26
632	xtn	v28.2s,v23.2d
633	ushr	v30.2d,v20.2d,#26
634	xtn	v25.2s,v20.2d
635	bic	v28.2s,#0xfc,lsl#24
636	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
637
638	add	v19.2d,v19.2d,v29.2d
639	shl	v29.2d,v29.2d,#2
640	shrn	v30.2s,v21.2d,#26
641	xtn	v26.2s,v21.2d
642	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
643	bic	v25.2s,#0xfc,lsl#24
644	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
645	bic	v26.2s,#0xfc,lsl#24
646
647	shrn	v29.2s,v19.2d,#26
648	xtn	v24.2s,v19.2d
649	ushr	v30.2s,v27.2s,#26
650	bic	v27.2s,#0xfc,lsl#24
651	bic	v24.2s,#0xfc,lsl#24
652	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
653	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
654
655	b.hi	.Loop_neon
656
657.Lskip_loop:
658	dup	v16.2d,v16.d[0]
659	add	v11.2s,v11.2s,v26.2s
660
661	////////////////////////////////////////////////////////////////
662	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
663
664	adds	x2,x2,#32
665	b.ne	.Long_tail
666
667	dup	v16.2d,v11.d[0]
668	add	v14.2s,v9.2s,v24.2s
669	add	v17.2s,v12.2s,v27.2s
670	add	v15.2s,v10.2s,v25.2s
671	add	v18.2s,v13.2s,v28.2s
672
673.Long_tail:
674	dup	v14.2d,v14.d[0]
675	umull2	v19.2d,v16.4s,v6.4s
676	umull2	v22.2d,v16.4s,v1.4s
677	umull2	v23.2d,v16.4s,v3.4s
678	umull2	v21.2d,v16.4s,v0.4s
679	umull2	v20.2d,v16.4s,v8.4s
680
681	dup	v15.2d,v15.d[0]
682	umlal2	v19.2d,v14.4s,v0.4s
683	umlal2	v21.2d,v14.4s,v3.4s
684	umlal2	v22.2d,v14.4s,v5.4s
685	umlal2	v23.2d,v14.4s,v7.4s
686	umlal2	v20.2d,v14.4s,v1.4s
687
688	dup	v17.2d,v17.d[0]
689	umlal2	v19.2d,v15.4s,v8.4s
690	umlal2	v22.2d,v15.4s,v3.4s
691	umlal2	v21.2d,v15.4s,v1.4s
692	umlal2	v23.2d,v15.4s,v5.4s
693	umlal2	v20.2d,v15.4s,v0.4s
694
695	dup	v18.2d,v18.d[0]
696	umlal2	v22.2d,v17.4s,v0.4s
697	umlal2	v23.2d,v17.4s,v1.4s
698	umlal2	v19.2d,v17.4s,v4.4s
699	umlal2	v20.2d,v17.4s,v6.4s
700	umlal2	v21.2d,v17.4s,v8.4s
701
702	umlal2	v22.2d,v18.4s,v8.4s
703	umlal2	v19.2d,v18.4s,v2.4s
704	umlal2	v23.2d,v18.4s,v0.4s
705	umlal2	v20.2d,v18.4s,v4.4s
706	umlal2	v21.2d,v18.4s,v6.4s
707
708	b.eq	.Lshort_tail
709
710	////////////////////////////////////////////////////////////////
711	// (hash+inp[0:1])*r^4:r^3 and accumulate
712
713	add	v9.2s,v9.2s,v24.2s
714	umlal	v22.2d,v11.2s,v1.2s
715	umlal	v19.2d,v11.2s,v6.2s
716	umlal	v23.2d,v11.2s,v3.2s
717	umlal	v20.2d,v11.2s,v8.2s
718	umlal	v21.2d,v11.2s,v0.2s
719
720	add	v10.2s,v10.2s,v25.2s
721	umlal	v22.2d,v9.2s,v5.2s
722	umlal	v19.2d,v9.2s,v0.2s
723	umlal	v23.2d,v9.2s,v7.2s
724	umlal	v20.2d,v9.2s,v1.2s
725	umlal	v21.2d,v9.2s,v3.2s
726
727	add	v12.2s,v12.2s,v27.2s
728	umlal	v22.2d,v10.2s,v3.2s
729	umlal	v19.2d,v10.2s,v8.2s
730	umlal	v23.2d,v10.2s,v5.2s
731	umlal	v20.2d,v10.2s,v0.2s
732	umlal	v21.2d,v10.2s,v1.2s
733
734	add	v13.2s,v13.2s,v28.2s
735	umlal	v22.2d,v12.2s,v0.2s
736	umlal	v19.2d,v12.2s,v4.2s
737	umlal	v23.2d,v12.2s,v1.2s
738	umlal	v20.2d,v12.2s,v6.2s
739	umlal	v21.2d,v12.2s,v8.2s
740
741	umlal	v22.2d,v13.2s,v8.2s
742	umlal	v19.2d,v13.2s,v2.2s
743	umlal	v23.2d,v13.2s,v0.2s
744	umlal	v20.2d,v13.2s,v4.2s
745	umlal	v21.2d,v13.2s,v6.2s
746
747.Lshort_tail:
748	////////////////////////////////////////////////////////////////
749	// horizontal add
750
751	addp	v22.2d,v22.2d,v22.2d
752	ldp	d8,d9,[sp,#16]		// meet ABI requirements
753	addp	v19.2d,v19.2d,v19.2d
754	ldp	d10,d11,[sp,#32]
755	addp	v23.2d,v23.2d,v23.2d
756	ldp	d12,d13,[sp,#48]
757	addp	v20.2d,v20.2d,v20.2d
758	ldp	d14,d15,[sp,#64]
759	addp	v21.2d,v21.2d,v21.2d
760
761	////////////////////////////////////////////////////////////////
762	// lazy reduction, but without narrowing
763
764	ushr	v29.2d,v22.2d,#26
765	and	v22.16b,v22.16b,v31.16b
766	ushr	v30.2d,v19.2d,#26
767	and	v19.16b,v19.16b,v31.16b
768
769	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
770	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
771
772	ushr	v29.2d,v23.2d,#26
773	and	v23.16b,v23.16b,v31.16b
774	ushr	v30.2d,v20.2d,#26
775	and	v20.16b,v20.16b,v31.16b
776	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
777
778	add	v19.2d,v19.2d,v29.2d
779	shl	v29.2d,v29.2d,#2
780	ushr	v30.2d,v21.2d,#26
781	and	v21.16b,v21.16b,v31.16b
782	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
783	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
784
785	ushr	v29.2d,v19.2d,#26
786	and	v19.16b,v19.16b,v31.16b
787	ushr	v30.2d,v22.2d,#26
788	and	v22.16b,v22.16b,v31.16b
789	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
790	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
791
792	////////////////////////////////////////////////////////////////
793	// write the result, can be partially reduced
794
795	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
796	st1	{v23.s}[0],[x0]
797
798.Lno_data_neon:
799.inst	0xd50323bf		// autiasp
800	ldr	x29,[sp],#80
801	ret
802.size	poly1305_blocks_neon,.-poly1305_blocks_neon
803
804.type	poly1305_emit_neon,%function
805.align	5
806poly1305_emit_neon:
807	ldr	x17,[x0,#24]
808	cbz	x17,poly1305_emit
809
810	ldp	w10,w11,[x0]		// load hash value base 2^26
811	ldp	w12,w13,[x0,#8]
812	ldr	w14,[x0,#16]
813
814	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
815	lsr	x5,x12,#12
816	adds	x4,x4,x12,lsl#52
817	add	x5,x5,x13,lsl#14
818	adc	x5,x5,xzr
819	lsr	x6,x14,#24
820	adds	x5,x5,x14,lsl#40
821	adc	x6,x6,xzr		// can be partially reduced...
822
823	ldp	x10,x11,[x2]	// load nonce
824
825	and	x12,x6,#-4		// ... so reduce
826	add	x12,x12,x6,lsr#2
827	and	x6,x6,#3
828	adds	x4,x4,x12
829	adcs	x5,x5,xzr
830	adc	x6,x6,xzr
831
832	adds	x12,x4,#5		// compare to modulus
833	adcs	x13,x5,xzr
834	adc	x14,x6,xzr
835
836	tst	x14,#-4			// see if it's carried/borrowed
837
838	csel	x4,x4,x12,eq
839	csel	x5,x5,x13,eq
840
841#ifdef	__ARMEB__
842	ror	x10,x10,#32		// flip nonce words
843	ror	x11,x11,#32
844#endif
845	adds	x4,x4,x10		// accumulate nonce
846	adc	x5,x5,x11
847#ifdef	__ARMEB__
848	rev	x4,x4			// flip output bytes
849	rev	x5,x5
850#endif
851	stp	x4,x5,[x1]		// write result
852
853	ret
854.size	poly1305_emit_neon,.-poly1305_emit_neon
855
856.align	5
857.Lzeros:
858.long	0,0,0,0,0,0,0,0
859.LOPENSSL_armcap_P:
860#ifdef	__ILP32__
861.long	OPENSSL_armcap_P-.
862#else
863.quad	OPENSSL_armcap_P-.
864#endif
865.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
866.align	2
867.align	2
868