xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/poly1305-armv8.S (revision cde2e090ac142bb11e68cdd791a971650215baf2)
1#include "arm_asm.h"
2#include "arm_arch.h"
3
4.text
5
6// forward "declarations" are required for Apple
7
8.globl	poly1305_blocks
9.globl	poly1305_emit
10
11.globl	poly1305_init
12.type	poly1305_init,%function
13.align	5
14poly1305_init:
15	cmp	x1,xzr
16	stp	xzr,xzr,[x0]		// zero hash value
17	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
18
19	csel	x0,xzr,x0,eq
20	b.eq	.Lno_key
21
22#ifdef	__ILP32__
23	ldrsw	x11,.LOPENSSL_armcap_P
24#else
25	ldr	x11,.LOPENSSL_armcap_P
26#endif
27	adr	x10,.LOPENSSL_armcap_P
28
29	ldp	x7,x8,[x1]		// load key
30	mov	x9,#0xfffffffc0fffffff
31	movk	x9,#0x0fff,lsl#48
32	ldr	w17,[x10,x11]
33#ifdef	__ARMEB__
34	rev	x7,x7			// flip bytes
35	rev	x8,x8
36#endif
37	and	x7,x7,x9		// &=0ffffffc0fffffff
38	and	x9,x9,#-4
39	and	x8,x8,x9		// &=0ffffffc0ffffffc
40	stp	x7,x8,[x0,#32]	// save key value
41
42	tst	w17,#ARMV7_NEON
43
44	adr	x12,poly1305_blocks
45	adr	x7,poly1305_blocks_neon
46	adr	x13,poly1305_emit
47	adr	x8,poly1305_emit_neon
48
49	csel	x12,x12,x7,eq
50	csel	x13,x13,x8,eq
51
52#ifdef	__ILP32__
53	stp	w12,w13,[x2]
54#else
55	stp	x12,x13,[x2]
56#endif
57
58	mov	x0,#1
59.Lno_key:
60	ret
61.size	poly1305_init,.-poly1305_init
62
63.type	poly1305_blocks,%function
64.align	5
65poly1305_blocks:
66	ands	x2,x2,#-16
67	b.eq	.Lno_data
68
69	ldp	x4,x5,[x0]		// load hash value
70	ldp	x7,x8,[x0,#32]	// load key value
71	ldr	x6,[x0,#16]
72	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
73	b	.Loop
74
75.align	5
76.Loop:
77	ldp	x10,x11,[x1],#16	// load input
78	sub	x2,x2,#16
79#ifdef	__ARMEB__
80	rev	x10,x10
81	rev	x11,x11
82#endif
83	adds	x4,x4,x10		// accumulate input
84	adcs	x5,x5,x11
85
86	mul	x12,x4,x7		// h0*r0
87	adc	x6,x6,x3
88	umulh	x13,x4,x7
89
90	mul	x10,x5,x9		// h1*5*r1
91	umulh	x11,x5,x9
92
93	adds	x12,x12,x10
94	mul	x10,x4,x8		// h0*r1
95	adc	x13,x13,x11
96	umulh	x14,x4,x8
97
98	adds	x13,x13,x10
99	mul	x10,x5,x7		// h1*r0
100	adc	x14,x14,xzr
101	umulh	x11,x5,x7
102
103	adds	x13,x13,x10
104	mul	x10,x6,x9		// h2*5*r1
105	adc	x14,x14,x11
106	mul	x11,x6,x7		// h2*r0
107
108	adds	x13,x13,x10
109	adc	x14,x14,x11
110
111	and	x10,x14,#-4		// final reduction
112	and	x6,x14,#3
113	add	x10,x10,x14,lsr#2
114	adds	x4,x12,x10
115	adcs	x5,x13,xzr
116	adc	x6,x6,xzr
117
118	cbnz	x2,.Loop
119
120	stp	x4,x5,[x0]		// store hash value
121	str	x6,[x0,#16]
122
123.Lno_data:
124	ret
125.size	poly1305_blocks,.-poly1305_blocks
126
127.type	poly1305_emit,%function
128.align	5
129poly1305_emit:
130	ldp	x4,x5,[x0]		// load hash base 2^64
131	ldr	x6,[x0,#16]
132	ldp	x10,x11,[x2]	// load nonce
133
134	adds	x12,x4,#5		// compare to modulus
135	adcs	x13,x5,xzr
136	adc	x14,x6,xzr
137
138	tst	x14,#-4			// see if it's carried/borrowed
139
140	csel	x4,x4,x12,eq
141	csel	x5,x5,x13,eq
142
143#ifdef	__ARMEB__
144	ror	x10,x10,#32		// flip nonce words
145	ror	x11,x11,#32
146#endif
147	adds	x4,x4,x10		// accumulate nonce
148	adc	x5,x5,x11
149#ifdef	__ARMEB__
150	rev	x4,x4			// flip output bytes
151	rev	x5,x5
152#endif
153	stp	x4,x5,[x1]		// write result
154
155	ret
156.size	poly1305_emit,.-poly1305_emit
157.type	poly1305_mult,%function
158.align	5
159poly1305_mult:
160	mul	x12,x4,x7		// h0*r0
161	umulh	x13,x4,x7
162
163	mul	x10,x5,x9		// h1*5*r1
164	umulh	x11,x5,x9
165
166	adds	x12,x12,x10
167	mul	x10,x4,x8		// h0*r1
168	adc	x13,x13,x11
169	umulh	x14,x4,x8
170
171	adds	x13,x13,x10
172	mul	x10,x5,x7		// h1*r0
173	adc	x14,x14,xzr
174	umulh	x11,x5,x7
175
176	adds	x13,x13,x10
177	mul	x10,x6,x9		// h2*5*r1
178	adc	x14,x14,x11
179	mul	x11,x6,x7		// h2*r0
180
181	adds	x13,x13,x10
182	adc	x14,x14,x11
183
184	and	x10,x14,#-4		// final reduction
185	and	x6,x14,#3
186	add	x10,x10,x14,lsr#2
187	adds	x4,x12,x10
188	adcs	x5,x13,xzr
189	adc	x6,x6,xzr
190
191	ret
192.size	poly1305_mult,.-poly1305_mult
193
194.type	poly1305_splat,%function
195.align	5
196poly1305_splat:
197	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
198	ubfx	x13,x4,#26,#26
199	extr	x14,x5,x4,#52
200	and	x14,x14,#0x03ffffff
201	ubfx	x15,x5,#14,#26
202	extr	x16,x6,x5,#40
203
204	str	w12,[x0,#16*0]	// r0
205	add	w12,w13,w13,lsl#2	// r1*5
206	str	w13,[x0,#16*1]	// r1
207	add	w13,w14,w14,lsl#2	// r2*5
208	str	w12,[x0,#16*2]	// s1
209	str	w14,[x0,#16*3]	// r2
210	add	w14,w15,w15,lsl#2	// r3*5
211	str	w13,[x0,#16*4]	// s2
212	str	w15,[x0,#16*5]	// r3
213	add	w15,w16,w16,lsl#2	// r4*5
214	str	w14,[x0,#16*6]	// s3
215	str	w16,[x0,#16*7]	// r4
216	str	w15,[x0,#16*8]	// s4
217
218	ret
219.size	poly1305_splat,.-poly1305_splat
220
221.type	poly1305_blocks_neon,%function
222.align	5
223poly1305_blocks_neon:
224	ldr	x17,[x0,#24]
225	cmp	x2,#128
226	b.hs	.Lblocks_neon
227	cbz	x17,poly1305_blocks
228
229.Lblocks_neon:
230	stp	x29,x30,[sp,#-80]!
231	add	x29,sp,#0
232
233	ands	x2,x2,#-16
234	b.eq	.Lno_data_neon
235
236	cbz	x17,.Lbase2_64_neon
237
238	ldp	w10,w11,[x0]		// load hash value base 2^26
239	ldp	w12,w13,[x0,#8]
240	ldr	w14,[x0,#16]
241
242	tst	x2,#31
243	b.eq	.Leven_neon
244
245	ldp	x7,x8,[x0,#32]	// load key value
246
247	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
248	lsr	x5,x12,#12
249	adds	x4,x4,x12,lsl#52
250	add	x5,x5,x13,lsl#14
251	adc	x5,x5,xzr
252	lsr	x6,x14,#24
253	adds	x5,x5,x14,lsl#40
254	adc	x14,x6,xzr		// can be partially reduced...
255
256	ldp	x12,x13,[x1],#16	// load input
257	sub	x2,x2,#16
258	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
259
260	and	x10,x14,#-4		// ... so reduce
261	and	x6,x14,#3
262	add	x10,x10,x14,lsr#2
263	adds	x4,x4,x10
264	adcs	x5,x5,xzr
265	adc	x6,x6,xzr
266
267#ifdef	__ARMEB__
268	rev	x12,x12
269	rev	x13,x13
270#endif
271	adds	x4,x4,x12		// accumulate input
272	adcs	x5,x5,x13
273	adc	x6,x6,x3
274
275	bl	poly1305_mult
276	ldr	x30,[sp,#8]
277
278	cbz	x3,.Lstore_base2_64_neon
279
280	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
281	ubfx	x11,x4,#26,#26
282	extr	x12,x5,x4,#52
283	and	x12,x12,#0x03ffffff
284	ubfx	x13,x5,#14,#26
285	extr	x14,x6,x5,#40
286
287	cbnz	x2,.Leven_neon
288
289	stp	w10,w11,[x0]		// store hash value base 2^26
290	stp	w12,w13,[x0,#8]
291	str	w14,[x0,#16]
292	b	.Lno_data_neon
293
294.align	4
295.Lstore_base2_64_neon:
296	stp	x4,x5,[x0]		// store hash value base 2^64
297	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
298	b	.Lno_data_neon
299
300.align	4
301.Lbase2_64_neon:
302	ldp	x7,x8,[x0,#32]	// load key value
303
304	ldp	x4,x5,[x0]		// load hash value base 2^64
305	ldr	x6,[x0,#16]
306
307	tst	x2,#31
308	b.eq	.Linit_neon
309
310	ldp	x12,x13,[x1],#16	// load input
311	sub	x2,x2,#16
312	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
313#ifdef	__ARMEB__
314	rev	x12,x12
315	rev	x13,x13
316#endif
317	adds	x4,x4,x12		// accumulate input
318	adcs	x5,x5,x13
319	adc	x6,x6,x3
320
321	bl	poly1305_mult
322
323.Linit_neon:
324	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
325	ubfx	x11,x4,#26,#26
326	extr	x12,x5,x4,#52
327	and	x12,x12,#0x03ffffff
328	ubfx	x13,x5,#14,#26
329	extr	x14,x6,x5,#40
330
331	stp	d8,d9,[sp,#16]		// meet ABI requirements
332	stp	d10,d11,[sp,#32]
333	stp	d12,d13,[sp,#48]
334	stp	d14,d15,[sp,#64]
335
336	fmov	d24,x10
337	fmov	d25,x11
338	fmov	d26,x12
339	fmov	d27,x13
340	fmov	d28,x14
341
342	////////////////////////////////// initialize r^n table
343	mov	x4,x7			// r^1
344	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
345	mov	x5,x8
346	mov	x6,xzr
347	add	x0,x0,#48+12
348	bl	poly1305_splat
349
350	bl	poly1305_mult		// r^2
351	sub	x0,x0,#4
352	bl	poly1305_splat
353
354	bl	poly1305_mult		// r^3
355	sub	x0,x0,#4
356	bl	poly1305_splat
357
358	bl	poly1305_mult		// r^4
359	sub	x0,x0,#4
360	bl	poly1305_splat
361	ldr	x30,[sp,#8]
362
363	add	x16,x1,#32
364	adr	x17,.Lzeros
365	subs	x2,x2,#64
366	csel	x16,x17,x16,lo
367
368	mov	x4,#1
369	str	x4,[x0,#-24]		// set is_base2_26
370	sub	x0,x0,#48		// restore original x0
371	b	.Ldo_neon
372
373.align	4
374.Leven_neon:
375	add	x16,x1,#32
376	adr	x17,.Lzeros
377	subs	x2,x2,#64
378	csel	x16,x17,x16,lo
379
380	stp	d8,d9,[sp,#16]		// meet ABI requirements
381	stp	d10,d11,[sp,#32]
382	stp	d12,d13,[sp,#48]
383	stp	d14,d15,[sp,#64]
384
385	fmov	d24,x10
386	fmov	d25,x11
387	fmov	d26,x12
388	fmov	d27,x13
389	fmov	d28,x14
390
391.Ldo_neon:
392	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
393	ldp	x9,x13,[x16],#48
394
395	lsl	x3,x3,#24
396	add	x15,x0,#48
397
398#ifdef	__ARMEB__
399	rev	x8,x8
400	rev	x12,x12
401	rev	x9,x9
402	rev	x13,x13
403#endif
404	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
405	and	x5,x9,#0x03ffffff
406	ubfx	x6,x8,#26,#26
407	ubfx	x7,x9,#26,#26
408	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
409	extr	x8,x12,x8,#52
410	extr	x9,x13,x9,#52
411	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
412	fmov	d14,x4
413	and	x8,x8,#0x03ffffff
414	and	x9,x9,#0x03ffffff
415	ubfx	x10,x12,#14,#26
416	ubfx	x11,x13,#14,#26
417	add	x12,x3,x12,lsr#40
418	add	x13,x3,x13,lsr#40
419	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
420	fmov	d15,x6
421	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
422	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
423	fmov	d16,x8
424	fmov	d17,x10
425	fmov	d18,x12
426
427	ldp	x8,x12,[x1],#16	// inp[0:1]
428	ldp	x9,x13,[x1],#48
429
430	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
431	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
432	ld1	{v8.4s},[x15]
433
434#ifdef	__ARMEB__
435	rev	x8,x8
436	rev	x12,x12
437	rev	x9,x9
438	rev	x13,x13
439#endif
440	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
441	and	x5,x9,#0x03ffffff
442	ubfx	x6,x8,#26,#26
443	ubfx	x7,x9,#26,#26
444	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
445	extr	x8,x12,x8,#52
446	extr	x9,x13,x9,#52
447	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
448	fmov	d9,x4
449	and	x8,x8,#0x03ffffff
450	and	x9,x9,#0x03ffffff
451	ubfx	x10,x12,#14,#26
452	ubfx	x11,x13,#14,#26
453	add	x12,x3,x12,lsr#40
454	add	x13,x3,x13,lsr#40
455	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
456	fmov	d10,x6
457	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
458	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
459	movi	v31.2d,#-1
460	fmov	d11,x8
461	fmov	d12,x10
462	fmov	d13,x12
463	ushr	v31.2d,v31.2d,#38
464
465	b.ls	.Lskip_loop
466
467.align	4
468.Loop_neon:
469	////////////////////////////////////////////////////////////////
470	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
471	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
472	//   ___________________/
473	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
474	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
475	//   ___________________/ ____________________/
476	//
477	// Note that we start with inp[2:3]*r^2. This is because it
478	// doesn't depend on reduction in previous iteration.
479	////////////////////////////////////////////////////////////////
480	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
481	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
482	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
483	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
484	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
485
486	subs	x2,x2,#64
487	umull	v23.2d,v14.2s,v7.s[2]
488	csel	x16,x17,x16,lo
489	umull	v22.2d,v14.2s,v5.s[2]
490	umull	v21.2d,v14.2s,v3.s[2]
491	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
492	umull	v20.2d,v14.2s,v1.s[2]
493	ldp	x9,x13,[x16],#48
494	umull	v19.2d,v14.2s,v0.s[2]
495#ifdef	__ARMEB__
496	rev	x8,x8
497	rev	x12,x12
498	rev	x9,x9
499	rev	x13,x13
500#endif
501
502	umlal	v23.2d,v15.2s,v5.s[2]
503	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
504	umlal	v22.2d,v15.2s,v3.s[2]
505	and	x5,x9,#0x03ffffff
506	umlal	v21.2d,v15.2s,v1.s[2]
507	ubfx	x6,x8,#26,#26
508	umlal	v20.2d,v15.2s,v0.s[2]
509	ubfx	x7,x9,#26,#26
510	umlal	v19.2d,v15.2s,v8.s[2]
511	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
512
513	umlal	v23.2d,v16.2s,v3.s[2]
514	extr	x8,x12,x8,#52
515	umlal	v22.2d,v16.2s,v1.s[2]
516	extr	x9,x13,x9,#52
517	umlal	v21.2d,v16.2s,v0.s[2]
518	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
519	umlal	v20.2d,v16.2s,v8.s[2]
520	fmov	d14,x4
521	umlal	v19.2d,v16.2s,v6.s[2]
522	and	x8,x8,#0x03ffffff
523
524	umlal	v23.2d,v17.2s,v1.s[2]
525	and	x9,x9,#0x03ffffff
526	umlal	v22.2d,v17.2s,v0.s[2]
527	ubfx	x10,x12,#14,#26
528	umlal	v21.2d,v17.2s,v8.s[2]
529	ubfx	x11,x13,#14,#26
530	umlal	v20.2d,v17.2s,v6.s[2]
531	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
532	umlal	v19.2d,v17.2s,v4.s[2]
533	fmov	d15,x6
534
535	add	v11.2s,v11.2s,v26.2s
536	add	x12,x3,x12,lsr#40
537	umlal	v23.2d,v18.2s,v0.s[2]
538	add	x13,x3,x13,lsr#40
539	umlal	v22.2d,v18.2s,v8.s[2]
540	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
541	umlal	v21.2d,v18.2s,v6.s[2]
542	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
543	umlal	v20.2d,v18.2s,v4.s[2]
544	fmov	d16,x8
545	umlal	v19.2d,v18.2s,v2.s[2]
546	fmov	d17,x10
547
548	////////////////////////////////////////////////////////////////
549	// (hash+inp[0:1])*r^4 and accumulate
550
551	add	v9.2s,v9.2s,v24.2s
552	fmov	d18,x12
553	umlal	v22.2d,v11.2s,v1.s[0]
554	ldp	x8,x12,[x1],#16	// inp[0:1]
555	umlal	v19.2d,v11.2s,v6.s[0]
556	ldp	x9,x13,[x1],#48
557	umlal	v23.2d,v11.2s,v3.s[0]
558	umlal	v20.2d,v11.2s,v8.s[0]
559	umlal	v21.2d,v11.2s,v0.s[0]
560#ifdef	__ARMEB__
561	rev	x8,x8
562	rev	x12,x12
563	rev	x9,x9
564	rev	x13,x13
565#endif
566
567	add	v10.2s,v10.2s,v25.2s
568	umlal	v22.2d,v9.2s,v5.s[0]
569	umlal	v23.2d,v9.2s,v7.s[0]
570	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
571	umlal	v21.2d,v9.2s,v3.s[0]
572	and	x5,x9,#0x03ffffff
573	umlal	v19.2d,v9.2s,v0.s[0]
574	ubfx	x6,x8,#26,#26
575	umlal	v20.2d,v9.2s,v1.s[0]
576	ubfx	x7,x9,#26,#26
577
578	add	v12.2s,v12.2s,v27.2s
579	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
580	umlal	v22.2d,v10.2s,v3.s[0]
581	extr	x8,x12,x8,#52
582	umlal	v23.2d,v10.2s,v5.s[0]
583	extr	x9,x13,x9,#52
584	umlal	v19.2d,v10.2s,v8.s[0]
585	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
586	umlal	v21.2d,v10.2s,v1.s[0]
587	fmov	d9,x4
588	umlal	v20.2d,v10.2s,v0.s[0]
589	and	x8,x8,#0x03ffffff
590
591	add	v13.2s,v13.2s,v28.2s
592	and	x9,x9,#0x03ffffff
593	umlal	v22.2d,v12.2s,v0.s[0]
594	ubfx	x10,x12,#14,#26
595	umlal	v19.2d,v12.2s,v4.s[0]
596	ubfx	x11,x13,#14,#26
597	umlal	v23.2d,v12.2s,v1.s[0]
598	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
599	umlal	v20.2d,v12.2s,v6.s[0]
600	fmov	d10,x6
601	umlal	v21.2d,v12.2s,v8.s[0]
602	add	x12,x3,x12,lsr#40
603
604	umlal	v22.2d,v13.2s,v8.s[0]
605	add	x13,x3,x13,lsr#40
606	umlal	v19.2d,v13.2s,v2.s[0]
607	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
608	umlal	v23.2d,v13.2s,v0.s[0]
609	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
610	umlal	v20.2d,v13.2s,v4.s[0]
611	fmov	d11,x8
612	umlal	v21.2d,v13.2s,v6.s[0]
613	fmov	d12,x10
614	fmov	d13,x12
615
616	/////////////////////////////////////////////////////////////////
617	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
618	// and P. Schwabe
619	//
620	// [see discussion in poly1305-armv4 module]
621
622	ushr	v29.2d,v22.2d,#26
623	xtn	v27.2s,v22.2d
624	ushr	v30.2d,v19.2d,#26
625	and	v19.16b,v19.16b,v31.16b
626	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
627	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
628	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
629
630	ushr	v29.2d,v23.2d,#26
631	xtn	v28.2s,v23.2d
632	ushr	v30.2d,v20.2d,#26
633	xtn	v25.2s,v20.2d
634	bic	v28.2s,#0xfc,lsl#24
635	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
636
637	add	v19.2d,v19.2d,v29.2d
638	shl	v29.2d,v29.2d,#2
639	shrn	v30.2s,v21.2d,#26
640	xtn	v26.2s,v21.2d
641	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
642	bic	v25.2s,#0xfc,lsl#24
643	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
644	bic	v26.2s,#0xfc,lsl#24
645
646	shrn	v29.2s,v19.2d,#26
647	xtn	v24.2s,v19.2d
648	ushr	v30.2s,v27.2s,#26
649	bic	v27.2s,#0xfc,lsl#24
650	bic	v24.2s,#0xfc,lsl#24
651	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
652	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
653
654	b.hi	.Loop_neon
655
656.Lskip_loop:
657	dup	v16.2d,v16.d[0]
658	add	v11.2s,v11.2s,v26.2s
659
660	////////////////////////////////////////////////////////////////
661	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
662
663	adds	x2,x2,#32
664	b.ne	.Long_tail
665
666	dup	v16.2d,v11.d[0]
667	add	v14.2s,v9.2s,v24.2s
668	add	v17.2s,v12.2s,v27.2s
669	add	v15.2s,v10.2s,v25.2s
670	add	v18.2s,v13.2s,v28.2s
671
672.Long_tail:
673	dup	v14.2d,v14.d[0]
674	umull2	v19.2d,v16.4s,v6.4s
675	umull2	v22.2d,v16.4s,v1.4s
676	umull2	v23.2d,v16.4s,v3.4s
677	umull2	v21.2d,v16.4s,v0.4s
678	umull2	v20.2d,v16.4s,v8.4s
679
680	dup	v15.2d,v15.d[0]
681	umlal2	v19.2d,v14.4s,v0.4s
682	umlal2	v21.2d,v14.4s,v3.4s
683	umlal2	v22.2d,v14.4s,v5.4s
684	umlal2	v23.2d,v14.4s,v7.4s
685	umlal2	v20.2d,v14.4s,v1.4s
686
687	dup	v17.2d,v17.d[0]
688	umlal2	v19.2d,v15.4s,v8.4s
689	umlal2	v22.2d,v15.4s,v3.4s
690	umlal2	v21.2d,v15.4s,v1.4s
691	umlal2	v23.2d,v15.4s,v5.4s
692	umlal2	v20.2d,v15.4s,v0.4s
693
694	dup	v18.2d,v18.d[0]
695	umlal2	v22.2d,v17.4s,v0.4s
696	umlal2	v23.2d,v17.4s,v1.4s
697	umlal2	v19.2d,v17.4s,v4.4s
698	umlal2	v20.2d,v17.4s,v6.4s
699	umlal2	v21.2d,v17.4s,v8.4s
700
701	umlal2	v22.2d,v18.4s,v8.4s
702	umlal2	v19.2d,v18.4s,v2.4s
703	umlal2	v23.2d,v18.4s,v0.4s
704	umlal2	v20.2d,v18.4s,v4.4s
705	umlal2	v21.2d,v18.4s,v6.4s
706
707	b.eq	.Lshort_tail
708
709	////////////////////////////////////////////////////////////////
710	// (hash+inp[0:1])*r^4:r^3 and accumulate
711
712	add	v9.2s,v9.2s,v24.2s
713	umlal	v22.2d,v11.2s,v1.2s
714	umlal	v19.2d,v11.2s,v6.2s
715	umlal	v23.2d,v11.2s,v3.2s
716	umlal	v20.2d,v11.2s,v8.2s
717	umlal	v21.2d,v11.2s,v0.2s
718
719	add	v10.2s,v10.2s,v25.2s
720	umlal	v22.2d,v9.2s,v5.2s
721	umlal	v19.2d,v9.2s,v0.2s
722	umlal	v23.2d,v9.2s,v7.2s
723	umlal	v20.2d,v9.2s,v1.2s
724	umlal	v21.2d,v9.2s,v3.2s
725
726	add	v12.2s,v12.2s,v27.2s
727	umlal	v22.2d,v10.2s,v3.2s
728	umlal	v19.2d,v10.2s,v8.2s
729	umlal	v23.2d,v10.2s,v5.2s
730	umlal	v20.2d,v10.2s,v0.2s
731	umlal	v21.2d,v10.2s,v1.2s
732
733	add	v13.2s,v13.2s,v28.2s
734	umlal	v22.2d,v12.2s,v0.2s
735	umlal	v19.2d,v12.2s,v4.2s
736	umlal	v23.2d,v12.2s,v1.2s
737	umlal	v20.2d,v12.2s,v6.2s
738	umlal	v21.2d,v12.2s,v8.2s
739
740	umlal	v22.2d,v13.2s,v8.2s
741	umlal	v19.2d,v13.2s,v2.2s
742	umlal	v23.2d,v13.2s,v0.2s
743	umlal	v20.2d,v13.2s,v4.2s
744	umlal	v21.2d,v13.2s,v6.2s
745
746.Lshort_tail:
747	////////////////////////////////////////////////////////////////
748	// horizontal add
749
750	addp	v22.2d,v22.2d,v22.2d
751	ldp	d8,d9,[sp,#16]		// meet ABI requirements
752	addp	v19.2d,v19.2d,v19.2d
753	ldp	d10,d11,[sp,#32]
754	addp	v23.2d,v23.2d,v23.2d
755	ldp	d12,d13,[sp,#48]
756	addp	v20.2d,v20.2d,v20.2d
757	ldp	d14,d15,[sp,#64]
758	addp	v21.2d,v21.2d,v21.2d
759
760	////////////////////////////////////////////////////////////////
761	// lazy reduction, but without narrowing
762
763	ushr	v29.2d,v22.2d,#26
764	and	v22.16b,v22.16b,v31.16b
765	ushr	v30.2d,v19.2d,#26
766	and	v19.16b,v19.16b,v31.16b
767
768	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
769	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
770
771	ushr	v29.2d,v23.2d,#26
772	and	v23.16b,v23.16b,v31.16b
773	ushr	v30.2d,v20.2d,#26
774	and	v20.16b,v20.16b,v31.16b
775	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
776
777	add	v19.2d,v19.2d,v29.2d
778	shl	v29.2d,v29.2d,#2
779	ushr	v30.2d,v21.2d,#26
780	and	v21.16b,v21.16b,v31.16b
781	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
782	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
783
784	ushr	v29.2d,v19.2d,#26
785	and	v19.16b,v19.16b,v31.16b
786	ushr	v30.2d,v22.2d,#26
787	and	v22.16b,v22.16b,v31.16b
788	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
789	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
790
791	////////////////////////////////////////////////////////////////
792	// write the result, can be partially reduced
793
794	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
795	st1	{v23.s}[0],[x0]
796
797.Lno_data_neon:
798	ldr	x29,[sp],#80
799	ret
800.size	poly1305_blocks_neon,.-poly1305_blocks_neon
801
802.type	poly1305_emit_neon,%function
803.align	5
804poly1305_emit_neon:
805	ldr	x17,[x0,#24]
806	cbz	x17,poly1305_emit
807
808	ldp	w10,w11,[x0]		// load hash value base 2^26
809	ldp	w12,w13,[x0,#8]
810	ldr	w14,[x0,#16]
811
812	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
813	lsr	x5,x12,#12
814	adds	x4,x4,x12,lsl#52
815	add	x5,x5,x13,lsl#14
816	adc	x5,x5,xzr
817	lsr	x6,x14,#24
818	adds	x5,x5,x14,lsl#40
819	adc	x6,x6,xzr		// can be partially reduced...
820
821	ldp	x10,x11,[x2]	// load nonce
822
823	and	x12,x6,#-4		// ... so reduce
824	add	x12,x12,x6,lsr#2
825	and	x6,x6,#3
826	adds	x4,x4,x12
827	adcs	x5,x5,xzr
828	adc	x6,x6,xzr
829
830	adds	x12,x4,#5		// compare to modulus
831	adcs	x13,x5,xzr
832	adc	x14,x6,xzr
833
834	tst	x14,#-4			// see if it's carried/borrowed
835
836	csel	x4,x4,x12,eq
837	csel	x5,x5,x13,eq
838
839#ifdef	__ARMEB__
840	ror	x10,x10,#32		// flip nonce words
841	ror	x11,x11,#32
842#endif
843	adds	x4,x4,x10		// accumulate nonce
844	adc	x5,x5,x11
845#ifdef	__ARMEB__
846	rev	x4,x4			// flip output bytes
847	rev	x5,x5
848#endif
849	stp	x4,x5,[x1]		// write result
850
851	ret
852.size	poly1305_emit_neon,.-poly1305_emit_neon
853
854.align	5
855.Lzeros:
856.long	0,0,0,0,0,0,0,0
857.LOPENSSL_armcap_P:
858#ifdef	__ILP32__
859.long	OPENSSL_armcap_P-.
860#else
861.quad	OPENSSL_armcap_P-.
862#endif
863.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
864.align	2
865.align	2
866