xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/poly1305-armv8.S (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1#include "arm_arch.h"
2
3.text
4
5// forward "declarations" are required for Apple
6
7.globl	poly1305_blocks
8.globl	poly1305_emit
9
10.globl	poly1305_init
11.type	poly1305_init,%function
12.align	5
13poly1305_init:
14	cmp	x1,xzr
15	stp	xzr,xzr,[x0]		// zero hash value
16	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
17
18	csel	x0,xzr,x0,eq
19	b.eq	.Lno_key
20
21#ifdef	__ILP32__
22	ldrsw	x11,.LOPENSSL_armcap_P
23#else
24	ldr	x11,.LOPENSSL_armcap_P
25#endif
26	adr	x10,.LOPENSSL_armcap_P
27
28	ldp	x7,x8,[x1]		// load key
29	mov	x9,#0xfffffffc0fffffff
30	movk	x9,#0x0fff,lsl#48
31	ldr	w17,[x10,x11]
32#ifdef	__ARMEB__
33	rev	x7,x7			// flip bytes
34	rev	x8,x8
35#endif
36	and	x7,x7,x9		// &=0ffffffc0fffffff
37	and	x9,x9,#-4
38	and	x8,x8,x9		// &=0ffffffc0ffffffc
39	stp	x7,x8,[x0,#32]	// save key value
40
41	tst	w17,#ARMV7_NEON
42
43	adr	x12,poly1305_blocks
44	adr	x7,poly1305_blocks_neon
45	adr	x13,poly1305_emit
46	adr	x8,poly1305_emit_neon
47
48	csel	x12,x12,x7,eq
49	csel	x13,x13,x8,eq
50
51#ifdef	__ILP32__
52	stp	w12,w13,[x2]
53#else
54	stp	x12,x13,[x2]
55#endif
56
57	mov	x0,#1
58.Lno_key:
59	ret
60.size	poly1305_init,.-poly1305_init
61
62.type	poly1305_blocks,%function
63.align	5
64poly1305_blocks:
65	ands	x2,x2,#-16
66	b.eq	.Lno_data
67
68	ldp	x4,x5,[x0]		// load hash value
69	ldp	x7,x8,[x0,#32]	// load key value
70	ldr	x6,[x0,#16]
71	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
72	b	.Loop
73
74.align	5
75.Loop:
76	ldp	x10,x11,[x1],#16	// load input
77	sub	x2,x2,#16
78#ifdef	__ARMEB__
79	rev	x10,x10
80	rev	x11,x11
81#endif
82	adds	x4,x4,x10		// accumulate input
83	adcs	x5,x5,x11
84
85	mul	x12,x4,x7		// h0*r0
86	adc	x6,x6,x3
87	umulh	x13,x4,x7
88
89	mul	x10,x5,x9		// h1*5*r1
90	umulh	x11,x5,x9
91
92	adds	x12,x12,x10
93	mul	x10,x4,x8		// h0*r1
94	adc	x13,x13,x11
95	umulh	x14,x4,x8
96
97	adds	x13,x13,x10
98	mul	x10,x5,x7		// h1*r0
99	adc	x14,x14,xzr
100	umulh	x11,x5,x7
101
102	adds	x13,x13,x10
103	mul	x10,x6,x9		// h2*5*r1
104	adc	x14,x14,x11
105	mul	x11,x6,x7		// h2*r0
106
107	adds	x13,x13,x10
108	adc	x14,x14,x11
109
110	and	x10,x14,#-4		// final reduction
111	and	x6,x14,#3
112	add	x10,x10,x14,lsr#2
113	adds	x4,x12,x10
114	adcs	x5,x13,xzr
115	adc	x6,x6,xzr
116
117	cbnz	x2,.Loop
118
119	stp	x4,x5,[x0]		// store hash value
120	str	x6,[x0,#16]
121
122.Lno_data:
123	ret
124.size	poly1305_blocks,.-poly1305_blocks
125
126.type	poly1305_emit,%function
127.align	5
128poly1305_emit:
129	ldp	x4,x5,[x0]		// load hash base 2^64
130	ldr	x6,[x0,#16]
131	ldp	x10,x11,[x2]	// load nonce
132
133	adds	x12,x4,#5		// compare to modulus
134	adcs	x13,x5,xzr
135	adc	x14,x6,xzr
136
137	tst	x14,#-4			// see if it's carried/borrowed
138
139	csel	x4,x4,x12,eq
140	csel	x5,x5,x13,eq
141
142#ifdef	__ARMEB__
143	ror	x10,x10,#32		// flip nonce words
144	ror	x11,x11,#32
145#endif
146	adds	x4,x4,x10		// accumulate nonce
147	adc	x5,x5,x11
148#ifdef	__ARMEB__
149	rev	x4,x4			// flip output bytes
150	rev	x5,x5
151#endif
152	stp	x4,x5,[x1]		// write result
153
154	ret
155.size	poly1305_emit,.-poly1305_emit
156.type	poly1305_mult,%function
157.align	5
158poly1305_mult:
159	mul	x12,x4,x7		// h0*r0
160	umulh	x13,x4,x7
161
162	mul	x10,x5,x9		// h1*5*r1
163	umulh	x11,x5,x9
164
165	adds	x12,x12,x10
166	mul	x10,x4,x8		// h0*r1
167	adc	x13,x13,x11
168	umulh	x14,x4,x8
169
170	adds	x13,x13,x10
171	mul	x10,x5,x7		// h1*r0
172	adc	x14,x14,xzr
173	umulh	x11,x5,x7
174
175	adds	x13,x13,x10
176	mul	x10,x6,x9		// h2*5*r1
177	adc	x14,x14,x11
178	mul	x11,x6,x7		// h2*r0
179
180	adds	x13,x13,x10
181	adc	x14,x14,x11
182
183	and	x10,x14,#-4		// final reduction
184	and	x6,x14,#3
185	add	x10,x10,x14,lsr#2
186	adds	x4,x12,x10
187	adcs	x5,x13,xzr
188	adc	x6,x6,xzr
189
190	ret
191.size	poly1305_mult,.-poly1305_mult
192
193.type	poly1305_splat,%function
194.align	5
195poly1305_splat:
196	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
197	ubfx	x13,x4,#26,#26
198	extr	x14,x5,x4,#52
199	and	x14,x14,#0x03ffffff
200	ubfx	x15,x5,#14,#26
201	extr	x16,x6,x5,#40
202
203	str	w12,[x0,#16*0]	// r0
204	add	w12,w13,w13,lsl#2	// r1*5
205	str	w13,[x0,#16*1]	// r1
206	add	w13,w14,w14,lsl#2	// r2*5
207	str	w12,[x0,#16*2]	// s1
208	str	w14,[x0,#16*3]	// r2
209	add	w14,w15,w15,lsl#2	// r3*5
210	str	w13,[x0,#16*4]	// s2
211	str	w15,[x0,#16*5]	// r3
212	add	w15,w16,w16,lsl#2	// r4*5
213	str	w14,[x0,#16*6]	// s3
214	str	w16,[x0,#16*7]	// r4
215	str	w15,[x0,#16*8]	// s4
216
217	ret
218.size	poly1305_splat,.-poly1305_splat
219
220.type	poly1305_blocks_neon,%function
221.align	5
222poly1305_blocks_neon:
223	ldr	x17,[x0,#24]
224	cmp	x2,#128
225	b.hs	.Lblocks_neon
226	cbz	x17,poly1305_blocks
227
228.Lblocks_neon:
229	stp	x29,x30,[sp,#-80]!
230	add	x29,sp,#0
231
232	ands	x2,x2,#-16
233	b.eq	.Lno_data_neon
234
235	cbz	x17,.Lbase2_64_neon
236
237	ldp	w10,w11,[x0]		// load hash value base 2^26
238	ldp	w12,w13,[x0,#8]
239	ldr	w14,[x0,#16]
240
241	tst	x2,#31
242	b.eq	.Leven_neon
243
244	ldp	x7,x8,[x0,#32]	// load key value
245
246	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
247	lsr	x5,x12,#12
248	adds	x4,x4,x12,lsl#52
249	add	x5,x5,x13,lsl#14
250	adc	x5,x5,xzr
251	lsr	x6,x14,#24
252	adds	x5,x5,x14,lsl#40
253	adc	x14,x6,xzr		// can be partially reduced...
254
255	ldp	x12,x13,[x1],#16	// load input
256	sub	x2,x2,#16
257	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
258
259	and	x10,x14,#-4		// ... so reduce
260	and	x6,x14,#3
261	add	x10,x10,x14,lsr#2
262	adds	x4,x4,x10
263	adcs	x5,x5,xzr
264	adc	x6,x6,xzr
265
266#ifdef	__ARMEB__
267	rev	x12,x12
268	rev	x13,x13
269#endif
270	adds	x4,x4,x12		// accumulate input
271	adcs	x5,x5,x13
272	adc	x6,x6,x3
273
274	bl	poly1305_mult
275	ldr	x30,[sp,#8]
276
277	cbz	x3,.Lstore_base2_64_neon
278
279	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
280	ubfx	x11,x4,#26,#26
281	extr	x12,x5,x4,#52
282	and	x12,x12,#0x03ffffff
283	ubfx	x13,x5,#14,#26
284	extr	x14,x6,x5,#40
285
286	cbnz	x2,.Leven_neon
287
288	stp	w10,w11,[x0]		// store hash value base 2^26
289	stp	w12,w13,[x0,#8]
290	str	w14,[x0,#16]
291	b	.Lno_data_neon
292
293.align	4
294.Lstore_base2_64_neon:
295	stp	x4,x5,[x0]		// store hash value base 2^64
296	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
297	b	.Lno_data_neon
298
299.align	4
300.Lbase2_64_neon:
301	ldp	x7,x8,[x0,#32]	// load key value
302
303	ldp	x4,x5,[x0]		// load hash value base 2^64
304	ldr	x6,[x0,#16]
305
306	tst	x2,#31
307	b.eq	.Linit_neon
308
309	ldp	x12,x13,[x1],#16	// load input
310	sub	x2,x2,#16
311	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
312#ifdef	__ARMEB__
313	rev	x12,x12
314	rev	x13,x13
315#endif
316	adds	x4,x4,x12		// accumulate input
317	adcs	x5,x5,x13
318	adc	x6,x6,x3
319
320	bl	poly1305_mult
321
322.Linit_neon:
323	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
324	ubfx	x11,x4,#26,#26
325	extr	x12,x5,x4,#52
326	and	x12,x12,#0x03ffffff
327	ubfx	x13,x5,#14,#26
328	extr	x14,x6,x5,#40
329
330	stp	d8,d9,[sp,#16]		// meet ABI requirements
331	stp	d10,d11,[sp,#32]
332	stp	d12,d13,[sp,#48]
333	stp	d14,d15,[sp,#64]
334
335	fmov	d24,x10
336	fmov	d25,x11
337	fmov	d26,x12
338	fmov	d27,x13
339	fmov	d28,x14
340
341	////////////////////////////////// initialize r^n table
342	mov	x4,x7			// r^1
343	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
344	mov	x5,x8
345	mov	x6,xzr
346	add	x0,x0,#48+12
347	bl	poly1305_splat
348
349	bl	poly1305_mult		// r^2
350	sub	x0,x0,#4
351	bl	poly1305_splat
352
353	bl	poly1305_mult		// r^3
354	sub	x0,x0,#4
355	bl	poly1305_splat
356
357	bl	poly1305_mult		// r^4
358	sub	x0,x0,#4
359	bl	poly1305_splat
360	ldr	x30,[sp,#8]
361
362	add	x16,x1,#32
363	adr	x17,.Lzeros
364	subs	x2,x2,#64
365	csel	x16,x17,x16,lo
366
367	mov	x4,#1
368	str	x4,[x0,#-24]		// set is_base2_26
369	sub	x0,x0,#48		// restore original x0
370	b	.Ldo_neon
371
372.align	4
373.Leven_neon:
374	add	x16,x1,#32
375	adr	x17,.Lzeros
376	subs	x2,x2,#64
377	csel	x16,x17,x16,lo
378
379	stp	d8,d9,[sp,#16]		// meet ABI requirements
380	stp	d10,d11,[sp,#32]
381	stp	d12,d13,[sp,#48]
382	stp	d14,d15,[sp,#64]
383
384	fmov	d24,x10
385	fmov	d25,x11
386	fmov	d26,x12
387	fmov	d27,x13
388	fmov	d28,x14
389
390.Ldo_neon:
391	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
392	ldp	x9,x13,[x16],#48
393
394	lsl	x3,x3,#24
395	add	x15,x0,#48
396
397#ifdef	__ARMEB__
398	rev	x8,x8
399	rev	x12,x12
400	rev	x9,x9
401	rev	x13,x13
402#endif
403	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
404	and	x5,x9,#0x03ffffff
405	ubfx	x6,x8,#26,#26
406	ubfx	x7,x9,#26,#26
407	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
408	extr	x8,x12,x8,#52
409	extr	x9,x13,x9,#52
410	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
411	fmov	d14,x4
412	and	x8,x8,#0x03ffffff
413	and	x9,x9,#0x03ffffff
414	ubfx	x10,x12,#14,#26
415	ubfx	x11,x13,#14,#26
416	add	x12,x3,x12,lsr#40
417	add	x13,x3,x13,lsr#40
418	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
419	fmov	d15,x6
420	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
421	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
422	fmov	d16,x8
423	fmov	d17,x10
424	fmov	d18,x12
425
426	ldp	x8,x12,[x1],#16	// inp[0:1]
427	ldp	x9,x13,[x1],#48
428
429	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
430	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
431	ld1	{v8.4s},[x15]
432
433#ifdef	__ARMEB__
434	rev	x8,x8
435	rev	x12,x12
436	rev	x9,x9
437	rev	x13,x13
438#endif
439	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
440	and	x5,x9,#0x03ffffff
441	ubfx	x6,x8,#26,#26
442	ubfx	x7,x9,#26,#26
443	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
444	extr	x8,x12,x8,#52
445	extr	x9,x13,x9,#52
446	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
447	fmov	d9,x4
448	and	x8,x8,#0x03ffffff
449	and	x9,x9,#0x03ffffff
450	ubfx	x10,x12,#14,#26
451	ubfx	x11,x13,#14,#26
452	add	x12,x3,x12,lsr#40
453	add	x13,x3,x13,lsr#40
454	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
455	fmov	d10,x6
456	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
457	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
458	movi	v31.2d,#-1
459	fmov	d11,x8
460	fmov	d12,x10
461	fmov	d13,x12
462	ushr	v31.2d,v31.2d,#38
463
464	b.ls	.Lskip_loop
465
466.align	4
467.Loop_neon:
468	////////////////////////////////////////////////////////////////
469	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
470	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
471	//   ___________________/
472	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
473	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
474	//   ___________________/ ____________________/
475	//
476	// Note that we start with inp[2:3]*r^2. This is because it
477	// doesn't depend on reduction in previous iteration.
478	////////////////////////////////////////////////////////////////
479	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
480	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
481	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
482	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
483	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
484
485	subs	x2,x2,#64
486	umull	v23.2d,v14.2s,v7.s[2]
487	csel	x16,x17,x16,lo
488	umull	v22.2d,v14.2s,v5.s[2]
489	umull	v21.2d,v14.2s,v3.s[2]
490	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
491	umull	v20.2d,v14.2s,v1.s[2]
492	ldp	x9,x13,[x16],#48
493	umull	v19.2d,v14.2s,v0.s[2]
494#ifdef	__ARMEB__
495	rev	x8,x8
496	rev	x12,x12
497	rev	x9,x9
498	rev	x13,x13
499#endif
500
501	umlal	v23.2d,v15.2s,v5.s[2]
502	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
503	umlal	v22.2d,v15.2s,v3.s[2]
504	and	x5,x9,#0x03ffffff
505	umlal	v21.2d,v15.2s,v1.s[2]
506	ubfx	x6,x8,#26,#26
507	umlal	v20.2d,v15.2s,v0.s[2]
508	ubfx	x7,x9,#26,#26
509	umlal	v19.2d,v15.2s,v8.s[2]
510	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
511
512	umlal	v23.2d,v16.2s,v3.s[2]
513	extr	x8,x12,x8,#52
514	umlal	v22.2d,v16.2s,v1.s[2]
515	extr	x9,x13,x9,#52
516	umlal	v21.2d,v16.2s,v0.s[2]
517	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
518	umlal	v20.2d,v16.2s,v8.s[2]
519	fmov	d14,x4
520	umlal	v19.2d,v16.2s,v6.s[2]
521	and	x8,x8,#0x03ffffff
522
523	umlal	v23.2d,v17.2s,v1.s[2]
524	and	x9,x9,#0x03ffffff
525	umlal	v22.2d,v17.2s,v0.s[2]
526	ubfx	x10,x12,#14,#26
527	umlal	v21.2d,v17.2s,v8.s[2]
528	ubfx	x11,x13,#14,#26
529	umlal	v20.2d,v17.2s,v6.s[2]
530	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
531	umlal	v19.2d,v17.2s,v4.s[2]
532	fmov	d15,x6
533
534	add	v11.2s,v11.2s,v26.2s
535	add	x12,x3,x12,lsr#40
536	umlal	v23.2d,v18.2s,v0.s[2]
537	add	x13,x3,x13,lsr#40
538	umlal	v22.2d,v18.2s,v8.s[2]
539	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
540	umlal	v21.2d,v18.2s,v6.s[2]
541	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
542	umlal	v20.2d,v18.2s,v4.s[2]
543	fmov	d16,x8
544	umlal	v19.2d,v18.2s,v2.s[2]
545	fmov	d17,x10
546
547	////////////////////////////////////////////////////////////////
548	// (hash+inp[0:1])*r^4 and accumulate
549
550	add	v9.2s,v9.2s,v24.2s
551	fmov	d18,x12
552	umlal	v22.2d,v11.2s,v1.s[0]
553	ldp	x8,x12,[x1],#16	// inp[0:1]
554	umlal	v19.2d,v11.2s,v6.s[0]
555	ldp	x9,x13,[x1],#48
556	umlal	v23.2d,v11.2s,v3.s[0]
557	umlal	v20.2d,v11.2s,v8.s[0]
558	umlal	v21.2d,v11.2s,v0.s[0]
559#ifdef	__ARMEB__
560	rev	x8,x8
561	rev	x12,x12
562	rev	x9,x9
563	rev	x13,x13
564#endif
565
566	add	v10.2s,v10.2s,v25.2s
567	umlal	v22.2d,v9.2s,v5.s[0]
568	umlal	v23.2d,v9.2s,v7.s[0]
569	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
570	umlal	v21.2d,v9.2s,v3.s[0]
571	and	x5,x9,#0x03ffffff
572	umlal	v19.2d,v9.2s,v0.s[0]
573	ubfx	x6,x8,#26,#26
574	umlal	v20.2d,v9.2s,v1.s[0]
575	ubfx	x7,x9,#26,#26
576
577	add	v12.2s,v12.2s,v27.2s
578	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
579	umlal	v22.2d,v10.2s,v3.s[0]
580	extr	x8,x12,x8,#52
581	umlal	v23.2d,v10.2s,v5.s[0]
582	extr	x9,x13,x9,#52
583	umlal	v19.2d,v10.2s,v8.s[0]
584	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
585	umlal	v21.2d,v10.2s,v1.s[0]
586	fmov	d9,x4
587	umlal	v20.2d,v10.2s,v0.s[0]
588	and	x8,x8,#0x03ffffff
589
590	add	v13.2s,v13.2s,v28.2s
591	and	x9,x9,#0x03ffffff
592	umlal	v22.2d,v12.2s,v0.s[0]
593	ubfx	x10,x12,#14,#26
594	umlal	v19.2d,v12.2s,v4.s[0]
595	ubfx	x11,x13,#14,#26
596	umlal	v23.2d,v12.2s,v1.s[0]
597	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
598	umlal	v20.2d,v12.2s,v6.s[0]
599	fmov	d10,x6
600	umlal	v21.2d,v12.2s,v8.s[0]
601	add	x12,x3,x12,lsr#40
602
603	umlal	v22.2d,v13.2s,v8.s[0]
604	add	x13,x3,x13,lsr#40
605	umlal	v19.2d,v13.2s,v2.s[0]
606	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
607	umlal	v23.2d,v13.2s,v0.s[0]
608	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
609	umlal	v20.2d,v13.2s,v4.s[0]
610	fmov	d11,x8
611	umlal	v21.2d,v13.2s,v6.s[0]
612	fmov	d12,x10
613	fmov	d13,x12
614
615	/////////////////////////////////////////////////////////////////
616	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
617	// and P. Schwabe
618	//
619	// [see discussion in poly1305-armv4 module]
620
621	ushr	v29.2d,v22.2d,#26
622	xtn	v27.2s,v22.2d
623	ushr	v30.2d,v19.2d,#26
624	and	v19.16b,v19.16b,v31.16b
625	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
626	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
627	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
628
629	ushr	v29.2d,v23.2d,#26
630	xtn	v28.2s,v23.2d
631	ushr	v30.2d,v20.2d,#26
632	xtn	v25.2s,v20.2d
633	bic	v28.2s,#0xfc,lsl#24
634	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
635
636	add	v19.2d,v19.2d,v29.2d
637	shl	v29.2d,v29.2d,#2
638	shrn	v30.2s,v21.2d,#26
639	xtn	v26.2s,v21.2d
640	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
641	bic	v25.2s,#0xfc,lsl#24
642	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
643	bic	v26.2s,#0xfc,lsl#24
644
645	shrn	v29.2s,v19.2d,#26
646	xtn	v24.2s,v19.2d
647	ushr	v30.2s,v27.2s,#26
648	bic	v27.2s,#0xfc,lsl#24
649	bic	v24.2s,#0xfc,lsl#24
650	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
651	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
652
653	b.hi	.Loop_neon
654
655.Lskip_loop:
656	dup	v16.2d,v16.d[0]
657	add	v11.2s,v11.2s,v26.2s
658
659	////////////////////////////////////////////////////////////////
660	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
661
662	adds	x2,x2,#32
663	b.ne	.Long_tail
664
665	dup	v16.2d,v11.d[0]
666	add	v14.2s,v9.2s,v24.2s
667	add	v17.2s,v12.2s,v27.2s
668	add	v15.2s,v10.2s,v25.2s
669	add	v18.2s,v13.2s,v28.2s
670
671.Long_tail:
672	dup	v14.2d,v14.d[0]
673	umull2	v19.2d,v16.4s,v6.4s
674	umull2	v22.2d,v16.4s,v1.4s
675	umull2	v23.2d,v16.4s,v3.4s
676	umull2	v21.2d,v16.4s,v0.4s
677	umull2	v20.2d,v16.4s,v8.4s
678
679	dup	v15.2d,v15.d[0]
680	umlal2	v19.2d,v14.4s,v0.4s
681	umlal2	v21.2d,v14.4s,v3.4s
682	umlal2	v22.2d,v14.4s,v5.4s
683	umlal2	v23.2d,v14.4s,v7.4s
684	umlal2	v20.2d,v14.4s,v1.4s
685
686	dup	v17.2d,v17.d[0]
687	umlal2	v19.2d,v15.4s,v8.4s
688	umlal2	v22.2d,v15.4s,v3.4s
689	umlal2	v21.2d,v15.4s,v1.4s
690	umlal2	v23.2d,v15.4s,v5.4s
691	umlal2	v20.2d,v15.4s,v0.4s
692
693	dup	v18.2d,v18.d[0]
694	umlal2	v22.2d,v17.4s,v0.4s
695	umlal2	v23.2d,v17.4s,v1.4s
696	umlal2	v19.2d,v17.4s,v4.4s
697	umlal2	v20.2d,v17.4s,v6.4s
698	umlal2	v21.2d,v17.4s,v8.4s
699
700	umlal2	v22.2d,v18.4s,v8.4s
701	umlal2	v19.2d,v18.4s,v2.4s
702	umlal2	v23.2d,v18.4s,v0.4s
703	umlal2	v20.2d,v18.4s,v4.4s
704	umlal2	v21.2d,v18.4s,v6.4s
705
706	b.eq	.Lshort_tail
707
708	////////////////////////////////////////////////////////////////
709	// (hash+inp[0:1])*r^4:r^3 and accumulate
710
711	add	v9.2s,v9.2s,v24.2s
712	umlal	v22.2d,v11.2s,v1.2s
713	umlal	v19.2d,v11.2s,v6.2s
714	umlal	v23.2d,v11.2s,v3.2s
715	umlal	v20.2d,v11.2s,v8.2s
716	umlal	v21.2d,v11.2s,v0.2s
717
718	add	v10.2s,v10.2s,v25.2s
719	umlal	v22.2d,v9.2s,v5.2s
720	umlal	v19.2d,v9.2s,v0.2s
721	umlal	v23.2d,v9.2s,v7.2s
722	umlal	v20.2d,v9.2s,v1.2s
723	umlal	v21.2d,v9.2s,v3.2s
724
725	add	v12.2s,v12.2s,v27.2s
726	umlal	v22.2d,v10.2s,v3.2s
727	umlal	v19.2d,v10.2s,v8.2s
728	umlal	v23.2d,v10.2s,v5.2s
729	umlal	v20.2d,v10.2s,v0.2s
730	umlal	v21.2d,v10.2s,v1.2s
731
732	add	v13.2s,v13.2s,v28.2s
733	umlal	v22.2d,v12.2s,v0.2s
734	umlal	v19.2d,v12.2s,v4.2s
735	umlal	v23.2d,v12.2s,v1.2s
736	umlal	v20.2d,v12.2s,v6.2s
737	umlal	v21.2d,v12.2s,v8.2s
738
739	umlal	v22.2d,v13.2s,v8.2s
740	umlal	v19.2d,v13.2s,v2.2s
741	umlal	v23.2d,v13.2s,v0.2s
742	umlal	v20.2d,v13.2s,v4.2s
743	umlal	v21.2d,v13.2s,v6.2s
744
745.Lshort_tail:
746	////////////////////////////////////////////////////////////////
747	// horizontal add
748
749	addp	v22.2d,v22.2d,v22.2d
750	ldp	d8,d9,[sp,#16]		// meet ABI requirements
751	addp	v19.2d,v19.2d,v19.2d
752	ldp	d10,d11,[sp,#32]
753	addp	v23.2d,v23.2d,v23.2d
754	ldp	d12,d13,[sp,#48]
755	addp	v20.2d,v20.2d,v20.2d
756	ldp	d14,d15,[sp,#64]
757	addp	v21.2d,v21.2d,v21.2d
758
759	////////////////////////////////////////////////////////////////
760	// lazy reduction, but without narrowing
761
762	ushr	v29.2d,v22.2d,#26
763	and	v22.16b,v22.16b,v31.16b
764	ushr	v30.2d,v19.2d,#26
765	and	v19.16b,v19.16b,v31.16b
766
767	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
768	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
769
770	ushr	v29.2d,v23.2d,#26
771	and	v23.16b,v23.16b,v31.16b
772	ushr	v30.2d,v20.2d,#26
773	and	v20.16b,v20.16b,v31.16b
774	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
775
776	add	v19.2d,v19.2d,v29.2d
777	shl	v29.2d,v29.2d,#2
778	ushr	v30.2d,v21.2d,#26
779	and	v21.16b,v21.16b,v31.16b
780	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
781	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
782
783	ushr	v29.2d,v19.2d,#26
784	and	v19.16b,v19.16b,v31.16b
785	ushr	v30.2d,v22.2d,#26
786	and	v22.16b,v22.16b,v31.16b
787	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
788	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
789
790	////////////////////////////////////////////////////////////////
791	// write the result, can be partially reduced
792
793	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
794	st1	{v23.s}[0],[x0]
795
796.Lno_data_neon:
797	ldr	x29,[sp],#80
798	ret
799.size	poly1305_blocks_neon,.-poly1305_blocks_neon
800
801.type	poly1305_emit_neon,%function
802.align	5
803poly1305_emit_neon:
804	ldr	x17,[x0,#24]
805	cbz	x17,poly1305_emit
806
807	ldp	w10,w11,[x0]		// load hash value base 2^26
808	ldp	w12,w13,[x0,#8]
809	ldr	w14,[x0,#16]
810
811	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
812	lsr	x5,x12,#12
813	adds	x4,x4,x12,lsl#52
814	add	x5,x5,x13,lsl#14
815	adc	x5,x5,xzr
816	lsr	x6,x14,#24
817	adds	x5,x5,x14,lsl#40
818	adc	x6,x6,xzr		// can be partially reduced...
819
820	ldp	x10,x11,[x2]	// load nonce
821
822	and	x12,x6,#-4		// ... so reduce
823	add	x12,x12,x6,lsr#2
824	and	x6,x6,#3
825	adds	x4,x4,x12
826	adcs	x5,x5,xzr
827	adc	x6,x6,xzr
828
829	adds	x12,x4,#5		// compare to modulus
830	adcs	x13,x5,xzr
831	adc	x14,x6,xzr
832
833	tst	x14,#-4			// see if it's carried/borrowed
834
835	csel	x4,x4,x12,eq
836	csel	x5,x5,x13,eq
837
838#ifdef	__ARMEB__
839	ror	x10,x10,#32		// flip nonce words
840	ror	x11,x11,#32
841#endif
842	adds	x4,x4,x10		// accumulate nonce
843	adc	x5,x5,x11
844#ifdef	__ARMEB__
845	rev	x4,x4			// flip output bytes
846	rev	x5,x5
847#endif
848	stp	x4,x5,[x1]		// write result
849
850	ret
851.size	poly1305_emit_neon,.-poly1305_emit_neon
852
853.align	5
854.Lzeros:
855.long	0,0,0,0,0,0,0,0
856.LOPENSSL_armcap_P:
857#ifdef	__ILP32__
858.long	OPENSSL_armcap_P-.
859#else
860.quad	OPENSSL_armcap_P-.
861#endif
862.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
863.align	2
864.align	2
865