xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/poly1305-armv4.S (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1#include "arm_arch.h"
2
3.text
4#if defined(__thumb2__)
5.syntax	unified
6.thumb
7#else
8.code	32
9#endif
10
11.globl	poly1305_emit
12.globl	poly1305_blocks
13.globl	poly1305_init
14.type	poly1305_init,%function
15.align	5
16poly1305_init:
17.Lpoly1305_init:
18	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
19
20	eor	r3,r3,r3
21	cmp	r1,#0
22	str	r3,[r0,#0]		@ zero hash value
23	str	r3,[r0,#4]
24	str	r3,[r0,#8]
25	str	r3,[r0,#12]
26	str	r3,[r0,#16]
27	str	r3,[r0,#36]		@ is_base2_26
28	add	r0,r0,#20
29
30#ifdef	__thumb2__
31	it	eq
32#endif
33	moveq	r0,#0
34	beq	.Lno_key
35
36#if	__ARM_MAX_ARCH__>=7
37	adr	r11,.Lpoly1305_init
38	ldr	r12,.LOPENSSL_armcap
39#endif
40	ldrb	r4,[r1,#0]
41	mov	r10,#0x0fffffff
42	ldrb	r5,[r1,#1]
43	and	r3,r10,#-4		@ 0x0ffffffc
44	ldrb	r6,[r1,#2]
45	ldrb	r7,[r1,#3]
46	orr	r4,r4,r5,lsl#8
47	ldrb	r5,[r1,#4]
48	orr	r4,r4,r6,lsl#16
49	ldrb	r6,[r1,#5]
50	orr	r4,r4,r7,lsl#24
51	ldrb	r7,[r1,#6]
52	and	r4,r4,r10
53
54#if	__ARM_MAX_ARCH__>=7
55	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
56# ifdef	__APPLE__
57	ldr	r12,[r12]
58# endif
59#endif
60	ldrb	r8,[r1,#7]
61	orr	r5,r5,r6,lsl#8
62	ldrb	r6,[r1,#8]
63	orr	r5,r5,r7,lsl#16
64	ldrb	r7,[r1,#9]
65	orr	r5,r5,r8,lsl#24
66	ldrb	r8,[r1,#10]
67	and	r5,r5,r3
68
69#if	__ARM_MAX_ARCH__>=7
70	tst	r12,#ARMV7_NEON		@ check for NEON
71# ifdef	__APPLE__
72	adr	r9,poly1305_blocks_neon
73	adr	r11,poly1305_blocks
74#  ifdef __thumb2__
75	it	ne
76#  endif
77	movne	r11,r9
78	adr	r12,poly1305_emit
79	adr	r10,poly1305_emit_neon
80#  ifdef __thumb2__
81	it	ne
82#  endif
83	movne	r12,r10
84# else
85#  ifdef __thumb2__
86	itete	eq
87#  endif
88	addeq	r12,r11,#(poly1305_emit-.Lpoly1305_init)
89	addne	r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
90	addeq	r11,r11,#(poly1305_blocks-.Lpoly1305_init)
91	addne	r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
92# endif
93# ifdef	__thumb2__
94	orr	r12,r12,#1	@ thumb-ify address
95	orr	r11,r11,#1
96# endif
97#endif
98	ldrb	r9,[r1,#11]
99	orr	r6,r6,r7,lsl#8
100	ldrb	r7,[r1,#12]
101	orr	r6,r6,r8,lsl#16
102	ldrb	r8,[r1,#13]
103	orr	r6,r6,r9,lsl#24
104	ldrb	r9,[r1,#14]
105	and	r6,r6,r3
106
107	ldrb	r10,[r1,#15]
108	orr	r7,r7,r8,lsl#8
109	str	r4,[r0,#0]
110	orr	r7,r7,r9,lsl#16
111	str	r5,[r0,#4]
112	orr	r7,r7,r10,lsl#24
113	str	r6,[r0,#8]
114	and	r7,r7,r3
115	str	r7,[r0,#12]
116#if	__ARM_MAX_ARCH__>=7
117	stmia	r2,{r11,r12}		@ fill functions table
118	mov	r0,#1
119#else
120	mov	r0,#0
121#endif
122.Lno_key:
123	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
124#if	__ARM_ARCH__>=5
125	bx	lr				@ bx	lr
126#else
127	tst	lr,#1
128	moveq	pc,lr			@ be binary compatible with V4, yet
129.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
130#endif
131.size	poly1305_init,.-poly1305_init
132.type	poly1305_blocks,%function
133.align	5
134poly1305_blocks:
135	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
136
137	ands	r2,r2,#-16
138	beq	.Lno_data
139
140	cmp	r3,#0
141	add	r2,r2,r1		@ end pointer
142	sub	sp,sp,#32
143
144	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
145
146	str	r0,[sp,#12]		@ offload stuff
147	mov	lr,r1
148	str	r2,[sp,#16]
149	str	r10,[sp,#20]
150	str	r11,[sp,#24]
151	str	r12,[sp,#28]
152	b	.Loop
153
154.Loop:
155#if __ARM_ARCH__<7
156	ldrb	r0,[lr],#16		@ load input
157# ifdef	__thumb2__
158	it	hi
159# endif
160	addhi	r8,r8,#1		@ 1<<128
161	ldrb	r1,[lr,#-15]
162	ldrb	r2,[lr,#-14]
163	ldrb	r3,[lr,#-13]
164	orr	r1,r0,r1,lsl#8
165	ldrb	r0,[lr,#-12]
166	orr	r2,r1,r2,lsl#16
167	ldrb	r1,[lr,#-11]
168	orr	r3,r2,r3,lsl#24
169	ldrb	r2,[lr,#-10]
170	adds	r4,r4,r3		@ accumulate input
171
172	ldrb	r3,[lr,#-9]
173	orr	r1,r0,r1,lsl#8
174	ldrb	r0,[lr,#-8]
175	orr	r2,r1,r2,lsl#16
176	ldrb	r1,[lr,#-7]
177	orr	r3,r2,r3,lsl#24
178	ldrb	r2,[lr,#-6]
179	adcs	r5,r5,r3
180
181	ldrb	r3,[lr,#-5]
182	orr	r1,r0,r1,lsl#8
183	ldrb	r0,[lr,#-4]
184	orr	r2,r1,r2,lsl#16
185	ldrb	r1,[lr,#-3]
186	orr	r3,r2,r3,lsl#24
187	ldrb	r2,[lr,#-2]
188	adcs	r6,r6,r3
189
190	ldrb	r3,[lr,#-1]
191	orr	r1,r0,r1,lsl#8
192	str	lr,[sp,#8]		@ offload input pointer
193	orr	r2,r1,r2,lsl#16
194	add	r10,r10,r10,lsr#2
195	orr	r3,r2,r3,lsl#24
196#else
197	ldr	r0,[lr],#16		@ load input
198# ifdef	__thumb2__
199	it	hi
200# endif
201	addhi	r8,r8,#1		@ padbit
202	ldr	r1,[lr,#-12]
203	ldr	r2,[lr,#-8]
204	ldr	r3,[lr,#-4]
205# ifdef	__ARMEB__
206	rev	r0,r0
207	rev	r1,r1
208	rev	r2,r2
209	rev	r3,r3
210# endif
211	adds	r4,r4,r0		@ accumulate input
212	str	lr,[sp,#8]		@ offload input pointer
213	adcs	r5,r5,r1
214	add	r10,r10,r10,lsr#2
215	adcs	r6,r6,r2
216#endif
217	add	r11,r11,r11,lsr#2
218	adcs	r7,r7,r3
219	add	r12,r12,r12,lsr#2
220
221	umull	r2,r3,r5,r9
222	adc	r8,r8,#0
223	umull	r0,r1,r4,r9
224	umlal	r2,r3,r8,r10
225	umlal	r0,r1,r7,r10
226	ldr	r10,[sp,#20]		@ reload r10
227	umlal	r2,r3,r6,r12
228	umlal	r0,r1,r5,r12
229	umlal	r2,r3,r7,r11
230	umlal	r0,r1,r6,r11
231	umlal	r2,r3,r4,r10
232	str	r0,[sp,#0]		@ future r4
233	mul	r0,r11,r8
234	ldr	r11,[sp,#24]		@ reload r11
235	adds	r2,r2,r1		@ d1+=d0>>32
236	eor	r1,r1,r1
237	adc	lr,r3,#0		@ future r6
238	str	r2,[sp,#4]		@ future r5
239
240	mul	r2,r12,r8
241	eor	r3,r3,r3
242	umlal	r0,r1,r7,r12
243	ldr	r12,[sp,#28]		@ reload r12
244	umlal	r2,r3,r7,r9
245	umlal	r0,r1,r6,r9
246	umlal	r2,r3,r6,r10
247	umlal	r0,r1,r5,r10
248	umlal	r2,r3,r5,r11
249	umlal	r0,r1,r4,r11
250	umlal	r2,r3,r4,r12
251	ldr	r4,[sp,#0]
252	mul	r8,r9,r8
253	ldr	r5,[sp,#4]
254
255	adds	r6,lr,r0		@ d2+=d1>>32
256	ldr	lr,[sp,#8]		@ reload input pointer
257	adc	r1,r1,#0
258	adds	r7,r2,r1		@ d3+=d2>>32
259	ldr	r0,[sp,#16]		@ reload end pointer
260	adc	r3,r3,#0
261	add	r8,r8,r3		@ h4+=d3>>32
262
263	and	r1,r8,#-4
264	and	r8,r8,#3
265	add	r1,r1,r1,lsr#2		@ *=5
266	adds	r4,r4,r1
267	adcs	r5,r5,#0
268	adcs	r6,r6,#0
269	adcs	r7,r7,#0
270	adc	r8,r8,#0
271
272	cmp	r0,lr			@ done yet?
273	bhi	.Loop
274
275	ldr	r0,[sp,#12]
276	add	sp,sp,#32
277	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
278
279.Lno_data:
280#if	__ARM_ARCH__>=5
281	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
282#else
283	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
284	tst	lr,#1
285	moveq	pc,lr			@ be binary compatible with V4, yet
286.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
287#endif
288.size	poly1305_blocks,.-poly1305_blocks
289.type	poly1305_emit,%function
290.align	5
291poly1305_emit:
292	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
293.Lpoly1305_emit_enter:
294
295	ldmia	r0,{r3,r4,r5,r6,r7}
296	adds	r8,r3,#5		@ compare to modulus
297	adcs	r9,r4,#0
298	adcs	r10,r5,#0
299	adcs	r11,r6,#0
300	adc	r7,r7,#0
301	tst	r7,#4			@ did it carry/borrow?
302
303#ifdef	__thumb2__
304	it	ne
305#endif
306	movne	r3,r8
307	ldr	r8,[r2,#0]
308#ifdef	__thumb2__
309	it	ne
310#endif
311	movne	r4,r9
312	ldr	r9,[r2,#4]
313#ifdef	__thumb2__
314	it	ne
315#endif
316	movne	r5,r10
317	ldr	r10,[r2,#8]
318#ifdef	__thumb2__
319	it	ne
320#endif
321	movne	r6,r11
322	ldr	r11,[r2,#12]
323
324	adds	r3,r3,r8
325	adcs	r4,r4,r9
326	adcs	r5,r5,r10
327	adc	r6,r6,r11
328
329#if __ARM_ARCH__>=7
330# ifdef __ARMEB__
331	rev	r3,r3
332	rev	r4,r4
333	rev	r5,r5
334	rev	r6,r6
335# endif
336	str	r3,[r1,#0]
337	str	r4,[r1,#4]
338	str	r5,[r1,#8]
339	str	r6,[r1,#12]
340#else
341	strb	r3,[r1,#0]
342	mov	r3,r3,lsr#8
343	strb	r4,[r1,#4]
344	mov	r4,r4,lsr#8
345	strb	r5,[r1,#8]
346	mov	r5,r5,lsr#8
347	strb	r6,[r1,#12]
348	mov	r6,r6,lsr#8
349
350	strb	r3,[r1,#1]
351	mov	r3,r3,lsr#8
352	strb	r4,[r1,#5]
353	mov	r4,r4,lsr#8
354	strb	r5,[r1,#9]
355	mov	r5,r5,lsr#8
356	strb	r6,[r1,#13]
357	mov	r6,r6,lsr#8
358
359	strb	r3,[r1,#2]
360	mov	r3,r3,lsr#8
361	strb	r4,[r1,#6]
362	mov	r4,r4,lsr#8
363	strb	r5,[r1,#10]
364	mov	r5,r5,lsr#8
365	strb	r6,[r1,#14]
366	mov	r6,r6,lsr#8
367
368	strb	r3,[r1,#3]
369	strb	r4,[r1,#7]
370	strb	r5,[r1,#11]
371	strb	r6,[r1,#15]
372#endif
373	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
374#if	__ARM_ARCH__>=5
375	bx	lr				@ bx	lr
376#else
377	tst	lr,#1
378	moveq	pc,lr			@ be binary compatible with V4, yet
379.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
380#endif
381.size	poly1305_emit,.-poly1305_emit
382#if	__ARM_MAX_ARCH__>=7
383.fpu	neon
384
385.type	poly1305_init_neon,%function
386.align	5
387poly1305_init_neon:
388	ldr	r4,[r0,#20]		@ load key base 2^32
389	ldr	r5,[r0,#24]
390	ldr	r6,[r0,#28]
391	ldr	r7,[r0,#32]
392
393	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
394	mov	r3,r4,lsr#26
395	mov	r4,r5,lsr#20
396	orr	r3,r3,r5,lsl#6
397	mov	r5,r6,lsr#14
398	orr	r4,r4,r6,lsl#12
399	mov	r6,r7,lsr#8
400	orr	r5,r5,r7,lsl#18
401	and	r3,r3,#0x03ffffff
402	and	r4,r4,#0x03ffffff
403	and	r5,r5,#0x03ffffff
404
405	vdup.32	d0,r2			@ r^1 in both lanes
406	add	r2,r3,r3,lsl#2		@ *5
407	vdup.32	d1,r3
408	add	r3,r4,r4,lsl#2
409	vdup.32	d2,r2
410	vdup.32	d3,r4
411	add	r4,r5,r5,lsl#2
412	vdup.32	d4,r3
413	vdup.32	d5,r5
414	add	r5,r6,r6,lsl#2
415	vdup.32	d6,r4
416	vdup.32	d7,r6
417	vdup.32	d8,r5
418
419	mov	r5,#2		@ counter
420
421.Lsquare_neon:
422	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
423	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
424	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
425	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
426	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
427	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
428
429	vmull.u32	q5,d0,d0[1]
430	vmull.u32	q6,d1,d0[1]
431	vmull.u32	q7,d3,d0[1]
432	vmull.u32	q8,d5,d0[1]
433	vmull.u32	q9,d7,d0[1]
434
435	vmlal.u32	q5,d7,d2[1]
436	vmlal.u32	q6,d0,d1[1]
437	vmlal.u32	q7,d1,d1[1]
438	vmlal.u32	q8,d3,d1[1]
439	vmlal.u32	q9,d5,d1[1]
440
441	vmlal.u32	q5,d5,d4[1]
442	vmlal.u32	q6,d7,d4[1]
443	vmlal.u32	q8,d1,d3[1]
444	vmlal.u32	q7,d0,d3[1]
445	vmlal.u32	q9,d3,d3[1]
446
447	vmlal.u32	q5,d3,d6[1]
448	vmlal.u32	q8,d0,d5[1]
449	vmlal.u32	q6,d5,d6[1]
450	vmlal.u32	q7,d7,d6[1]
451	vmlal.u32	q9,d1,d5[1]
452
453	vmlal.u32	q8,d7,d8[1]
454	vmlal.u32	q5,d1,d8[1]
455	vmlal.u32	q6,d3,d8[1]
456	vmlal.u32	q7,d5,d8[1]
457	vmlal.u32	q9,d0,d7[1]
458
459	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
460	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
461	@ and P. Schwabe
462	@
463	@ H0>>+H1>>+H2>>+H3>>+H4
464	@ H3>>+H4>>*5+H0>>+H1
465	@
466	@ Trivia.
467	@
468	@ Result of multiplication of n-bit number by m-bit number is
469	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
470	@ m-bit number multiplied by 2^n is still n+m bits wide.
471	@
472	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
473	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
474	@ one is n+1 bits wide.
475	@
476	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
477	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
478	@ can be 27. However! In cases when their width exceeds 26 bits
479	@ they are limited by 2^26+2^6. This in turn means that *sum*
480	@ of the products with these values can still be viewed as sum
481	@ of 52-bit numbers as long as the amount of addends is not a
482	@ power of 2. For example,
483	@
484	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
485	@
486	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
487	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
488	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
489	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
490	@ which is less than 32 * (2^52) or 2^57. And when processing
491	@ data we are looking at triple as many addends...
492	@
493	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
494	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
495	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
496	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
497	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
498	@ This means that result of reduction have to be compressed upon
499	@ loop wrap-around. This can be done in the process of reduction
500	@ to minimize amount of instructions [as well as amount of
501	@ 128-bit instructions, which benefits low-end processors], but
502	@ one has to watch for H2 (which is narrower than H0) and 5*H4
503	@ not being wider than 58 bits, so that result of right shift
504	@ by 26 bits fits in 32 bits. This is also useful on x86,
505	@ because it allows to use paddd in place for paddq, which
506	@ benefits Atom, where paddq is ridiculously slow.
507
508	vshr.u64	q15,q8,#26
509	vmovn.i64	d16,q8
510	vshr.u64	q4,q5,#26
511	vmovn.i64	d10,q5
512	vadd.i64	q9,q9,q15		@ h3 -> h4
513	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
514	vadd.i64	q6,q6,q4		@ h0 -> h1
515	vbic.i32	d10,#0xfc000000
516
517	vshrn.u64	d30,q9,#26
518	vmovn.i64	d18,q9
519	vshr.u64	q4,q6,#26
520	vmovn.i64	d12,q6
521	vadd.i64	q7,q7,q4		@ h1 -> h2
522	vbic.i32	d18,#0xfc000000
523	vbic.i32	d12,#0xfc000000
524
525	vadd.i32	d10,d10,d30
526	vshl.u32	d30,d30,#2
527	vshrn.u64	d8,q7,#26
528	vmovn.i64	d14,q7
529	vadd.i32	d10,d10,d30	@ h4 -> h0
530	vadd.i32	d16,d16,d8	@ h2 -> h3
531	vbic.i32	d14,#0xfc000000
532
533	vshr.u32	d30,d10,#26
534	vbic.i32	d10,#0xfc000000
535	vshr.u32	d8,d16,#26
536	vbic.i32	d16,#0xfc000000
537	vadd.i32	d12,d12,d30	@ h0 -> h1
538	vadd.i32	d18,d18,d8	@ h3 -> h4
539
540	subs	r5,r5,#1
541	beq	.Lsquare_break_neon
542
543	add	r6,r0,#(48+0*9*4)
544	add	r7,r0,#(48+1*9*4)
545
546	vtrn.32	d0,d10		@ r^2:r^1
547	vtrn.32	d3,d14
548	vtrn.32	d5,d16
549	vtrn.32	d1,d12
550	vtrn.32	d7,d18
551
552	vshl.u32	d4,d3,#2		@ *5
553	vshl.u32	d6,d5,#2
554	vshl.u32	d2,d1,#2
555	vshl.u32	d8,d7,#2
556	vadd.i32	d4,d4,d3
557	vadd.i32	d2,d2,d1
558	vadd.i32	d6,d6,d5
559	vadd.i32	d8,d8,d7
560
561	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
562	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
563	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
564	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
565	vst1.32	{d8[0]},[r6,:32]
566	vst1.32	{d8[1]},[r7,:32]
567
568	b	.Lsquare_neon
569
570.align	4
571.Lsquare_break_neon:
572	add	r6,r0,#(48+2*4*9)
573	add	r7,r0,#(48+3*4*9)
574
575	vmov	d0,d10		@ r^4:r^3
576	vshl.u32	d2,d12,#2		@ *5
577	vmov	d1,d12
578	vshl.u32	d4,d14,#2
579	vmov	d3,d14
580	vshl.u32	d6,d16,#2
581	vmov	d5,d16
582	vshl.u32	d8,d18,#2
583	vmov	d7,d18
584	vadd.i32	d2,d2,d12
585	vadd.i32	d4,d4,d14
586	vadd.i32	d6,d6,d16
587	vadd.i32	d8,d8,d18
588
589	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
590	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
591	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
592	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
593	vst1.32	{d8[0]},[r6]
594	vst1.32	{d8[1]},[r7]
595
596	bx	lr				@ bx	lr
597.size	poly1305_init_neon,.-poly1305_init_neon
598
599.type	poly1305_blocks_neon,%function
600.align	5
601poly1305_blocks_neon:
602	ldr	ip,[r0,#36]		@ is_base2_26
603	ands	r2,r2,#-16
604	beq	.Lno_data_neon
605
606	cmp	r2,#64
607	bhs	.Lenter_neon
608	tst	ip,ip			@ is_base2_26?
609	beq	poly1305_blocks
610
611.Lenter_neon:
612	stmdb	sp!,{r4,r5,r6,r7}
613	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
614
615	tst	ip,ip			@ is_base2_26?
616	bne	.Lbase2_26_neon
617
618	stmdb	sp!,{r1,r2,r3,lr}
619	bl	poly1305_init_neon
620
621	ldr	r4,[r0,#0]		@ load hash value base 2^32
622	ldr	r5,[r0,#4]
623	ldr	r6,[r0,#8]
624	ldr	r7,[r0,#12]
625	ldr	ip,[r0,#16]
626
627	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
628	mov	r3,r4,lsr#26
629	veor	d10,d10,d10
630	mov	r4,r5,lsr#20
631	orr	r3,r3,r5,lsl#6
632	veor	d12,d12,d12
633	mov	r5,r6,lsr#14
634	orr	r4,r4,r6,lsl#12
635	veor	d14,d14,d14
636	mov	r6,r7,lsr#8
637	orr	r5,r5,r7,lsl#18
638	veor	d16,d16,d16
639	and	r3,r3,#0x03ffffff
640	orr	r6,r6,ip,lsl#24
641	veor	d18,d18,d18
642	and	r4,r4,#0x03ffffff
643	mov	r1,#1
644	and	r5,r5,#0x03ffffff
645	str	r1,[r0,#36]		@ is_base2_26
646
647	vmov.32	d10[0],r2
648	vmov.32	d12[0],r3
649	vmov.32	d14[0],r4
650	vmov.32	d16[0],r5
651	vmov.32	d18[0],r6
652	adr	r5,.Lzeros
653
654	ldmia	sp!,{r1,r2,r3,lr}
655	b	.Lbase2_32_neon
656
657.align	4
658.Lbase2_26_neon:
659	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
660	@ load hash value
661
662	veor	d10,d10,d10
663	veor	d12,d12,d12
664	veor	d14,d14,d14
665	veor	d16,d16,d16
666	veor	d18,d18,d18
667	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
668	adr	r5,.Lzeros
669	vld1.32	{d18[0]},[r0]
670	sub	r0,r0,#16		@ rewind
671
672.Lbase2_32_neon:
673	add	r4,r1,#32
674	mov	r3,r3,lsl#24
675	tst	r2,#31
676	beq	.Leven
677
678	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
679	vmov.32	d28[0],r3
680	sub	r2,r2,#16
681	add	r4,r1,#32
682
683# ifdef	__ARMEB__
684	vrev32.8	q10,q10
685	vrev32.8	q13,q13
686	vrev32.8	q11,q11
687	vrev32.8	q12,q12
688# endif
689	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
690	vshl.u32	d26,d26,#18
691
692	vsri.u32	d26,d24,#14
693	vshl.u32	d24,d24,#12
694	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
695
696	vbic.i32	d26,#0xfc000000
697	vsri.u32	d24,d22,#20
698	vshl.u32	d22,d22,#6
699
700	vbic.i32	d24,#0xfc000000
701	vsri.u32	d22,d20,#26
702	vadd.i32	d27,d26,d16
703
704	vbic.i32	d20,#0xfc000000
705	vbic.i32	d22,#0xfc000000
706	vadd.i32	d25,d24,d14
707
708	vadd.i32	d21,d20,d10
709	vadd.i32	d23,d22,d12
710
711	mov	r7,r5
712	add	r6,r0,#48
713
714	cmp	r2,r2
715	b	.Long_tail
716
717.align	4
718.Leven:
719	subs	r2,r2,#64
720	it	lo
721	movlo	r4,r5
722
723	vmov.i32	q14,#1<<24		@ padbit, yes, always
724	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
725	add	r1,r1,#64
726	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
727	add	r4,r4,#64
728	itt	hi
729	addhi	r7,r0,#(48+1*9*4)
730	addhi	r6,r0,#(48+3*9*4)
731
732# ifdef	__ARMEB__
733	vrev32.8	q10,q10
734	vrev32.8	q13,q13
735	vrev32.8	q11,q11
736	vrev32.8	q12,q12
737# endif
738	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
739	vshl.u32	q13,q13,#18
740
741	vsri.u32	q13,q12,#14
742	vshl.u32	q12,q12,#12
743
744	vbic.i32	q13,#0xfc000000
745	vsri.u32	q12,q11,#20
746	vshl.u32	q11,q11,#6
747
748	vbic.i32	q12,#0xfc000000
749	vsri.u32	q11,q10,#26
750
751	vbic.i32	q10,#0xfc000000
752	vbic.i32	q11,#0xfc000000
753
754	bls	.Lskip_loop
755
756	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
757	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
758	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
759	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
760	b	.Loop_neon
761
762.align	5
763.Loop_neon:
764	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
765	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
766	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
767	@   ___________________/
768	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
769	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
770	@   ___________________/ ____________________/
771	@
772	@ Note that we start with inp[2:3]*r^2. This is because it
773	@ doesn't depend on reduction in previous iteration.
774	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
775	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
776	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
777	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
778	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
779	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
780
781	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
782	@ inp[2:3]*r^2
783
784	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
785	vmull.u32	q7,d25,d0[1]
786	vadd.i32	d20,d20,d10
787	vmull.u32	q5,d21,d0[1]
788	vadd.i32	d26,d26,d16
789	vmull.u32	q8,d27,d0[1]
790	vmlal.u32	q7,d23,d1[1]
791	vadd.i32	d22,d22,d12
792	vmull.u32	q6,d23,d0[1]
793
794	vadd.i32	d28,d28,d18
795	vmull.u32	q9,d29,d0[1]
796	subs	r2,r2,#64
797	vmlal.u32	q5,d29,d2[1]
798	it	lo
799	movlo	r4,r5
800	vmlal.u32	q8,d25,d1[1]
801	vld1.32	d8[1],[r7,:32]
802	vmlal.u32	q6,d21,d1[1]
803	vmlal.u32	q9,d27,d1[1]
804
805	vmlal.u32	q5,d27,d4[1]
806	vmlal.u32	q8,d23,d3[1]
807	vmlal.u32	q9,d25,d3[1]
808	vmlal.u32	q6,d29,d4[1]
809	vmlal.u32	q7,d21,d3[1]
810
811	vmlal.u32	q8,d21,d5[1]
812	vmlal.u32	q5,d25,d6[1]
813	vmlal.u32	q9,d23,d5[1]
814	vmlal.u32	q6,d27,d6[1]
815	vmlal.u32	q7,d29,d6[1]
816
817	vmlal.u32	q8,d29,d8[1]
818	vmlal.u32	q5,d23,d8[1]
819	vmlal.u32	q9,d21,d7[1]
820	vmlal.u32	q6,d25,d8[1]
821	vmlal.u32	q7,d27,d8[1]
822
823	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
824	add	r4,r4,#64
825
826	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
827	@ (hash+inp[0:1])*r^4 and accumulate
828
829	vmlal.u32	q8,d26,d0[0]
830	vmlal.u32	q5,d20,d0[0]
831	vmlal.u32	q9,d28,d0[0]
832	vmlal.u32	q6,d22,d0[0]
833	vmlal.u32	q7,d24,d0[0]
834	vld1.32	d8[0],[r6,:32]
835
836	vmlal.u32	q8,d24,d1[0]
837	vmlal.u32	q5,d28,d2[0]
838	vmlal.u32	q9,d26,d1[0]
839	vmlal.u32	q6,d20,d1[0]
840	vmlal.u32	q7,d22,d1[0]
841
842	vmlal.u32	q8,d22,d3[0]
843	vmlal.u32	q5,d26,d4[0]
844	vmlal.u32	q9,d24,d3[0]
845	vmlal.u32	q6,d28,d4[0]
846	vmlal.u32	q7,d20,d3[0]
847
848	vmlal.u32	q8,d20,d5[0]
849	vmlal.u32	q5,d24,d6[0]
850	vmlal.u32	q9,d22,d5[0]
851	vmlal.u32	q6,d26,d6[0]
852	vmlal.u32	q8,d28,d8[0]
853
854	vmlal.u32	q7,d28,d6[0]
855	vmlal.u32	q5,d22,d8[0]
856	vmlal.u32	q9,d20,d7[0]
857	vmov.i32	q14,#1<<24		@ padbit, yes, always
858	vmlal.u32	q6,d24,d8[0]
859	vmlal.u32	q7,d26,d8[0]
860
861	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
862	add	r1,r1,#64
863# ifdef	__ARMEB__
864	vrev32.8	q10,q10
865	vrev32.8	q11,q11
866	vrev32.8	q12,q12
867	vrev32.8	q13,q13
868# endif
869
870	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
871	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
872	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
873
874	vshr.u64	q15,q8,#26
875	vmovn.i64	d16,q8
876	vshr.u64	q4,q5,#26
877	vmovn.i64	d10,q5
878	vadd.i64	q9,q9,q15		@ h3 -> h4
879	vbic.i32	d16,#0xfc000000
880	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
881	vadd.i64	q6,q6,q4		@ h0 -> h1
882	vshl.u32	q13,q13,#18
883	vbic.i32	d10,#0xfc000000
884
885	vshrn.u64	d30,q9,#26
886	vmovn.i64	d18,q9
887	vshr.u64	q4,q6,#26
888	vmovn.i64	d12,q6
889	vadd.i64	q7,q7,q4		@ h1 -> h2
890	vsri.u32	q13,q12,#14
891	vbic.i32	d18,#0xfc000000
892	vshl.u32	q12,q12,#12
893	vbic.i32	d12,#0xfc000000
894
895	vadd.i32	d10,d10,d30
896	vshl.u32	d30,d30,#2
897	vbic.i32	q13,#0xfc000000
898	vshrn.u64	d8,q7,#26
899	vmovn.i64	d14,q7
900	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
901	vsri.u32	q12,q11,#20
902	vadd.i32	d16,d16,d8	@ h2 -> h3
903	vshl.u32	q11,q11,#6
904	vbic.i32	d14,#0xfc000000
905	vbic.i32	q12,#0xfc000000
906
907	vshrn.u64	d30,q5,#26		@ re-narrow
908	vmovn.i64	d10,q5
909	vsri.u32	q11,q10,#26
910	vbic.i32	q10,#0xfc000000
911	vshr.u32	d8,d16,#26
912	vbic.i32	d16,#0xfc000000
913	vbic.i32	d10,#0xfc000000
914	vadd.i32	d12,d12,d30	@ h0 -> h1
915	vadd.i32	d18,d18,d8	@ h3 -> h4
916	vbic.i32	q11,#0xfc000000
917
918	bhi	.Loop_neon
919
920.Lskip_loop:
921	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
922	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
923
924	add	r7,r0,#(48+0*9*4)
925	add	r6,r0,#(48+1*9*4)
926	adds	r2,r2,#32
927	it	ne
928	movne	r2,#0
929	bne	.Long_tail
930
931	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
932	vadd.i32	d21,d20,d10
933	vadd.i32	d27,d26,d16
934	vadd.i32	d23,d22,d12
935	vadd.i32	d29,d28,d18
936
937.Long_tail:
938	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
939	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
940
941	vadd.i32	d24,d24,d14	@ can be redundant
942	vmull.u32	q7,d25,d0
943	vadd.i32	d20,d20,d10
944	vmull.u32	q5,d21,d0
945	vadd.i32	d26,d26,d16
946	vmull.u32	q8,d27,d0
947	vadd.i32	d22,d22,d12
948	vmull.u32	q6,d23,d0
949	vadd.i32	d28,d28,d18
950	vmull.u32	q9,d29,d0
951
952	vmlal.u32	q5,d29,d2
953	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
954	vmlal.u32	q8,d25,d1
955	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
956	vmlal.u32	q6,d21,d1
957	vmlal.u32	q9,d27,d1
958	vmlal.u32	q7,d23,d1
959
960	vmlal.u32	q8,d23,d3
961	vld1.32	d8[1],[r7,:32]
962	vmlal.u32	q5,d27,d4
963	vld1.32	d8[0],[r6,:32]
964	vmlal.u32	q9,d25,d3
965	vmlal.u32	q6,d29,d4
966	vmlal.u32	q7,d21,d3
967
968	vmlal.u32	q8,d21,d5
969	it	ne
970	addne	r7,r0,#(48+2*9*4)
971	vmlal.u32	q5,d25,d6
972	it	ne
973	addne	r6,r0,#(48+3*9*4)
974	vmlal.u32	q9,d23,d5
975	vmlal.u32	q6,d27,d6
976	vmlal.u32	q7,d29,d6
977
978	vmlal.u32	q8,d29,d8
979	vorn	q0,q0,q0	@ all-ones, can be redundant
980	vmlal.u32	q5,d23,d8
981	vshr.u64	q0,q0,#38
982	vmlal.u32	q9,d21,d7
983	vmlal.u32	q6,d25,d8
984	vmlal.u32	q7,d27,d8
985
986	beq	.Lshort_tail
987
988	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
989	@ (hash+inp[0:1])*r^4:r^3 and accumulate
990
991	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
992	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
993
994	vmlal.u32	q7,d24,d0
995	vmlal.u32	q5,d20,d0
996	vmlal.u32	q8,d26,d0
997	vmlal.u32	q6,d22,d0
998	vmlal.u32	q9,d28,d0
999
1000	vmlal.u32	q5,d28,d2
1001	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
1002	vmlal.u32	q8,d24,d1
1003	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1004	vmlal.u32	q6,d20,d1
1005	vmlal.u32	q9,d26,d1
1006	vmlal.u32	q7,d22,d1
1007
1008	vmlal.u32	q8,d22,d3
1009	vld1.32	d8[1],[r7,:32]
1010	vmlal.u32	q5,d26,d4
1011	vld1.32	d8[0],[r6,:32]
1012	vmlal.u32	q9,d24,d3
1013	vmlal.u32	q6,d28,d4
1014	vmlal.u32	q7,d20,d3
1015
1016	vmlal.u32	q8,d20,d5
1017	vmlal.u32	q5,d24,d6
1018	vmlal.u32	q9,d22,d5
1019	vmlal.u32	q6,d26,d6
1020	vmlal.u32	q7,d28,d6
1021
1022	vmlal.u32	q8,d28,d8
1023	vorn	q0,q0,q0	@ all-ones
1024	vmlal.u32	q5,d22,d8
1025	vshr.u64	q0,q0,#38
1026	vmlal.u32	q9,d20,d7
1027	vmlal.u32	q6,d24,d8
1028	vmlal.u32	q7,d26,d8
1029
1030.Lshort_tail:
1031	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1032	@ horizontal addition
1033
1034	vadd.i64	d16,d16,d17
1035	vadd.i64	d10,d10,d11
1036	vadd.i64	d18,d18,d19
1037	vadd.i64	d12,d12,d13
1038	vadd.i64	d14,d14,d15
1039
1040	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1041	@ lazy reduction, but without narrowing
1042
1043	vshr.u64	q15,q8,#26
1044	vand.i64	q8,q8,q0
1045	vshr.u64	q4,q5,#26
1046	vand.i64	q5,q5,q0
1047	vadd.i64	q9,q9,q15		@ h3 -> h4
1048	vadd.i64	q6,q6,q4		@ h0 -> h1
1049
1050	vshr.u64	q15,q9,#26
1051	vand.i64	q9,q9,q0
1052	vshr.u64	q4,q6,#26
1053	vand.i64	q6,q6,q0
1054	vadd.i64	q7,q7,q4		@ h1 -> h2
1055
1056	vadd.i64	q5,q5,q15
1057	vshl.u64	q15,q15,#2
1058	vshr.u64	q4,q7,#26
1059	vand.i64	q7,q7,q0
1060	vadd.i64	q5,q5,q15		@ h4 -> h0
1061	vadd.i64	q8,q8,q4		@ h2 -> h3
1062
1063	vshr.u64	q15,q5,#26
1064	vand.i64	q5,q5,q0
1065	vshr.u64	q4,q8,#26
1066	vand.i64	q8,q8,q0
1067	vadd.i64	q6,q6,q15		@ h0 -> h1
1068	vadd.i64	q9,q9,q4		@ h3 -> h4
1069
1070	cmp	r2,#0
1071	bne	.Leven
1072
1073	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1074	@ store hash value
1075
1076	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1077	vst1.32	{d18[0]},[r0]
1078
1079	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1080	ldmia	sp!,{r4,r5,r6,r7}
1081.Lno_data_neon:
1082	bx	lr					@ bx	lr
1083.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1084
1085.type	poly1305_emit_neon,%function
1086.align	5
1087poly1305_emit_neon:
1088	ldr	ip,[r0,#36]		@ is_base2_26
1089
1090	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1091
1092	tst	ip,ip
1093	beq	.Lpoly1305_emit_enter
1094
1095	ldmia	r0,{r3,r4,r5,r6,r7}
1096	eor	r8,r8,r8
1097
1098	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1099	mov	r4,r4,lsr#6
1100	adcs	r4,r4,r5,lsl#20
1101	mov	r5,r5,lsr#12
1102	adcs	r5,r5,r6,lsl#14
1103	mov	r6,r6,lsr#18
1104	adcs	r6,r6,r7,lsl#8
1105	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1106
1107	and	r8,r7,#-4		@ ... so reduce
1108	and	r7,r6,#3
1109	add	r8,r8,r8,lsr#2	@ *= 5
1110	adds	r3,r3,r8
1111	adcs	r4,r4,#0
1112	adcs	r5,r5,#0
1113	adcs	r6,r6,#0
1114	adc	r7,r7,#0
1115
1116	adds	r8,r3,#5		@ compare to modulus
1117	adcs	r9,r4,#0
1118	adcs	r10,r5,#0
1119	adcs	r11,r6,#0
1120	adc	r7,r7,#0
1121	tst	r7,#4			@ did it carry/borrow?
1122
1123	it	ne
1124	movne	r3,r8
1125	ldr	r8,[r2,#0]
1126	it	ne
1127	movne	r4,r9
1128	ldr	r9,[r2,#4]
1129	it	ne
1130	movne	r5,r10
1131	ldr	r10,[r2,#8]
1132	it	ne
1133	movne	r6,r11
1134	ldr	r11,[r2,#12]
1135
1136	adds	r3,r3,r8		@ accumulate nonce
1137	adcs	r4,r4,r9
1138	adcs	r5,r5,r10
1139	adc	r6,r6,r11
1140
1141# ifdef __ARMEB__
1142	rev	r3,r3
1143	rev	r4,r4
1144	rev	r5,r5
1145	rev	r6,r6
1146# endif
1147	str	r3,[r1,#0]		@ store the result
1148	str	r4,[r1,#4]
1149	str	r5,[r1,#8]
1150	str	r6,[r1,#12]
1151
1152	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1153	bx	lr				@ bx	lr
1154.size	poly1305_emit_neon,.-poly1305_emit_neon
1155
1156.align	5
1157.Lzeros:
1158.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1159.LOPENSSL_armcap:
1160.word	OPENSSL_armcap_P-.Lpoly1305_init
1161#endif
1162.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1163.align	2
1164.align	2
1165#if	__ARM_MAX_ARCH__>=7
1166.comm	OPENSSL_armcap_P,4,4
1167#endif
1168