xref: /netbsd-src/crypto/external/bsd/openssl.old/lib/libcrypto/arch/arm/poly1305-armv4.S (revision 4724848cf0da353df257f730694b7882798e5daf)
1#include "arm_asm.h"
2#include "arm_arch.h"
3
4.text
5#if defined(__thumb2__)
6.syntax	unified
7.thumb
8#else
9.code	32
10#endif
11
12.globl	poly1305_emit
13.globl	poly1305_blocks
14.globl	poly1305_init
15.type	poly1305_init,%function
16.align	5
17poly1305_init:
18.Lpoly1305_init:
19	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
20
21	eor	r3,r3,r3
22	cmp	r1,#0
23	str	r3,[r0,#0]		@ zero hash value
24	str	r3,[r0,#4]
25	str	r3,[r0,#8]
26	str	r3,[r0,#12]
27	str	r3,[r0,#16]
28	str	r3,[r0,#36]		@ is_base2_26
29	add	r0,r0,#20
30
31#ifdef	__thumb2__
32	it	eq
33#endif
34	moveq	r0,#0
35	beq	.Lno_key
36
37#if	__ARM_MAX_ARCH__>=7
38	adr	r11,.Lpoly1305_init
39	ldr	r12,.LOPENSSL_armcap
40#endif
41	ldrb	r4,[r1,#0]
42	mov	r10,#0x0fffffff
43	ldrb	r5,[r1,#1]
44	and	r3,r10,#-4		@ 0x0ffffffc
45	ldrb	r6,[r1,#2]
46	ldrb	r7,[r1,#3]
47	orr	r4,r4,r5,lsl#8
48	ldrb	r5,[r1,#4]
49	orr	r4,r4,r6,lsl#16
50	ldrb	r6,[r1,#5]
51	orr	r4,r4,r7,lsl#24
52	ldrb	r7,[r1,#6]
53	and	r4,r4,r10
54
55#if	__ARM_MAX_ARCH__>=7
56	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
57# ifdef	__APPLE__
58	ldr	r12,[r12]
59# endif
60#endif
61	ldrb	r8,[r1,#7]
62	orr	r5,r5,r6,lsl#8
63	ldrb	r6,[r1,#8]
64	orr	r5,r5,r7,lsl#16
65	ldrb	r7,[r1,#9]
66	orr	r5,r5,r8,lsl#24
67	ldrb	r8,[r1,#10]
68	and	r5,r5,r3
69
70#if	__ARM_MAX_ARCH__>=7
71	tst	r12,#ARMV7_NEON		@ check for NEON
72# ifdef	__APPLE__
73	adr	r9,poly1305_blocks_neon
74	adr	r11,poly1305_blocks
75#  ifdef __thumb2__
76	it	ne
77#  endif
78	movne	r11,r9
79	adr	r12,poly1305_emit
80	adr	r10,poly1305_emit_neon
81#  ifdef __thumb2__
82	it	ne
83#  endif
84	movne	r12,r10
85# else
86#  ifdef __thumb2__
87	itete	eq
88#  endif
89	addeq	r12,r11,#(poly1305_emit-.Lpoly1305_init)
90	addne	r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
91	addeq	r11,r11,#(poly1305_blocks-.Lpoly1305_init)
92	addne	r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
93# endif
94# ifdef	__thumb2__
95	orr	r12,r12,#1	@ thumb-ify address
96	orr	r11,r11,#1
97# endif
98#endif
99	ldrb	r9,[r1,#11]
100	orr	r6,r6,r7,lsl#8
101	ldrb	r7,[r1,#12]
102	orr	r6,r6,r8,lsl#16
103	ldrb	r8,[r1,#13]
104	orr	r6,r6,r9,lsl#24
105	ldrb	r9,[r1,#14]
106	and	r6,r6,r3
107
108	ldrb	r10,[r1,#15]
109	orr	r7,r7,r8,lsl#8
110	str	r4,[r0,#0]
111	orr	r7,r7,r9,lsl#16
112	str	r5,[r0,#4]
113	orr	r7,r7,r10,lsl#24
114	str	r6,[r0,#8]
115	and	r7,r7,r3
116	str	r7,[r0,#12]
117#if	__ARM_MAX_ARCH__>=7
118	stmia	r2,{r11,r12}		@ fill functions table
119	mov	r0,#1
120#else
121	mov	r0,#0
122#endif
123.Lno_key:
124	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
125#if	__ARM_ARCH__>=5
126	RET				@ bx	lr
127#else
128	tst	lr,#1
129	moveq	pc,lr			@ be binary compatible with V4, yet
130.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
131#endif
132.size	poly1305_init,.-poly1305_init
133.type	poly1305_blocks,%function
134.align	5
135poly1305_blocks:
136.Lpoly1305_blocks:
137	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
138
139	ands	r2,r2,#-16
140	beq	.Lno_data
141
142	cmp	r3,#0
143	add	r2,r2,r1		@ end pointer
144	sub	sp,sp,#32
145
146	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
147
148	str	r0,[sp,#12]		@ offload stuff
149	mov	lr,r1
150	str	r2,[sp,#16]
151	str	r10,[sp,#20]
152	str	r11,[sp,#24]
153	str	r12,[sp,#28]
154	b	.Loop
155
156.Loop:
157#if __ARM_ARCH__<7
158	ldrb	r0,[lr],#16		@ load input
159# ifdef	__thumb2__
160	it	hi
161# endif
162	addhi	r8,r8,#1		@ 1<<128
163	ldrb	r1,[lr,#-15]
164	ldrb	r2,[lr,#-14]
165	ldrb	r3,[lr,#-13]
166	orr	r1,r0,r1,lsl#8
167	ldrb	r0,[lr,#-12]
168	orr	r2,r1,r2,lsl#16
169	ldrb	r1,[lr,#-11]
170	orr	r3,r2,r3,lsl#24
171	ldrb	r2,[lr,#-10]
172	adds	r4,r4,r3		@ accumulate input
173
174	ldrb	r3,[lr,#-9]
175	orr	r1,r0,r1,lsl#8
176	ldrb	r0,[lr,#-8]
177	orr	r2,r1,r2,lsl#16
178	ldrb	r1,[lr,#-7]
179	orr	r3,r2,r3,lsl#24
180	ldrb	r2,[lr,#-6]
181	adcs	r5,r5,r3
182
183	ldrb	r3,[lr,#-5]
184	orr	r1,r0,r1,lsl#8
185	ldrb	r0,[lr,#-4]
186	orr	r2,r1,r2,lsl#16
187	ldrb	r1,[lr,#-3]
188	orr	r3,r2,r3,lsl#24
189	ldrb	r2,[lr,#-2]
190	adcs	r6,r6,r3
191
192	ldrb	r3,[lr,#-1]
193	orr	r1,r0,r1,lsl#8
194	str	lr,[sp,#8]		@ offload input pointer
195	orr	r2,r1,r2,lsl#16
196	add	r10,r10,r10,lsr#2
197	orr	r3,r2,r3,lsl#24
198#else
199	ldr	r0,[lr],#16		@ load input
200# ifdef	__thumb2__
201	it	hi
202# endif
203	addhi	r8,r8,#1		@ padbit
204	ldr	r1,[lr,#-12]
205	ldr	r2,[lr,#-8]
206	ldr	r3,[lr,#-4]
207# ifdef	__ARMEB__
208	rev	r0,r0
209	rev	r1,r1
210	rev	r2,r2
211	rev	r3,r3
212# endif
213	adds	r4,r4,r0		@ accumulate input
214	str	lr,[sp,#8]		@ offload input pointer
215	adcs	r5,r5,r1
216	add	r10,r10,r10,lsr#2
217	adcs	r6,r6,r2
218#endif
219	add	r11,r11,r11,lsr#2
220	adcs	r7,r7,r3
221	add	r12,r12,r12,lsr#2
222
223	umull	r2,r3,r5,r9
224	adc	r8,r8,#0
225	umull	r0,r1,r4,r9
226	umlal	r2,r3,r8,r10
227	umlal	r0,r1,r7,r10
228	ldr	r10,[sp,#20]		@ reload r10
229	umlal	r2,r3,r6,r12
230	umlal	r0,r1,r5,r12
231	umlal	r2,r3,r7,r11
232	umlal	r0,r1,r6,r11
233	umlal	r2,r3,r4,r10
234	str	r0,[sp,#0]		@ future r4
235	mul	r0,r11,r8
236	ldr	r11,[sp,#24]		@ reload r11
237	adds	r2,r2,r1		@ d1+=d0>>32
238	eor	r1,r1,r1
239	adc	lr,r3,#0		@ future r6
240	str	r2,[sp,#4]		@ future r5
241
242	mul	r2,r12,r8
243	eor	r3,r3,r3
244	umlal	r0,r1,r7,r12
245	ldr	r12,[sp,#28]		@ reload r12
246	umlal	r2,r3,r7,r9
247	umlal	r0,r1,r6,r9
248	umlal	r2,r3,r6,r10
249	umlal	r0,r1,r5,r10
250	umlal	r2,r3,r5,r11
251	umlal	r0,r1,r4,r11
252	umlal	r2,r3,r4,r12
253	ldr	r4,[sp,#0]
254	mul	r8,r9,r8
255	ldr	r5,[sp,#4]
256
257	adds	r6,lr,r0		@ d2+=d1>>32
258	ldr	lr,[sp,#8]		@ reload input pointer
259	adc	r1,r1,#0
260	adds	r7,r2,r1		@ d3+=d2>>32
261	ldr	r0,[sp,#16]		@ reload end pointer
262	adc	r3,r3,#0
263	add	r8,r8,r3		@ h4+=d3>>32
264
265	and	r1,r8,#-4
266	and	r8,r8,#3
267	add	r1,r1,r1,lsr#2		@ *=5
268	adds	r4,r4,r1
269	adcs	r5,r5,#0
270	adcs	r6,r6,#0
271	adcs	r7,r7,#0
272	adc	r8,r8,#0
273
274	cmp	r0,lr			@ done yet?
275	bhi	.Loop
276
277	ldr	r0,[sp,#12]
278	add	sp,sp,#32
279	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
280
281.Lno_data:
282#if	__ARM_ARCH__>=5
283	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
284#else
285	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
286	tst	lr,#1
287	moveq	pc,lr			@ be binary compatible with V4, yet
288.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
289#endif
290.size	poly1305_blocks,.-poly1305_blocks
291.type	poly1305_emit,%function
292.align	5
293poly1305_emit:
294	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
295.Lpoly1305_emit_enter:
296
297	ldmia	r0,{r3,r4,r5,r6,r7}
298	adds	r8,r3,#5		@ compare to modulus
299	adcs	r9,r4,#0
300	adcs	r10,r5,#0
301	adcs	r11,r6,#0
302	adc	r7,r7,#0
303	tst	r7,#4			@ did it carry/borrow?
304
305#ifdef	__thumb2__
306	it	ne
307#endif
308	movne	r3,r8
309	ldr	r8,[r2,#0]
310#ifdef	__thumb2__
311	it	ne
312#endif
313	movne	r4,r9
314	ldr	r9,[r2,#4]
315#ifdef	__thumb2__
316	it	ne
317#endif
318	movne	r5,r10
319	ldr	r10,[r2,#8]
320#ifdef	__thumb2__
321	it	ne
322#endif
323	movne	r6,r11
324	ldr	r11,[r2,#12]
325
326	adds	r3,r3,r8
327	adcs	r4,r4,r9
328	adcs	r5,r5,r10
329	adc	r6,r6,r11
330
331#if __ARM_ARCH__>=7
332# ifdef __ARMEB__
333	rev	r3,r3
334	rev	r4,r4
335	rev	r5,r5
336	rev	r6,r6
337# endif
338	str	r3,[r1,#0]
339	str	r4,[r1,#4]
340	str	r5,[r1,#8]
341	str	r6,[r1,#12]
342#else
343	strb	r3,[r1,#0]
344	mov	r3,r3,lsr#8
345	strb	r4,[r1,#4]
346	mov	r4,r4,lsr#8
347	strb	r5,[r1,#8]
348	mov	r5,r5,lsr#8
349	strb	r6,[r1,#12]
350	mov	r6,r6,lsr#8
351
352	strb	r3,[r1,#1]
353	mov	r3,r3,lsr#8
354	strb	r4,[r1,#5]
355	mov	r4,r4,lsr#8
356	strb	r5,[r1,#9]
357	mov	r5,r5,lsr#8
358	strb	r6,[r1,#13]
359	mov	r6,r6,lsr#8
360
361	strb	r3,[r1,#2]
362	mov	r3,r3,lsr#8
363	strb	r4,[r1,#6]
364	mov	r4,r4,lsr#8
365	strb	r5,[r1,#10]
366	mov	r5,r5,lsr#8
367	strb	r6,[r1,#14]
368	mov	r6,r6,lsr#8
369
370	strb	r3,[r1,#3]
371	strb	r4,[r1,#7]
372	strb	r5,[r1,#11]
373	strb	r6,[r1,#15]
374#endif
375	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
376#if	__ARM_ARCH__>=5
377	RET				@ bx	lr
378#else
379	tst	lr,#1
380	moveq	pc,lr			@ be binary compatible with V4, yet
381.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
382#endif
383.size	poly1305_emit,.-poly1305_emit
384#if	__ARM_MAX_ARCH__>=7
385.fpu	neon
386
387.type	poly1305_init_neon,%function
388.align	5
389poly1305_init_neon:
390	ldr	r4,[r0,#20]		@ load key base 2^32
391	ldr	r5,[r0,#24]
392	ldr	r6,[r0,#28]
393	ldr	r7,[r0,#32]
394
395	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
396	mov	r3,r4,lsr#26
397	mov	r4,r5,lsr#20
398	orr	r3,r3,r5,lsl#6
399	mov	r5,r6,lsr#14
400	orr	r4,r4,r6,lsl#12
401	mov	r6,r7,lsr#8
402	orr	r5,r5,r7,lsl#18
403	and	r3,r3,#0x03ffffff
404	and	r4,r4,#0x03ffffff
405	and	r5,r5,#0x03ffffff
406
407	vdup.32	d0,r2			@ r^1 in both lanes
408	add	r2,r3,r3,lsl#2		@ *5
409	vdup.32	d1,r3
410	add	r3,r4,r4,lsl#2
411	vdup.32	d2,r2
412	vdup.32	d3,r4
413	add	r4,r5,r5,lsl#2
414	vdup.32	d4,r3
415	vdup.32	d5,r5
416	add	r5,r6,r6,lsl#2
417	vdup.32	d6,r4
418	vdup.32	d7,r6
419	vdup.32	d8,r5
420
421	mov	r5,#2		@ counter
422
423.Lsquare_neon:
424	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
425	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
426	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
427	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
428	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
429	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
430
431	vmull.u32	q5,d0,d0[1]
432	vmull.u32	q6,d1,d0[1]
433	vmull.u32	q7,d3,d0[1]
434	vmull.u32	q8,d5,d0[1]
435	vmull.u32	q9,d7,d0[1]
436
437	vmlal.u32	q5,d7,d2[1]
438	vmlal.u32	q6,d0,d1[1]
439	vmlal.u32	q7,d1,d1[1]
440	vmlal.u32	q8,d3,d1[1]
441	vmlal.u32	q9,d5,d1[1]
442
443	vmlal.u32	q5,d5,d4[1]
444	vmlal.u32	q6,d7,d4[1]
445	vmlal.u32	q8,d1,d3[1]
446	vmlal.u32	q7,d0,d3[1]
447	vmlal.u32	q9,d3,d3[1]
448
449	vmlal.u32	q5,d3,d6[1]
450	vmlal.u32	q8,d0,d5[1]
451	vmlal.u32	q6,d5,d6[1]
452	vmlal.u32	q7,d7,d6[1]
453	vmlal.u32	q9,d1,d5[1]
454
455	vmlal.u32	q8,d7,d8[1]
456	vmlal.u32	q5,d1,d8[1]
457	vmlal.u32	q6,d3,d8[1]
458	vmlal.u32	q7,d5,d8[1]
459	vmlal.u32	q9,d0,d7[1]
460
461	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
462	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
463	@ and P. Schwabe
464	@
465	@ H0>>+H1>>+H2>>+H3>>+H4
466	@ H3>>+H4>>*5+H0>>+H1
467	@
468	@ Trivia.
469	@
470	@ Result of multiplication of n-bit number by m-bit number is
471	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
472	@ m-bit number multiplied by 2^n is still n+m bits wide.
473	@
474	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
475	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
476	@ one is n+1 bits wide.
477	@
478	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
479	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
480	@ can be 27. However! In cases when their width exceeds 26 bits
481	@ they are limited by 2^26+2^6. This in turn means that *sum*
482	@ of the products with these values can still be viewed as sum
483	@ of 52-bit numbers as long as the amount of addends is not a
484	@ power of 2. For example,
485	@
486	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
487	@
488	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
489	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
490	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
491	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
492	@ which is less than 32 * (2^52) or 2^57. And when processing
493	@ data we are looking at triple as many addends...
494	@
495	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
496	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
497	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
498	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
499	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
500	@ This means that result of reduction have to be compressed upon
501	@ loop wrap-around. This can be done in the process of reduction
502	@ to minimize amount of instructions [as well as amount of
503	@ 128-bit instructions, which benefits low-end processors], but
504	@ one has to watch for H2 (which is narrower than H0) and 5*H4
505	@ not being wider than 58 bits, so that result of right shift
506	@ by 26 bits fits in 32 bits. This is also useful on x86,
507	@ because it allows to use paddd in place for paddq, which
508	@ benefits Atom, where paddq is ridiculously slow.
509
510	vshr.u64	q15,q8,#26
511	vmovn.i64	d16,q8
512	vshr.u64	q4,q5,#26
513	vmovn.i64	d10,q5
514	vadd.i64	q9,q9,q15		@ h3 -> h4
515	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
516	vadd.i64	q6,q6,q4		@ h0 -> h1
517	vbic.i32	d10,#0xfc000000
518
519	vshrn.u64	d30,q9,#26
520	vmovn.i64	d18,q9
521	vshr.u64	q4,q6,#26
522	vmovn.i64	d12,q6
523	vadd.i64	q7,q7,q4		@ h1 -> h2
524	vbic.i32	d18,#0xfc000000
525	vbic.i32	d12,#0xfc000000
526
527	vadd.i32	d10,d10,d30
528	vshl.u32	d30,d30,#2
529	vshrn.u64	d8,q7,#26
530	vmovn.i64	d14,q7
531	vadd.i32	d10,d10,d30	@ h4 -> h0
532	vadd.i32	d16,d16,d8	@ h2 -> h3
533	vbic.i32	d14,#0xfc000000
534
535	vshr.u32	d30,d10,#26
536	vbic.i32	d10,#0xfc000000
537	vshr.u32	d8,d16,#26
538	vbic.i32	d16,#0xfc000000
539	vadd.i32	d12,d12,d30	@ h0 -> h1
540	vadd.i32	d18,d18,d8	@ h3 -> h4
541
542	subs	r5,r5,#1
543	beq	.Lsquare_break_neon
544
545	add	r6,r0,#(48+0*9*4)
546	add	r7,r0,#(48+1*9*4)
547
548	vtrn.32	d0,d10		@ r^2:r^1
549	vtrn.32	d3,d14
550	vtrn.32	d5,d16
551	vtrn.32	d1,d12
552	vtrn.32	d7,d18
553
554	vshl.u32	d4,d3,#2		@ *5
555	vshl.u32	d6,d5,#2
556	vshl.u32	d2,d1,#2
557	vshl.u32	d8,d7,#2
558	vadd.i32	d4,d4,d3
559	vadd.i32	d2,d2,d1
560	vadd.i32	d6,d6,d5
561	vadd.i32	d8,d8,d7
562
563	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
564	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
565	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
566	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
567	vst1.32	{d8[0]},[r6,:32]
568	vst1.32	{d8[1]},[r7,:32]
569
570	b	.Lsquare_neon
571
572.align	4
573.Lsquare_break_neon:
574	add	r6,r0,#(48+2*4*9)
575	add	r7,r0,#(48+3*4*9)
576
577	vmov	d0,d10		@ r^4:r^3
578	vshl.u32	d2,d12,#2		@ *5
579	vmov	d1,d12
580	vshl.u32	d4,d14,#2
581	vmov	d3,d14
582	vshl.u32	d6,d16,#2
583	vmov	d5,d16
584	vshl.u32	d8,d18,#2
585	vmov	d7,d18
586	vadd.i32	d2,d2,d12
587	vadd.i32	d4,d4,d14
588	vadd.i32	d6,d6,d16
589	vadd.i32	d8,d8,d18
590
591	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
592	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
593	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
594	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
595	vst1.32	{d8[0]},[r6]
596	vst1.32	{d8[1]},[r7]
597
598	RET				@ bx	lr
599.size	poly1305_init_neon,.-poly1305_init_neon
600
601.type	poly1305_blocks_neon,%function
602.align	5
603poly1305_blocks_neon:
604	ldr	ip,[r0,#36]		@ is_base2_26
605	ands	r2,r2,#-16
606	beq	.Lno_data_neon
607
608	cmp	r2,#64
609	bhs	.Lenter_neon
610	tst	ip,ip			@ is_base2_26?
611	beq	.Lpoly1305_blocks
612
613.Lenter_neon:
614	stmdb	sp!,{r4,r5,r6,r7}
615	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
616
617	tst	ip,ip			@ is_base2_26?
618	bne	.Lbase2_26_neon
619
620	stmdb	sp!,{r1,r2,r3,lr}
621	bl	poly1305_init_neon
622
623	ldr	r4,[r0,#0]		@ load hash value base 2^32
624	ldr	r5,[r0,#4]
625	ldr	r6,[r0,#8]
626	ldr	r7,[r0,#12]
627	ldr	ip,[r0,#16]
628
629	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
630	mov	r3,r4,lsr#26
631	veor	d10,d10,d10
632	mov	r4,r5,lsr#20
633	orr	r3,r3,r5,lsl#6
634	veor	d12,d12,d12
635	mov	r5,r6,lsr#14
636	orr	r4,r4,r6,lsl#12
637	veor	d14,d14,d14
638	mov	r6,r7,lsr#8
639	orr	r5,r5,r7,lsl#18
640	veor	d16,d16,d16
641	and	r3,r3,#0x03ffffff
642	orr	r6,r6,ip,lsl#24
643	veor	d18,d18,d18
644	and	r4,r4,#0x03ffffff
645	mov	r1,#1
646	and	r5,r5,#0x03ffffff
647	str	r1,[r0,#36]		@ is_base2_26
648
649	vmov.32	d10[0],r2
650	vmov.32	d12[0],r3
651	vmov.32	d14[0],r4
652	vmov.32	d16[0],r5
653	vmov.32	d18[0],r6
654	adr	r5,.Lzeros
655
656	ldmia	sp!,{r1,r2,r3,lr}
657	b	.Lbase2_32_neon
658
659.align	4
660.Lbase2_26_neon:
661	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
662	@ load hash value
663
664	veor	d10,d10,d10
665	veor	d12,d12,d12
666	veor	d14,d14,d14
667	veor	d16,d16,d16
668	veor	d18,d18,d18
669	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
670	adr	r5,.Lzeros
671	vld1.32	{d18[0]},[r0]
672	sub	r0,r0,#16		@ rewind
673
674.Lbase2_32_neon:
675	add	r4,r1,#32
676	mov	r3,r3,lsl#24
677	tst	r2,#31
678	beq	.Leven
679
680	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
681	vmov.32	d28[0],r3
682	sub	r2,r2,#16
683	add	r4,r1,#32
684
685# ifdef	__ARMEB__
686	vrev32.8	q10,q10
687	vrev32.8	q13,q13
688	vrev32.8	q11,q11
689	vrev32.8	q12,q12
690# endif
691	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
692	vshl.u32	d26,d26,#18
693
694	vsri.u32	d26,d24,#14
695	vshl.u32	d24,d24,#12
696	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
697
698	vbic.i32	d26,#0xfc000000
699	vsri.u32	d24,d22,#20
700	vshl.u32	d22,d22,#6
701
702	vbic.i32	d24,#0xfc000000
703	vsri.u32	d22,d20,#26
704	vadd.i32	d27,d26,d16
705
706	vbic.i32	d20,#0xfc000000
707	vbic.i32	d22,#0xfc000000
708	vadd.i32	d25,d24,d14
709
710	vadd.i32	d21,d20,d10
711	vadd.i32	d23,d22,d12
712
713	mov	r7,r5
714	add	r6,r0,#48
715
716	cmp	r2,r2
717	b	.Long_tail
718
719.align	4
720.Leven:
721	subs	r2,r2,#64
722	it	lo
723	movlo	r4,r5
724
725	vmov.i32	q14,#1<<24		@ padbit, yes, always
726	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
727	add	r1,r1,#64
728	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
729	add	r4,r4,#64
730	itt	hi
731	addhi	r7,r0,#(48+1*9*4)
732	addhi	r6,r0,#(48+3*9*4)
733
734# ifdef	__ARMEB__
735	vrev32.8	q10,q10
736	vrev32.8	q13,q13
737	vrev32.8	q11,q11
738	vrev32.8	q12,q12
739# endif
740	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
741	vshl.u32	q13,q13,#18
742
743	vsri.u32	q13,q12,#14
744	vshl.u32	q12,q12,#12
745
746	vbic.i32	q13,#0xfc000000
747	vsri.u32	q12,q11,#20
748	vshl.u32	q11,q11,#6
749
750	vbic.i32	q12,#0xfc000000
751	vsri.u32	q11,q10,#26
752
753	vbic.i32	q10,#0xfc000000
754	vbic.i32	q11,#0xfc000000
755
756	bls	.Lskip_loop
757
758	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
759	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
760	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
761	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
762	b	.Loop_neon
763
764.align	5
765.Loop_neon:
766	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
767	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
768	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
769	@   ___________________/
770	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
771	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
772	@   ___________________/ ____________________/
773	@
774	@ Note that we start with inp[2:3]*r^2. This is because it
775	@ doesn't depend on reduction in previous iteration.
776	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
777	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
778	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
779	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
780	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
781	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
782
783	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
784	@ inp[2:3]*r^2
785
786	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
787	vmull.u32	q7,d25,d0[1]
788	vadd.i32	d20,d20,d10
789	vmull.u32	q5,d21,d0[1]
790	vadd.i32	d26,d26,d16
791	vmull.u32	q8,d27,d0[1]
792	vmlal.u32	q7,d23,d1[1]
793	vadd.i32	d22,d22,d12
794	vmull.u32	q6,d23,d0[1]
795
796	vadd.i32	d28,d28,d18
797	vmull.u32	q9,d29,d0[1]
798	subs	r2,r2,#64
799	vmlal.u32	q5,d29,d2[1]
800	it	lo
801	movlo	r4,r5
802	vmlal.u32	q8,d25,d1[1]
803	vld1.32	d8[1],[r7,:32]
804	vmlal.u32	q6,d21,d1[1]
805	vmlal.u32	q9,d27,d1[1]
806
807	vmlal.u32	q5,d27,d4[1]
808	vmlal.u32	q8,d23,d3[1]
809	vmlal.u32	q9,d25,d3[1]
810	vmlal.u32	q6,d29,d4[1]
811	vmlal.u32	q7,d21,d3[1]
812
813	vmlal.u32	q8,d21,d5[1]
814	vmlal.u32	q5,d25,d6[1]
815	vmlal.u32	q9,d23,d5[1]
816	vmlal.u32	q6,d27,d6[1]
817	vmlal.u32	q7,d29,d6[1]
818
819	vmlal.u32	q8,d29,d8[1]
820	vmlal.u32	q5,d23,d8[1]
821	vmlal.u32	q9,d21,d7[1]
822	vmlal.u32	q6,d25,d8[1]
823	vmlal.u32	q7,d27,d8[1]
824
825	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
826	add	r4,r4,#64
827
828	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
829	@ (hash+inp[0:1])*r^4 and accumulate
830
831	vmlal.u32	q8,d26,d0[0]
832	vmlal.u32	q5,d20,d0[0]
833	vmlal.u32	q9,d28,d0[0]
834	vmlal.u32	q6,d22,d0[0]
835	vmlal.u32	q7,d24,d0[0]
836	vld1.32	d8[0],[r6,:32]
837
838	vmlal.u32	q8,d24,d1[0]
839	vmlal.u32	q5,d28,d2[0]
840	vmlal.u32	q9,d26,d1[0]
841	vmlal.u32	q6,d20,d1[0]
842	vmlal.u32	q7,d22,d1[0]
843
844	vmlal.u32	q8,d22,d3[0]
845	vmlal.u32	q5,d26,d4[0]
846	vmlal.u32	q9,d24,d3[0]
847	vmlal.u32	q6,d28,d4[0]
848	vmlal.u32	q7,d20,d3[0]
849
850	vmlal.u32	q8,d20,d5[0]
851	vmlal.u32	q5,d24,d6[0]
852	vmlal.u32	q9,d22,d5[0]
853	vmlal.u32	q6,d26,d6[0]
854	vmlal.u32	q8,d28,d8[0]
855
856	vmlal.u32	q7,d28,d6[0]
857	vmlal.u32	q5,d22,d8[0]
858	vmlal.u32	q9,d20,d7[0]
859	vmov.i32	q14,#1<<24		@ padbit, yes, always
860	vmlal.u32	q6,d24,d8[0]
861	vmlal.u32	q7,d26,d8[0]
862
863	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
864	add	r1,r1,#64
865# ifdef	__ARMEB__
866	vrev32.8	q10,q10
867	vrev32.8	q11,q11
868	vrev32.8	q12,q12
869	vrev32.8	q13,q13
870# endif
871
872	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
873	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
874	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
875
876	vshr.u64	q15,q8,#26
877	vmovn.i64	d16,q8
878	vshr.u64	q4,q5,#26
879	vmovn.i64	d10,q5
880	vadd.i64	q9,q9,q15		@ h3 -> h4
881	vbic.i32	d16,#0xfc000000
882	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
883	vadd.i64	q6,q6,q4		@ h0 -> h1
884	vshl.u32	q13,q13,#18
885	vbic.i32	d10,#0xfc000000
886
887	vshrn.u64	d30,q9,#26
888	vmovn.i64	d18,q9
889	vshr.u64	q4,q6,#26
890	vmovn.i64	d12,q6
891	vadd.i64	q7,q7,q4		@ h1 -> h2
892	vsri.u32	q13,q12,#14
893	vbic.i32	d18,#0xfc000000
894	vshl.u32	q12,q12,#12
895	vbic.i32	d12,#0xfc000000
896
897	vadd.i32	d10,d10,d30
898	vshl.u32	d30,d30,#2
899	vbic.i32	q13,#0xfc000000
900	vshrn.u64	d8,q7,#26
901	vmovn.i64	d14,q7
902	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
903	vsri.u32	q12,q11,#20
904	vadd.i32	d16,d16,d8	@ h2 -> h3
905	vshl.u32	q11,q11,#6
906	vbic.i32	d14,#0xfc000000
907	vbic.i32	q12,#0xfc000000
908
909	vshrn.u64	d30,q5,#26		@ re-narrow
910	vmovn.i64	d10,q5
911	vsri.u32	q11,q10,#26
912	vbic.i32	q10,#0xfc000000
913	vshr.u32	d8,d16,#26
914	vbic.i32	d16,#0xfc000000
915	vbic.i32	d10,#0xfc000000
916	vadd.i32	d12,d12,d30	@ h0 -> h1
917	vadd.i32	d18,d18,d8	@ h3 -> h4
918	vbic.i32	q11,#0xfc000000
919
920	bhi	.Loop_neon
921
922.Lskip_loop:
923	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
924	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
925
926	add	r7,r0,#(48+0*9*4)
927	add	r6,r0,#(48+1*9*4)
928	adds	r2,r2,#32
929	it	ne
930	movne	r2,#0
931	bne	.Long_tail
932
933	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
934	vadd.i32	d21,d20,d10
935	vadd.i32	d27,d26,d16
936	vadd.i32	d23,d22,d12
937	vadd.i32	d29,d28,d18
938
939.Long_tail:
940	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
941	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
942
943	vadd.i32	d24,d24,d14	@ can be redundant
944	vmull.u32	q7,d25,d0
945	vadd.i32	d20,d20,d10
946	vmull.u32	q5,d21,d0
947	vadd.i32	d26,d26,d16
948	vmull.u32	q8,d27,d0
949	vadd.i32	d22,d22,d12
950	vmull.u32	q6,d23,d0
951	vadd.i32	d28,d28,d18
952	vmull.u32	q9,d29,d0
953
954	vmlal.u32	q5,d29,d2
955	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
956	vmlal.u32	q8,d25,d1
957	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
958	vmlal.u32	q6,d21,d1
959	vmlal.u32	q9,d27,d1
960	vmlal.u32	q7,d23,d1
961
962	vmlal.u32	q8,d23,d3
963	vld1.32	d8[1],[r7,:32]
964	vmlal.u32	q5,d27,d4
965	vld1.32	d8[0],[r6,:32]
966	vmlal.u32	q9,d25,d3
967	vmlal.u32	q6,d29,d4
968	vmlal.u32	q7,d21,d3
969
970	vmlal.u32	q8,d21,d5
971	it	ne
972	addne	r7,r0,#(48+2*9*4)
973	vmlal.u32	q5,d25,d6
974	it	ne
975	addne	r6,r0,#(48+3*9*4)
976	vmlal.u32	q9,d23,d5
977	vmlal.u32	q6,d27,d6
978	vmlal.u32	q7,d29,d6
979
980	vmlal.u32	q8,d29,d8
981	vorn	q0,q0,q0	@ all-ones, can be redundant
982	vmlal.u32	q5,d23,d8
983	vshr.u64	q0,q0,#38
984	vmlal.u32	q9,d21,d7
985	vmlal.u32	q6,d25,d8
986	vmlal.u32	q7,d27,d8
987
988	beq	.Lshort_tail
989
990	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
991	@ (hash+inp[0:1])*r^4:r^3 and accumulate
992
993	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
994	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
995
996	vmlal.u32	q7,d24,d0
997	vmlal.u32	q5,d20,d0
998	vmlal.u32	q8,d26,d0
999	vmlal.u32	q6,d22,d0
1000	vmlal.u32	q9,d28,d0
1001
1002	vmlal.u32	q5,d28,d2
1003	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
1004	vmlal.u32	q8,d24,d1
1005	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1006	vmlal.u32	q6,d20,d1
1007	vmlal.u32	q9,d26,d1
1008	vmlal.u32	q7,d22,d1
1009
1010	vmlal.u32	q8,d22,d3
1011	vld1.32	d8[1],[r7,:32]
1012	vmlal.u32	q5,d26,d4
1013	vld1.32	d8[0],[r6,:32]
1014	vmlal.u32	q9,d24,d3
1015	vmlal.u32	q6,d28,d4
1016	vmlal.u32	q7,d20,d3
1017
1018	vmlal.u32	q8,d20,d5
1019	vmlal.u32	q5,d24,d6
1020	vmlal.u32	q9,d22,d5
1021	vmlal.u32	q6,d26,d6
1022	vmlal.u32	q7,d28,d6
1023
1024	vmlal.u32	q8,d28,d8
1025	vorn	q0,q0,q0	@ all-ones
1026	vmlal.u32	q5,d22,d8
1027	vshr.u64	q0,q0,#38
1028	vmlal.u32	q9,d20,d7
1029	vmlal.u32	q6,d24,d8
1030	vmlal.u32	q7,d26,d8
1031
1032.Lshort_tail:
1033	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1034	@ horizontal addition
1035
1036	vadd.i64	d16,d16,d17
1037	vadd.i64	d10,d10,d11
1038	vadd.i64	d18,d18,d19
1039	vadd.i64	d12,d12,d13
1040	vadd.i64	d14,d14,d15
1041
1042	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1043	@ lazy reduction, but without narrowing
1044
1045	vshr.u64	q15,q8,#26
1046	vand.i64	q8,q8,q0
1047	vshr.u64	q4,q5,#26
1048	vand.i64	q5,q5,q0
1049	vadd.i64	q9,q9,q15		@ h3 -> h4
1050	vadd.i64	q6,q6,q4		@ h0 -> h1
1051
1052	vshr.u64	q15,q9,#26
1053	vand.i64	q9,q9,q0
1054	vshr.u64	q4,q6,#26
1055	vand.i64	q6,q6,q0
1056	vadd.i64	q7,q7,q4		@ h1 -> h2
1057
1058	vadd.i64	q5,q5,q15
1059	vshl.u64	q15,q15,#2
1060	vshr.u64	q4,q7,#26
1061	vand.i64	q7,q7,q0
1062	vadd.i64	q5,q5,q15		@ h4 -> h0
1063	vadd.i64	q8,q8,q4		@ h2 -> h3
1064
1065	vshr.u64	q15,q5,#26
1066	vand.i64	q5,q5,q0
1067	vshr.u64	q4,q8,#26
1068	vand.i64	q8,q8,q0
1069	vadd.i64	q6,q6,q15		@ h0 -> h1
1070	vadd.i64	q9,q9,q4		@ h3 -> h4
1071
1072	cmp	r2,#0
1073	bne	.Leven
1074
1075	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1076	@ store hash value
1077
1078	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1079	vst1.32	{d18[0]},[r0]
1080
1081	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1082	ldmia	sp!,{r4,r5,r6,r7}
1083.Lno_data_neon:
1084	RET					@ bx	lr
1085.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1086
1087.type	poly1305_emit_neon,%function
1088.align	5
1089poly1305_emit_neon:
1090	ldr	ip,[r0,#36]		@ is_base2_26
1091
1092	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1093
1094	tst	ip,ip
1095	beq	.Lpoly1305_emit_enter
1096
1097	ldmia	r0,{r3,r4,r5,r6,r7}
1098	eor	r8,r8,r8
1099
1100	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1101	mov	r4,r4,lsr#6
1102	adcs	r4,r4,r5,lsl#20
1103	mov	r5,r5,lsr#12
1104	adcs	r5,r5,r6,lsl#14
1105	mov	r6,r6,lsr#18
1106	adcs	r6,r6,r7,lsl#8
1107	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1108
1109	and	r8,r7,#-4		@ ... so reduce
1110	and	r7,r6,#3
1111	add	r8,r8,r8,lsr#2	@ *= 5
1112	adds	r3,r3,r8
1113	adcs	r4,r4,#0
1114	adcs	r5,r5,#0
1115	adcs	r6,r6,#0
1116	adc	r7,r7,#0
1117
1118	adds	r8,r3,#5		@ compare to modulus
1119	adcs	r9,r4,#0
1120	adcs	r10,r5,#0
1121	adcs	r11,r6,#0
1122	adc	r7,r7,#0
1123	tst	r7,#4			@ did it carry/borrow?
1124
1125	it	ne
1126	movne	r3,r8
1127	ldr	r8,[r2,#0]
1128	it	ne
1129	movne	r4,r9
1130	ldr	r9,[r2,#4]
1131	it	ne
1132	movne	r5,r10
1133	ldr	r10,[r2,#8]
1134	it	ne
1135	movne	r6,r11
1136	ldr	r11,[r2,#12]
1137
1138	adds	r3,r3,r8		@ accumulate nonce
1139	adcs	r4,r4,r9
1140	adcs	r5,r5,r10
1141	adc	r6,r6,r11
1142
1143# ifdef __ARMEB__
1144	rev	r3,r3
1145	rev	r4,r4
1146	rev	r5,r5
1147	rev	r6,r6
1148# endif
1149	str	r3,[r1,#0]		@ store the result
1150	str	r4,[r1,#4]
1151	str	r5,[r1,#8]
1152	str	r6,[r1,#12]
1153
1154	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1155	RET				@ bx	lr
1156.size	poly1305_emit_neon,.-poly1305_emit_neon
1157
1158.align	5
1159.Lzeros:
1160.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1161.LOPENSSL_armcap:
1162.word	OPENSSL_armcap_P-.Lpoly1305_init
1163#endif
1164.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1165.align	2
1166.align	2
1167#if	__ARM_MAX_ARCH__>=7
1168.comm	OPENSSL_armcap_P,4,4
1169#endif
1170