xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/armv8-mont.S (revision 1b3d6f93806f8821fe459e13ad13e605b37c6d43)
1#ifndef	__KERNEL__
2# include "arm_arch.h"
3
4.hidden	OPENSSL_armv8_rsa_neonized
5#endif
6.text
7
8.globl	bn_mul_mont
9.type	bn_mul_mont,%function
10.align	5
11bn_mul_mont:
12.Lbn_mul_mont:
13	tst	x5,#3
14	b.ne	.Lmul_mont
15	cmp	x5,#32
16	b.le	.Lscalar_impl
17#ifndef	__KERNEL__
18	adrp	x17,OPENSSL_armv8_rsa_neonized
19	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
20	cbnz	w17, bn_mul8x_mont_neon
21#endif
22
23.Lscalar_impl:
24	tst	x5,#7
25	b.eq	__bn_sqr8x_mont
26	tst	x5,#3
27	b.eq	__bn_mul4x_mont
28
29.Lmul_mont:
30	stp	x29,x30,[sp,#-64]!
31	add	x29,sp,#0
32	stp	x19,x20,[sp,#16]
33	stp	x21,x22,[sp,#32]
34	stp	x23,x24,[sp,#48]
35
36	ldr	x9,[x2],#8		// bp[0]
37	sub	x22,sp,x5,lsl#3
38	ldp	x7,x8,[x1],#16	// ap[0..1]
39	lsl	x5,x5,#3
40	ldr	x4,[x4]		// *n0
41	and	x22,x22,#-16		// ABI says so
42	ldp	x13,x14,[x3],#16	// np[0..1]
43
44	mul	x6,x7,x9		// ap[0]*bp[0]
45	sub	x21,x5,#16		// j=num-2
46	umulh	x7,x7,x9
47	mul	x10,x8,x9		// ap[1]*bp[0]
48	umulh	x11,x8,x9
49
50	mul	x15,x6,x4		// "tp[0]"*n0
51	mov	sp,x22			// alloca
52
53	// (*)	mul	x12,x13,x15	// np[0]*m1
54	umulh	x13,x13,x15
55	mul	x16,x14,x15		// np[1]*m1
56	// (*)	adds	x12,x12,x6	// discarded
57	// (*)	As for removal of first multiplication and addition
58	//	instructions. The outcome of first addition is
59	//	guaranteed to be zero, which leaves two computationally
60	//	significant outcomes: it either carries or not. Then
61	//	question is when does it carry? Is there alternative
62	//	way to deduce it? If you follow operations, you can
63	//	observe that condition for carry is quite simple:
64	//	x6 being non-zero. So that carry can be calculated
65	//	by adding -1 to x6. That's what next instruction does.
66	subs	xzr,x6,#1		// (*)
67	umulh	x17,x14,x15
68	adc	x13,x13,xzr
69	cbz	x21,.L1st_skip
70
71.L1st:
72	ldr	x8,[x1],#8
73	adds	x6,x10,x7
74	sub	x21,x21,#8		// j--
75	adc	x7,x11,xzr
76
77	ldr	x14,[x3],#8
78	adds	x12,x16,x13
79	mul	x10,x8,x9		// ap[j]*bp[0]
80	adc	x13,x17,xzr
81	umulh	x11,x8,x9
82
83	adds	x12,x12,x6
84	mul	x16,x14,x15		// np[j]*m1
85	adc	x13,x13,xzr
86	umulh	x17,x14,x15
87	str	x12,[x22],#8		// tp[j-1]
88	cbnz	x21,.L1st
89
90.L1st_skip:
91	adds	x6,x10,x7
92	sub	x1,x1,x5		// rewind x1
93	adc	x7,x11,xzr
94
95	adds	x12,x16,x13
96	sub	x3,x3,x5		// rewind x3
97	adc	x13,x17,xzr
98
99	adds	x12,x12,x6
100	sub	x20,x5,#8		// i=num-1
101	adcs	x13,x13,x7
102
103	adc	x19,xzr,xzr		// upmost overflow bit
104	stp	x12,x13,[x22]
105
106.Louter:
107	ldr	x9,[x2],#8		// bp[i]
108	ldp	x7,x8,[x1],#16
109	ldr	x23,[sp]		// tp[0]
110	add	x22,sp,#8
111
112	mul	x6,x7,x9		// ap[0]*bp[i]
113	sub	x21,x5,#16		// j=num-2
114	umulh	x7,x7,x9
115	ldp	x13,x14,[x3],#16
116	mul	x10,x8,x9		// ap[1]*bp[i]
117	adds	x6,x6,x23
118	umulh	x11,x8,x9
119	adc	x7,x7,xzr
120
121	mul	x15,x6,x4
122	sub	x20,x20,#8		// i--
123
124	// (*)	mul	x12,x13,x15	// np[0]*m1
125	umulh	x13,x13,x15
126	mul	x16,x14,x15		// np[1]*m1
127	// (*)	adds	x12,x12,x6
128	subs	xzr,x6,#1		// (*)
129	umulh	x17,x14,x15
130	cbz	x21,.Linner_skip
131
132.Linner:
133	ldr	x8,[x1],#8
134	adc	x13,x13,xzr
135	ldr	x23,[x22],#8		// tp[j]
136	adds	x6,x10,x7
137	sub	x21,x21,#8		// j--
138	adc	x7,x11,xzr
139
140	adds	x12,x16,x13
141	ldr	x14,[x3],#8
142	adc	x13,x17,xzr
143
144	mul	x10,x8,x9		// ap[j]*bp[i]
145	adds	x6,x6,x23
146	umulh	x11,x8,x9
147	adc	x7,x7,xzr
148
149	mul	x16,x14,x15		// np[j]*m1
150	adds	x12,x12,x6
151	umulh	x17,x14,x15
152	stur	x12,[x22,#-16]		// tp[j-1]
153	cbnz	x21,.Linner
154
155.Linner_skip:
156	ldr	x23,[x22],#8		// tp[j]
157	adc	x13,x13,xzr
158	adds	x6,x10,x7
159	sub	x1,x1,x5		// rewind x1
160	adc	x7,x11,xzr
161
162	adds	x12,x16,x13
163	sub	x3,x3,x5		// rewind x3
164	adcs	x13,x17,x19
165	adc	x19,xzr,xzr
166
167	adds	x6,x6,x23
168	adc	x7,x7,xzr
169
170	adds	x12,x12,x6
171	adcs	x13,x13,x7
172	adc	x19,x19,xzr		// upmost overflow bit
173	stp	x12,x13,[x22,#-16]
174
175	cbnz	x20,.Louter
176
177	// Final step. We see if result is larger than modulus, and
178	// if it is, subtract the modulus. But comparison implies
179	// subtraction. So we subtract modulus, see if it borrowed,
180	// and conditionally copy original value.
181	ldr	x23,[sp]		// tp[0]
182	add	x22,sp,#8
183	ldr	x14,[x3],#8		// np[0]
184	subs	x21,x5,#8		// j=num-1 and clear borrow
185	mov	x1,x0
186.Lsub:
187	sbcs	x8,x23,x14		// tp[j]-np[j]
188	ldr	x23,[x22],#8
189	sub	x21,x21,#8		// j--
190	ldr	x14,[x3],#8
191	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
192	cbnz	x21,.Lsub
193
194	sbcs	x8,x23,x14
195	sbcs	x19,x19,xzr		// did it borrow?
196	str	x8,[x1],#8		// rp[num-1]
197
198	ldr	x23,[sp]		// tp[0]
199	add	x22,sp,#8
200	ldr	x8,[x0],#8		// rp[0]
201	sub	x5,x5,#8		// num--
202	nop
203.Lcond_copy:
204	sub	x5,x5,#8		// num--
205	csel	x14,x23,x8,lo		// did it borrow?
206	ldr	x23,[x22],#8
207	ldr	x8,[x0],#8
208	stur	xzr,[x22,#-16]		// wipe tp
209	stur	x14,[x0,#-16]
210	cbnz	x5,.Lcond_copy
211
212	csel	x14,x23,x8,lo
213	stur	xzr,[x22,#-8]		// wipe tp
214	stur	x14,[x0,#-8]
215
216	ldp	x19,x20,[x29,#16]
217	mov	sp,x29
218	ldp	x21,x22,[x29,#32]
219	mov	x0,#1
220	ldp	x23,x24,[x29,#48]
221	ldr	x29,[sp],#64
222	ret
223.size	bn_mul_mont,.-bn_mul_mont
224.type	bn_mul8x_mont_neon,%function
225.align	5
226bn_mul8x_mont_neon:
227	stp	x29,x30,[sp,#-80]!
228	mov	x16,sp
229	stp	d8,d9,[sp,#16]
230	stp	d10,d11,[sp,#32]
231	stp	d12,d13,[sp,#48]
232	stp	d14,d15,[sp,#64]
233	lsl	x5,x5,#1
234	eor	v14.16b,v14.16b,v14.16b
235
236.align	4
237.LNEON_8n:
238	eor	v6.16b,v6.16b,v6.16b
239	sub	x7,sp,#128
240	eor	v7.16b,v7.16b,v7.16b
241	sub	x7,x7,x5,lsl#4
242	eor	v8.16b,v8.16b,v8.16b
243	and	x7,x7,#-64
244	eor	v9.16b,v9.16b,v9.16b
245	mov	sp,x7		// alloca
246	eor	v10.16b,v10.16b,v10.16b
247	add	x7,x7,#256
248	eor	v11.16b,v11.16b,v11.16b
249	sub	x8,x5,#8
250	eor	v12.16b,v12.16b,v12.16b
251	eor	v13.16b,v13.16b,v13.16b
252
253.LNEON_8n_init:
254	st1	{v6.2d,v7.2d},[x7],#32
255	subs	x8,x8,#8
256	st1	{v8.2d,v9.2d},[x7],#32
257	st1	{v10.2d,v11.2d},[x7],#32
258	st1	{v12.2d,v13.2d},[x7],#32
259	bne	.LNEON_8n_init
260
261	add	x6,sp,#256
262	ld1	{v0.4s,v1.4s},[x1],#32
263	add	x10,sp,#8
264	ldr	s30,[x4],#4
265	mov	x9,x5
266	b	.LNEON_8n_outer
267
268.align	4
269.LNEON_8n_outer:
270	ldr	s28,[x2],#4   // *b++
271	uxtl	v28.4s,v28.4h
272	add	x7,sp,#128
273	ld1	{v2.4s,v3.4s},[x3],#32
274
275	umlal	v6.2d,v28.2s,v0.s[0]
276	umlal	v7.2d,v28.2s,v0.s[1]
277	umlal	v8.2d,v28.2s,v0.s[2]
278	shl	v29.2d,v6.2d,#16
279	ext	v29.16b,v29.16b,v29.16b,#8
280	umlal	v9.2d,v28.2s,v0.s[3]
281	add	v29.2d,v29.2d,v6.2d
282	umlal	v10.2d,v28.2s,v1.s[0]
283	mul	v29.2s,v29.2s,v30.2s
284	umlal	v11.2d,v28.2s,v1.s[1]
285	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
286	umlal	v12.2d,v28.2s,v1.s[2]
287	uxtl	v29.4s,v29.4h
288	umlal	v13.2d,v28.2s,v1.s[3]
289	ldr	s28,[x2],#4   // *b++
290	umlal	v6.2d,v29.2s,v2.s[0]
291	umlal	v7.2d,v29.2s,v2.s[1]
292	uxtl	v28.4s,v28.4h
293	umlal	v8.2d,v29.2s,v2.s[2]
294	ushr	v15.2d,v6.2d,#16
295	umlal	v9.2d,v29.2s,v2.s[3]
296	umlal	v10.2d,v29.2s,v3.s[0]
297	ext	v6.16b,v6.16b,v6.16b,#8
298	add	v6.2d,v6.2d,v15.2d
299	umlal	v11.2d,v29.2s,v3.s[1]
300	ushr	v6.2d,v6.2d,#16
301	umlal	v12.2d,v29.2s,v3.s[2]
302	umlal	v13.2d,v29.2s,v3.s[3]
303	add	v16.2d,v7.2d,v6.2d
304	ins	v7.d[0],v16.d[0]
305	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
306	umlal	v7.2d,v28.2s,v0.s[0]
307	ld1	{v6.2d},[x6],#16
308	umlal	v8.2d,v28.2s,v0.s[1]
309	umlal	v9.2d,v28.2s,v0.s[2]
310	shl	v29.2d,v7.2d,#16
311	ext	v29.16b,v29.16b,v29.16b,#8
312	umlal	v10.2d,v28.2s,v0.s[3]
313	add	v29.2d,v29.2d,v7.2d
314	umlal	v11.2d,v28.2s,v1.s[0]
315	mul	v29.2s,v29.2s,v30.2s
316	umlal	v12.2d,v28.2s,v1.s[1]
317	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
318	umlal	v13.2d,v28.2s,v1.s[2]
319	uxtl	v29.4s,v29.4h
320	umlal	v6.2d,v28.2s,v1.s[3]
321	ldr	s28,[x2],#4   // *b++
322	umlal	v7.2d,v29.2s,v2.s[0]
323	umlal	v8.2d,v29.2s,v2.s[1]
324	uxtl	v28.4s,v28.4h
325	umlal	v9.2d,v29.2s,v2.s[2]
326	ushr	v15.2d,v7.2d,#16
327	umlal	v10.2d,v29.2s,v2.s[3]
328	umlal	v11.2d,v29.2s,v3.s[0]
329	ext	v7.16b,v7.16b,v7.16b,#8
330	add	v7.2d,v7.2d,v15.2d
331	umlal	v12.2d,v29.2s,v3.s[1]
332	ushr	v7.2d,v7.2d,#16
333	umlal	v13.2d,v29.2s,v3.s[2]
334	umlal	v6.2d,v29.2s,v3.s[3]
335	add	v16.2d,v8.2d,v7.2d
336	ins	v8.d[0],v16.d[0]
337	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
338	umlal	v8.2d,v28.2s,v0.s[0]
339	ld1	{v7.2d},[x6],#16
340	umlal	v9.2d,v28.2s,v0.s[1]
341	umlal	v10.2d,v28.2s,v0.s[2]
342	shl	v29.2d,v8.2d,#16
343	ext	v29.16b,v29.16b,v29.16b,#8
344	umlal	v11.2d,v28.2s,v0.s[3]
345	add	v29.2d,v29.2d,v8.2d
346	umlal	v12.2d,v28.2s,v1.s[0]
347	mul	v29.2s,v29.2s,v30.2s
348	umlal	v13.2d,v28.2s,v1.s[1]
349	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
350	umlal	v6.2d,v28.2s,v1.s[2]
351	uxtl	v29.4s,v29.4h
352	umlal	v7.2d,v28.2s,v1.s[3]
353	ldr	s28,[x2],#4   // *b++
354	umlal	v8.2d,v29.2s,v2.s[0]
355	umlal	v9.2d,v29.2s,v2.s[1]
356	uxtl	v28.4s,v28.4h
357	umlal	v10.2d,v29.2s,v2.s[2]
358	ushr	v15.2d,v8.2d,#16
359	umlal	v11.2d,v29.2s,v2.s[3]
360	umlal	v12.2d,v29.2s,v3.s[0]
361	ext	v8.16b,v8.16b,v8.16b,#8
362	add	v8.2d,v8.2d,v15.2d
363	umlal	v13.2d,v29.2s,v3.s[1]
364	ushr	v8.2d,v8.2d,#16
365	umlal	v6.2d,v29.2s,v3.s[2]
366	umlal	v7.2d,v29.2s,v3.s[3]
367	add	v16.2d,v9.2d,v8.2d
368	ins	v9.d[0],v16.d[0]
369	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
370	umlal	v9.2d,v28.2s,v0.s[0]
371	ld1	{v8.2d},[x6],#16
372	umlal	v10.2d,v28.2s,v0.s[1]
373	umlal	v11.2d,v28.2s,v0.s[2]
374	shl	v29.2d,v9.2d,#16
375	ext	v29.16b,v29.16b,v29.16b,#8
376	umlal	v12.2d,v28.2s,v0.s[3]
377	add	v29.2d,v29.2d,v9.2d
378	umlal	v13.2d,v28.2s,v1.s[0]
379	mul	v29.2s,v29.2s,v30.2s
380	umlal	v6.2d,v28.2s,v1.s[1]
381	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
382	umlal	v7.2d,v28.2s,v1.s[2]
383	uxtl	v29.4s,v29.4h
384	umlal	v8.2d,v28.2s,v1.s[3]
385	ldr	s28,[x2],#4   // *b++
386	umlal	v9.2d,v29.2s,v2.s[0]
387	umlal	v10.2d,v29.2s,v2.s[1]
388	uxtl	v28.4s,v28.4h
389	umlal	v11.2d,v29.2s,v2.s[2]
390	ushr	v15.2d,v9.2d,#16
391	umlal	v12.2d,v29.2s,v2.s[3]
392	umlal	v13.2d,v29.2s,v3.s[0]
393	ext	v9.16b,v9.16b,v9.16b,#8
394	add	v9.2d,v9.2d,v15.2d
395	umlal	v6.2d,v29.2s,v3.s[1]
396	ushr	v9.2d,v9.2d,#16
397	umlal	v7.2d,v29.2s,v3.s[2]
398	umlal	v8.2d,v29.2s,v3.s[3]
399	add	v16.2d,v10.2d,v9.2d
400	ins	v10.d[0],v16.d[0]
401	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
402	umlal	v10.2d,v28.2s,v0.s[0]
403	ld1	{v9.2d},[x6],#16
404	umlal	v11.2d,v28.2s,v0.s[1]
405	umlal	v12.2d,v28.2s,v0.s[2]
406	shl	v29.2d,v10.2d,#16
407	ext	v29.16b,v29.16b,v29.16b,#8
408	umlal	v13.2d,v28.2s,v0.s[3]
409	add	v29.2d,v29.2d,v10.2d
410	umlal	v6.2d,v28.2s,v1.s[0]
411	mul	v29.2s,v29.2s,v30.2s
412	umlal	v7.2d,v28.2s,v1.s[1]
413	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
414	umlal	v8.2d,v28.2s,v1.s[2]
415	uxtl	v29.4s,v29.4h
416	umlal	v9.2d,v28.2s,v1.s[3]
417	ldr	s28,[x2],#4   // *b++
418	umlal	v10.2d,v29.2s,v2.s[0]
419	umlal	v11.2d,v29.2s,v2.s[1]
420	uxtl	v28.4s,v28.4h
421	umlal	v12.2d,v29.2s,v2.s[2]
422	ushr	v15.2d,v10.2d,#16
423	umlal	v13.2d,v29.2s,v2.s[3]
424	umlal	v6.2d,v29.2s,v3.s[0]
425	ext	v10.16b,v10.16b,v10.16b,#8
426	add	v10.2d,v10.2d,v15.2d
427	umlal	v7.2d,v29.2s,v3.s[1]
428	ushr	v10.2d,v10.2d,#16
429	umlal	v8.2d,v29.2s,v3.s[2]
430	umlal	v9.2d,v29.2s,v3.s[3]
431	add	v16.2d,v11.2d,v10.2d
432	ins	v11.d[0],v16.d[0]
433	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
434	umlal	v11.2d,v28.2s,v0.s[0]
435	ld1	{v10.2d},[x6],#16
436	umlal	v12.2d,v28.2s,v0.s[1]
437	umlal	v13.2d,v28.2s,v0.s[2]
438	shl	v29.2d,v11.2d,#16
439	ext	v29.16b,v29.16b,v29.16b,#8
440	umlal	v6.2d,v28.2s,v0.s[3]
441	add	v29.2d,v29.2d,v11.2d
442	umlal	v7.2d,v28.2s,v1.s[0]
443	mul	v29.2s,v29.2s,v30.2s
444	umlal	v8.2d,v28.2s,v1.s[1]
445	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
446	umlal	v9.2d,v28.2s,v1.s[2]
447	uxtl	v29.4s,v29.4h
448	umlal	v10.2d,v28.2s,v1.s[3]
449	ldr	s28,[x2],#4   // *b++
450	umlal	v11.2d,v29.2s,v2.s[0]
451	umlal	v12.2d,v29.2s,v2.s[1]
452	uxtl	v28.4s,v28.4h
453	umlal	v13.2d,v29.2s,v2.s[2]
454	ushr	v15.2d,v11.2d,#16
455	umlal	v6.2d,v29.2s,v2.s[3]
456	umlal	v7.2d,v29.2s,v3.s[0]
457	ext	v11.16b,v11.16b,v11.16b,#8
458	add	v11.2d,v11.2d,v15.2d
459	umlal	v8.2d,v29.2s,v3.s[1]
460	ushr	v11.2d,v11.2d,#16
461	umlal	v9.2d,v29.2s,v3.s[2]
462	umlal	v10.2d,v29.2s,v3.s[3]
463	add	v16.2d,v12.2d,v11.2d
464	ins	v12.d[0],v16.d[0]
465	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
466	umlal	v12.2d,v28.2s,v0.s[0]
467	ld1	{v11.2d},[x6],#16
468	umlal	v13.2d,v28.2s,v0.s[1]
469	umlal	v6.2d,v28.2s,v0.s[2]
470	shl	v29.2d,v12.2d,#16
471	ext	v29.16b,v29.16b,v29.16b,#8
472	umlal	v7.2d,v28.2s,v0.s[3]
473	add	v29.2d,v29.2d,v12.2d
474	umlal	v8.2d,v28.2s,v1.s[0]
475	mul	v29.2s,v29.2s,v30.2s
476	umlal	v9.2d,v28.2s,v1.s[1]
477	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
478	umlal	v10.2d,v28.2s,v1.s[2]
479	uxtl	v29.4s,v29.4h
480	umlal	v11.2d,v28.2s,v1.s[3]
481	ldr	s28,[x2],#4   // *b++
482	umlal	v12.2d,v29.2s,v2.s[0]
483	umlal	v13.2d,v29.2s,v2.s[1]
484	uxtl	v28.4s,v28.4h
485	umlal	v6.2d,v29.2s,v2.s[2]
486	ushr	v15.2d,v12.2d,#16
487	umlal	v7.2d,v29.2s,v2.s[3]
488	umlal	v8.2d,v29.2s,v3.s[0]
489	ext	v12.16b,v12.16b,v12.16b,#8
490	add	v12.2d,v12.2d,v15.2d
491	umlal	v9.2d,v29.2s,v3.s[1]
492	ushr	v12.2d,v12.2d,#16
493	umlal	v10.2d,v29.2s,v3.s[2]
494	umlal	v11.2d,v29.2s,v3.s[3]
495	add	v16.2d,v13.2d,v12.2d
496	ins	v13.d[0],v16.d[0]
497	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
498	umlal	v13.2d,v28.2s,v0.s[0]
499	ld1	{v12.2d},[x6],#16
500	umlal	v6.2d,v28.2s,v0.s[1]
501	umlal	v7.2d,v28.2s,v0.s[2]
502	shl	v29.2d,v13.2d,#16
503	ext	v29.16b,v29.16b,v29.16b,#8
504	umlal	v8.2d,v28.2s,v0.s[3]
505	add	v29.2d,v29.2d,v13.2d
506	umlal	v9.2d,v28.2s,v1.s[0]
507	mul	v29.2s,v29.2s,v30.2s
508	umlal	v10.2d,v28.2s,v1.s[1]
509	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
510	umlal	v11.2d,v28.2s,v1.s[2]
511	uxtl	v29.4s,v29.4h
512	umlal	v12.2d,v28.2s,v1.s[3]
513	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
514	umlal	v13.2d,v29.2s,v2.s[0]
515	ld1	{v0.4s,v1.4s},[x1],#32
516	umlal	v6.2d,v29.2s,v2.s[1]
517	umlal	v7.2d,v29.2s,v2.s[2]
518	mov	v5.16b,v13.16b
519	ushr	v5.2d,v5.2d,#16
520	ext	v13.16b,v13.16b,v13.16b,#8
521	umlal	v8.2d,v29.2s,v2.s[3]
522	umlal	v9.2d,v29.2s,v3.s[0]
523	add	v13.2d,v13.2d,v5.2d
524	umlal	v10.2d,v29.2s,v3.s[1]
525	ushr	v13.2d,v13.2d,#16
526	eor	v15.16b,v15.16b,v15.16b
527	ins	v13.d[1],v15.d[0]
528	umlal	v11.2d,v29.2s,v3.s[2]
529	umlal	v12.2d,v29.2s,v3.s[3]
530	add	v6.2d,v6.2d,v13.2d
531	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
532	add	x10,sp,#8		// rewind
533	sub	x8,x5,#8
534	b	.LNEON_8n_inner
535
536.align	4
537.LNEON_8n_inner:
538	subs	x8,x8,#8
539	umlal	v6.2d,v28.2s,v0.s[0]
540	ld1	{v13.2d},[x6]
541	umlal	v7.2d,v28.2s,v0.s[1]
542	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
543	umlal	v8.2d,v28.2s,v0.s[2]
544	ld1	{v2.4s,v3.4s},[x3],#32
545	umlal	v9.2d,v28.2s,v0.s[3]
546	b.eq	.LInner_jump
547	add	x6,x6,#16	// don't advance in last iteration
548.LInner_jump:
549	umlal	v10.2d,v28.2s,v1.s[0]
550	umlal	v11.2d,v28.2s,v1.s[1]
551	umlal	v12.2d,v28.2s,v1.s[2]
552	umlal	v13.2d,v28.2s,v1.s[3]
553	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
554	umlal	v6.2d,v29.2s,v2.s[0]
555	umlal	v7.2d,v29.2s,v2.s[1]
556	umlal	v8.2d,v29.2s,v2.s[2]
557	umlal	v9.2d,v29.2s,v2.s[3]
558	umlal	v10.2d,v29.2s,v3.s[0]
559	umlal	v11.2d,v29.2s,v3.s[1]
560	umlal	v12.2d,v29.2s,v3.s[2]
561	umlal	v13.2d,v29.2s,v3.s[3]
562	st1	{v6.2d},[x7],#16
563	umlal	v7.2d,v28.2s,v0.s[0]
564	ld1	{v6.2d},[x6]
565	umlal	v8.2d,v28.2s,v0.s[1]
566	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
567	umlal	v9.2d,v28.2s,v0.s[2]
568	b.eq	.LInner_jump1
569	add	x6,x6,#16	// don't advance in last iteration
570.LInner_jump1:
571	umlal	v10.2d,v28.2s,v0.s[3]
572	umlal	v11.2d,v28.2s,v1.s[0]
573	umlal	v12.2d,v28.2s,v1.s[1]
574	umlal	v13.2d,v28.2s,v1.s[2]
575	umlal	v6.2d,v28.2s,v1.s[3]
576	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
577	umlal	v7.2d,v29.2s,v2.s[0]
578	umlal	v8.2d,v29.2s,v2.s[1]
579	umlal	v9.2d,v29.2s,v2.s[2]
580	umlal	v10.2d,v29.2s,v2.s[3]
581	umlal	v11.2d,v29.2s,v3.s[0]
582	umlal	v12.2d,v29.2s,v3.s[1]
583	umlal	v13.2d,v29.2s,v3.s[2]
584	umlal	v6.2d,v29.2s,v3.s[3]
585	st1	{v7.2d},[x7],#16
586	umlal	v8.2d,v28.2s,v0.s[0]
587	ld1	{v7.2d},[x6]
588	umlal	v9.2d,v28.2s,v0.s[1]
589	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
590	umlal	v10.2d,v28.2s,v0.s[2]
591	b.eq	.LInner_jump2
592	add	x6,x6,#16	// don't advance in last iteration
593.LInner_jump2:
594	umlal	v11.2d,v28.2s,v0.s[3]
595	umlal	v12.2d,v28.2s,v1.s[0]
596	umlal	v13.2d,v28.2s,v1.s[1]
597	umlal	v6.2d,v28.2s,v1.s[2]
598	umlal	v7.2d,v28.2s,v1.s[3]
599	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
600	umlal	v8.2d,v29.2s,v2.s[0]
601	umlal	v9.2d,v29.2s,v2.s[1]
602	umlal	v10.2d,v29.2s,v2.s[2]
603	umlal	v11.2d,v29.2s,v2.s[3]
604	umlal	v12.2d,v29.2s,v3.s[0]
605	umlal	v13.2d,v29.2s,v3.s[1]
606	umlal	v6.2d,v29.2s,v3.s[2]
607	umlal	v7.2d,v29.2s,v3.s[3]
608	st1	{v8.2d},[x7],#16
609	umlal	v9.2d,v28.2s,v0.s[0]
610	ld1	{v8.2d},[x6]
611	umlal	v10.2d,v28.2s,v0.s[1]
612	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
613	umlal	v11.2d,v28.2s,v0.s[2]
614	b.eq	.LInner_jump3
615	add	x6,x6,#16	// don't advance in last iteration
616.LInner_jump3:
617	umlal	v12.2d,v28.2s,v0.s[3]
618	umlal	v13.2d,v28.2s,v1.s[0]
619	umlal	v6.2d,v28.2s,v1.s[1]
620	umlal	v7.2d,v28.2s,v1.s[2]
621	umlal	v8.2d,v28.2s,v1.s[3]
622	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
623	umlal	v9.2d,v29.2s,v2.s[0]
624	umlal	v10.2d,v29.2s,v2.s[1]
625	umlal	v11.2d,v29.2s,v2.s[2]
626	umlal	v12.2d,v29.2s,v2.s[3]
627	umlal	v13.2d,v29.2s,v3.s[0]
628	umlal	v6.2d,v29.2s,v3.s[1]
629	umlal	v7.2d,v29.2s,v3.s[2]
630	umlal	v8.2d,v29.2s,v3.s[3]
631	st1	{v9.2d},[x7],#16
632	umlal	v10.2d,v28.2s,v0.s[0]
633	ld1	{v9.2d},[x6]
634	umlal	v11.2d,v28.2s,v0.s[1]
635	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
636	umlal	v12.2d,v28.2s,v0.s[2]
637	b.eq	.LInner_jump4
638	add	x6,x6,#16	// don't advance in last iteration
639.LInner_jump4:
640	umlal	v13.2d,v28.2s,v0.s[3]
641	umlal	v6.2d,v28.2s,v1.s[0]
642	umlal	v7.2d,v28.2s,v1.s[1]
643	umlal	v8.2d,v28.2s,v1.s[2]
644	umlal	v9.2d,v28.2s,v1.s[3]
645	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
646	umlal	v10.2d,v29.2s,v2.s[0]
647	umlal	v11.2d,v29.2s,v2.s[1]
648	umlal	v12.2d,v29.2s,v2.s[2]
649	umlal	v13.2d,v29.2s,v2.s[3]
650	umlal	v6.2d,v29.2s,v3.s[0]
651	umlal	v7.2d,v29.2s,v3.s[1]
652	umlal	v8.2d,v29.2s,v3.s[2]
653	umlal	v9.2d,v29.2s,v3.s[3]
654	st1	{v10.2d},[x7],#16
655	umlal	v11.2d,v28.2s,v0.s[0]
656	ld1	{v10.2d},[x6]
657	umlal	v12.2d,v28.2s,v0.s[1]
658	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
659	umlal	v13.2d,v28.2s,v0.s[2]
660	b.eq	.LInner_jump5
661	add	x6,x6,#16	// don't advance in last iteration
662.LInner_jump5:
663	umlal	v6.2d,v28.2s,v0.s[3]
664	umlal	v7.2d,v28.2s,v1.s[0]
665	umlal	v8.2d,v28.2s,v1.s[1]
666	umlal	v9.2d,v28.2s,v1.s[2]
667	umlal	v10.2d,v28.2s,v1.s[3]
668	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
669	umlal	v11.2d,v29.2s,v2.s[0]
670	umlal	v12.2d,v29.2s,v2.s[1]
671	umlal	v13.2d,v29.2s,v2.s[2]
672	umlal	v6.2d,v29.2s,v2.s[3]
673	umlal	v7.2d,v29.2s,v3.s[0]
674	umlal	v8.2d,v29.2s,v3.s[1]
675	umlal	v9.2d,v29.2s,v3.s[2]
676	umlal	v10.2d,v29.2s,v3.s[3]
677	st1	{v11.2d},[x7],#16
678	umlal	v12.2d,v28.2s,v0.s[0]
679	ld1	{v11.2d},[x6]
680	umlal	v13.2d,v28.2s,v0.s[1]
681	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
682	umlal	v6.2d,v28.2s,v0.s[2]
683	b.eq	.LInner_jump6
684	add	x6,x6,#16	// don't advance in last iteration
685.LInner_jump6:
686	umlal	v7.2d,v28.2s,v0.s[3]
687	umlal	v8.2d,v28.2s,v1.s[0]
688	umlal	v9.2d,v28.2s,v1.s[1]
689	umlal	v10.2d,v28.2s,v1.s[2]
690	umlal	v11.2d,v28.2s,v1.s[3]
691	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
692	umlal	v12.2d,v29.2s,v2.s[0]
693	umlal	v13.2d,v29.2s,v2.s[1]
694	umlal	v6.2d,v29.2s,v2.s[2]
695	umlal	v7.2d,v29.2s,v2.s[3]
696	umlal	v8.2d,v29.2s,v3.s[0]
697	umlal	v9.2d,v29.2s,v3.s[1]
698	umlal	v10.2d,v29.2s,v3.s[2]
699	umlal	v11.2d,v29.2s,v3.s[3]
700	st1	{v12.2d},[x7],#16
701	umlal	v13.2d,v28.2s,v0.s[0]
702	ld1	{v12.2d},[x6]
703	umlal	v6.2d,v28.2s,v0.s[1]
704	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
705	umlal	v7.2d,v28.2s,v0.s[2]
706	b.eq	.LInner_jump7
707	add	x6,x6,#16	// don't advance in last iteration
708.LInner_jump7:
709	umlal	v8.2d,v28.2s,v0.s[3]
710	umlal	v9.2d,v28.2s,v1.s[0]
711	umlal	v10.2d,v28.2s,v1.s[1]
712	umlal	v11.2d,v28.2s,v1.s[2]
713	umlal	v12.2d,v28.2s,v1.s[3]
714	b.ne	.LInner_after_rewind8
715	sub	x1,x1,x5,lsl#2	// rewind
716.LInner_after_rewind8:
717	umlal	v13.2d,v29.2s,v2.s[0]
718	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
719	umlal	v6.2d,v29.2s,v2.s[1]
720	ld1	{v0.4s,v1.4s},[x1],#32
721	umlal	v7.2d,v29.2s,v2.s[2]
722	add	x10,sp,#8		// rewind
723	umlal	v8.2d,v29.2s,v2.s[3]
724	umlal	v9.2d,v29.2s,v3.s[0]
725	umlal	v10.2d,v29.2s,v3.s[1]
726	umlal	v11.2d,v29.2s,v3.s[2]
727	st1	{v13.2d},[x7],#16
728	umlal	v12.2d,v29.2s,v3.s[3]
729
730	bne	.LNEON_8n_inner
731	add	x6,sp,#128
732	st1	{v6.2d,v7.2d},[x7],#32
733	eor	v2.16b,v2.16b,v2.16b	// v2
734	st1	{v8.2d,v9.2d},[x7],#32
735	eor	v3.16b,v3.16b,v3.16b	// v3
736	st1	{v10.2d,v11.2d},[x7],#32
737	st1	{v12.2d},[x7]
738
739	subs	x9,x9,#8
740	ld1	{v6.2d,v7.2d},[x6],#32
741	ld1	{v8.2d,v9.2d},[x6],#32
742	ld1	{v10.2d,v11.2d},[x6],#32
743	ld1	{v12.2d,v13.2d},[x6],#32
744
745	b.eq	.LInner_8n_jump_2steps
746	sub	x3,x3,x5,lsl#2	// rewind
747	b	.LNEON_8n_outer
748
749.LInner_8n_jump_2steps:
750	add	x7,sp,#128
751	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
752	mov	v5.16b,v6.16b
753	ushr	v15.2d,v6.2d,#16
754	ext	v6.16b,v6.16b,v6.16b,#8
755	st1	{v2.2d,v3.2d}, [sp],#32
756	add	v6.2d,v6.2d,v15.2d
757	st1	{v2.2d,v3.2d}, [sp],#32
758	ushr	v15.2d,v6.2d,#16
759	st1	{v2.2d,v3.2d}, [sp],#32
760	zip1	v6.4h,v5.4h,v6.4h
761	ins	v15.d[1],v14.d[0]
762
763	mov	x8,x5
764	b	.LNEON_tail_entry
765
766.align	4
767.LNEON_tail:
768	add	v6.2d,v6.2d,v15.2d
769	mov	v5.16b,v6.16b
770	ushr	v15.2d,v6.2d,#16
771	ext	v6.16b,v6.16b,v6.16b,#8
772	ld1	{v8.2d,v9.2d}, [x6],#32
773	add	v6.2d,v6.2d,v15.2d
774	ld1	{v10.2d,v11.2d}, [x6],#32
775	ushr	v15.2d,v6.2d,#16
776	ld1	{v12.2d,v13.2d}, [x6],#32
777	zip1	v6.4h,v5.4h,v6.4h
778	ins	v15.d[1],v14.d[0]
779
780.LNEON_tail_entry:
781	add	v7.2d,v7.2d,v15.2d
782	st1	{v6.s}[0], [x7],#4
783	ushr	v15.2d,v7.2d,#16
784	mov	v5.16b,v7.16b
785	ext	v7.16b,v7.16b,v7.16b,#8
786	add	v7.2d,v7.2d,v15.2d
787	ushr	v15.2d,v7.2d,#16
788	zip1	v7.4h,v5.4h,v7.4h
789	ins	v15.d[1],v14.d[0]
790	add	v8.2d,v8.2d,v15.2d
791	st1	{v7.s}[0], [x7],#4
792	ushr	v15.2d,v8.2d,#16
793	mov	v5.16b,v8.16b
794	ext	v8.16b,v8.16b,v8.16b,#8
795	add	v8.2d,v8.2d,v15.2d
796	ushr	v15.2d,v8.2d,#16
797	zip1	v8.4h,v5.4h,v8.4h
798	ins	v15.d[1],v14.d[0]
799	add	v9.2d,v9.2d,v15.2d
800	st1	{v8.s}[0], [x7],#4
801	ushr	v15.2d,v9.2d,#16
802	mov	v5.16b,v9.16b
803	ext	v9.16b,v9.16b,v9.16b,#8
804	add	v9.2d,v9.2d,v15.2d
805	ushr	v15.2d,v9.2d,#16
806	zip1	v9.4h,v5.4h,v9.4h
807	ins	v15.d[1],v14.d[0]
808	add	v10.2d,v10.2d,v15.2d
809	st1	{v9.s}[0], [x7],#4
810	ushr	v15.2d,v10.2d,#16
811	mov	v5.16b,v10.16b
812	ext	v10.16b,v10.16b,v10.16b,#8
813	add	v10.2d,v10.2d,v15.2d
814	ushr	v15.2d,v10.2d,#16
815	zip1	v10.4h,v5.4h,v10.4h
816	ins	v15.d[1],v14.d[0]
817	add	v11.2d,v11.2d,v15.2d
818	st1	{v10.s}[0], [x7],#4
819	ushr	v15.2d,v11.2d,#16
820	mov	v5.16b,v11.16b
821	ext	v11.16b,v11.16b,v11.16b,#8
822	add	v11.2d,v11.2d,v15.2d
823	ushr	v15.2d,v11.2d,#16
824	zip1	v11.4h,v5.4h,v11.4h
825	ins	v15.d[1],v14.d[0]
826	add	v12.2d,v12.2d,v15.2d
827	st1	{v11.s}[0], [x7],#4
828	ushr	v15.2d,v12.2d,#16
829	mov	v5.16b,v12.16b
830	ext	v12.16b,v12.16b,v12.16b,#8
831	add	v12.2d,v12.2d,v15.2d
832	ushr	v15.2d,v12.2d,#16
833	zip1	v12.4h,v5.4h,v12.4h
834	ins	v15.d[1],v14.d[0]
835	add	v13.2d,v13.2d,v15.2d
836	st1	{v12.s}[0], [x7],#4
837	ushr	v15.2d,v13.2d,#16
838	mov	v5.16b,v13.16b
839	ext	v13.16b,v13.16b,v13.16b,#8
840	add	v13.2d,v13.2d,v15.2d
841	ushr	v15.2d,v13.2d,#16
842	zip1	v13.4h,v5.4h,v13.4h
843	ins	v15.d[1],v14.d[0]
844	ld1	{v6.2d,v7.2d}, [x6],#32
845	subs	x8,x8,#8
846	st1	{v13.s}[0], [x7],#4
847	bne	.LNEON_tail
848
849	st1	{v15.s}[0], [x7],#4	// top-most bit
850	sub	x3,x3,x5,lsl#2		// rewind x3
851	subs	x1,sp,#0			// clear carry flag
852	add	x2,sp,x5,lsl#2
853
854.LNEON_sub:
855	ldp	w4,w5,[x1],#8
856	ldp	w6,w7,[x1],#8
857	ldp	w8,w9,[x3],#8
858	ldp	w10,w11,[x3],#8
859	sbcs	w8,w4,w8
860	sbcs	w9,w5,w9
861	sbcs	w10,w6,w10
862	sbcs	w11,w7,w11
863	sub	x17,x2,x1
864	stp	w8,w9,[x0],#8
865	stp	w10,w11,[x0],#8
866	cbnz	x17,.LNEON_sub
867
868	ldr	w10, [x1]		// load top-most bit
869	mov	x11,sp
870	eor	v0.16b,v0.16b,v0.16b
871	sub	x11,x2,x11		// this is num*4
872	eor	v1.16b,v1.16b,v1.16b
873	mov	x1,sp
874	sub	x0,x0,x11		// rewind x0
875	mov	x3,x2		// second 3/4th of frame
876	sbcs	w10,w10,wzr		// result is carry flag
877
878.LNEON_copy_n_zap:
879	ldp	w4,w5,[x1],#8
880	ldp	w6,w7,[x1],#8
881	ldp	w8,w9,[x0],#8
882	ldp	w10,w11,[x0]
883	sub	x0,x0,#8
884	b.cs	.LCopy_1
885	mov	w8,w4
886	mov	w9,w5
887	mov	w10,w6
888	mov	w11,w7
889.LCopy_1:
890	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
891	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
892	ldp	w4,w5,[x1],#8
893	ldp	w6,w7,[x1],#8
894	stp	w8,w9,[x0],#8
895	stp	w10,w11,[x0],#8
896	sub	x1,x1,#32
897	ldp	w8,w9,[x0],#8
898	ldp	w10,w11,[x0]
899	sub	x0,x0,#8
900	b.cs	.LCopy_2
901	mov	w8, w4
902	mov	w9, w5
903	mov	w10, w6
904	mov	w11, w7
905.LCopy_2:
906	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
907	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
908	sub	x17,x2,x1		// preserves carry
909	stp	w8,w9,[x0],#8
910	stp	w10,w11,[x0],#8
911	cbnz	x17,.LNEON_copy_n_zap
912
913	mov	sp,x16
914	ldp	d14,d15,[sp,#64]
915	ldp	d12,d13,[sp,#48]
916	ldp	d10,d11,[sp,#32]
917	ldp	d8,d9,[sp,#16]
918	ldr	x29,[sp],#80
919	ret	// bx lr
920
921.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
922.type	__bn_sqr8x_mont,%function
923.align	5
924__bn_sqr8x_mont:
925	cmp	x1,x2
926	b.ne	__bn_mul4x_mont
927.Lsqr8x_mont:
928.inst	0xd503233f		// paciasp
929	stp	x29,x30,[sp,#-128]!
930	add	x29,sp,#0
931	stp	x19,x20,[sp,#16]
932	stp	x21,x22,[sp,#32]
933	stp	x23,x24,[sp,#48]
934	stp	x25,x26,[sp,#64]
935	stp	x27,x28,[sp,#80]
936	stp	x0,x3,[sp,#96]	// offload rp and np
937
938	ldp	x6,x7,[x1,#8*0]
939	ldp	x8,x9,[x1,#8*2]
940	ldp	x10,x11,[x1,#8*4]
941	ldp	x12,x13,[x1,#8*6]
942
943	sub	x2,sp,x5,lsl#4
944	lsl	x5,x5,#3
945	ldr	x4,[x4]		// *n0
946	mov	sp,x2			// alloca
947	sub	x27,x5,#8*8
948	b	.Lsqr8x_zero_start
949
950.Lsqr8x_zero:
951	sub	x27,x27,#8*8
952	stp	xzr,xzr,[x2,#8*0]
953	stp	xzr,xzr,[x2,#8*2]
954	stp	xzr,xzr,[x2,#8*4]
955	stp	xzr,xzr,[x2,#8*6]
956.Lsqr8x_zero_start:
957	stp	xzr,xzr,[x2,#8*8]
958	stp	xzr,xzr,[x2,#8*10]
959	stp	xzr,xzr,[x2,#8*12]
960	stp	xzr,xzr,[x2,#8*14]
961	add	x2,x2,#8*16
962	cbnz	x27,.Lsqr8x_zero
963
964	add	x3,x1,x5
965	add	x1,x1,#8*8
966	mov	x19,xzr
967	mov	x20,xzr
968	mov	x21,xzr
969	mov	x22,xzr
970	mov	x23,xzr
971	mov	x24,xzr
972	mov	x25,xzr
973	mov	x26,xzr
974	mov	x2,sp
975	str	x4,[x29,#112]		// offload n0
976
977	// Multiply everything but a[i]*a[i]
978.align	4
979.Lsqr8x_outer_loop:
980        //                                                 a[1]a[0]	(i)
981        //                                             a[2]a[0]
982        //                                         a[3]a[0]
983        //                                     a[4]a[0]
984        //                                 a[5]a[0]
985        //                             a[6]a[0]
986        //                         a[7]a[0]
987        //                                         a[2]a[1]		(ii)
988        //                                     a[3]a[1]
989        //                                 a[4]a[1]
990        //                             a[5]a[1]
991        //                         a[6]a[1]
992        //                     a[7]a[1]
993        //                                 a[3]a[2]			(iii)
994        //                             a[4]a[2]
995        //                         a[5]a[2]
996        //                     a[6]a[2]
997        //                 a[7]a[2]
998        //                         a[4]a[3]				(iv)
999        //                     a[5]a[3]
1000        //                 a[6]a[3]
1001        //             a[7]a[3]
1002        //                 a[5]a[4]					(v)
1003        //             a[6]a[4]
1004        //         a[7]a[4]
1005        //         a[6]a[5]						(vi)
1006        //     a[7]a[5]
1007        // a[7]a[6]							(vii)
1008
1009	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
1010	mul	x15,x8,x6
1011	mul	x16,x9,x6
1012	mul	x17,x10,x6
1013	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
1014	mul	x14,x11,x6
1015	adcs	x21,x21,x15
1016	mul	x15,x12,x6
1017	adcs	x22,x22,x16
1018	mul	x16,x13,x6
1019	adcs	x23,x23,x17
1020	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
1021	adcs	x24,x24,x14
1022	umulh	x14,x8,x6
1023	adcs	x25,x25,x15
1024	umulh	x15,x9,x6
1025	adcs	x26,x26,x16
1026	umulh	x16,x10,x6
1027	stp	x19,x20,[x2],#8*2	// t[0..1]
1028	adc	x19,xzr,xzr		// t[8]
1029	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
1030	umulh	x17,x11,x6
1031	adcs	x22,x22,x14
1032	umulh	x14,x12,x6
1033	adcs	x23,x23,x15
1034	umulh	x15,x13,x6
1035	adcs	x24,x24,x16
1036	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
1037	adcs	x25,x25,x17
1038	mul	x17,x9,x7
1039	adcs	x26,x26,x14
1040	mul	x14,x10,x7
1041	adc	x19,x19,x15
1042
1043	mul	x15,x11,x7
1044	adds	x22,x22,x16
1045	mul	x16,x12,x7
1046	adcs	x23,x23,x17
1047	mul	x17,x13,x7
1048	adcs	x24,x24,x14
1049	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
1050	adcs	x25,x25,x15
1051	umulh	x15,x9,x7
1052	adcs	x26,x26,x16
1053	umulh	x16,x10,x7
1054	adcs	x19,x19,x17
1055	umulh	x17,x11,x7
1056	stp	x21,x22,[x2],#8*2	// t[2..3]
1057	adc	x20,xzr,xzr		// t[9]
1058	adds	x23,x23,x14
1059	umulh	x14,x12,x7
1060	adcs	x24,x24,x15
1061	umulh	x15,x13,x7
1062	adcs	x25,x25,x16
1063	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
1064	adcs	x26,x26,x17
1065	mul	x17,x10,x8
1066	adcs	x19,x19,x14
1067	mul	x14,x11,x8
1068	adc	x20,x20,x15
1069
1070	mul	x15,x12,x8
1071	adds	x24,x24,x16
1072	mul	x16,x13,x8
1073	adcs	x25,x25,x17
1074	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
1075	adcs	x26,x26,x14
1076	umulh	x14,x10,x8
1077	adcs	x19,x19,x15
1078	umulh	x15,x11,x8
1079	adcs	x20,x20,x16
1080	umulh	x16,x12,x8
1081	stp	x23,x24,[x2],#8*2	// t[4..5]
1082	adc	x21,xzr,xzr		// t[10]
1083	adds	x25,x25,x17
1084	umulh	x17,x13,x8
1085	adcs	x26,x26,x14
1086	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
1087	adcs	x19,x19,x15
1088	mul	x15,x11,x9
1089	adcs	x20,x20,x16
1090	mul	x16,x12,x9
1091	adc	x21,x21,x17
1092
1093	mul	x17,x13,x9
1094	adds	x26,x26,x14
1095	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
1096	adcs	x19,x19,x15
1097	umulh	x15,x11,x9
1098	adcs	x20,x20,x16
1099	umulh	x16,x12,x9
1100	adcs	x21,x21,x17
1101	umulh	x17,x13,x9
1102	stp	x25,x26,[x2],#8*2	// t[6..7]
1103	adc	x22,xzr,xzr		// t[11]
1104	adds	x19,x19,x14
1105	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
1106	adcs	x20,x20,x15
1107	mul	x15,x12,x10
1108	adcs	x21,x21,x16
1109	mul	x16,x13,x10
1110	adc	x22,x22,x17
1111
1112	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
1113	adds	x20,x20,x14
1114	umulh	x14,x12,x10
1115	adcs	x21,x21,x15
1116	umulh	x15,x13,x10
1117	adcs	x22,x22,x16
1118	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
1119	adc	x23,xzr,xzr		// t[12]
1120	adds	x21,x21,x17
1121	mul	x17,x13,x11
1122	adcs	x22,x22,x14
1123	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
1124	adc	x23,x23,x15
1125
1126	umulh	x15,x13,x11
1127	adds	x22,x22,x16
1128	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
1129	adcs	x23,x23,x17
1130	umulh	x17,x13,x12		// hi(a[7]*a[6])
1131	adc	x24,xzr,xzr		// t[13]
1132	adds	x23,x23,x14
1133	sub	x27,x3,x1	// done yet?
1134	adc	x24,x24,x15
1135
1136	adds	x24,x24,x16
1137	sub	x14,x3,x5	// rewinded ap
1138	adc	x25,xzr,xzr		// t[14]
1139	add	x25,x25,x17
1140
1141	cbz	x27,.Lsqr8x_outer_break
1142
1143	mov	x4,x6
1144	ldp	x6,x7,[x2,#8*0]
1145	ldp	x8,x9,[x2,#8*2]
1146	ldp	x10,x11,[x2,#8*4]
1147	ldp	x12,x13,[x2,#8*6]
1148	adds	x19,x19,x6
1149	adcs	x20,x20,x7
1150	ldp	x6,x7,[x1,#8*0]
1151	adcs	x21,x21,x8
1152	adcs	x22,x22,x9
1153	ldp	x8,x9,[x1,#8*2]
1154	adcs	x23,x23,x10
1155	adcs	x24,x24,x11
1156	ldp	x10,x11,[x1,#8*4]
1157	adcs	x25,x25,x12
1158	mov	x0,x1
1159	adcs	x26,xzr,x13
1160	ldp	x12,x13,[x1,#8*6]
1161	add	x1,x1,#8*8
1162	//adc	x28,xzr,xzr		// moved below
1163	mov	x27,#-8*8
1164
1165	//                                                         a[8]a[0]
1166	//                                                     a[9]a[0]
1167	//                                                 a[a]a[0]
1168	//                                             a[b]a[0]
1169	//                                         a[c]a[0]
1170	//                                     a[d]a[0]
1171	//                                 a[e]a[0]
1172	//                             a[f]a[0]
1173	//                                                     a[8]a[1]
1174	//                         a[f]a[1]........................
1175	//                                                 a[8]a[2]
1176	//                     a[f]a[2]........................
1177	//                                             a[8]a[3]
1178	//                 a[f]a[3]........................
1179	//                                         a[8]a[4]
1180	//             a[f]a[4]........................
1181	//                                     a[8]a[5]
1182	//         a[f]a[5]........................
1183	//                                 a[8]a[6]
1184	//     a[f]a[6]........................
1185	//                             a[8]a[7]
1186	// a[f]a[7]........................
1187.Lsqr8x_mul:
1188	mul	x14,x6,x4
1189	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1190	mul	x15,x7,x4
1191	add	x27,x27,#8
1192	mul	x16,x8,x4
1193	mul	x17,x9,x4
1194	adds	x19,x19,x14
1195	mul	x14,x10,x4
1196	adcs	x20,x20,x15
1197	mul	x15,x11,x4
1198	adcs	x21,x21,x16
1199	mul	x16,x12,x4
1200	adcs	x22,x22,x17
1201	mul	x17,x13,x4
1202	adcs	x23,x23,x14
1203	umulh	x14,x6,x4
1204	adcs	x24,x24,x15
1205	umulh	x15,x7,x4
1206	adcs	x25,x25,x16
1207	umulh	x16,x8,x4
1208	adcs	x26,x26,x17
1209	umulh	x17,x9,x4
1210	adc	x28,x28,xzr
1211	str	x19,[x2],#8
1212	adds	x19,x20,x14
1213	umulh	x14,x10,x4
1214	adcs	x20,x21,x15
1215	umulh	x15,x11,x4
1216	adcs	x21,x22,x16
1217	umulh	x16,x12,x4
1218	adcs	x22,x23,x17
1219	umulh	x17,x13,x4
1220	ldr	x4,[x0,x27]
1221	adcs	x23,x24,x14
1222	adcs	x24,x25,x15
1223	adcs	x25,x26,x16
1224	adcs	x26,x28,x17
1225	//adc	x28,xzr,xzr		// moved above
1226	cbnz	x27,.Lsqr8x_mul
1227					// note that carry flag is guaranteed
1228					// to be zero at this point
1229	cmp	x1,x3		// done yet?
1230	b.eq	.Lsqr8x_break
1231
1232	ldp	x6,x7,[x2,#8*0]
1233	ldp	x8,x9,[x2,#8*2]
1234	ldp	x10,x11,[x2,#8*4]
1235	ldp	x12,x13,[x2,#8*6]
1236	adds	x19,x19,x6
1237	ldur	x4,[x0,#-8*8]
1238	adcs	x20,x20,x7
1239	ldp	x6,x7,[x1,#8*0]
1240	adcs	x21,x21,x8
1241	adcs	x22,x22,x9
1242	ldp	x8,x9,[x1,#8*2]
1243	adcs	x23,x23,x10
1244	adcs	x24,x24,x11
1245	ldp	x10,x11,[x1,#8*4]
1246	adcs	x25,x25,x12
1247	mov	x27,#-8*8
1248	adcs	x26,x26,x13
1249	ldp	x12,x13,[x1,#8*6]
1250	add	x1,x1,#8*8
1251	//adc	x28,xzr,xzr		// moved above
1252	b	.Lsqr8x_mul
1253
1254.align	4
1255.Lsqr8x_break:
1256	ldp	x6,x7,[x0,#8*0]
1257	add	x1,x0,#8*8
1258	ldp	x8,x9,[x0,#8*2]
1259	sub	x14,x3,x1		// is it last iteration?
1260	ldp	x10,x11,[x0,#8*4]
1261	sub	x15,x2,x14
1262	ldp	x12,x13,[x0,#8*6]
1263	cbz	x14,.Lsqr8x_outer_loop
1264
1265	stp	x19,x20,[x2,#8*0]
1266	ldp	x19,x20,[x15,#8*0]
1267	stp	x21,x22,[x2,#8*2]
1268	ldp	x21,x22,[x15,#8*2]
1269	stp	x23,x24,[x2,#8*4]
1270	ldp	x23,x24,[x15,#8*4]
1271	stp	x25,x26,[x2,#8*6]
1272	mov	x2,x15
1273	ldp	x25,x26,[x15,#8*6]
1274	b	.Lsqr8x_outer_loop
1275
1276.align	4
1277.Lsqr8x_outer_break:
1278	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1279	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
1280	ldp	x15,x16,[sp,#8*1]
1281	ldp	x11,x13,[x14,#8*2]
1282	add	x1,x14,#8*4
1283	ldp	x17,x14,[sp,#8*3]
1284
1285	stp	x19,x20,[x2,#8*0]
1286	mul	x19,x7,x7
1287	stp	x21,x22,[x2,#8*2]
1288	umulh	x7,x7,x7
1289	stp	x23,x24,[x2,#8*4]
1290	mul	x8,x9,x9
1291	stp	x25,x26,[x2,#8*6]
1292	mov	x2,sp
1293	umulh	x9,x9,x9
1294	adds	x20,x7,x15,lsl#1
1295	extr	x15,x16,x15,#63
1296	sub	x27,x5,#8*4
1297
1298.Lsqr4x_shift_n_add:
1299	adcs	x21,x8,x15
1300	extr	x16,x17,x16,#63
1301	sub	x27,x27,#8*4
1302	adcs	x22,x9,x16
1303	ldp	x15,x16,[x2,#8*5]
1304	mul	x10,x11,x11
1305	ldp	x7,x9,[x1],#8*2
1306	umulh	x11,x11,x11
1307	mul	x12,x13,x13
1308	umulh	x13,x13,x13
1309	extr	x17,x14,x17,#63
1310	stp	x19,x20,[x2,#8*0]
1311	adcs	x23,x10,x17
1312	extr	x14,x15,x14,#63
1313	stp	x21,x22,[x2,#8*2]
1314	adcs	x24,x11,x14
1315	ldp	x17,x14,[x2,#8*7]
1316	extr	x15,x16,x15,#63
1317	adcs	x25,x12,x15
1318	extr	x16,x17,x16,#63
1319	adcs	x26,x13,x16
1320	ldp	x15,x16,[x2,#8*9]
1321	mul	x6,x7,x7
1322	ldp	x11,x13,[x1],#8*2
1323	umulh	x7,x7,x7
1324	mul	x8,x9,x9
1325	umulh	x9,x9,x9
1326	stp	x23,x24,[x2,#8*4]
1327	extr	x17,x14,x17,#63
1328	stp	x25,x26,[x2,#8*6]
1329	add	x2,x2,#8*8
1330	adcs	x19,x6,x17
1331	extr	x14,x15,x14,#63
1332	adcs	x20,x7,x14
1333	ldp	x17,x14,[x2,#8*3]
1334	extr	x15,x16,x15,#63
1335	cbnz	x27,.Lsqr4x_shift_n_add
1336	ldp	x1,x4,[x29,#104]	// pull np and n0
1337
1338	adcs	x21,x8,x15
1339	extr	x16,x17,x16,#63
1340	adcs	x22,x9,x16
1341	ldp	x15,x16,[x2,#8*5]
1342	mul	x10,x11,x11
1343	umulh	x11,x11,x11
1344	stp	x19,x20,[x2,#8*0]
1345	mul	x12,x13,x13
1346	umulh	x13,x13,x13
1347	stp	x21,x22,[x2,#8*2]
1348	extr	x17,x14,x17,#63
1349	adcs	x23,x10,x17
1350	extr	x14,x15,x14,#63
1351	ldp	x19,x20,[sp,#8*0]
1352	adcs	x24,x11,x14
1353	extr	x15,x16,x15,#63
1354	ldp	x6,x7,[x1,#8*0]
1355	adcs	x25,x12,x15
1356	extr	x16,xzr,x16,#63
1357	ldp	x8,x9,[x1,#8*2]
1358	adc	x26,x13,x16
1359	ldp	x10,x11,[x1,#8*4]
1360
1361	// Reduce by 512 bits per iteration
1362	mul	x28,x4,x19		// t[0]*n0
1363	ldp	x12,x13,[x1,#8*6]
1364	add	x3,x1,x5
1365	ldp	x21,x22,[sp,#8*2]
1366	stp	x23,x24,[x2,#8*4]
1367	ldp	x23,x24,[sp,#8*4]
1368	stp	x25,x26,[x2,#8*6]
1369	ldp	x25,x26,[sp,#8*6]
1370	add	x1,x1,#8*8
1371	mov	x30,xzr		// initial top-most carry
1372	mov	x2,sp
1373	mov	x27,#8
1374
1375.Lsqr8x_reduction:
1376	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
1377	mul	x15,x7,x28
1378	sub	x27,x27,#1
1379	mul	x16,x8,x28
1380	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
1381	mul	x17,x9,x28
1382	// (*)	adds	xzr,x19,x14
1383	subs	xzr,x19,#1		// (*)
1384	mul	x14,x10,x28
1385	adcs	x19,x20,x15
1386	mul	x15,x11,x28
1387	adcs	x20,x21,x16
1388	mul	x16,x12,x28
1389	adcs	x21,x22,x17
1390	mul	x17,x13,x28
1391	adcs	x22,x23,x14
1392	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
1393	adcs	x23,x24,x15
1394	umulh	x15,x7,x28
1395	adcs	x24,x25,x16
1396	umulh	x16,x8,x28
1397	adcs	x25,x26,x17
1398	umulh	x17,x9,x28
1399	adc	x26,xzr,xzr
1400	adds	x19,x19,x14
1401	umulh	x14,x10,x28
1402	adcs	x20,x20,x15
1403	umulh	x15,x11,x28
1404	adcs	x21,x21,x16
1405	umulh	x16,x12,x28
1406	adcs	x22,x22,x17
1407	umulh	x17,x13,x28
1408	mul	x28,x4,x19		// next t[0]*n0
1409	adcs	x23,x23,x14
1410	adcs	x24,x24,x15
1411	adcs	x25,x25,x16
1412	adc	x26,x26,x17
1413	cbnz	x27,.Lsqr8x_reduction
1414
1415	ldp	x14,x15,[x2,#8*0]
1416	ldp	x16,x17,[x2,#8*2]
1417	mov	x0,x2
1418	sub	x27,x3,x1	// done yet?
1419	adds	x19,x19,x14
1420	adcs	x20,x20,x15
1421	ldp	x14,x15,[x2,#8*4]
1422	adcs	x21,x21,x16
1423	adcs	x22,x22,x17
1424	ldp	x16,x17,[x2,#8*6]
1425	adcs	x23,x23,x14
1426	adcs	x24,x24,x15
1427	adcs	x25,x25,x16
1428	adcs	x26,x26,x17
1429	//adc	x28,xzr,xzr		// moved below
1430	cbz	x27,.Lsqr8x8_post_condition
1431
1432	ldur	x4,[x2,#-8*8]
1433	ldp	x6,x7,[x1,#8*0]
1434	ldp	x8,x9,[x1,#8*2]
1435	ldp	x10,x11,[x1,#8*4]
1436	mov	x27,#-8*8
1437	ldp	x12,x13,[x1,#8*6]
1438	add	x1,x1,#8*8
1439
1440.Lsqr8x_tail:
1441	mul	x14,x6,x4
1442	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1443	mul	x15,x7,x4
1444	add	x27,x27,#8
1445	mul	x16,x8,x4
1446	mul	x17,x9,x4
1447	adds	x19,x19,x14
1448	mul	x14,x10,x4
1449	adcs	x20,x20,x15
1450	mul	x15,x11,x4
1451	adcs	x21,x21,x16
1452	mul	x16,x12,x4
1453	adcs	x22,x22,x17
1454	mul	x17,x13,x4
1455	adcs	x23,x23,x14
1456	umulh	x14,x6,x4
1457	adcs	x24,x24,x15
1458	umulh	x15,x7,x4
1459	adcs	x25,x25,x16
1460	umulh	x16,x8,x4
1461	adcs	x26,x26,x17
1462	umulh	x17,x9,x4
1463	adc	x28,x28,xzr
1464	str	x19,[x2],#8
1465	adds	x19,x20,x14
1466	umulh	x14,x10,x4
1467	adcs	x20,x21,x15
1468	umulh	x15,x11,x4
1469	adcs	x21,x22,x16
1470	umulh	x16,x12,x4
1471	adcs	x22,x23,x17
1472	umulh	x17,x13,x4
1473	ldr	x4,[x0,x27]
1474	adcs	x23,x24,x14
1475	adcs	x24,x25,x15
1476	adcs	x25,x26,x16
1477	adcs	x26,x28,x17
1478	//adc	x28,xzr,xzr		// moved above
1479	cbnz	x27,.Lsqr8x_tail
1480					// note that carry flag is guaranteed
1481					// to be zero at this point
1482	ldp	x6,x7,[x2,#8*0]
1483	sub	x27,x3,x1	// done yet?
1484	sub	x16,x3,x5	// rewinded np
1485	ldp	x8,x9,[x2,#8*2]
1486	ldp	x10,x11,[x2,#8*4]
1487	ldp	x12,x13,[x2,#8*6]
1488	cbz	x27,.Lsqr8x_tail_break
1489
1490	ldur	x4,[x0,#-8*8]
1491	adds	x19,x19,x6
1492	adcs	x20,x20,x7
1493	ldp	x6,x7,[x1,#8*0]
1494	adcs	x21,x21,x8
1495	adcs	x22,x22,x9
1496	ldp	x8,x9,[x1,#8*2]
1497	adcs	x23,x23,x10
1498	adcs	x24,x24,x11
1499	ldp	x10,x11,[x1,#8*4]
1500	adcs	x25,x25,x12
1501	mov	x27,#-8*8
1502	adcs	x26,x26,x13
1503	ldp	x12,x13,[x1,#8*6]
1504	add	x1,x1,#8*8
1505	//adc	x28,xzr,xzr		// moved above
1506	b	.Lsqr8x_tail
1507
1508.align	4
1509.Lsqr8x_tail_break:
1510	ldr	x4,[x29,#112]		// pull n0
1511	add	x27,x2,#8*8		// end of current t[num] window
1512
1513	subs	xzr,x30,#1		// "move" top-most carry to carry bit
1514	adcs	x14,x19,x6
1515	adcs	x15,x20,x7
1516	ldp	x19,x20,[x0,#8*0]
1517	adcs	x21,x21,x8
1518	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
1519	adcs	x22,x22,x9
1520	ldp	x8,x9,[x16,#8*2]
1521	adcs	x23,x23,x10
1522	adcs	x24,x24,x11
1523	ldp	x10,x11,[x16,#8*4]
1524	adcs	x25,x25,x12
1525	adcs	x26,x26,x13
1526	ldp	x12,x13,[x16,#8*6]
1527	add	x1,x16,#8*8
1528	adc	x30,xzr,xzr	// top-most carry
1529	mul	x28,x4,x19
1530	stp	x14,x15,[x2,#8*0]
1531	stp	x21,x22,[x2,#8*2]
1532	ldp	x21,x22,[x0,#8*2]
1533	stp	x23,x24,[x2,#8*4]
1534	ldp	x23,x24,[x0,#8*4]
1535	cmp	x27,x29		// did we hit the bottom?
1536	stp	x25,x26,[x2,#8*6]
1537	mov	x2,x0			// slide the window
1538	ldp	x25,x26,[x0,#8*6]
1539	mov	x27,#8
1540	b.ne	.Lsqr8x_reduction
1541
1542	// Final step. We see if result is larger than modulus, and
1543	// if it is, subtract the modulus. But comparison implies
1544	// subtraction. So we subtract modulus, see if it borrowed,
1545	// and conditionally copy original value.
1546	ldr	x0,[x29,#96]		// pull rp
1547	add	x2,x2,#8*8
1548	subs	x14,x19,x6
1549	sbcs	x15,x20,x7
1550	sub	x27,x5,#8*8
1551	mov	x3,x0		// x0 copy
1552
1553.Lsqr8x_sub:
1554	sbcs	x16,x21,x8
1555	ldp	x6,x7,[x1,#8*0]
1556	sbcs	x17,x22,x9
1557	stp	x14,x15,[x0,#8*0]
1558	sbcs	x14,x23,x10
1559	ldp	x8,x9,[x1,#8*2]
1560	sbcs	x15,x24,x11
1561	stp	x16,x17,[x0,#8*2]
1562	sbcs	x16,x25,x12
1563	ldp	x10,x11,[x1,#8*4]
1564	sbcs	x17,x26,x13
1565	ldp	x12,x13,[x1,#8*6]
1566	add	x1,x1,#8*8
1567	ldp	x19,x20,[x2,#8*0]
1568	sub	x27,x27,#8*8
1569	ldp	x21,x22,[x2,#8*2]
1570	ldp	x23,x24,[x2,#8*4]
1571	ldp	x25,x26,[x2,#8*6]
1572	add	x2,x2,#8*8
1573	stp	x14,x15,[x0,#8*4]
1574	sbcs	x14,x19,x6
1575	stp	x16,x17,[x0,#8*6]
1576	add	x0,x0,#8*8
1577	sbcs	x15,x20,x7
1578	cbnz	x27,.Lsqr8x_sub
1579
1580	sbcs	x16,x21,x8
1581	mov	x2,sp
1582	add	x1,sp,x5
1583	ldp	x6,x7,[x3,#8*0]
1584	sbcs	x17,x22,x9
1585	stp	x14,x15,[x0,#8*0]
1586	sbcs	x14,x23,x10
1587	ldp	x8,x9,[x3,#8*2]
1588	sbcs	x15,x24,x11
1589	stp	x16,x17,[x0,#8*2]
1590	sbcs	x16,x25,x12
1591	ldp	x19,x20,[x1,#8*0]
1592	sbcs	x17,x26,x13
1593	ldp	x21,x22,[x1,#8*2]
1594	sbcs	xzr,x30,xzr	// did it borrow?
1595	ldr	x30,[x29,#8]		// pull return address
1596	stp	x14,x15,[x0,#8*4]
1597	stp	x16,x17,[x0,#8*6]
1598
1599	sub	x27,x5,#8*4
1600.Lsqr4x_cond_copy:
1601	sub	x27,x27,#8*4
1602	csel	x14,x19,x6,lo
1603	stp	xzr,xzr,[x2,#8*0]
1604	csel	x15,x20,x7,lo
1605	ldp	x6,x7,[x3,#8*4]
1606	ldp	x19,x20,[x1,#8*4]
1607	csel	x16,x21,x8,lo
1608	stp	xzr,xzr,[x2,#8*2]
1609	add	x2,x2,#8*4
1610	csel	x17,x22,x9,lo
1611	ldp	x8,x9,[x3,#8*6]
1612	ldp	x21,x22,[x1,#8*6]
1613	add	x1,x1,#8*4
1614	stp	x14,x15,[x3,#8*0]
1615	stp	x16,x17,[x3,#8*2]
1616	add	x3,x3,#8*4
1617	stp	xzr,xzr,[x1,#8*0]
1618	stp	xzr,xzr,[x1,#8*2]
1619	cbnz	x27,.Lsqr4x_cond_copy
1620
1621	csel	x14,x19,x6,lo
1622	stp	xzr,xzr,[x2,#8*0]
1623	csel	x15,x20,x7,lo
1624	stp	xzr,xzr,[x2,#8*2]
1625	csel	x16,x21,x8,lo
1626	csel	x17,x22,x9,lo
1627	stp	x14,x15,[x3,#8*0]
1628	stp	x16,x17,[x3,#8*2]
1629
1630	b	.Lsqr8x_done
1631
1632.align	4
1633.Lsqr8x8_post_condition:
1634	adc	x28,xzr,xzr
1635	ldr	x30,[x29,#8]		// pull return address
1636	// x19-7,x28 hold result, x6-7 hold modulus
1637	subs	x6,x19,x6
1638	ldr	x1,[x29,#96]		// pull rp
1639	sbcs	x7,x20,x7
1640	stp	xzr,xzr,[sp,#8*0]
1641	sbcs	x8,x21,x8
1642	stp	xzr,xzr,[sp,#8*2]
1643	sbcs	x9,x22,x9
1644	stp	xzr,xzr,[sp,#8*4]
1645	sbcs	x10,x23,x10
1646	stp	xzr,xzr,[sp,#8*6]
1647	sbcs	x11,x24,x11
1648	stp	xzr,xzr,[sp,#8*8]
1649	sbcs	x12,x25,x12
1650	stp	xzr,xzr,[sp,#8*10]
1651	sbcs	x13,x26,x13
1652	stp	xzr,xzr,[sp,#8*12]
1653	sbcs	x28,x28,xzr	// did it borrow?
1654	stp	xzr,xzr,[sp,#8*14]
1655
1656	// x6-7 hold result-modulus
1657	csel	x6,x19,x6,lo
1658	csel	x7,x20,x7,lo
1659	csel	x8,x21,x8,lo
1660	csel	x9,x22,x9,lo
1661	stp	x6,x7,[x1,#8*0]
1662	csel	x10,x23,x10,lo
1663	csel	x11,x24,x11,lo
1664	stp	x8,x9,[x1,#8*2]
1665	csel	x12,x25,x12,lo
1666	csel	x13,x26,x13,lo
1667	stp	x10,x11,[x1,#8*4]
1668	stp	x12,x13,[x1,#8*6]
1669
1670.Lsqr8x_done:
1671	ldp	x19,x20,[x29,#16]
1672	mov	sp,x29
1673	ldp	x21,x22,[x29,#32]
1674	mov	x0,#1
1675	ldp	x23,x24,[x29,#48]
1676	ldp	x25,x26,[x29,#64]
1677	ldp	x27,x28,[x29,#80]
1678	ldr	x29,[sp],#128
1679.inst	0xd50323bf		// autiasp
1680	ret
1681.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1682.type	__bn_mul4x_mont,%function
1683.align	5
1684__bn_mul4x_mont:
1685.inst	0xd503233f		// paciasp
1686	stp	x29,x30,[sp,#-128]!
1687	add	x29,sp,#0
1688	stp	x19,x20,[sp,#16]
1689	stp	x21,x22,[sp,#32]
1690	stp	x23,x24,[sp,#48]
1691	stp	x25,x26,[sp,#64]
1692	stp	x27,x28,[sp,#80]
1693
1694	sub	x26,sp,x5,lsl#3
1695	lsl	x5,x5,#3
1696	ldr	x4,[x4]		// *n0
1697	sub	sp,x26,#8*4		// alloca
1698
1699	add	x10,x2,x5
1700	add	x27,x1,x5
1701	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1702
1703	ldr	x24,[x2,#8*0]		// b[0]
1704	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1705	ldp	x8,x9,[x1,#8*2]
1706	add	x1,x1,#8*4
1707	mov	x19,xzr
1708	mov	x20,xzr
1709	mov	x21,xzr
1710	mov	x22,xzr
1711	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1712	ldp	x16,x17,[x3,#8*2]
1713	adds	x3,x3,#8*4		// clear carry bit
1714	mov	x0,xzr
1715	mov	x28,#0
1716	mov	x26,sp
1717
1718.Loop_mul4x_1st_reduction:
1719	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1720	adc	x0,x0,xzr	// modulo-scheduled
1721	mul	x11,x7,x24
1722	add	x28,x28,#8
1723	mul	x12,x8,x24
1724	and	x28,x28,#31
1725	mul	x13,x9,x24
1726	adds	x19,x19,x10
1727	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1728	adcs	x20,x20,x11
1729	mul	x25,x19,x4		// t[0]*n0
1730	adcs	x21,x21,x12
1731	umulh	x11,x7,x24
1732	adcs	x22,x22,x13
1733	umulh	x12,x8,x24
1734	adc	x23,xzr,xzr
1735	umulh	x13,x9,x24
1736	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1737	adds	x20,x20,x10
1738	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1739	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1740	adcs	x21,x21,x11
1741	mul	x11,x15,x25
1742	adcs	x22,x22,x12
1743	mul	x12,x16,x25
1744	adc	x23,x23,x13		// can't overflow
1745	mul	x13,x17,x25
1746	// (*)	adds	xzr,x19,x10
1747	subs	xzr,x19,#1		// (*)
1748	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1749	adcs	x19,x20,x11
1750	umulh	x11,x15,x25
1751	adcs	x20,x21,x12
1752	umulh	x12,x16,x25
1753	adcs	x21,x22,x13
1754	umulh	x13,x17,x25
1755	adcs	x22,x23,x0
1756	adc	x0,xzr,xzr
1757	adds	x19,x19,x10
1758	sub	x10,x27,x1
1759	adcs	x20,x20,x11
1760	adcs	x21,x21,x12
1761	adcs	x22,x22,x13
1762	//adc	x0,x0,xzr
1763	cbnz	x28,.Loop_mul4x_1st_reduction
1764
1765	cbz	x10,.Lmul4x4_post_condition
1766
1767	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1768	ldp	x8,x9,[x1,#8*2]
1769	add	x1,x1,#8*4
1770	ldr	x25,[sp]		// a[0]*n0
1771	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1772	ldp	x16,x17,[x3,#8*2]
1773	add	x3,x3,#8*4
1774
1775.Loop_mul4x_1st_tail:
1776	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1777	adc	x0,x0,xzr	// modulo-scheduled
1778	mul	x11,x7,x24
1779	add	x28,x28,#8
1780	mul	x12,x8,x24
1781	and	x28,x28,#31
1782	mul	x13,x9,x24
1783	adds	x19,x19,x10
1784	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1785	adcs	x20,x20,x11
1786	umulh	x11,x7,x24
1787	adcs	x21,x21,x12
1788	umulh	x12,x8,x24
1789	adcs	x22,x22,x13
1790	umulh	x13,x9,x24
1791	adc	x23,xzr,xzr
1792	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1793	adds	x20,x20,x10
1794	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1795	adcs	x21,x21,x11
1796	mul	x11,x15,x25
1797	adcs	x22,x22,x12
1798	mul	x12,x16,x25
1799	adc	x23,x23,x13		// can't overflow
1800	mul	x13,x17,x25
1801	adds	x19,x19,x10
1802	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1803	adcs	x20,x20,x11
1804	umulh	x11,x15,x25
1805	adcs	x21,x21,x12
1806	umulh	x12,x16,x25
1807	adcs	x22,x22,x13
1808	adcs	x23,x23,x0
1809	umulh	x13,x17,x25
1810	adc	x0,xzr,xzr
1811	ldr	x25,[sp,x28]		// next t[0]*n0
1812	str	x19,[x26],#8		// result!!!
1813	adds	x19,x20,x10
1814	sub	x10,x27,x1		// done yet?
1815	adcs	x20,x21,x11
1816	adcs	x21,x22,x12
1817	adcs	x22,x23,x13
1818	//adc	x0,x0,xzr
1819	cbnz	x28,.Loop_mul4x_1st_tail
1820
1821	sub	x11,x27,x5	// rewinded x1
1822	cbz	x10,.Lmul4x_proceed
1823
1824	ldp	x6,x7,[x1,#8*0]
1825	ldp	x8,x9,[x1,#8*2]
1826	add	x1,x1,#8*4
1827	ldp	x14,x15,[x3,#8*0]
1828	ldp	x16,x17,[x3,#8*2]
1829	add	x3,x3,#8*4
1830	b	.Loop_mul4x_1st_tail
1831
1832.align	5
1833.Lmul4x_proceed:
1834	ldr	x24,[x2,#8*4]!		// *++b
1835	adc	x30,x0,xzr
1836	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1837	sub	x3,x3,x5		// rewind np
1838	ldp	x8,x9,[x11,#8*2]
1839	add	x1,x11,#8*4
1840
1841	stp	x19,x20,[x26,#8*0]	// result!!!
1842	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1843	stp	x21,x22,[x26,#8*2]	// result!!!
1844	ldp	x21,x22,[sp,#8*6]
1845
1846	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1847	mov	x26,sp
1848	ldp	x16,x17,[x3,#8*2]
1849	adds	x3,x3,#8*4		// clear carry bit
1850	mov	x0,xzr
1851
1852.align	4
1853.Loop_mul4x_reduction:
1854	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1855	adc	x0,x0,xzr	// modulo-scheduled
1856	mul	x11,x7,x24
1857	add	x28,x28,#8
1858	mul	x12,x8,x24
1859	and	x28,x28,#31
1860	mul	x13,x9,x24
1861	adds	x19,x19,x10
1862	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1863	adcs	x20,x20,x11
1864	mul	x25,x19,x4		// t[0]*n0
1865	adcs	x21,x21,x12
1866	umulh	x11,x7,x24
1867	adcs	x22,x22,x13
1868	umulh	x12,x8,x24
1869	adc	x23,xzr,xzr
1870	umulh	x13,x9,x24
1871	ldr	x24,[x2,x28]		// next b[i]
1872	adds	x20,x20,x10
1873	// (*)	mul	x10,x14,x25
1874	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1875	adcs	x21,x21,x11
1876	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1877	adcs	x22,x22,x12
1878	mul	x12,x16,x25
1879	adc	x23,x23,x13		// can't overflow
1880	mul	x13,x17,x25
1881	// (*)	adds	xzr,x19,x10
1882	subs	xzr,x19,#1		// (*)
1883	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1884	adcs	x19,x20,x11
1885	umulh	x11,x15,x25
1886	adcs	x20,x21,x12
1887	umulh	x12,x16,x25
1888	adcs	x21,x22,x13
1889	umulh	x13,x17,x25
1890	adcs	x22,x23,x0
1891	adc	x0,xzr,xzr
1892	adds	x19,x19,x10
1893	adcs	x20,x20,x11
1894	adcs	x21,x21,x12
1895	adcs	x22,x22,x13
1896	//adc	x0,x0,xzr
1897	cbnz	x28,.Loop_mul4x_reduction
1898
1899	adc	x0,x0,xzr
1900	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1901	ldp	x12,x13,[x26,#8*6]
1902	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1903	ldp	x8,x9,[x1,#8*2]
1904	add	x1,x1,#8*4
1905	adds	x19,x19,x10
1906	adcs	x20,x20,x11
1907	adcs	x21,x21,x12
1908	adcs	x22,x22,x13
1909	//adc	x0,x0,xzr
1910
1911	ldr	x25,[sp]		// t[0]*n0
1912	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1913	ldp	x16,x17,[x3,#8*2]
1914	add	x3,x3,#8*4
1915
1916.align	4
1917.Loop_mul4x_tail:
1918	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1919	adc	x0,x0,xzr	// modulo-scheduled
1920	mul	x11,x7,x24
1921	add	x28,x28,#8
1922	mul	x12,x8,x24
1923	and	x28,x28,#31
1924	mul	x13,x9,x24
1925	adds	x19,x19,x10
1926	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1927	adcs	x20,x20,x11
1928	umulh	x11,x7,x24
1929	adcs	x21,x21,x12
1930	umulh	x12,x8,x24
1931	adcs	x22,x22,x13
1932	umulh	x13,x9,x24
1933	adc	x23,xzr,xzr
1934	ldr	x24,[x2,x28]		// next b[i]
1935	adds	x20,x20,x10
1936	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1937	adcs	x21,x21,x11
1938	mul	x11,x15,x25
1939	adcs	x22,x22,x12
1940	mul	x12,x16,x25
1941	adc	x23,x23,x13		// can't overflow
1942	mul	x13,x17,x25
1943	adds	x19,x19,x10
1944	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1945	adcs	x20,x20,x11
1946	umulh	x11,x15,x25
1947	adcs	x21,x21,x12
1948	umulh	x12,x16,x25
1949	adcs	x22,x22,x13
1950	umulh	x13,x17,x25
1951	adcs	x23,x23,x0
1952	ldr	x25,[sp,x28]		// next a[0]*n0
1953	adc	x0,xzr,xzr
1954	str	x19,[x26],#8		// result!!!
1955	adds	x19,x20,x10
1956	sub	x10,x27,x1		// done yet?
1957	adcs	x20,x21,x11
1958	adcs	x21,x22,x12
1959	adcs	x22,x23,x13
1960	//adc	x0,x0,xzr
1961	cbnz	x28,.Loop_mul4x_tail
1962
1963	sub	x11,x3,x5		// rewinded np?
1964	adc	x0,x0,xzr
1965	cbz	x10,.Loop_mul4x_break
1966
1967	ldp	x10,x11,[x26,#8*4]
1968	ldp	x12,x13,[x26,#8*6]
1969	ldp	x6,x7,[x1,#8*0]
1970	ldp	x8,x9,[x1,#8*2]
1971	add	x1,x1,#8*4
1972	adds	x19,x19,x10
1973	adcs	x20,x20,x11
1974	adcs	x21,x21,x12
1975	adcs	x22,x22,x13
1976	//adc	x0,x0,xzr
1977	ldp	x14,x15,[x3,#8*0]
1978	ldp	x16,x17,[x3,#8*2]
1979	add	x3,x3,#8*4
1980	b	.Loop_mul4x_tail
1981
1982.align	4
1983.Loop_mul4x_break:
1984	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1985	adds	x19,x19,x30
1986	add	x2,x2,#8*4		// bp++
1987	adcs	x20,x20,xzr
1988	sub	x1,x1,x5		// rewind ap
1989	adcs	x21,x21,xzr
1990	stp	x19,x20,[x26,#8*0]	// result!!!
1991	adcs	x22,x22,xzr
1992	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1993	adc	x30,x0,xzr
1994	stp	x21,x22,[x26,#8*2]	// result!!!
1995	cmp	x2,x13			// done yet?
1996	ldp	x21,x22,[sp,#8*6]
1997	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1998	ldp	x16,x17,[x11,#8*2]
1999	add	x3,x11,#8*4
2000	b.eq	.Lmul4x_post
2001
2002	ldr	x24,[x2]
2003	ldp	x6,x7,[x1,#8*0]	// a[0..3]
2004	ldp	x8,x9,[x1,#8*2]
2005	adds	x1,x1,#8*4		// clear carry bit
2006	mov	x0,xzr
2007	mov	x26,sp
2008	b	.Loop_mul4x_reduction
2009
2010.align	4
2011.Lmul4x_post:
2012	// Final step. We see if result is larger than modulus, and
2013	// if it is, subtract the modulus. But comparison implies
2014	// subtraction. So we subtract modulus, see if it borrowed,
2015	// and conditionally copy original value.
2016	mov	x0,x12
2017	mov	x27,x12		// x0 copy
2018	subs	x10,x19,x14
2019	add	x26,sp,#8*8
2020	sbcs	x11,x20,x15
2021	sub	x28,x5,#8*4
2022
2023.Lmul4x_sub:
2024	sbcs	x12,x21,x16
2025	ldp	x14,x15,[x3,#8*0]
2026	sub	x28,x28,#8*4
2027	ldp	x19,x20,[x26,#8*0]
2028	sbcs	x13,x22,x17
2029	ldp	x16,x17,[x3,#8*2]
2030	add	x3,x3,#8*4
2031	ldp	x21,x22,[x26,#8*2]
2032	add	x26,x26,#8*4
2033	stp	x10,x11,[x0,#8*0]
2034	sbcs	x10,x19,x14
2035	stp	x12,x13,[x0,#8*2]
2036	add	x0,x0,#8*4
2037	sbcs	x11,x20,x15
2038	cbnz	x28,.Lmul4x_sub
2039
2040	sbcs	x12,x21,x16
2041	mov	x26,sp
2042	add	x1,sp,#8*4
2043	ldp	x6,x7,[x27,#8*0]
2044	sbcs	x13,x22,x17
2045	stp	x10,x11,[x0,#8*0]
2046	ldp	x8,x9,[x27,#8*2]
2047	stp	x12,x13,[x0,#8*2]
2048	ldp	x19,x20,[x1,#8*0]
2049	ldp	x21,x22,[x1,#8*2]
2050	sbcs	xzr,x30,xzr	// did it borrow?
2051	ldr	x30,[x29,#8]		// pull return address
2052
2053	sub	x28,x5,#8*4
2054.Lmul4x_cond_copy:
2055	sub	x28,x28,#8*4
2056	csel	x10,x19,x6,lo
2057	stp	xzr,xzr,[x26,#8*0]
2058	csel	x11,x20,x7,lo
2059	ldp	x6,x7,[x27,#8*4]
2060	ldp	x19,x20,[x1,#8*4]
2061	csel	x12,x21,x8,lo
2062	stp	xzr,xzr,[x26,#8*2]
2063	add	x26,x26,#8*4
2064	csel	x13,x22,x9,lo
2065	ldp	x8,x9,[x27,#8*6]
2066	ldp	x21,x22,[x1,#8*6]
2067	add	x1,x1,#8*4
2068	stp	x10,x11,[x27,#8*0]
2069	stp	x12,x13,[x27,#8*2]
2070	add	x27,x27,#8*4
2071	cbnz	x28,.Lmul4x_cond_copy
2072
2073	csel	x10,x19,x6,lo
2074	stp	xzr,xzr,[x26,#8*0]
2075	csel	x11,x20,x7,lo
2076	stp	xzr,xzr,[x26,#8*2]
2077	csel	x12,x21,x8,lo
2078	stp	xzr,xzr,[x26,#8*3]
2079	csel	x13,x22,x9,lo
2080	stp	xzr,xzr,[x26,#8*4]
2081	stp	x10,x11,[x27,#8*0]
2082	stp	x12,x13,[x27,#8*2]
2083
2084	b	.Lmul4x_done
2085
2086.align	4
2087.Lmul4x4_post_condition:
2088	adc	x0,x0,xzr
2089	ldr	x1,[x29,#96]		// pull rp
2090	// x19-3,x0 hold result, x14-7 hold modulus
2091	subs	x6,x19,x14
2092	ldr	x30,[x29,#8]		// pull return address
2093	sbcs	x7,x20,x15
2094	stp	xzr,xzr,[sp,#8*0]
2095	sbcs	x8,x21,x16
2096	stp	xzr,xzr,[sp,#8*2]
2097	sbcs	x9,x22,x17
2098	stp	xzr,xzr,[sp,#8*4]
2099	sbcs	xzr,x0,xzr		// did it borrow?
2100	stp	xzr,xzr,[sp,#8*6]
2101
2102	// x6-3 hold result-modulus
2103	csel	x6,x19,x6,lo
2104	csel	x7,x20,x7,lo
2105	csel	x8,x21,x8,lo
2106	csel	x9,x22,x9,lo
2107	stp	x6,x7,[x1,#8*0]
2108	stp	x8,x9,[x1,#8*2]
2109
2110.Lmul4x_done:
2111	ldp	x19,x20,[x29,#16]
2112	mov	sp,x29
2113	ldp	x21,x22,[x29,#32]
2114	mov	x0,#1
2115	ldp	x23,x24,[x29,#48]
2116	ldp	x25,x26,[x29,#64]
2117	ldp	x27,x28,[x29,#80]
2118	ldr	x29,[sp],#128
2119.inst	0xd50323bf		// autiasp
2120	ret
2121.size	__bn_mul4x_mont,.-__bn_mul4x_mont
2122.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2123.align	2
2124.align	4
2125