xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/armv8-mont.S (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1.text
2
3.globl	bn_mul_mont
4.type	bn_mul_mont,%function
5.align	5
6bn_mul_mont:
7	tst	x5,#7
8	b.eq	__bn_sqr8x_mont
9	tst	x5,#3
10	b.eq	__bn_mul4x_mont
11.Lmul_mont:
12	stp	x29,x30,[sp,#-64]!
13	add	x29,sp,#0
14	stp	x19,x20,[sp,#16]
15	stp	x21,x22,[sp,#32]
16	stp	x23,x24,[sp,#48]
17
18	ldr	x9,[x2],#8		// bp[0]
19	sub	x22,sp,x5,lsl#3
20	ldp	x7,x8,[x1],#16	// ap[0..1]
21	lsl	x5,x5,#3
22	ldr	x4,[x4]		// *n0
23	and	x22,x22,#-16		// ABI says so
24	ldp	x13,x14,[x3],#16	// np[0..1]
25
26	mul	x6,x7,x9		// ap[0]*bp[0]
27	sub	x21,x5,#16		// j=num-2
28	umulh	x7,x7,x9
29	mul	x10,x8,x9		// ap[1]*bp[0]
30	umulh	x11,x8,x9
31
32	mul	x15,x6,x4		// "tp[0]"*n0
33	mov	sp,x22			// alloca
34
35	// (*)	mul	x12,x13,x15	// np[0]*m1
36	umulh	x13,x13,x15
37	mul	x16,x14,x15		// np[1]*m1
38	// (*)	adds	x12,x12,x6	// discarded
39	// (*)	As for removal of first multiplication and addition
40	//	instructions. The outcome of first addition is
41	//	guaranteed to be zero, which leaves two computationally
42	//	significant outcomes: it either carries or not. Then
43	//	question is when does it carry? Is there alternative
44	//	way to deduce it? If you follow operations, you can
45	//	observe that condition for carry is quite simple:
46	//	x6 being non-zero. So that carry can be calculated
47	//	by adding -1 to x6. That's what next instruction does.
48	subs	xzr,x6,#1		// (*)
49	umulh	x17,x14,x15
50	adc	x13,x13,xzr
51	cbz	x21,.L1st_skip
52
53.L1st:
54	ldr	x8,[x1],#8
55	adds	x6,x10,x7
56	sub	x21,x21,#8		// j--
57	adc	x7,x11,xzr
58
59	ldr	x14,[x3],#8
60	adds	x12,x16,x13
61	mul	x10,x8,x9		// ap[j]*bp[0]
62	adc	x13,x17,xzr
63	umulh	x11,x8,x9
64
65	adds	x12,x12,x6
66	mul	x16,x14,x15		// np[j]*m1
67	adc	x13,x13,xzr
68	umulh	x17,x14,x15
69	str	x12,[x22],#8		// tp[j-1]
70	cbnz	x21,.L1st
71
72.L1st_skip:
73	adds	x6,x10,x7
74	sub	x1,x1,x5		// rewind x1
75	adc	x7,x11,xzr
76
77	adds	x12,x16,x13
78	sub	x3,x3,x5		// rewind x3
79	adc	x13,x17,xzr
80
81	adds	x12,x12,x6
82	sub	x20,x5,#8		// i=num-1
83	adcs	x13,x13,x7
84
85	adc	x19,xzr,xzr		// upmost overflow bit
86	stp	x12,x13,[x22]
87
88.Louter:
89	ldr	x9,[x2],#8		// bp[i]
90	ldp	x7,x8,[x1],#16
91	ldr	x23,[sp]		// tp[0]
92	add	x22,sp,#8
93
94	mul	x6,x7,x9		// ap[0]*bp[i]
95	sub	x21,x5,#16		// j=num-2
96	umulh	x7,x7,x9
97	ldp	x13,x14,[x3],#16
98	mul	x10,x8,x9		// ap[1]*bp[i]
99	adds	x6,x6,x23
100	umulh	x11,x8,x9
101	adc	x7,x7,xzr
102
103	mul	x15,x6,x4
104	sub	x20,x20,#8		// i--
105
106	// (*)	mul	x12,x13,x15	// np[0]*m1
107	umulh	x13,x13,x15
108	mul	x16,x14,x15		// np[1]*m1
109	// (*)	adds	x12,x12,x6
110	subs	xzr,x6,#1		// (*)
111	umulh	x17,x14,x15
112	cbz	x21,.Linner_skip
113
114.Linner:
115	ldr	x8,[x1],#8
116	adc	x13,x13,xzr
117	ldr	x23,[x22],#8		// tp[j]
118	adds	x6,x10,x7
119	sub	x21,x21,#8		// j--
120	adc	x7,x11,xzr
121
122	adds	x12,x16,x13
123	ldr	x14,[x3],#8
124	adc	x13,x17,xzr
125
126	mul	x10,x8,x9		// ap[j]*bp[i]
127	adds	x6,x6,x23
128	umulh	x11,x8,x9
129	adc	x7,x7,xzr
130
131	mul	x16,x14,x15		// np[j]*m1
132	adds	x12,x12,x6
133	umulh	x17,x14,x15
134	str	x12,[x22,#-16]		// tp[j-1]
135	cbnz	x21,.Linner
136
137.Linner_skip:
138	ldr	x23,[x22],#8		// tp[j]
139	adc	x13,x13,xzr
140	adds	x6,x10,x7
141	sub	x1,x1,x5		// rewind x1
142	adc	x7,x11,xzr
143
144	adds	x12,x16,x13
145	sub	x3,x3,x5		// rewind x3
146	adcs	x13,x17,x19
147	adc	x19,xzr,xzr
148
149	adds	x6,x6,x23
150	adc	x7,x7,xzr
151
152	adds	x12,x12,x6
153	adcs	x13,x13,x7
154	adc	x19,x19,xzr		// upmost overflow bit
155	stp	x12,x13,[x22,#-16]
156
157	cbnz	x20,.Louter
158
159	// Final step. We see if result is larger than modulus, and
160	// if it is, subtract the modulus. But comparison implies
161	// subtraction. So we subtract modulus, see if it borrowed,
162	// and conditionally copy original value.
163	ldr	x23,[sp]		// tp[0]
164	add	x22,sp,#8
165	ldr	x14,[x3],#8		// np[0]
166	subs	x21,x5,#8		// j=num-1 and clear borrow
167	mov	x1,x0
168.Lsub:
169	sbcs	x8,x23,x14		// tp[j]-np[j]
170	ldr	x23,[x22],#8
171	sub	x21,x21,#8		// j--
172	ldr	x14,[x3],#8
173	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
174	cbnz	x21,.Lsub
175
176	sbcs	x8,x23,x14
177	sbcs	x19,x19,xzr		// did it borrow?
178	str	x8,[x1],#8		// rp[num-1]
179
180	ldr	x23,[sp]		// tp[0]
181	add	x22,sp,#8
182	ldr	x8,[x0],#8		// rp[0]
183	sub	x5,x5,#8		// num--
184	nop
185.Lcond_copy:
186	sub	x5,x5,#8		// num--
187	csel	x14,x23,x8,lo		// did it borrow?
188	ldr	x23,[x22],#8
189	ldr	x8,[x0],#8
190	str	xzr,[x22,#-16]		// wipe tp
191	str	x14,[x0,#-16]
192	cbnz	x5,.Lcond_copy
193
194	csel	x14,x23,x8,lo
195	str	xzr,[x22,#-8]		// wipe tp
196	str	x14,[x0,#-8]
197
198	ldp	x19,x20,[x29,#16]
199	mov	sp,x29
200	ldp	x21,x22,[x29,#32]
201	mov	x0,#1
202	ldp	x23,x24,[x29,#48]
203	ldr	x29,[sp],#64
204	ret
205.size	bn_mul_mont,.-bn_mul_mont
206.type	__bn_sqr8x_mont,%function
207.align	5
208__bn_sqr8x_mont:
209	cmp	x1,x2
210	b.ne	__bn_mul4x_mont
211.Lsqr8x_mont:
212	stp	x29,x30,[sp,#-128]!
213	add	x29,sp,#0
214	stp	x19,x20,[sp,#16]
215	stp	x21,x22,[sp,#32]
216	stp	x23,x24,[sp,#48]
217	stp	x25,x26,[sp,#64]
218	stp	x27,x28,[sp,#80]
219	stp	x0,x3,[sp,#96]	// offload rp and np
220
221	ldp	x6,x7,[x1,#8*0]
222	ldp	x8,x9,[x1,#8*2]
223	ldp	x10,x11,[x1,#8*4]
224	ldp	x12,x13,[x1,#8*6]
225
226	sub	x2,sp,x5,lsl#4
227	lsl	x5,x5,#3
228	ldr	x4,[x4]		// *n0
229	mov	sp,x2			// alloca
230	sub	x27,x5,#8*8
231	b	.Lsqr8x_zero_start
232
233.Lsqr8x_zero:
234	sub	x27,x27,#8*8
235	stp	xzr,xzr,[x2,#8*0]
236	stp	xzr,xzr,[x2,#8*2]
237	stp	xzr,xzr,[x2,#8*4]
238	stp	xzr,xzr,[x2,#8*6]
239.Lsqr8x_zero_start:
240	stp	xzr,xzr,[x2,#8*8]
241	stp	xzr,xzr,[x2,#8*10]
242	stp	xzr,xzr,[x2,#8*12]
243	stp	xzr,xzr,[x2,#8*14]
244	add	x2,x2,#8*16
245	cbnz	x27,.Lsqr8x_zero
246
247	add	x3,x1,x5
248	add	x1,x1,#8*8
249	mov	x19,xzr
250	mov	x20,xzr
251	mov	x21,xzr
252	mov	x22,xzr
253	mov	x23,xzr
254	mov	x24,xzr
255	mov	x25,xzr
256	mov	x26,xzr
257	mov	x2,sp
258	str	x4,[x29,#112]		// offload n0
259
260	// Multiply everything but a[i]*a[i]
261.align	4
262.Lsqr8x_outer_loop:
263        //                                                 a[1]a[0]	(i)
264        //                                             a[2]a[0]
265        //                                         a[3]a[0]
266        //                                     a[4]a[0]
267        //                                 a[5]a[0]
268        //                             a[6]a[0]
269        //                         a[7]a[0]
270        //                                         a[2]a[1]		(ii)
271        //                                     a[3]a[1]
272        //                                 a[4]a[1]
273        //                             a[5]a[1]
274        //                         a[6]a[1]
275        //                     a[7]a[1]
276        //                                 a[3]a[2]			(iii)
277        //                             a[4]a[2]
278        //                         a[5]a[2]
279        //                     a[6]a[2]
280        //                 a[7]a[2]
281        //                         a[4]a[3]				(iv)
282        //                     a[5]a[3]
283        //                 a[6]a[3]
284        //             a[7]a[3]
285        //                 a[5]a[4]					(v)
286        //             a[6]a[4]
287        //         a[7]a[4]
288        //         a[6]a[5]						(vi)
289        //     a[7]a[5]
290        // a[7]a[6]							(vii)
291
292	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
293	mul	x15,x8,x6
294	mul	x16,x9,x6
295	mul	x17,x10,x6
296	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
297	mul	x14,x11,x6
298	adcs	x21,x21,x15
299	mul	x15,x12,x6
300	adcs	x22,x22,x16
301	mul	x16,x13,x6
302	adcs	x23,x23,x17
303	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
304	adcs	x24,x24,x14
305	umulh	x14,x8,x6
306	adcs	x25,x25,x15
307	umulh	x15,x9,x6
308	adcs	x26,x26,x16
309	umulh	x16,x10,x6
310	stp	x19,x20,[x2],#8*2	// t[0..1]
311	adc	x19,xzr,xzr		// t[8]
312	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
313	umulh	x17,x11,x6
314	adcs	x22,x22,x14
315	umulh	x14,x12,x6
316	adcs	x23,x23,x15
317	umulh	x15,x13,x6
318	adcs	x24,x24,x16
319	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
320	adcs	x25,x25,x17
321	mul	x17,x9,x7
322	adcs	x26,x26,x14
323	mul	x14,x10,x7
324	adc	x19,x19,x15
325
326	mul	x15,x11,x7
327	adds	x22,x22,x16
328	mul	x16,x12,x7
329	adcs	x23,x23,x17
330	mul	x17,x13,x7
331	adcs	x24,x24,x14
332	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
333	adcs	x25,x25,x15
334	umulh	x15,x9,x7
335	adcs	x26,x26,x16
336	umulh	x16,x10,x7
337	adcs	x19,x19,x17
338	umulh	x17,x11,x7
339	stp	x21,x22,[x2],#8*2	// t[2..3]
340	adc	x20,xzr,xzr		// t[9]
341	adds	x23,x23,x14
342	umulh	x14,x12,x7
343	adcs	x24,x24,x15
344	umulh	x15,x13,x7
345	adcs	x25,x25,x16
346	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
347	adcs	x26,x26,x17
348	mul	x17,x10,x8
349	adcs	x19,x19,x14
350	mul	x14,x11,x8
351	adc	x20,x20,x15
352
353	mul	x15,x12,x8
354	adds	x24,x24,x16
355	mul	x16,x13,x8
356	adcs	x25,x25,x17
357	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
358	adcs	x26,x26,x14
359	umulh	x14,x10,x8
360	adcs	x19,x19,x15
361	umulh	x15,x11,x8
362	adcs	x20,x20,x16
363	umulh	x16,x12,x8
364	stp	x23,x24,[x2],#8*2	// t[4..5]
365	adc	x21,xzr,xzr		// t[10]
366	adds	x25,x25,x17
367	umulh	x17,x13,x8
368	adcs	x26,x26,x14
369	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
370	adcs	x19,x19,x15
371	mul	x15,x11,x9
372	adcs	x20,x20,x16
373	mul	x16,x12,x9
374	adc	x21,x21,x17
375
376	mul	x17,x13,x9
377	adds	x26,x26,x14
378	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
379	adcs	x19,x19,x15
380	umulh	x15,x11,x9
381	adcs	x20,x20,x16
382	umulh	x16,x12,x9
383	adcs	x21,x21,x17
384	umulh	x17,x13,x9
385	stp	x25,x26,[x2],#8*2	// t[6..7]
386	adc	x22,xzr,xzr		// t[11]
387	adds	x19,x19,x14
388	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
389	adcs	x20,x20,x15
390	mul	x15,x12,x10
391	adcs	x21,x21,x16
392	mul	x16,x13,x10
393	adc	x22,x22,x17
394
395	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
396	adds	x20,x20,x14
397	umulh	x14,x12,x10
398	adcs	x21,x21,x15
399	umulh	x15,x13,x10
400	adcs	x22,x22,x16
401	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
402	adc	x23,xzr,xzr		// t[12]
403	adds	x21,x21,x17
404	mul	x17,x13,x11
405	adcs	x22,x22,x14
406	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
407	adc	x23,x23,x15
408
409	umulh	x15,x13,x11
410	adds	x22,x22,x16
411	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
412	adcs	x23,x23,x17
413	umulh	x17,x13,x12		// hi(a[7]*a[6])
414	adc	x24,xzr,xzr		// t[13]
415	adds	x23,x23,x14
416	sub	x27,x3,x1	// done yet?
417	adc	x24,x24,x15
418
419	adds	x24,x24,x16
420	sub	x14,x3,x5	// rewinded ap
421	adc	x25,xzr,xzr		// t[14]
422	add	x25,x25,x17
423
424	cbz	x27,.Lsqr8x_outer_break
425
426	mov	x4,x6
427	ldp	x6,x7,[x2,#8*0]
428	ldp	x8,x9,[x2,#8*2]
429	ldp	x10,x11,[x2,#8*4]
430	ldp	x12,x13,[x2,#8*6]
431	adds	x19,x19,x6
432	adcs	x20,x20,x7
433	ldp	x6,x7,[x1,#8*0]
434	adcs	x21,x21,x8
435	adcs	x22,x22,x9
436	ldp	x8,x9,[x1,#8*2]
437	adcs	x23,x23,x10
438	adcs	x24,x24,x11
439	ldp	x10,x11,[x1,#8*4]
440	adcs	x25,x25,x12
441	mov	x0,x1
442	adcs	x26,xzr,x13
443	ldp	x12,x13,[x1,#8*6]
444	add	x1,x1,#8*8
445	//adc	x28,xzr,xzr		// moved below
446	mov	x27,#-8*8
447
448	//                                                         a[8]a[0]
449	//                                                     a[9]a[0]
450	//                                                 a[a]a[0]
451	//                                             a[b]a[0]
452	//                                         a[c]a[0]
453	//                                     a[d]a[0]
454	//                                 a[e]a[0]
455	//                             a[f]a[0]
456	//                                                     a[8]a[1]
457	//                         a[f]a[1]........................
458	//                                                 a[8]a[2]
459	//                     a[f]a[2]........................
460	//                                             a[8]a[3]
461	//                 a[f]a[3]........................
462	//                                         a[8]a[4]
463	//             a[f]a[4]........................
464	//                                     a[8]a[5]
465	//         a[f]a[5]........................
466	//                                 a[8]a[6]
467	//     a[f]a[6]........................
468	//                             a[8]a[7]
469	// a[f]a[7]........................
470.Lsqr8x_mul:
471	mul	x14,x6,x4
472	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
473	mul	x15,x7,x4
474	add	x27,x27,#8
475	mul	x16,x8,x4
476	mul	x17,x9,x4
477	adds	x19,x19,x14
478	mul	x14,x10,x4
479	adcs	x20,x20,x15
480	mul	x15,x11,x4
481	adcs	x21,x21,x16
482	mul	x16,x12,x4
483	adcs	x22,x22,x17
484	mul	x17,x13,x4
485	adcs	x23,x23,x14
486	umulh	x14,x6,x4
487	adcs	x24,x24,x15
488	umulh	x15,x7,x4
489	adcs	x25,x25,x16
490	umulh	x16,x8,x4
491	adcs	x26,x26,x17
492	umulh	x17,x9,x4
493	adc	x28,x28,xzr
494	str	x19,[x2],#8
495	adds	x19,x20,x14
496	umulh	x14,x10,x4
497	adcs	x20,x21,x15
498	umulh	x15,x11,x4
499	adcs	x21,x22,x16
500	umulh	x16,x12,x4
501	adcs	x22,x23,x17
502	umulh	x17,x13,x4
503	ldr	x4,[x0,x27]
504	adcs	x23,x24,x14
505	adcs	x24,x25,x15
506	adcs	x25,x26,x16
507	adcs	x26,x28,x17
508	//adc	x28,xzr,xzr		// moved above
509	cbnz	x27,.Lsqr8x_mul
510					// note that carry flag is guaranteed
511					// to be zero at this point
512	cmp	x1,x3		// done yet?
513	b.eq	.Lsqr8x_break
514
515	ldp	x6,x7,[x2,#8*0]
516	ldp	x8,x9,[x2,#8*2]
517	ldp	x10,x11,[x2,#8*4]
518	ldp	x12,x13,[x2,#8*6]
519	adds	x19,x19,x6
520	ldr	x4,[x0,#-8*8]
521	adcs	x20,x20,x7
522	ldp	x6,x7,[x1,#8*0]
523	adcs	x21,x21,x8
524	adcs	x22,x22,x9
525	ldp	x8,x9,[x1,#8*2]
526	adcs	x23,x23,x10
527	adcs	x24,x24,x11
528	ldp	x10,x11,[x1,#8*4]
529	adcs	x25,x25,x12
530	mov	x27,#-8*8
531	adcs	x26,x26,x13
532	ldp	x12,x13,[x1,#8*6]
533	add	x1,x1,#8*8
534	//adc	x28,xzr,xzr		// moved above
535	b	.Lsqr8x_mul
536
537.align	4
538.Lsqr8x_break:
539	ldp	x6,x7,[x0,#8*0]
540	add	x1,x0,#8*8
541	ldp	x8,x9,[x0,#8*2]
542	sub	x14,x3,x1		// is it last iteration?
543	ldp	x10,x11,[x0,#8*4]
544	sub	x15,x2,x14
545	ldp	x12,x13,[x0,#8*6]
546	cbz	x14,.Lsqr8x_outer_loop
547
548	stp	x19,x20,[x2,#8*0]
549	ldp	x19,x20,[x15,#8*0]
550	stp	x21,x22,[x2,#8*2]
551	ldp	x21,x22,[x15,#8*2]
552	stp	x23,x24,[x2,#8*4]
553	ldp	x23,x24,[x15,#8*4]
554	stp	x25,x26,[x2,#8*6]
555	mov	x2,x15
556	ldp	x25,x26,[x15,#8*6]
557	b	.Lsqr8x_outer_loop
558
559.align	4
560.Lsqr8x_outer_break:
561	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
562	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
563	ldp	x15,x16,[sp,#8*1]
564	ldp	x11,x13,[x14,#8*2]
565	add	x1,x14,#8*4
566	ldp	x17,x14,[sp,#8*3]
567
568	stp	x19,x20,[x2,#8*0]
569	mul	x19,x7,x7
570	stp	x21,x22,[x2,#8*2]
571	umulh	x7,x7,x7
572	stp	x23,x24,[x2,#8*4]
573	mul	x8,x9,x9
574	stp	x25,x26,[x2,#8*6]
575	mov	x2,sp
576	umulh	x9,x9,x9
577	adds	x20,x7,x15,lsl#1
578	extr	x15,x16,x15,#63
579	sub	x27,x5,#8*4
580
581.Lsqr4x_shift_n_add:
582	adcs	x21,x8,x15
583	extr	x16,x17,x16,#63
584	sub	x27,x27,#8*4
585	adcs	x22,x9,x16
586	ldp	x15,x16,[x2,#8*5]
587	mul	x10,x11,x11
588	ldp	x7,x9,[x1],#8*2
589	umulh	x11,x11,x11
590	mul	x12,x13,x13
591	umulh	x13,x13,x13
592	extr	x17,x14,x17,#63
593	stp	x19,x20,[x2,#8*0]
594	adcs	x23,x10,x17
595	extr	x14,x15,x14,#63
596	stp	x21,x22,[x2,#8*2]
597	adcs	x24,x11,x14
598	ldp	x17,x14,[x2,#8*7]
599	extr	x15,x16,x15,#63
600	adcs	x25,x12,x15
601	extr	x16,x17,x16,#63
602	adcs	x26,x13,x16
603	ldp	x15,x16,[x2,#8*9]
604	mul	x6,x7,x7
605	ldp	x11,x13,[x1],#8*2
606	umulh	x7,x7,x7
607	mul	x8,x9,x9
608	umulh	x9,x9,x9
609	stp	x23,x24,[x2,#8*4]
610	extr	x17,x14,x17,#63
611	stp	x25,x26,[x2,#8*6]
612	add	x2,x2,#8*8
613	adcs	x19,x6,x17
614	extr	x14,x15,x14,#63
615	adcs	x20,x7,x14
616	ldp	x17,x14,[x2,#8*3]
617	extr	x15,x16,x15,#63
618	cbnz	x27,.Lsqr4x_shift_n_add
619	ldp	x1,x4,[x29,#104]	// pull np and n0
620
621	adcs	x21,x8,x15
622	extr	x16,x17,x16,#63
623	adcs	x22,x9,x16
624	ldp	x15,x16,[x2,#8*5]
625	mul	x10,x11,x11
626	umulh	x11,x11,x11
627	stp	x19,x20,[x2,#8*0]
628	mul	x12,x13,x13
629	umulh	x13,x13,x13
630	stp	x21,x22,[x2,#8*2]
631	extr	x17,x14,x17,#63
632	adcs	x23,x10,x17
633	extr	x14,x15,x14,#63
634	ldp	x19,x20,[sp,#8*0]
635	adcs	x24,x11,x14
636	extr	x15,x16,x15,#63
637	ldp	x6,x7,[x1,#8*0]
638	adcs	x25,x12,x15
639	extr	x16,xzr,x16,#63
640	ldp	x8,x9,[x1,#8*2]
641	adc	x26,x13,x16
642	ldp	x10,x11,[x1,#8*4]
643
644	// Reduce by 512 bits per iteration
645	mul	x28,x4,x19		// t[0]*n0
646	ldp	x12,x13,[x1,#8*6]
647	add	x3,x1,x5
648	ldp	x21,x22,[sp,#8*2]
649	stp	x23,x24,[x2,#8*4]
650	ldp	x23,x24,[sp,#8*4]
651	stp	x25,x26,[x2,#8*6]
652	ldp	x25,x26,[sp,#8*6]
653	add	x1,x1,#8*8
654	mov	x30,xzr		// initial top-most carry
655	mov	x2,sp
656	mov	x27,#8
657
658.Lsqr8x_reduction:
659	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
660	mul	x15,x7,x28
661	sub	x27,x27,#1
662	mul	x16,x8,x28
663	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
664	mul	x17,x9,x28
665	// (*)	adds	xzr,x19,x14
666	subs	xzr,x19,#1		// (*)
667	mul	x14,x10,x28
668	adcs	x19,x20,x15
669	mul	x15,x11,x28
670	adcs	x20,x21,x16
671	mul	x16,x12,x28
672	adcs	x21,x22,x17
673	mul	x17,x13,x28
674	adcs	x22,x23,x14
675	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
676	adcs	x23,x24,x15
677	umulh	x15,x7,x28
678	adcs	x24,x25,x16
679	umulh	x16,x8,x28
680	adcs	x25,x26,x17
681	umulh	x17,x9,x28
682	adc	x26,xzr,xzr
683	adds	x19,x19,x14
684	umulh	x14,x10,x28
685	adcs	x20,x20,x15
686	umulh	x15,x11,x28
687	adcs	x21,x21,x16
688	umulh	x16,x12,x28
689	adcs	x22,x22,x17
690	umulh	x17,x13,x28
691	mul	x28,x4,x19		// next t[0]*n0
692	adcs	x23,x23,x14
693	adcs	x24,x24,x15
694	adcs	x25,x25,x16
695	adc	x26,x26,x17
696	cbnz	x27,.Lsqr8x_reduction
697
698	ldp	x14,x15,[x2,#8*0]
699	ldp	x16,x17,[x2,#8*2]
700	mov	x0,x2
701	sub	x27,x3,x1	// done yet?
702	adds	x19,x19,x14
703	adcs	x20,x20,x15
704	ldp	x14,x15,[x2,#8*4]
705	adcs	x21,x21,x16
706	adcs	x22,x22,x17
707	ldp	x16,x17,[x2,#8*6]
708	adcs	x23,x23,x14
709	adcs	x24,x24,x15
710	adcs	x25,x25,x16
711	adcs	x26,x26,x17
712	//adc	x28,xzr,xzr		// moved below
713	cbz	x27,.Lsqr8x8_post_condition
714
715	ldr	x4,[x2,#-8*8]
716	ldp	x6,x7,[x1,#8*0]
717	ldp	x8,x9,[x1,#8*2]
718	ldp	x10,x11,[x1,#8*4]
719	mov	x27,#-8*8
720	ldp	x12,x13,[x1,#8*6]
721	add	x1,x1,#8*8
722
723.Lsqr8x_tail:
724	mul	x14,x6,x4
725	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
726	mul	x15,x7,x4
727	add	x27,x27,#8
728	mul	x16,x8,x4
729	mul	x17,x9,x4
730	adds	x19,x19,x14
731	mul	x14,x10,x4
732	adcs	x20,x20,x15
733	mul	x15,x11,x4
734	adcs	x21,x21,x16
735	mul	x16,x12,x4
736	adcs	x22,x22,x17
737	mul	x17,x13,x4
738	adcs	x23,x23,x14
739	umulh	x14,x6,x4
740	adcs	x24,x24,x15
741	umulh	x15,x7,x4
742	adcs	x25,x25,x16
743	umulh	x16,x8,x4
744	adcs	x26,x26,x17
745	umulh	x17,x9,x4
746	adc	x28,x28,xzr
747	str	x19,[x2],#8
748	adds	x19,x20,x14
749	umulh	x14,x10,x4
750	adcs	x20,x21,x15
751	umulh	x15,x11,x4
752	adcs	x21,x22,x16
753	umulh	x16,x12,x4
754	adcs	x22,x23,x17
755	umulh	x17,x13,x4
756	ldr	x4,[x0,x27]
757	adcs	x23,x24,x14
758	adcs	x24,x25,x15
759	adcs	x25,x26,x16
760	adcs	x26,x28,x17
761	//adc	x28,xzr,xzr		// moved above
762	cbnz	x27,.Lsqr8x_tail
763					// note that carry flag is guaranteed
764					// to be zero at this point
765	ldp	x6,x7,[x2,#8*0]
766	sub	x27,x3,x1	// done yet?
767	sub	x16,x3,x5	// rewinded np
768	ldp	x8,x9,[x2,#8*2]
769	ldp	x10,x11,[x2,#8*4]
770	ldp	x12,x13,[x2,#8*6]
771	cbz	x27,.Lsqr8x_tail_break
772
773	ldr	x4,[x0,#-8*8]
774	adds	x19,x19,x6
775	adcs	x20,x20,x7
776	ldp	x6,x7,[x1,#8*0]
777	adcs	x21,x21,x8
778	adcs	x22,x22,x9
779	ldp	x8,x9,[x1,#8*2]
780	adcs	x23,x23,x10
781	adcs	x24,x24,x11
782	ldp	x10,x11,[x1,#8*4]
783	adcs	x25,x25,x12
784	mov	x27,#-8*8
785	adcs	x26,x26,x13
786	ldp	x12,x13,[x1,#8*6]
787	add	x1,x1,#8*8
788	//adc	x28,xzr,xzr		// moved above
789	b	.Lsqr8x_tail
790
791.align	4
792.Lsqr8x_tail_break:
793	ldr	x4,[x29,#112]		// pull n0
794	add	x27,x2,#8*8		// end of current t[num] window
795
796	subs	xzr,x30,#1		// "move" top-most carry to carry bit
797	adcs	x14,x19,x6
798	adcs	x15,x20,x7
799	ldp	x19,x20,[x0,#8*0]
800	adcs	x21,x21,x8
801	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
802	adcs	x22,x22,x9
803	ldp	x8,x9,[x16,#8*2]
804	adcs	x23,x23,x10
805	adcs	x24,x24,x11
806	ldp	x10,x11,[x16,#8*4]
807	adcs	x25,x25,x12
808	adcs	x26,x26,x13
809	ldp	x12,x13,[x16,#8*6]
810	add	x1,x16,#8*8
811	adc	x30,xzr,xzr	// top-most carry
812	mul	x28,x4,x19
813	stp	x14,x15,[x2,#8*0]
814	stp	x21,x22,[x2,#8*2]
815	ldp	x21,x22,[x0,#8*2]
816	stp	x23,x24,[x2,#8*4]
817	ldp	x23,x24,[x0,#8*4]
818	cmp	x27,x29		// did we hit the bottom?
819	stp	x25,x26,[x2,#8*6]
820	mov	x2,x0			// slide the window
821	ldp	x25,x26,[x0,#8*6]
822	mov	x27,#8
823	b.ne	.Lsqr8x_reduction
824
825	// Final step. We see if result is larger than modulus, and
826	// if it is, subtract the modulus. But comparison implies
827	// subtraction. So we subtract modulus, see if it borrowed,
828	// and conditionally copy original value.
829	ldr	x0,[x29,#96]		// pull rp
830	add	x2,x2,#8*8
831	subs	x14,x19,x6
832	sbcs	x15,x20,x7
833	sub	x27,x5,#8*8
834	mov	x3,x0		// x0 copy
835
836.Lsqr8x_sub:
837	sbcs	x16,x21,x8
838	ldp	x6,x7,[x1,#8*0]
839	sbcs	x17,x22,x9
840	stp	x14,x15,[x0,#8*0]
841	sbcs	x14,x23,x10
842	ldp	x8,x9,[x1,#8*2]
843	sbcs	x15,x24,x11
844	stp	x16,x17,[x0,#8*2]
845	sbcs	x16,x25,x12
846	ldp	x10,x11,[x1,#8*4]
847	sbcs	x17,x26,x13
848	ldp	x12,x13,[x1,#8*6]
849	add	x1,x1,#8*8
850	ldp	x19,x20,[x2,#8*0]
851	sub	x27,x27,#8*8
852	ldp	x21,x22,[x2,#8*2]
853	ldp	x23,x24,[x2,#8*4]
854	ldp	x25,x26,[x2,#8*6]
855	add	x2,x2,#8*8
856	stp	x14,x15,[x0,#8*4]
857	sbcs	x14,x19,x6
858	stp	x16,x17,[x0,#8*6]
859	add	x0,x0,#8*8
860	sbcs	x15,x20,x7
861	cbnz	x27,.Lsqr8x_sub
862
863	sbcs	x16,x21,x8
864	mov	x2,sp
865	add	x1,sp,x5
866	ldp	x6,x7,[x3,#8*0]
867	sbcs	x17,x22,x9
868	stp	x14,x15,[x0,#8*0]
869	sbcs	x14,x23,x10
870	ldp	x8,x9,[x3,#8*2]
871	sbcs	x15,x24,x11
872	stp	x16,x17,[x0,#8*2]
873	sbcs	x16,x25,x12
874	ldp	x19,x20,[x1,#8*0]
875	sbcs	x17,x26,x13
876	ldp	x21,x22,[x1,#8*2]
877	sbcs	xzr,x30,xzr	// did it borrow?
878	ldr	x30,[x29,#8]		// pull return address
879	stp	x14,x15,[x0,#8*4]
880	stp	x16,x17,[x0,#8*6]
881
882	sub	x27,x5,#8*4
883.Lsqr4x_cond_copy:
884	sub	x27,x27,#8*4
885	csel	x14,x19,x6,lo
886	stp	xzr,xzr,[x2,#8*0]
887	csel	x15,x20,x7,lo
888	ldp	x6,x7,[x3,#8*4]
889	ldp	x19,x20,[x1,#8*4]
890	csel	x16,x21,x8,lo
891	stp	xzr,xzr,[x2,#8*2]
892	add	x2,x2,#8*4
893	csel	x17,x22,x9,lo
894	ldp	x8,x9,[x3,#8*6]
895	ldp	x21,x22,[x1,#8*6]
896	add	x1,x1,#8*4
897	stp	x14,x15,[x3,#8*0]
898	stp	x16,x17,[x3,#8*2]
899	add	x3,x3,#8*4
900	stp	xzr,xzr,[x1,#8*0]
901	stp	xzr,xzr,[x1,#8*2]
902	cbnz	x27,.Lsqr4x_cond_copy
903
904	csel	x14,x19,x6,lo
905	stp	xzr,xzr,[x2,#8*0]
906	csel	x15,x20,x7,lo
907	stp	xzr,xzr,[x2,#8*2]
908	csel	x16,x21,x8,lo
909	csel	x17,x22,x9,lo
910	stp	x14,x15,[x3,#8*0]
911	stp	x16,x17,[x3,#8*2]
912
913	b	.Lsqr8x_done
914
915.align	4
916.Lsqr8x8_post_condition:
917	adc	x28,xzr,xzr
918	ldr	x30,[x29,#8]		// pull return address
919	// x19-7,x28 hold result, x6-7 hold modulus
920	subs	x6,x19,x6
921	ldr	x1,[x29,#96]		// pull rp
922	sbcs	x7,x20,x7
923	stp	xzr,xzr,[sp,#8*0]
924	sbcs	x8,x21,x8
925	stp	xzr,xzr,[sp,#8*2]
926	sbcs	x9,x22,x9
927	stp	xzr,xzr,[sp,#8*4]
928	sbcs	x10,x23,x10
929	stp	xzr,xzr,[sp,#8*6]
930	sbcs	x11,x24,x11
931	stp	xzr,xzr,[sp,#8*8]
932	sbcs	x12,x25,x12
933	stp	xzr,xzr,[sp,#8*10]
934	sbcs	x13,x26,x13
935	stp	xzr,xzr,[sp,#8*12]
936	sbcs	x28,x28,xzr	// did it borrow?
937	stp	xzr,xzr,[sp,#8*14]
938
939	// x6-7 hold result-modulus
940	csel	x6,x19,x6,lo
941	csel	x7,x20,x7,lo
942	csel	x8,x21,x8,lo
943	csel	x9,x22,x9,lo
944	stp	x6,x7,[x1,#8*0]
945	csel	x10,x23,x10,lo
946	csel	x11,x24,x11,lo
947	stp	x8,x9,[x1,#8*2]
948	csel	x12,x25,x12,lo
949	csel	x13,x26,x13,lo
950	stp	x10,x11,[x1,#8*4]
951	stp	x12,x13,[x1,#8*6]
952
953.Lsqr8x_done:
954	ldp	x19,x20,[x29,#16]
955	mov	sp,x29
956	ldp	x21,x22,[x29,#32]
957	mov	x0,#1
958	ldp	x23,x24,[x29,#48]
959	ldp	x25,x26,[x29,#64]
960	ldp	x27,x28,[x29,#80]
961	ldr	x29,[sp],#128
962	ret
963.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
964.type	__bn_mul4x_mont,%function
965.align	5
966__bn_mul4x_mont:
967	stp	x29,x30,[sp,#-128]!
968	add	x29,sp,#0
969	stp	x19,x20,[sp,#16]
970	stp	x21,x22,[sp,#32]
971	stp	x23,x24,[sp,#48]
972	stp	x25,x26,[sp,#64]
973	stp	x27,x28,[sp,#80]
974
975	sub	x26,sp,x5,lsl#3
976	lsl	x5,x5,#3
977	ldr	x4,[x4]		// *n0
978	sub	sp,x26,#8*4		// alloca
979
980	add	x10,x2,x5
981	add	x27,x1,x5
982	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
983
984	ldr	x24,[x2,#8*0]		// b[0]
985	ldp	x6,x7,[x1,#8*0]	// a[0..3]
986	ldp	x8,x9,[x1,#8*2]
987	add	x1,x1,#8*4
988	mov	x19,xzr
989	mov	x20,xzr
990	mov	x21,xzr
991	mov	x22,xzr
992	ldp	x14,x15,[x3,#8*0]	// n[0..3]
993	ldp	x16,x17,[x3,#8*2]
994	adds	x3,x3,#8*4		// clear carry bit
995	mov	x0,xzr
996	mov	x28,#0
997	mov	x26,sp
998
999.Loop_mul4x_1st_reduction:
1000	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1001	adc	x0,x0,xzr	// modulo-scheduled
1002	mul	x11,x7,x24
1003	add	x28,x28,#8
1004	mul	x12,x8,x24
1005	and	x28,x28,#31
1006	mul	x13,x9,x24
1007	adds	x19,x19,x10
1008	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1009	adcs	x20,x20,x11
1010	mul	x25,x19,x4		// t[0]*n0
1011	adcs	x21,x21,x12
1012	umulh	x11,x7,x24
1013	adcs	x22,x22,x13
1014	umulh	x12,x8,x24
1015	adc	x23,xzr,xzr
1016	umulh	x13,x9,x24
1017	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1018	adds	x20,x20,x10
1019	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1020	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1021	adcs	x21,x21,x11
1022	mul	x11,x15,x25
1023	adcs	x22,x22,x12
1024	mul	x12,x16,x25
1025	adc	x23,x23,x13		// can't overflow
1026	mul	x13,x17,x25
1027	// (*)	adds	xzr,x19,x10
1028	subs	xzr,x19,#1		// (*)
1029	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1030	adcs	x19,x20,x11
1031	umulh	x11,x15,x25
1032	adcs	x20,x21,x12
1033	umulh	x12,x16,x25
1034	adcs	x21,x22,x13
1035	umulh	x13,x17,x25
1036	adcs	x22,x23,x0
1037	adc	x0,xzr,xzr
1038	adds	x19,x19,x10
1039	sub	x10,x27,x1
1040	adcs	x20,x20,x11
1041	adcs	x21,x21,x12
1042	adcs	x22,x22,x13
1043	//adc	x0,x0,xzr
1044	cbnz	x28,.Loop_mul4x_1st_reduction
1045
1046	cbz	x10,.Lmul4x4_post_condition
1047
1048	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1049	ldp	x8,x9,[x1,#8*2]
1050	add	x1,x1,#8*4
1051	ldr	x25,[sp]		// a[0]*n0
1052	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1053	ldp	x16,x17,[x3,#8*2]
1054	add	x3,x3,#8*4
1055
1056.Loop_mul4x_1st_tail:
1057	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1058	adc	x0,x0,xzr	// modulo-scheduled
1059	mul	x11,x7,x24
1060	add	x28,x28,#8
1061	mul	x12,x8,x24
1062	and	x28,x28,#31
1063	mul	x13,x9,x24
1064	adds	x19,x19,x10
1065	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1066	adcs	x20,x20,x11
1067	umulh	x11,x7,x24
1068	adcs	x21,x21,x12
1069	umulh	x12,x8,x24
1070	adcs	x22,x22,x13
1071	umulh	x13,x9,x24
1072	adc	x23,xzr,xzr
1073	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1074	adds	x20,x20,x10
1075	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1076	adcs	x21,x21,x11
1077	mul	x11,x15,x25
1078	adcs	x22,x22,x12
1079	mul	x12,x16,x25
1080	adc	x23,x23,x13		// can't overflow
1081	mul	x13,x17,x25
1082	adds	x19,x19,x10
1083	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1084	adcs	x20,x20,x11
1085	umulh	x11,x15,x25
1086	adcs	x21,x21,x12
1087	umulh	x12,x16,x25
1088	adcs	x22,x22,x13
1089	adcs	x23,x23,x0
1090	umulh	x13,x17,x25
1091	adc	x0,xzr,xzr
1092	ldr	x25,[sp,x28]		// next t[0]*n0
1093	str	x19,[x26],#8		// result!!!
1094	adds	x19,x20,x10
1095	sub	x10,x27,x1		// done yet?
1096	adcs	x20,x21,x11
1097	adcs	x21,x22,x12
1098	adcs	x22,x23,x13
1099	//adc	x0,x0,xzr
1100	cbnz	x28,.Loop_mul4x_1st_tail
1101
1102	sub	x11,x27,x5	// rewinded x1
1103	cbz	x10,.Lmul4x_proceed
1104
1105	ldp	x6,x7,[x1,#8*0]
1106	ldp	x8,x9,[x1,#8*2]
1107	add	x1,x1,#8*4
1108	ldp	x14,x15,[x3,#8*0]
1109	ldp	x16,x17,[x3,#8*2]
1110	add	x3,x3,#8*4
1111	b	.Loop_mul4x_1st_tail
1112
1113.align	5
1114.Lmul4x_proceed:
1115	ldr	x24,[x2,#8*4]!		// *++b
1116	adc	x30,x0,xzr
1117	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1118	sub	x3,x3,x5		// rewind np
1119	ldp	x8,x9,[x11,#8*2]
1120	add	x1,x11,#8*4
1121
1122	stp	x19,x20,[x26,#8*0]	// result!!!
1123	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1124	stp	x21,x22,[x26,#8*2]	// result!!!
1125	ldp	x21,x22,[sp,#8*6]
1126
1127	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1128	mov	x26,sp
1129	ldp	x16,x17,[x3,#8*2]
1130	adds	x3,x3,#8*4		// clear carry bit
1131	mov	x0,xzr
1132
1133.align	4
1134.Loop_mul4x_reduction:
1135	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1136	adc	x0,x0,xzr	// modulo-scheduled
1137	mul	x11,x7,x24
1138	add	x28,x28,#8
1139	mul	x12,x8,x24
1140	and	x28,x28,#31
1141	mul	x13,x9,x24
1142	adds	x19,x19,x10
1143	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1144	adcs	x20,x20,x11
1145	mul	x25,x19,x4		// t[0]*n0
1146	adcs	x21,x21,x12
1147	umulh	x11,x7,x24
1148	adcs	x22,x22,x13
1149	umulh	x12,x8,x24
1150	adc	x23,xzr,xzr
1151	umulh	x13,x9,x24
1152	ldr	x24,[x2,x28]		// next b[i]
1153	adds	x20,x20,x10
1154	// (*)	mul	x10,x14,x25
1155	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1156	adcs	x21,x21,x11
1157	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1158	adcs	x22,x22,x12
1159	mul	x12,x16,x25
1160	adc	x23,x23,x13		// can't overflow
1161	mul	x13,x17,x25
1162	// (*)	adds	xzr,x19,x10
1163	subs	xzr,x19,#1		// (*)
1164	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1165	adcs	x19,x20,x11
1166	umulh	x11,x15,x25
1167	adcs	x20,x21,x12
1168	umulh	x12,x16,x25
1169	adcs	x21,x22,x13
1170	umulh	x13,x17,x25
1171	adcs	x22,x23,x0
1172	adc	x0,xzr,xzr
1173	adds	x19,x19,x10
1174	adcs	x20,x20,x11
1175	adcs	x21,x21,x12
1176	adcs	x22,x22,x13
1177	//adc	x0,x0,xzr
1178	cbnz	x28,.Loop_mul4x_reduction
1179
1180	adc	x0,x0,xzr
1181	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1182	ldp	x12,x13,[x26,#8*6]
1183	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1184	ldp	x8,x9,[x1,#8*2]
1185	add	x1,x1,#8*4
1186	adds	x19,x19,x10
1187	adcs	x20,x20,x11
1188	adcs	x21,x21,x12
1189	adcs	x22,x22,x13
1190	//adc	x0,x0,xzr
1191
1192	ldr	x25,[sp]		// t[0]*n0
1193	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1194	ldp	x16,x17,[x3,#8*2]
1195	add	x3,x3,#8*4
1196
1197.align	4
1198.Loop_mul4x_tail:
1199	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1200	adc	x0,x0,xzr	// modulo-scheduled
1201	mul	x11,x7,x24
1202	add	x28,x28,#8
1203	mul	x12,x8,x24
1204	and	x28,x28,#31
1205	mul	x13,x9,x24
1206	adds	x19,x19,x10
1207	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1208	adcs	x20,x20,x11
1209	umulh	x11,x7,x24
1210	adcs	x21,x21,x12
1211	umulh	x12,x8,x24
1212	adcs	x22,x22,x13
1213	umulh	x13,x9,x24
1214	adc	x23,xzr,xzr
1215	ldr	x24,[x2,x28]		// next b[i]
1216	adds	x20,x20,x10
1217	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1218	adcs	x21,x21,x11
1219	mul	x11,x15,x25
1220	adcs	x22,x22,x12
1221	mul	x12,x16,x25
1222	adc	x23,x23,x13		// can't overflow
1223	mul	x13,x17,x25
1224	adds	x19,x19,x10
1225	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1226	adcs	x20,x20,x11
1227	umulh	x11,x15,x25
1228	adcs	x21,x21,x12
1229	umulh	x12,x16,x25
1230	adcs	x22,x22,x13
1231	umulh	x13,x17,x25
1232	adcs	x23,x23,x0
1233	ldr	x25,[sp,x28]		// next a[0]*n0
1234	adc	x0,xzr,xzr
1235	str	x19,[x26],#8		// result!!!
1236	adds	x19,x20,x10
1237	sub	x10,x27,x1		// done yet?
1238	adcs	x20,x21,x11
1239	adcs	x21,x22,x12
1240	adcs	x22,x23,x13
1241	//adc	x0,x0,xzr
1242	cbnz	x28,.Loop_mul4x_tail
1243
1244	sub	x11,x3,x5		// rewinded np?
1245	adc	x0,x0,xzr
1246	cbz	x10,.Loop_mul4x_break
1247
1248	ldp	x10,x11,[x26,#8*4]
1249	ldp	x12,x13,[x26,#8*6]
1250	ldp	x6,x7,[x1,#8*0]
1251	ldp	x8,x9,[x1,#8*2]
1252	add	x1,x1,#8*4
1253	adds	x19,x19,x10
1254	adcs	x20,x20,x11
1255	adcs	x21,x21,x12
1256	adcs	x22,x22,x13
1257	//adc	x0,x0,xzr
1258	ldp	x14,x15,[x3,#8*0]
1259	ldp	x16,x17,[x3,#8*2]
1260	add	x3,x3,#8*4
1261	b	.Loop_mul4x_tail
1262
1263.align	4
1264.Loop_mul4x_break:
1265	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1266	adds	x19,x19,x30
1267	add	x2,x2,#8*4		// bp++
1268	adcs	x20,x20,xzr
1269	sub	x1,x1,x5		// rewind ap
1270	adcs	x21,x21,xzr
1271	stp	x19,x20,[x26,#8*0]	// result!!!
1272	adcs	x22,x22,xzr
1273	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1274	adc	x30,x0,xzr
1275	stp	x21,x22,[x26,#8*2]	// result!!!
1276	cmp	x2,x13			// done yet?
1277	ldp	x21,x22,[sp,#8*6]
1278	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1279	ldp	x16,x17,[x11,#8*2]
1280	add	x3,x11,#8*4
1281	b.eq	.Lmul4x_post
1282
1283	ldr	x24,[x2]
1284	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1285	ldp	x8,x9,[x1,#8*2]
1286	adds	x1,x1,#8*4		// clear carry bit
1287	mov	x0,xzr
1288	mov	x26,sp
1289	b	.Loop_mul4x_reduction
1290
1291.align	4
1292.Lmul4x_post:
1293	// Final step. We see if result is larger than modulus, and
1294	// if it is, subtract the modulus. But comparison implies
1295	// subtraction. So we subtract modulus, see if it borrowed,
1296	// and conditionally copy original value.
1297	mov	x0,x12
1298	mov	x27,x12		// x0 copy
1299	subs	x10,x19,x14
1300	add	x26,sp,#8*8
1301	sbcs	x11,x20,x15
1302	sub	x28,x5,#8*4
1303
1304.Lmul4x_sub:
1305	sbcs	x12,x21,x16
1306	ldp	x14,x15,[x3,#8*0]
1307	sub	x28,x28,#8*4
1308	ldp	x19,x20,[x26,#8*0]
1309	sbcs	x13,x22,x17
1310	ldp	x16,x17,[x3,#8*2]
1311	add	x3,x3,#8*4
1312	ldp	x21,x22,[x26,#8*2]
1313	add	x26,x26,#8*4
1314	stp	x10,x11,[x0,#8*0]
1315	sbcs	x10,x19,x14
1316	stp	x12,x13,[x0,#8*2]
1317	add	x0,x0,#8*4
1318	sbcs	x11,x20,x15
1319	cbnz	x28,.Lmul4x_sub
1320
1321	sbcs	x12,x21,x16
1322	mov	x26,sp
1323	add	x1,sp,#8*4
1324	ldp	x6,x7,[x27,#8*0]
1325	sbcs	x13,x22,x17
1326	stp	x10,x11,[x0,#8*0]
1327	ldp	x8,x9,[x27,#8*2]
1328	stp	x12,x13,[x0,#8*2]
1329	ldp	x19,x20,[x1,#8*0]
1330	ldp	x21,x22,[x1,#8*2]
1331	sbcs	xzr,x30,xzr	// did it borrow?
1332	ldr	x30,[x29,#8]		// pull return address
1333
1334	sub	x28,x5,#8*4
1335.Lmul4x_cond_copy:
1336	sub	x28,x28,#8*4
1337	csel	x10,x19,x6,lo
1338	stp	xzr,xzr,[x26,#8*0]
1339	csel	x11,x20,x7,lo
1340	ldp	x6,x7,[x27,#8*4]
1341	ldp	x19,x20,[x1,#8*4]
1342	csel	x12,x21,x8,lo
1343	stp	xzr,xzr,[x26,#8*2]
1344	add	x26,x26,#8*4
1345	csel	x13,x22,x9,lo
1346	ldp	x8,x9,[x27,#8*6]
1347	ldp	x21,x22,[x1,#8*6]
1348	add	x1,x1,#8*4
1349	stp	x10,x11,[x27,#8*0]
1350	stp	x12,x13,[x27,#8*2]
1351	add	x27,x27,#8*4
1352	cbnz	x28,.Lmul4x_cond_copy
1353
1354	csel	x10,x19,x6,lo
1355	stp	xzr,xzr,[x26,#8*0]
1356	csel	x11,x20,x7,lo
1357	stp	xzr,xzr,[x26,#8*2]
1358	csel	x12,x21,x8,lo
1359	stp	xzr,xzr,[x26,#8*3]
1360	csel	x13,x22,x9,lo
1361	stp	xzr,xzr,[x26,#8*4]
1362	stp	x10,x11,[x27,#8*0]
1363	stp	x12,x13,[x27,#8*2]
1364
1365	b	.Lmul4x_done
1366
1367.align	4
1368.Lmul4x4_post_condition:
1369	adc	x0,x0,xzr
1370	ldr	x1,[x29,#96]		// pull rp
1371	// x19-3,x0 hold result, x14-7 hold modulus
1372	subs	x6,x19,x14
1373	ldr	x30,[x29,#8]		// pull return address
1374	sbcs	x7,x20,x15
1375	stp	xzr,xzr,[sp,#8*0]
1376	sbcs	x8,x21,x16
1377	stp	xzr,xzr,[sp,#8*2]
1378	sbcs	x9,x22,x17
1379	stp	xzr,xzr,[sp,#8*4]
1380	sbcs	xzr,x0,xzr		// did it borrow?
1381	stp	xzr,xzr,[sp,#8*6]
1382
1383	// x6-3 hold result-modulus
1384	csel	x6,x19,x6,lo
1385	csel	x7,x20,x7,lo
1386	csel	x8,x21,x8,lo
1387	csel	x9,x22,x9,lo
1388	stp	x6,x7,[x1,#8*0]
1389	stp	x8,x9,[x1,#8*2]
1390
1391.Lmul4x_done:
1392	ldp	x19,x20,[x29,#16]
1393	mov	sp,x29
1394	ldp	x21,x22,[x29,#32]
1395	mov	x0,#1
1396	ldp	x23,x24,[x29,#48]
1397	ldp	x25,x26,[x29,#64]
1398	ldp	x27,x28,[x29,#80]
1399	ldr	x29,[sp],#128
1400	ret
1401.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1402.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1403.align	2
1404.align	4
1405