xref: /netbsd-src/crypto/external/bsd/openssl.old/lib/libcrypto/arch/aarch64/armv8-mont.S (revision 4724848cf0da353df257f730694b7882798e5daf)
1.text
2
3.globl	bn_mul_mont
4.type	bn_mul_mont,%function
5.align	5
6bn_mul_mont:
7	tst	x5,#7
8	b.eq	__bn_sqr8x_mont
9	tst	x5,#3
10	b.eq	__bn_mul4x_mont
11.Lmul_mont:
12	stp	x29,x30,[sp,#-64]!
13	add	x29,sp,#0
14	stp	x19,x20,[sp,#16]
15	stp	x21,x22,[sp,#32]
16	stp	x23,x24,[sp,#48]
17
18	ldr	x9,[x2],#8		// bp[0]
19	sub	x22,sp,x5,lsl#3
20	ldp	x7,x8,[x1],#16	// ap[0..1]
21	lsl	x5,x5,#3
22	ldr	x4,[x4]		// *n0
23	and	x22,x22,#-16		// ABI says so
24	ldp	x13,x14,[x3],#16	// np[0..1]
25
26	mul	x6,x7,x9		// ap[0]*bp[0]
27	sub	x21,x5,#16		// j=num-2
28	umulh	x7,x7,x9
29	mul	x10,x8,x9		// ap[1]*bp[0]
30	umulh	x11,x8,x9
31
32	mul	x15,x6,x4		// "tp[0]"*n0
33	mov	sp,x22			// alloca
34
35	// (*)	mul	x12,x13,x15	// np[0]*m1
36	umulh	x13,x13,x15
37	mul	x16,x14,x15		// np[1]*m1
38	// (*)	adds	x12,x12,x6	// discarded
39	// (*)	As for removal of first multiplication and addition
40	//	instructions. The outcome of first addition is
41	//	guaranteed to be zero, which leaves two computationally
42	//	significant outcomes: it either carries or not. Then
43	//	question is when does it carry? Is there alternative
44	//	way to deduce it? If you follow operations, you can
45	//	observe that condition for carry is quite simple:
46	//	x6 being non-zero. So that carry can be calculated
47	//	by adding -1 to x6. That's what next instruction does.
48	subs	xzr,x6,#1		// (*)
49	umulh	x17,x14,x15
50	adc	x13,x13,xzr
51	cbz	x21,.L1st_skip
52
53.L1st:
54	ldr	x8,[x1],#8
55	adds	x6,x10,x7
56	sub	x21,x21,#8		// j--
57	adc	x7,x11,xzr
58
59	ldr	x14,[x3],#8
60	adds	x12,x16,x13
61	mul	x10,x8,x9		// ap[j]*bp[0]
62	adc	x13,x17,xzr
63	umulh	x11,x8,x9
64
65	adds	x12,x12,x6
66	mul	x16,x14,x15		// np[j]*m1
67	adc	x13,x13,xzr
68	umulh	x17,x14,x15
69	str	x12,[x22],#8		// tp[j-1]
70	cbnz	x21,.L1st
71
72.L1st_skip:
73	adds	x6,x10,x7
74	sub	x1,x1,x5		// rewind x1
75	adc	x7,x11,xzr
76
77	adds	x12,x16,x13
78	sub	x3,x3,x5		// rewind x3
79	adc	x13,x17,xzr
80
81	adds	x12,x12,x6
82	sub	x20,x5,#8		// i=num-1
83	adcs	x13,x13,x7
84
85	adc	x19,xzr,xzr		// upmost overflow bit
86	stp	x12,x13,[x22]
87
88.Louter:
89	ldr	x9,[x2],#8		// bp[i]
90	ldp	x7,x8,[x1],#16
91	ldr	x23,[sp]		// tp[0]
92	add	x22,sp,#8
93
94	mul	x6,x7,x9		// ap[0]*bp[i]
95	sub	x21,x5,#16		// j=num-2
96	umulh	x7,x7,x9
97	ldp	x13,x14,[x3],#16
98	mul	x10,x8,x9		// ap[1]*bp[i]
99	adds	x6,x6,x23
100	umulh	x11,x8,x9
101	adc	x7,x7,xzr
102
103	mul	x15,x6,x4
104	sub	x20,x20,#8		// i--
105
106	// (*)	mul	x12,x13,x15	// np[0]*m1
107	umulh	x13,x13,x15
108	mul	x16,x14,x15		// np[1]*m1
109	// (*)	adds	x12,x12,x6
110	subs	xzr,x6,#1		// (*)
111	umulh	x17,x14,x15
112	cbz	x21,.Linner_skip
113
114.Linner:
115	ldr	x8,[x1],#8
116	adc	x13,x13,xzr
117	ldr	x23,[x22],#8		// tp[j]
118	adds	x6,x10,x7
119	sub	x21,x21,#8		// j--
120	adc	x7,x11,xzr
121
122	adds	x12,x16,x13
123	ldr	x14,[x3],#8
124	adc	x13,x17,xzr
125
126	mul	x10,x8,x9		// ap[j]*bp[i]
127	adds	x6,x6,x23
128	umulh	x11,x8,x9
129	adc	x7,x7,xzr
130
131	mul	x16,x14,x15		// np[j]*m1
132	adds	x12,x12,x6
133	umulh	x17,x14,x15
134	str	x12,[x22,#-16]		// tp[j-1]
135	cbnz	x21,.Linner
136
137.Linner_skip:
138	ldr	x23,[x22],#8		// tp[j]
139	adc	x13,x13,xzr
140	adds	x6,x10,x7
141	sub	x1,x1,x5		// rewind x1
142	adc	x7,x11,xzr
143
144	adds	x12,x16,x13
145	sub	x3,x3,x5		// rewind x3
146	adcs	x13,x17,x19
147	adc	x19,xzr,xzr
148
149	adds	x6,x6,x23
150	adc	x7,x7,xzr
151
152	adds	x12,x12,x6
153	adcs	x13,x13,x7
154	adc	x19,x19,xzr		// upmost overflow bit
155	stp	x12,x13,[x22,#-16]
156
157	cbnz	x20,.Louter
158
159	// Final step. We see if result is larger than modulus, and
160	// if it is, subtract the modulus. But comparison implies
161	// subtraction. So we subtract modulus, see if it borrowed,
162	// and conditionally copy original value.
163	ldr	x23,[sp]		// tp[0]
164	add	x22,sp,#8
165	ldr	x14,[x3],#8		// np[0]
166	subs	x21,x5,#8		// j=num-1 and clear borrow
167	mov	x1,x0
168.Lsub:
169	sbcs	x8,x23,x14		// tp[j]-np[j]
170	ldr	x23,[x22],#8
171	sub	x21,x21,#8		// j--
172	ldr	x14,[x3],#8
173	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
174	cbnz	x21,.Lsub
175
176	sbcs	x8,x23,x14
177	sbcs	x19,x19,xzr		// did it borrow?
178	str	x8,[x1],#8		// rp[num-1]
179
180	ldr	x23,[sp]		// tp[0]
181	add	x22,sp,#8
182	ldr	x8,[x0],#8		// rp[0]
183	sub	x5,x5,#8		// num--
184	nop
185.Lcond_copy:
186	sub	x5,x5,#8		// num--
187	csel	x14,x23,x8,lo		// did it borrow?
188	ldr	x23,[x22],#8
189	ldr	x8,[x0],#8
190	str	xzr,[x22,#-16]		// wipe tp
191	str	x14,[x0,#-16]
192	cbnz	x5,.Lcond_copy
193
194	csel	x14,x23,x8,lo
195	str	xzr,[x22,#-8]		// wipe tp
196	str	x14,[x0,#-8]
197
198	ldp	x19,x20,[x29,#16]
199	mov	sp,x29
200	ldp	x21,x22,[x29,#32]
201	mov	x0,#1
202	ldp	x23,x24,[x29,#48]
203	ldr	x29,[sp],#64
204	ret
205.size	bn_mul_mont,.-bn_mul_mont
206.type	__bn_sqr8x_mont,%function
207.align	5
208__bn_sqr8x_mont:
209	cmp	x1,x2
210	b.ne	__bn_mul4x_mont
211.Lsqr8x_mont:
212.inst	0xd503233f		// paciasp
213	stp	x29,x30,[sp,#-128]!
214	add	x29,sp,#0
215	stp	x19,x20,[sp,#16]
216	stp	x21,x22,[sp,#32]
217	stp	x23,x24,[sp,#48]
218	stp	x25,x26,[sp,#64]
219	stp	x27,x28,[sp,#80]
220	stp	x0,x3,[sp,#96]	// offload rp and np
221
222	ldp	x6,x7,[x1,#8*0]
223	ldp	x8,x9,[x1,#8*2]
224	ldp	x10,x11,[x1,#8*4]
225	ldp	x12,x13,[x1,#8*6]
226
227	sub	x2,sp,x5,lsl#4
228	lsl	x5,x5,#3
229	ldr	x4,[x4]		// *n0
230	mov	sp,x2			// alloca
231	sub	x27,x5,#8*8
232	b	.Lsqr8x_zero_start
233
234.Lsqr8x_zero:
235	sub	x27,x27,#8*8
236	stp	xzr,xzr,[x2,#8*0]
237	stp	xzr,xzr,[x2,#8*2]
238	stp	xzr,xzr,[x2,#8*4]
239	stp	xzr,xzr,[x2,#8*6]
240.Lsqr8x_zero_start:
241	stp	xzr,xzr,[x2,#8*8]
242	stp	xzr,xzr,[x2,#8*10]
243	stp	xzr,xzr,[x2,#8*12]
244	stp	xzr,xzr,[x2,#8*14]
245	add	x2,x2,#8*16
246	cbnz	x27,.Lsqr8x_zero
247
248	add	x3,x1,x5
249	add	x1,x1,#8*8
250	mov	x19,xzr
251	mov	x20,xzr
252	mov	x21,xzr
253	mov	x22,xzr
254	mov	x23,xzr
255	mov	x24,xzr
256	mov	x25,xzr
257	mov	x26,xzr
258	mov	x2,sp
259	str	x4,[x29,#112]		// offload n0
260
261	// Multiply everything but a[i]*a[i]
262.align	4
263.Lsqr8x_outer_loop:
264        //                                                 a[1]a[0]	(i)
265        //                                             a[2]a[0]
266        //                                         a[3]a[0]
267        //                                     a[4]a[0]
268        //                                 a[5]a[0]
269        //                             a[6]a[0]
270        //                         a[7]a[0]
271        //                                         a[2]a[1]		(ii)
272        //                                     a[3]a[1]
273        //                                 a[4]a[1]
274        //                             a[5]a[1]
275        //                         a[6]a[1]
276        //                     a[7]a[1]
277        //                                 a[3]a[2]			(iii)
278        //                             a[4]a[2]
279        //                         a[5]a[2]
280        //                     a[6]a[2]
281        //                 a[7]a[2]
282        //                         a[4]a[3]				(iv)
283        //                     a[5]a[3]
284        //                 a[6]a[3]
285        //             a[7]a[3]
286        //                 a[5]a[4]					(v)
287        //             a[6]a[4]
288        //         a[7]a[4]
289        //         a[6]a[5]						(vi)
290        //     a[7]a[5]
291        // a[7]a[6]							(vii)
292
293	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
294	mul	x15,x8,x6
295	mul	x16,x9,x6
296	mul	x17,x10,x6
297	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
298	mul	x14,x11,x6
299	adcs	x21,x21,x15
300	mul	x15,x12,x6
301	adcs	x22,x22,x16
302	mul	x16,x13,x6
303	adcs	x23,x23,x17
304	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
305	adcs	x24,x24,x14
306	umulh	x14,x8,x6
307	adcs	x25,x25,x15
308	umulh	x15,x9,x6
309	adcs	x26,x26,x16
310	umulh	x16,x10,x6
311	stp	x19,x20,[x2],#8*2	// t[0..1]
312	adc	x19,xzr,xzr		// t[8]
313	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
314	umulh	x17,x11,x6
315	adcs	x22,x22,x14
316	umulh	x14,x12,x6
317	adcs	x23,x23,x15
318	umulh	x15,x13,x6
319	adcs	x24,x24,x16
320	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
321	adcs	x25,x25,x17
322	mul	x17,x9,x7
323	adcs	x26,x26,x14
324	mul	x14,x10,x7
325	adc	x19,x19,x15
326
327	mul	x15,x11,x7
328	adds	x22,x22,x16
329	mul	x16,x12,x7
330	adcs	x23,x23,x17
331	mul	x17,x13,x7
332	adcs	x24,x24,x14
333	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
334	adcs	x25,x25,x15
335	umulh	x15,x9,x7
336	adcs	x26,x26,x16
337	umulh	x16,x10,x7
338	adcs	x19,x19,x17
339	umulh	x17,x11,x7
340	stp	x21,x22,[x2],#8*2	// t[2..3]
341	adc	x20,xzr,xzr		// t[9]
342	adds	x23,x23,x14
343	umulh	x14,x12,x7
344	adcs	x24,x24,x15
345	umulh	x15,x13,x7
346	adcs	x25,x25,x16
347	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
348	adcs	x26,x26,x17
349	mul	x17,x10,x8
350	adcs	x19,x19,x14
351	mul	x14,x11,x8
352	adc	x20,x20,x15
353
354	mul	x15,x12,x8
355	adds	x24,x24,x16
356	mul	x16,x13,x8
357	adcs	x25,x25,x17
358	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
359	adcs	x26,x26,x14
360	umulh	x14,x10,x8
361	adcs	x19,x19,x15
362	umulh	x15,x11,x8
363	adcs	x20,x20,x16
364	umulh	x16,x12,x8
365	stp	x23,x24,[x2],#8*2	// t[4..5]
366	adc	x21,xzr,xzr		// t[10]
367	adds	x25,x25,x17
368	umulh	x17,x13,x8
369	adcs	x26,x26,x14
370	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
371	adcs	x19,x19,x15
372	mul	x15,x11,x9
373	adcs	x20,x20,x16
374	mul	x16,x12,x9
375	adc	x21,x21,x17
376
377	mul	x17,x13,x9
378	adds	x26,x26,x14
379	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
380	adcs	x19,x19,x15
381	umulh	x15,x11,x9
382	adcs	x20,x20,x16
383	umulh	x16,x12,x9
384	adcs	x21,x21,x17
385	umulh	x17,x13,x9
386	stp	x25,x26,[x2],#8*2	// t[6..7]
387	adc	x22,xzr,xzr		// t[11]
388	adds	x19,x19,x14
389	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
390	adcs	x20,x20,x15
391	mul	x15,x12,x10
392	adcs	x21,x21,x16
393	mul	x16,x13,x10
394	adc	x22,x22,x17
395
396	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
397	adds	x20,x20,x14
398	umulh	x14,x12,x10
399	adcs	x21,x21,x15
400	umulh	x15,x13,x10
401	adcs	x22,x22,x16
402	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
403	adc	x23,xzr,xzr		// t[12]
404	adds	x21,x21,x17
405	mul	x17,x13,x11
406	adcs	x22,x22,x14
407	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
408	adc	x23,x23,x15
409
410	umulh	x15,x13,x11
411	adds	x22,x22,x16
412	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
413	adcs	x23,x23,x17
414	umulh	x17,x13,x12		// hi(a[7]*a[6])
415	adc	x24,xzr,xzr		// t[13]
416	adds	x23,x23,x14
417	sub	x27,x3,x1	// done yet?
418	adc	x24,x24,x15
419
420	adds	x24,x24,x16
421	sub	x14,x3,x5	// rewinded ap
422	adc	x25,xzr,xzr		// t[14]
423	add	x25,x25,x17
424
425	cbz	x27,.Lsqr8x_outer_break
426
427	mov	x4,x6
428	ldp	x6,x7,[x2,#8*0]
429	ldp	x8,x9,[x2,#8*2]
430	ldp	x10,x11,[x2,#8*4]
431	ldp	x12,x13,[x2,#8*6]
432	adds	x19,x19,x6
433	adcs	x20,x20,x7
434	ldp	x6,x7,[x1,#8*0]
435	adcs	x21,x21,x8
436	adcs	x22,x22,x9
437	ldp	x8,x9,[x1,#8*2]
438	adcs	x23,x23,x10
439	adcs	x24,x24,x11
440	ldp	x10,x11,[x1,#8*4]
441	adcs	x25,x25,x12
442	mov	x0,x1
443	adcs	x26,xzr,x13
444	ldp	x12,x13,[x1,#8*6]
445	add	x1,x1,#8*8
446	//adc	x28,xzr,xzr		// moved below
447	mov	x27,#-8*8
448
449	//                                                         a[8]a[0]
450	//                                                     a[9]a[0]
451	//                                                 a[a]a[0]
452	//                                             a[b]a[0]
453	//                                         a[c]a[0]
454	//                                     a[d]a[0]
455	//                                 a[e]a[0]
456	//                             a[f]a[0]
457	//                                                     a[8]a[1]
458	//                         a[f]a[1]........................
459	//                                                 a[8]a[2]
460	//                     a[f]a[2]........................
461	//                                             a[8]a[3]
462	//                 a[f]a[3]........................
463	//                                         a[8]a[4]
464	//             a[f]a[4]........................
465	//                                     a[8]a[5]
466	//         a[f]a[5]........................
467	//                                 a[8]a[6]
468	//     a[f]a[6]........................
469	//                             a[8]a[7]
470	// a[f]a[7]........................
471.Lsqr8x_mul:
472	mul	x14,x6,x4
473	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
474	mul	x15,x7,x4
475	add	x27,x27,#8
476	mul	x16,x8,x4
477	mul	x17,x9,x4
478	adds	x19,x19,x14
479	mul	x14,x10,x4
480	adcs	x20,x20,x15
481	mul	x15,x11,x4
482	adcs	x21,x21,x16
483	mul	x16,x12,x4
484	adcs	x22,x22,x17
485	mul	x17,x13,x4
486	adcs	x23,x23,x14
487	umulh	x14,x6,x4
488	adcs	x24,x24,x15
489	umulh	x15,x7,x4
490	adcs	x25,x25,x16
491	umulh	x16,x8,x4
492	adcs	x26,x26,x17
493	umulh	x17,x9,x4
494	adc	x28,x28,xzr
495	str	x19,[x2],#8
496	adds	x19,x20,x14
497	umulh	x14,x10,x4
498	adcs	x20,x21,x15
499	umulh	x15,x11,x4
500	adcs	x21,x22,x16
501	umulh	x16,x12,x4
502	adcs	x22,x23,x17
503	umulh	x17,x13,x4
504	ldr	x4,[x0,x27]
505	adcs	x23,x24,x14
506	adcs	x24,x25,x15
507	adcs	x25,x26,x16
508	adcs	x26,x28,x17
509	//adc	x28,xzr,xzr		// moved above
510	cbnz	x27,.Lsqr8x_mul
511					// note that carry flag is guaranteed
512					// to be zero at this point
513	cmp	x1,x3		// done yet?
514	b.eq	.Lsqr8x_break
515
516	ldp	x6,x7,[x2,#8*0]
517	ldp	x8,x9,[x2,#8*2]
518	ldp	x10,x11,[x2,#8*4]
519	ldp	x12,x13,[x2,#8*6]
520	adds	x19,x19,x6
521	ldr	x4,[x0,#-8*8]
522	adcs	x20,x20,x7
523	ldp	x6,x7,[x1,#8*0]
524	adcs	x21,x21,x8
525	adcs	x22,x22,x9
526	ldp	x8,x9,[x1,#8*2]
527	adcs	x23,x23,x10
528	adcs	x24,x24,x11
529	ldp	x10,x11,[x1,#8*4]
530	adcs	x25,x25,x12
531	mov	x27,#-8*8
532	adcs	x26,x26,x13
533	ldp	x12,x13,[x1,#8*6]
534	add	x1,x1,#8*8
535	//adc	x28,xzr,xzr		// moved above
536	b	.Lsqr8x_mul
537
538.align	4
539.Lsqr8x_break:
540	ldp	x6,x7,[x0,#8*0]
541	add	x1,x0,#8*8
542	ldp	x8,x9,[x0,#8*2]
543	sub	x14,x3,x1		// is it last iteration?
544	ldp	x10,x11,[x0,#8*4]
545	sub	x15,x2,x14
546	ldp	x12,x13,[x0,#8*6]
547	cbz	x14,.Lsqr8x_outer_loop
548
549	stp	x19,x20,[x2,#8*0]
550	ldp	x19,x20,[x15,#8*0]
551	stp	x21,x22,[x2,#8*2]
552	ldp	x21,x22,[x15,#8*2]
553	stp	x23,x24,[x2,#8*4]
554	ldp	x23,x24,[x15,#8*4]
555	stp	x25,x26,[x2,#8*6]
556	mov	x2,x15
557	ldp	x25,x26,[x15,#8*6]
558	b	.Lsqr8x_outer_loop
559
560.align	4
561.Lsqr8x_outer_break:
562	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
563	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
564	ldp	x15,x16,[sp,#8*1]
565	ldp	x11,x13,[x14,#8*2]
566	add	x1,x14,#8*4
567	ldp	x17,x14,[sp,#8*3]
568
569	stp	x19,x20,[x2,#8*0]
570	mul	x19,x7,x7
571	stp	x21,x22,[x2,#8*2]
572	umulh	x7,x7,x7
573	stp	x23,x24,[x2,#8*4]
574	mul	x8,x9,x9
575	stp	x25,x26,[x2,#8*6]
576	mov	x2,sp
577	umulh	x9,x9,x9
578	adds	x20,x7,x15,lsl#1
579	extr	x15,x16,x15,#63
580	sub	x27,x5,#8*4
581
582.Lsqr4x_shift_n_add:
583	adcs	x21,x8,x15
584	extr	x16,x17,x16,#63
585	sub	x27,x27,#8*4
586	adcs	x22,x9,x16
587	ldp	x15,x16,[x2,#8*5]
588	mul	x10,x11,x11
589	ldp	x7,x9,[x1],#8*2
590	umulh	x11,x11,x11
591	mul	x12,x13,x13
592	umulh	x13,x13,x13
593	extr	x17,x14,x17,#63
594	stp	x19,x20,[x2,#8*0]
595	adcs	x23,x10,x17
596	extr	x14,x15,x14,#63
597	stp	x21,x22,[x2,#8*2]
598	adcs	x24,x11,x14
599	ldp	x17,x14,[x2,#8*7]
600	extr	x15,x16,x15,#63
601	adcs	x25,x12,x15
602	extr	x16,x17,x16,#63
603	adcs	x26,x13,x16
604	ldp	x15,x16,[x2,#8*9]
605	mul	x6,x7,x7
606	ldp	x11,x13,[x1],#8*2
607	umulh	x7,x7,x7
608	mul	x8,x9,x9
609	umulh	x9,x9,x9
610	stp	x23,x24,[x2,#8*4]
611	extr	x17,x14,x17,#63
612	stp	x25,x26,[x2,#8*6]
613	add	x2,x2,#8*8
614	adcs	x19,x6,x17
615	extr	x14,x15,x14,#63
616	adcs	x20,x7,x14
617	ldp	x17,x14,[x2,#8*3]
618	extr	x15,x16,x15,#63
619	cbnz	x27,.Lsqr4x_shift_n_add
620	ldp	x1,x4,[x29,#104]	// pull np and n0
621
622	adcs	x21,x8,x15
623	extr	x16,x17,x16,#63
624	adcs	x22,x9,x16
625	ldp	x15,x16,[x2,#8*5]
626	mul	x10,x11,x11
627	umulh	x11,x11,x11
628	stp	x19,x20,[x2,#8*0]
629	mul	x12,x13,x13
630	umulh	x13,x13,x13
631	stp	x21,x22,[x2,#8*2]
632	extr	x17,x14,x17,#63
633	adcs	x23,x10,x17
634	extr	x14,x15,x14,#63
635	ldp	x19,x20,[sp,#8*0]
636	adcs	x24,x11,x14
637	extr	x15,x16,x15,#63
638	ldp	x6,x7,[x1,#8*0]
639	adcs	x25,x12,x15
640	extr	x16,xzr,x16,#63
641	ldp	x8,x9,[x1,#8*2]
642	adc	x26,x13,x16
643	ldp	x10,x11,[x1,#8*4]
644
645	// Reduce by 512 bits per iteration
646	mul	x28,x4,x19		// t[0]*n0
647	ldp	x12,x13,[x1,#8*6]
648	add	x3,x1,x5
649	ldp	x21,x22,[sp,#8*2]
650	stp	x23,x24,[x2,#8*4]
651	ldp	x23,x24,[sp,#8*4]
652	stp	x25,x26,[x2,#8*6]
653	ldp	x25,x26,[sp,#8*6]
654	add	x1,x1,#8*8
655	mov	x30,xzr		// initial top-most carry
656	mov	x2,sp
657	mov	x27,#8
658
659.Lsqr8x_reduction:
660	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
661	mul	x15,x7,x28
662	sub	x27,x27,#1
663	mul	x16,x8,x28
664	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
665	mul	x17,x9,x28
666	// (*)	adds	xzr,x19,x14
667	subs	xzr,x19,#1		// (*)
668	mul	x14,x10,x28
669	adcs	x19,x20,x15
670	mul	x15,x11,x28
671	adcs	x20,x21,x16
672	mul	x16,x12,x28
673	adcs	x21,x22,x17
674	mul	x17,x13,x28
675	adcs	x22,x23,x14
676	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
677	adcs	x23,x24,x15
678	umulh	x15,x7,x28
679	adcs	x24,x25,x16
680	umulh	x16,x8,x28
681	adcs	x25,x26,x17
682	umulh	x17,x9,x28
683	adc	x26,xzr,xzr
684	adds	x19,x19,x14
685	umulh	x14,x10,x28
686	adcs	x20,x20,x15
687	umulh	x15,x11,x28
688	adcs	x21,x21,x16
689	umulh	x16,x12,x28
690	adcs	x22,x22,x17
691	umulh	x17,x13,x28
692	mul	x28,x4,x19		// next t[0]*n0
693	adcs	x23,x23,x14
694	adcs	x24,x24,x15
695	adcs	x25,x25,x16
696	adc	x26,x26,x17
697	cbnz	x27,.Lsqr8x_reduction
698
699	ldp	x14,x15,[x2,#8*0]
700	ldp	x16,x17,[x2,#8*2]
701	mov	x0,x2
702	sub	x27,x3,x1	// done yet?
703	adds	x19,x19,x14
704	adcs	x20,x20,x15
705	ldp	x14,x15,[x2,#8*4]
706	adcs	x21,x21,x16
707	adcs	x22,x22,x17
708	ldp	x16,x17,[x2,#8*6]
709	adcs	x23,x23,x14
710	adcs	x24,x24,x15
711	adcs	x25,x25,x16
712	adcs	x26,x26,x17
713	//adc	x28,xzr,xzr		// moved below
714	cbz	x27,.Lsqr8x8_post_condition
715
716	ldr	x4,[x2,#-8*8]
717	ldp	x6,x7,[x1,#8*0]
718	ldp	x8,x9,[x1,#8*2]
719	ldp	x10,x11,[x1,#8*4]
720	mov	x27,#-8*8
721	ldp	x12,x13,[x1,#8*6]
722	add	x1,x1,#8*8
723
724.Lsqr8x_tail:
725	mul	x14,x6,x4
726	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
727	mul	x15,x7,x4
728	add	x27,x27,#8
729	mul	x16,x8,x4
730	mul	x17,x9,x4
731	adds	x19,x19,x14
732	mul	x14,x10,x4
733	adcs	x20,x20,x15
734	mul	x15,x11,x4
735	adcs	x21,x21,x16
736	mul	x16,x12,x4
737	adcs	x22,x22,x17
738	mul	x17,x13,x4
739	adcs	x23,x23,x14
740	umulh	x14,x6,x4
741	adcs	x24,x24,x15
742	umulh	x15,x7,x4
743	adcs	x25,x25,x16
744	umulh	x16,x8,x4
745	adcs	x26,x26,x17
746	umulh	x17,x9,x4
747	adc	x28,x28,xzr
748	str	x19,[x2],#8
749	adds	x19,x20,x14
750	umulh	x14,x10,x4
751	adcs	x20,x21,x15
752	umulh	x15,x11,x4
753	adcs	x21,x22,x16
754	umulh	x16,x12,x4
755	adcs	x22,x23,x17
756	umulh	x17,x13,x4
757	ldr	x4,[x0,x27]
758	adcs	x23,x24,x14
759	adcs	x24,x25,x15
760	adcs	x25,x26,x16
761	adcs	x26,x28,x17
762	//adc	x28,xzr,xzr		// moved above
763	cbnz	x27,.Lsqr8x_tail
764					// note that carry flag is guaranteed
765					// to be zero at this point
766	ldp	x6,x7,[x2,#8*0]
767	sub	x27,x3,x1	// done yet?
768	sub	x16,x3,x5	// rewinded np
769	ldp	x8,x9,[x2,#8*2]
770	ldp	x10,x11,[x2,#8*4]
771	ldp	x12,x13,[x2,#8*6]
772	cbz	x27,.Lsqr8x_tail_break
773
774	ldr	x4,[x0,#-8*8]
775	adds	x19,x19,x6
776	adcs	x20,x20,x7
777	ldp	x6,x7,[x1,#8*0]
778	adcs	x21,x21,x8
779	adcs	x22,x22,x9
780	ldp	x8,x9,[x1,#8*2]
781	adcs	x23,x23,x10
782	adcs	x24,x24,x11
783	ldp	x10,x11,[x1,#8*4]
784	adcs	x25,x25,x12
785	mov	x27,#-8*8
786	adcs	x26,x26,x13
787	ldp	x12,x13,[x1,#8*6]
788	add	x1,x1,#8*8
789	//adc	x28,xzr,xzr		// moved above
790	b	.Lsqr8x_tail
791
792.align	4
793.Lsqr8x_tail_break:
794	ldr	x4,[x29,#112]		// pull n0
795	add	x27,x2,#8*8		// end of current t[num] window
796
797	subs	xzr,x30,#1		// "move" top-most carry to carry bit
798	adcs	x14,x19,x6
799	adcs	x15,x20,x7
800	ldp	x19,x20,[x0,#8*0]
801	adcs	x21,x21,x8
802	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
803	adcs	x22,x22,x9
804	ldp	x8,x9,[x16,#8*2]
805	adcs	x23,x23,x10
806	adcs	x24,x24,x11
807	ldp	x10,x11,[x16,#8*4]
808	adcs	x25,x25,x12
809	adcs	x26,x26,x13
810	ldp	x12,x13,[x16,#8*6]
811	add	x1,x16,#8*8
812	adc	x30,xzr,xzr	// top-most carry
813	mul	x28,x4,x19
814	stp	x14,x15,[x2,#8*0]
815	stp	x21,x22,[x2,#8*2]
816	ldp	x21,x22,[x0,#8*2]
817	stp	x23,x24,[x2,#8*4]
818	ldp	x23,x24,[x0,#8*4]
819	cmp	x27,x29		// did we hit the bottom?
820	stp	x25,x26,[x2,#8*6]
821	mov	x2,x0			// slide the window
822	ldp	x25,x26,[x0,#8*6]
823	mov	x27,#8
824	b.ne	.Lsqr8x_reduction
825
826	// Final step. We see if result is larger than modulus, and
827	// if it is, subtract the modulus. But comparison implies
828	// subtraction. So we subtract modulus, see if it borrowed,
829	// and conditionally copy original value.
830	ldr	x0,[x29,#96]		// pull rp
831	add	x2,x2,#8*8
832	subs	x14,x19,x6
833	sbcs	x15,x20,x7
834	sub	x27,x5,#8*8
835	mov	x3,x0		// x0 copy
836
837.Lsqr8x_sub:
838	sbcs	x16,x21,x8
839	ldp	x6,x7,[x1,#8*0]
840	sbcs	x17,x22,x9
841	stp	x14,x15,[x0,#8*0]
842	sbcs	x14,x23,x10
843	ldp	x8,x9,[x1,#8*2]
844	sbcs	x15,x24,x11
845	stp	x16,x17,[x0,#8*2]
846	sbcs	x16,x25,x12
847	ldp	x10,x11,[x1,#8*4]
848	sbcs	x17,x26,x13
849	ldp	x12,x13,[x1,#8*6]
850	add	x1,x1,#8*8
851	ldp	x19,x20,[x2,#8*0]
852	sub	x27,x27,#8*8
853	ldp	x21,x22,[x2,#8*2]
854	ldp	x23,x24,[x2,#8*4]
855	ldp	x25,x26,[x2,#8*6]
856	add	x2,x2,#8*8
857	stp	x14,x15,[x0,#8*4]
858	sbcs	x14,x19,x6
859	stp	x16,x17,[x0,#8*6]
860	add	x0,x0,#8*8
861	sbcs	x15,x20,x7
862	cbnz	x27,.Lsqr8x_sub
863
864	sbcs	x16,x21,x8
865	mov	x2,sp
866	add	x1,sp,x5
867	ldp	x6,x7,[x3,#8*0]
868	sbcs	x17,x22,x9
869	stp	x14,x15,[x0,#8*0]
870	sbcs	x14,x23,x10
871	ldp	x8,x9,[x3,#8*2]
872	sbcs	x15,x24,x11
873	stp	x16,x17,[x0,#8*2]
874	sbcs	x16,x25,x12
875	ldp	x19,x20,[x1,#8*0]
876	sbcs	x17,x26,x13
877	ldp	x21,x22,[x1,#8*2]
878	sbcs	xzr,x30,xzr	// did it borrow?
879	ldr	x30,[x29,#8]		// pull return address
880	stp	x14,x15,[x0,#8*4]
881	stp	x16,x17,[x0,#8*6]
882
883	sub	x27,x5,#8*4
884.Lsqr4x_cond_copy:
885	sub	x27,x27,#8*4
886	csel	x14,x19,x6,lo
887	stp	xzr,xzr,[x2,#8*0]
888	csel	x15,x20,x7,lo
889	ldp	x6,x7,[x3,#8*4]
890	ldp	x19,x20,[x1,#8*4]
891	csel	x16,x21,x8,lo
892	stp	xzr,xzr,[x2,#8*2]
893	add	x2,x2,#8*4
894	csel	x17,x22,x9,lo
895	ldp	x8,x9,[x3,#8*6]
896	ldp	x21,x22,[x1,#8*6]
897	add	x1,x1,#8*4
898	stp	x14,x15,[x3,#8*0]
899	stp	x16,x17,[x3,#8*2]
900	add	x3,x3,#8*4
901	stp	xzr,xzr,[x1,#8*0]
902	stp	xzr,xzr,[x1,#8*2]
903	cbnz	x27,.Lsqr4x_cond_copy
904
905	csel	x14,x19,x6,lo
906	stp	xzr,xzr,[x2,#8*0]
907	csel	x15,x20,x7,lo
908	stp	xzr,xzr,[x2,#8*2]
909	csel	x16,x21,x8,lo
910	csel	x17,x22,x9,lo
911	stp	x14,x15,[x3,#8*0]
912	stp	x16,x17,[x3,#8*2]
913
914	b	.Lsqr8x_done
915
916.align	4
917.Lsqr8x8_post_condition:
918	adc	x28,xzr,xzr
919	ldr	x30,[x29,#8]		// pull return address
920	// x19-7,x28 hold result, x6-7 hold modulus
921	subs	x6,x19,x6
922	ldr	x1,[x29,#96]		// pull rp
923	sbcs	x7,x20,x7
924	stp	xzr,xzr,[sp,#8*0]
925	sbcs	x8,x21,x8
926	stp	xzr,xzr,[sp,#8*2]
927	sbcs	x9,x22,x9
928	stp	xzr,xzr,[sp,#8*4]
929	sbcs	x10,x23,x10
930	stp	xzr,xzr,[sp,#8*6]
931	sbcs	x11,x24,x11
932	stp	xzr,xzr,[sp,#8*8]
933	sbcs	x12,x25,x12
934	stp	xzr,xzr,[sp,#8*10]
935	sbcs	x13,x26,x13
936	stp	xzr,xzr,[sp,#8*12]
937	sbcs	x28,x28,xzr	// did it borrow?
938	stp	xzr,xzr,[sp,#8*14]
939
940	// x6-7 hold result-modulus
941	csel	x6,x19,x6,lo
942	csel	x7,x20,x7,lo
943	csel	x8,x21,x8,lo
944	csel	x9,x22,x9,lo
945	stp	x6,x7,[x1,#8*0]
946	csel	x10,x23,x10,lo
947	csel	x11,x24,x11,lo
948	stp	x8,x9,[x1,#8*2]
949	csel	x12,x25,x12,lo
950	csel	x13,x26,x13,lo
951	stp	x10,x11,[x1,#8*4]
952	stp	x12,x13,[x1,#8*6]
953
954.Lsqr8x_done:
955	ldp	x19,x20,[x29,#16]
956	mov	sp,x29
957	ldp	x21,x22,[x29,#32]
958	mov	x0,#1
959	ldp	x23,x24,[x29,#48]
960	ldp	x25,x26,[x29,#64]
961	ldp	x27,x28,[x29,#80]
962	ldr	x29,[sp],#128
963.inst	0xd50323bf		// autiasp
964	ret
965.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
966.type	__bn_mul4x_mont,%function
967.align	5
968__bn_mul4x_mont:
969.inst	0xd503233f		// paciasp
970	stp	x29,x30,[sp,#-128]!
971	add	x29,sp,#0
972	stp	x19,x20,[sp,#16]
973	stp	x21,x22,[sp,#32]
974	stp	x23,x24,[sp,#48]
975	stp	x25,x26,[sp,#64]
976	stp	x27,x28,[sp,#80]
977
978	sub	x26,sp,x5,lsl#3
979	lsl	x5,x5,#3
980	ldr	x4,[x4]		// *n0
981	sub	sp,x26,#8*4		// alloca
982
983	add	x10,x2,x5
984	add	x27,x1,x5
985	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
986
987	ldr	x24,[x2,#8*0]		// b[0]
988	ldp	x6,x7,[x1,#8*0]	// a[0..3]
989	ldp	x8,x9,[x1,#8*2]
990	add	x1,x1,#8*4
991	mov	x19,xzr
992	mov	x20,xzr
993	mov	x21,xzr
994	mov	x22,xzr
995	ldp	x14,x15,[x3,#8*0]	// n[0..3]
996	ldp	x16,x17,[x3,#8*2]
997	adds	x3,x3,#8*4		// clear carry bit
998	mov	x0,xzr
999	mov	x28,#0
1000	mov	x26,sp
1001
1002.Loop_mul4x_1st_reduction:
1003	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1004	adc	x0,x0,xzr	// modulo-scheduled
1005	mul	x11,x7,x24
1006	add	x28,x28,#8
1007	mul	x12,x8,x24
1008	and	x28,x28,#31
1009	mul	x13,x9,x24
1010	adds	x19,x19,x10
1011	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1012	adcs	x20,x20,x11
1013	mul	x25,x19,x4		// t[0]*n0
1014	adcs	x21,x21,x12
1015	umulh	x11,x7,x24
1016	adcs	x22,x22,x13
1017	umulh	x12,x8,x24
1018	adc	x23,xzr,xzr
1019	umulh	x13,x9,x24
1020	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1021	adds	x20,x20,x10
1022	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1023	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1024	adcs	x21,x21,x11
1025	mul	x11,x15,x25
1026	adcs	x22,x22,x12
1027	mul	x12,x16,x25
1028	adc	x23,x23,x13		// can't overflow
1029	mul	x13,x17,x25
1030	// (*)	adds	xzr,x19,x10
1031	subs	xzr,x19,#1		// (*)
1032	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1033	adcs	x19,x20,x11
1034	umulh	x11,x15,x25
1035	adcs	x20,x21,x12
1036	umulh	x12,x16,x25
1037	adcs	x21,x22,x13
1038	umulh	x13,x17,x25
1039	adcs	x22,x23,x0
1040	adc	x0,xzr,xzr
1041	adds	x19,x19,x10
1042	sub	x10,x27,x1
1043	adcs	x20,x20,x11
1044	adcs	x21,x21,x12
1045	adcs	x22,x22,x13
1046	//adc	x0,x0,xzr
1047	cbnz	x28,.Loop_mul4x_1st_reduction
1048
1049	cbz	x10,.Lmul4x4_post_condition
1050
1051	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1052	ldp	x8,x9,[x1,#8*2]
1053	add	x1,x1,#8*4
1054	ldr	x25,[sp]		// a[0]*n0
1055	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1056	ldp	x16,x17,[x3,#8*2]
1057	add	x3,x3,#8*4
1058
1059.Loop_mul4x_1st_tail:
1060	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1061	adc	x0,x0,xzr	// modulo-scheduled
1062	mul	x11,x7,x24
1063	add	x28,x28,#8
1064	mul	x12,x8,x24
1065	and	x28,x28,#31
1066	mul	x13,x9,x24
1067	adds	x19,x19,x10
1068	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1069	adcs	x20,x20,x11
1070	umulh	x11,x7,x24
1071	adcs	x21,x21,x12
1072	umulh	x12,x8,x24
1073	adcs	x22,x22,x13
1074	umulh	x13,x9,x24
1075	adc	x23,xzr,xzr
1076	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1077	adds	x20,x20,x10
1078	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1079	adcs	x21,x21,x11
1080	mul	x11,x15,x25
1081	adcs	x22,x22,x12
1082	mul	x12,x16,x25
1083	adc	x23,x23,x13		// can't overflow
1084	mul	x13,x17,x25
1085	adds	x19,x19,x10
1086	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1087	adcs	x20,x20,x11
1088	umulh	x11,x15,x25
1089	adcs	x21,x21,x12
1090	umulh	x12,x16,x25
1091	adcs	x22,x22,x13
1092	adcs	x23,x23,x0
1093	umulh	x13,x17,x25
1094	adc	x0,xzr,xzr
1095	ldr	x25,[sp,x28]		// next t[0]*n0
1096	str	x19,[x26],#8		// result!!!
1097	adds	x19,x20,x10
1098	sub	x10,x27,x1		// done yet?
1099	adcs	x20,x21,x11
1100	adcs	x21,x22,x12
1101	adcs	x22,x23,x13
1102	//adc	x0,x0,xzr
1103	cbnz	x28,.Loop_mul4x_1st_tail
1104
1105	sub	x11,x27,x5	// rewinded x1
1106	cbz	x10,.Lmul4x_proceed
1107
1108	ldp	x6,x7,[x1,#8*0]
1109	ldp	x8,x9,[x1,#8*2]
1110	add	x1,x1,#8*4
1111	ldp	x14,x15,[x3,#8*0]
1112	ldp	x16,x17,[x3,#8*2]
1113	add	x3,x3,#8*4
1114	b	.Loop_mul4x_1st_tail
1115
1116.align	5
1117.Lmul4x_proceed:
1118	ldr	x24,[x2,#8*4]!		// *++b
1119	adc	x30,x0,xzr
1120	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1121	sub	x3,x3,x5		// rewind np
1122	ldp	x8,x9,[x11,#8*2]
1123	add	x1,x11,#8*4
1124
1125	stp	x19,x20,[x26,#8*0]	// result!!!
1126	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1127	stp	x21,x22,[x26,#8*2]	// result!!!
1128	ldp	x21,x22,[sp,#8*6]
1129
1130	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1131	mov	x26,sp
1132	ldp	x16,x17,[x3,#8*2]
1133	adds	x3,x3,#8*4		// clear carry bit
1134	mov	x0,xzr
1135
1136.align	4
1137.Loop_mul4x_reduction:
1138	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1139	adc	x0,x0,xzr	// modulo-scheduled
1140	mul	x11,x7,x24
1141	add	x28,x28,#8
1142	mul	x12,x8,x24
1143	and	x28,x28,#31
1144	mul	x13,x9,x24
1145	adds	x19,x19,x10
1146	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1147	adcs	x20,x20,x11
1148	mul	x25,x19,x4		// t[0]*n0
1149	adcs	x21,x21,x12
1150	umulh	x11,x7,x24
1151	adcs	x22,x22,x13
1152	umulh	x12,x8,x24
1153	adc	x23,xzr,xzr
1154	umulh	x13,x9,x24
1155	ldr	x24,[x2,x28]		// next b[i]
1156	adds	x20,x20,x10
1157	// (*)	mul	x10,x14,x25
1158	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1159	adcs	x21,x21,x11
1160	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1161	adcs	x22,x22,x12
1162	mul	x12,x16,x25
1163	adc	x23,x23,x13		// can't overflow
1164	mul	x13,x17,x25
1165	// (*)	adds	xzr,x19,x10
1166	subs	xzr,x19,#1		// (*)
1167	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1168	adcs	x19,x20,x11
1169	umulh	x11,x15,x25
1170	adcs	x20,x21,x12
1171	umulh	x12,x16,x25
1172	adcs	x21,x22,x13
1173	umulh	x13,x17,x25
1174	adcs	x22,x23,x0
1175	adc	x0,xzr,xzr
1176	adds	x19,x19,x10
1177	adcs	x20,x20,x11
1178	adcs	x21,x21,x12
1179	adcs	x22,x22,x13
1180	//adc	x0,x0,xzr
1181	cbnz	x28,.Loop_mul4x_reduction
1182
1183	adc	x0,x0,xzr
1184	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1185	ldp	x12,x13,[x26,#8*6]
1186	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1187	ldp	x8,x9,[x1,#8*2]
1188	add	x1,x1,#8*4
1189	adds	x19,x19,x10
1190	adcs	x20,x20,x11
1191	adcs	x21,x21,x12
1192	adcs	x22,x22,x13
1193	//adc	x0,x0,xzr
1194
1195	ldr	x25,[sp]		// t[0]*n0
1196	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1197	ldp	x16,x17,[x3,#8*2]
1198	add	x3,x3,#8*4
1199
1200.align	4
1201.Loop_mul4x_tail:
1202	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1203	adc	x0,x0,xzr	// modulo-scheduled
1204	mul	x11,x7,x24
1205	add	x28,x28,#8
1206	mul	x12,x8,x24
1207	and	x28,x28,#31
1208	mul	x13,x9,x24
1209	adds	x19,x19,x10
1210	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1211	adcs	x20,x20,x11
1212	umulh	x11,x7,x24
1213	adcs	x21,x21,x12
1214	umulh	x12,x8,x24
1215	adcs	x22,x22,x13
1216	umulh	x13,x9,x24
1217	adc	x23,xzr,xzr
1218	ldr	x24,[x2,x28]		// next b[i]
1219	adds	x20,x20,x10
1220	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1221	adcs	x21,x21,x11
1222	mul	x11,x15,x25
1223	adcs	x22,x22,x12
1224	mul	x12,x16,x25
1225	adc	x23,x23,x13		// can't overflow
1226	mul	x13,x17,x25
1227	adds	x19,x19,x10
1228	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1229	adcs	x20,x20,x11
1230	umulh	x11,x15,x25
1231	adcs	x21,x21,x12
1232	umulh	x12,x16,x25
1233	adcs	x22,x22,x13
1234	umulh	x13,x17,x25
1235	adcs	x23,x23,x0
1236	ldr	x25,[sp,x28]		// next a[0]*n0
1237	adc	x0,xzr,xzr
1238	str	x19,[x26],#8		// result!!!
1239	adds	x19,x20,x10
1240	sub	x10,x27,x1		// done yet?
1241	adcs	x20,x21,x11
1242	adcs	x21,x22,x12
1243	adcs	x22,x23,x13
1244	//adc	x0,x0,xzr
1245	cbnz	x28,.Loop_mul4x_tail
1246
1247	sub	x11,x3,x5		// rewinded np?
1248	adc	x0,x0,xzr
1249	cbz	x10,.Loop_mul4x_break
1250
1251	ldp	x10,x11,[x26,#8*4]
1252	ldp	x12,x13,[x26,#8*6]
1253	ldp	x6,x7,[x1,#8*0]
1254	ldp	x8,x9,[x1,#8*2]
1255	add	x1,x1,#8*4
1256	adds	x19,x19,x10
1257	adcs	x20,x20,x11
1258	adcs	x21,x21,x12
1259	adcs	x22,x22,x13
1260	//adc	x0,x0,xzr
1261	ldp	x14,x15,[x3,#8*0]
1262	ldp	x16,x17,[x3,#8*2]
1263	add	x3,x3,#8*4
1264	b	.Loop_mul4x_tail
1265
1266.align	4
1267.Loop_mul4x_break:
1268	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1269	adds	x19,x19,x30
1270	add	x2,x2,#8*4		// bp++
1271	adcs	x20,x20,xzr
1272	sub	x1,x1,x5		// rewind ap
1273	adcs	x21,x21,xzr
1274	stp	x19,x20,[x26,#8*0]	// result!!!
1275	adcs	x22,x22,xzr
1276	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1277	adc	x30,x0,xzr
1278	stp	x21,x22,[x26,#8*2]	// result!!!
1279	cmp	x2,x13			// done yet?
1280	ldp	x21,x22,[sp,#8*6]
1281	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1282	ldp	x16,x17,[x11,#8*2]
1283	add	x3,x11,#8*4
1284	b.eq	.Lmul4x_post
1285
1286	ldr	x24,[x2]
1287	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1288	ldp	x8,x9,[x1,#8*2]
1289	adds	x1,x1,#8*4		// clear carry bit
1290	mov	x0,xzr
1291	mov	x26,sp
1292	b	.Loop_mul4x_reduction
1293
1294.align	4
1295.Lmul4x_post:
1296	// Final step. We see if result is larger than modulus, and
1297	// if it is, subtract the modulus. But comparison implies
1298	// subtraction. So we subtract modulus, see if it borrowed,
1299	// and conditionally copy original value.
1300	mov	x0,x12
1301	mov	x27,x12		// x0 copy
1302	subs	x10,x19,x14
1303	add	x26,sp,#8*8
1304	sbcs	x11,x20,x15
1305	sub	x28,x5,#8*4
1306
1307.Lmul4x_sub:
1308	sbcs	x12,x21,x16
1309	ldp	x14,x15,[x3,#8*0]
1310	sub	x28,x28,#8*4
1311	ldp	x19,x20,[x26,#8*0]
1312	sbcs	x13,x22,x17
1313	ldp	x16,x17,[x3,#8*2]
1314	add	x3,x3,#8*4
1315	ldp	x21,x22,[x26,#8*2]
1316	add	x26,x26,#8*4
1317	stp	x10,x11,[x0,#8*0]
1318	sbcs	x10,x19,x14
1319	stp	x12,x13,[x0,#8*2]
1320	add	x0,x0,#8*4
1321	sbcs	x11,x20,x15
1322	cbnz	x28,.Lmul4x_sub
1323
1324	sbcs	x12,x21,x16
1325	mov	x26,sp
1326	add	x1,sp,#8*4
1327	ldp	x6,x7,[x27,#8*0]
1328	sbcs	x13,x22,x17
1329	stp	x10,x11,[x0,#8*0]
1330	ldp	x8,x9,[x27,#8*2]
1331	stp	x12,x13,[x0,#8*2]
1332	ldp	x19,x20,[x1,#8*0]
1333	ldp	x21,x22,[x1,#8*2]
1334	sbcs	xzr,x30,xzr	// did it borrow?
1335	ldr	x30,[x29,#8]		// pull return address
1336
1337	sub	x28,x5,#8*4
1338.Lmul4x_cond_copy:
1339	sub	x28,x28,#8*4
1340	csel	x10,x19,x6,lo
1341	stp	xzr,xzr,[x26,#8*0]
1342	csel	x11,x20,x7,lo
1343	ldp	x6,x7,[x27,#8*4]
1344	ldp	x19,x20,[x1,#8*4]
1345	csel	x12,x21,x8,lo
1346	stp	xzr,xzr,[x26,#8*2]
1347	add	x26,x26,#8*4
1348	csel	x13,x22,x9,lo
1349	ldp	x8,x9,[x27,#8*6]
1350	ldp	x21,x22,[x1,#8*6]
1351	add	x1,x1,#8*4
1352	stp	x10,x11,[x27,#8*0]
1353	stp	x12,x13,[x27,#8*2]
1354	add	x27,x27,#8*4
1355	cbnz	x28,.Lmul4x_cond_copy
1356
1357	csel	x10,x19,x6,lo
1358	stp	xzr,xzr,[x26,#8*0]
1359	csel	x11,x20,x7,lo
1360	stp	xzr,xzr,[x26,#8*2]
1361	csel	x12,x21,x8,lo
1362	stp	xzr,xzr,[x26,#8*3]
1363	csel	x13,x22,x9,lo
1364	stp	xzr,xzr,[x26,#8*4]
1365	stp	x10,x11,[x27,#8*0]
1366	stp	x12,x13,[x27,#8*2]
1367
1368	b	.Lmul4x_done
1369
1370.align	4
1371.Lmul4x4_post_condition:
1372	adc	x0,x0,xzr
1373	ldr	x1,[x29,#96]		// pull rp
1374	// x19-3,x0 hold result, x14-7 hold modulus
1375	subs	x6,x19,x14
1376	ldr	x30,[x29,#8]		// pull return address
1377	sbcs	x7,x20,x15
1378	stp	xzr,xzr,[sp,#8*0]
1379	sbcs	x8,x21,x16
1380	stp	xzr,xzr,[sp,#8*2]
1381	sbcs	x9,x22,x17
1382	stp	xzr,xzr,[sp,#8*4]
1383	sbcs	xzr,x0,xzr		// did it borrow?
1384	stp	xzr,xzr,[sp,#8*6]
1385
1386	// x6-3 hold result-modulus
1387	csel	x6,x19,x6,lo
1388	csel	x7,x20,x7,lo
1389	csel	x8,x21,x8,lo
1390	csel	x9,x22,x9,lo
1391	stp	x6,x7,[x1,#8*0]
1392	stp	x8,x9,[x1,#8*2]
1393
1394.Lmul4x_done:
1395	ldp	x19,x20,[x29,#16]
1396	mov	sp,x29
1397	ldp	x21,x22,[x29,#32]
1398	mov	x0,#1
1399	ldp	x23,x24,[x29,#48]
1400	ldp	x25,x26,[x29,#64]
1401	ldp	x27,x28,[x29,#80]
1402	ldr	x29,[sp],#128
1403.inst	0xd50323bf		// autiasp
1404	ret
1405.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1406.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1407.align	2
1408.align	4
1409