xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/k8/sqr_basecase.asm (revision 70f7362772ba52b749c976fb5e86e39a8b2c9afc)
1dnl  AMD64 mpn_sqr_basecase.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C The inner loops of this code are the result of running a code generation and
36C optimization tool suite written by David Harvey and Torbjorn Granlund.
37
38C NOTES
39C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
40C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
41C     code which uses addmul_2s from the start, conditionally leaving a 1x1
42C     multiply to the end.  (In assembly code, one would stop invoking
43C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
44C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
45C     save/restore carry, instead it can propagate into the high product word.
46C   * Align more labels, should shave off a few cycles.
47C   * We can safely use 32-bit size operations, since operands with (2^32)
48C     limbs will lead to non-termination in practice.
49C   * The jump table could probably be optimized, at least for non-pic.
50C   * The special code for n <= 4 was quickly written.  It is probably too
51C     large and unnecessarily slow.
52C   * Consider combining small cases code so that the n=k-1 code jumps into the
53C     middle of the n=k code.
54C   * Avoid saving registers for small cases code.
55C   * Needed variables:
56C    n   r11  input size
57C    i   r8   work left, initially n
58C    j   r9   inner loop count
59C        r15  unused
60C    v0  r13
61C    v1  r14
62C    rp  rdi
63C    up  rsi
64C    w0  rbx
65C    w1  rcx
66C    w2  rbp
67C    w3  r10
68C    tp  r12
69C    lo  rax
70C    hi  rdx
71C        rsp
72
73C INPUT PARAMETERS
74define(`rp',	  `%rdi')
75define(`up',	  `%rsi')
76define(`n_param', `%rdx')
77
78define(`n',	`%r11')
79define(`tp',	`%r12')
80define(`i',	`%r8')
81define(`j',	`%r9')
82define(`v0',	`%r13')
83define(`v1',	`%r14')
84define(`w0',	`%rbx')
85define(`w1',	`%rcx')
86define(`w2',	`%rbp')
87define(`w3',	`%r10')
88
89ABI_SUPPORT(DOS64)
90ABI_SUPPORT(STD64)
91
92ASM_START()
93	TEXT
94	ALIGN(16)
95PROLOGUE(mpn_sqr_basecase)
96	FUNC_ENTRY(3)
97	mov	R32(n_param), R32(%rcx)
98	mov	R32(n_param), R32(n)		C free original n register (rdx)
99
100	add	$-40, %rsp
101
102	and	$3, R32(%rcx)
103	cmp	$4, R32(n_param)
104	lea	4(%rcx), %r8
105
106	mov	%rbx, 32(%rsp)
107	mov	%rbp, 24(%rsp)
108	mov	%r12, 16(%rsp)
109	mov	%r13, 8(%rsp)
110	mov	%r14, (%rsp)
111
112	cmovg	%r8, %rcx
113
114	lea	L(tab)(%rip), %rax
115ifdef(`PIC',
116`	movslq	(%rax,%rcx,4), %r10
117	add	%r10, %rax
118	jmp	*%rax
119',`
120	jmp	*(%rax,%rcx,8)
121')
122	JUMPTABSECT
123	ALIGN(8)
124L(tab):	JMPENT(	L(4), L(tab))
125	JMPENT(	L(1), L(tab))
126	JMPENT(	L(2), L(tab))
127	JMPENT(	L(3), L(tab))
128	JMPENT(	L(0m4), L(tab))
129	JMPENT(	L(1m4), L(tab))
130	JMPENT(	L(2m4), L(tab))
131	JMPENT(	L(3m4), L(tab))
132	TEXT
133
134L(1):	mov	(up), %rax
135	mul	%rax
136	add	$40, %rsp
137	mov	%rax, (rp)
138	mov	%rdx, 8(rp)
139	FUNC_EXIT()
140	ret
141
142L(2):	mov	(up), %rax
143	mov	%rax, %r8
144	mul	%rax
145	mov	8(up), %r11
146	mov	%rax, (rp)
147	mov	%r11, %rax
148	mov	%rdx, %r9
149	mul	%rax
150	add	$40, %rsp
151	mov	%rax, %r10
152	mov	%r11, %rax
153	mov	%rdx, %r11
154	mul	%r8
155	xor	%r8, %r8
156	add	%rax, %r9
157	adc	%rdx, %r10
158	adc	%r8, %r11
159	add	%rax, %r9
160	mov	%r9, 8(rp)
161	adc	%rdx, %r10
162	mov	%r10, 16(rp)
163	adc	%r8, %r11
164	mov	%r11, 24(rp)
165	FUNC_EXIT()
166	ret
167
168L(3):	mov	(up), %rax
169	mov	%rax, %r10
170	mul	%rax
171	mov	8(up), %r11
172	mov	%rax, (rp)
173	mov	%r11, %rax
174	mov	%rdx, 8(rp)
175	mul	%rax
176	mov	16(up), %rcx
177	mov	%rax, 16(rp)
178	mov	%rcx, %rax
179	mov	%rdx, 24(rp)
180	mul	%rax
181	mov	%rax, 32(rp)
182	mov	%rdx, 40(rp)
183
184	mov	%r11, %rax
185	mul	%r10
186	mov	%rax, %r8
187	mov	%rcx, %rax
188	mov	%rdx, %r9
189	mul	%r10
190	xor	%r10, %r10
191	add	%rax, %r9
192	mov	%r11, %rax
193	mov	%r10, %r11
194	adc	%rdx, %r10
195
196	mul	%rcx
197	add	$40, %rsp
198	add	%rax, %r10
199	adc	%r11, %rdx
200	add	%r8, %r8
201	adc	%r9, %r9
202	adc	%r10, %r10
203	adc	%rdx, %rdx
204	adc	%r11, %r11
205	add	%r8, 8(rp)
206	adc	%r9, 16(rp)
207	adc	%r10, 24(rp)
208	adc	%rdx, 32(rp)
209	adc	%r11, 40(rp)
210	FUNC_EXIT()
211	ret
212
213L(4):	mov	(up), %rax
214	mov	%rax, %r11
215	mul	%rax
216	mov	8(up), %rbx
217	mov	%rax, (rp)
218	mov	%rbx, %rax
219	mov	%rdx, 8(rp)
220	mul	%rax
221	mov	%rax, 16(rp)
222	mov	%rdx, 24(rp)
223	mov	16(up), %rax
224	mul	%rax
225	mov	%rax, 32(rp)
226	mov	%rdx, 40(rp)
227	mov	24(up), %rax
228	mul	%rax
229	mov	%rax, 48(rp)
230	mov	%rbx, %rax
231	mov	%rdx, 56(rp)
232
233	mul	%r11
234	add	$32, %rsp
235	mov	%rax, %r8
236	mov	%rdx, %r9
237	mov	16(up), %rax
238	mul	%r11
239	xor	%r10, %r10
240	add	%rax, %r9
241	adc	%rdx, %r10
242	mov	24(up), %rax
243	mul	%r11
244	xor	%r11, %r11
245	add	%rax, %r10
246	adc	%rdx, %r11
247	mov	16(up), %rax
248	mul	%rbx
249	xor	%rcx, %rcx
250	add	%rax, %r10
251	adc	%rdx, %r11
252	adc	$0, %rcx
253	mov	24(up), %rax
254	mul	%rbx
255	pop	%rbx
256	add	%rax, %r11
257	adc	%rdx, %rcx
258	mov	16(up), %rdx
259	mov	24(up), %rax
260	mul	%rdx
261	add	%rax, %rcx
262	adc	$0, %rdx
263
264	add	%r8, %r8
265	adc	%r9, %r9
266	adc	%r10, %r10
267	adc	%r11, %r11
268	adc	%rcx, %rcx
269	mov	$0, R32(%rax)
270	adc	%rdx, %rdx
271
272	adc	%rax, %rax
273	add	%r8, 8(rp)
274	adc	%r9, 16(rp)
275	adc	%r10, 24(rp)
276	adc	%r11, 32(rp)
277	adc	%rcx, 40(rp)
278	adc	%rdx, 48(rp)
279	adc	%rax, 56(rp)
280	FUNC_EXIT()
281	ret
282
283
284L(0m4):
285	lea	-16(rp,n,8), tp		C point tp in middle of result operand
286	mov	(up), v0
287	mov	8(up), %rax
288	lea	(up,n,8), up		C point up at end of input operand
289
290	lea	-4(n), i
291C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
292	xor	R32(j), R32(j)
293	sub	n, j
294
295	mul	v0
296	xor	R32(w2), R32(w2)
297	mov	%rax, w0
298	mov	16(up,j,8), %rax
299	mov	%rdx, w3
300	jmp	L(L3)
301
302	ALIGN(16)
303L(mul_1_m3_top):
304	add	%rax, w2
305	mov	w3, (tp,j,8)
306	mov	(up,j,8), %rax
307	adc	%rdx, w1
308	xor	R32(w0), R32(w0)
309	mul	v0
310	xor	R32(w3), R32(w3)
311	mov	w2, 8(tp,j,8)
312	add	%rax, w1
313	adc	%rdx, w0
314	mov	8(up,j,8), %rax
315	mov	w1, 16(tp,j,8)
316	xor	R32(w2), R32(w2)
317	mul	v0
318	add	%rax, w0
319	mov	16(up,j,8), %rax
320	adc	%rdx, w3
321L(L3):	xor	R32(w1), R32(w1)
322	mul	v0
323	add	%rax, w3
324	mov	24(up,j,8), %rax
325	adc	%rdx, w2
326	mov	w0, 24(tp,j,8)
327	mul	v0
328	add	$4, j
329	js	L(mul_1_m3_top)
330
331	add	%rax, w2
332	mov	w3, (tp)
333	adc	%rdx, w1
334	mov	w2, 8(tp)
335	mov	w1, 16(tp)
336
337	lea	eval(2*8)(tp), tp	C tp += 2
338	lea	-8(up), up
339	jmp	L(dowhile)
340
341
342L(1m4):
343	lea	8(rp,n,8), tp		C point tp in middle of result operand
344	mov	(up), v0		C u0
345	mov	8(up), %rax		C u1
346	lea	8(up,n,8), up		C point up at end of input operand
347
348	lea	-3(n), i
349C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
350	lea	-3(n), j
351	neg	j
352
353	mov	%rax, v1		C u1
354	mul	v0			C u0 * u1
355	mov	%rdx, w1
356	xor	R32(w2), R32(w2)
357	mov	%rax, 8(rp)
358	jmp	L(m0)
359
360	ALIGN(16)
361L(mul_2_m0_top):
362	mul	v1
363	add	%rax, w0
364	adc	%rdx, w1
365	mov	-24(up,j,8), %rax
366	mov	$0, R32(w2)
367	mul	v0
368	add	%rax, w0
369	mov	-24(up,j,8), %rax
370	adc	%rdx, w1
371	adc	$0, R32(w2)
372	mul	v1			C v1 * u0
373	add	%rax, w1
374	mov	w0, -24(tp,j,8)
375	adc	%rdx, w2
376L(m0):	mov	-16(up,j,8), %rax	C u2, u6 ...
377	mul	v0			C u0 * u2
378	mov	$0, R32(w3)
379	add	%rax, w1
380	adc	%rdx, w2
381	mov	-16(up,j,8), %rax
382	adc	$0, R32(w3)
383	mov	$0, R32(w0)
384	mov	w1, -16(tp,j,8)
385	mul	v1
386	add	%rax, w2
387	mov	-8(up,j,8), %rax
388	adc	%rdx, w3
389	mov	$0, R32(w1)
390	mul	v0
391	add	%rax, w2
392	mov	-8(up,j,8), %rax
393	adc	%rdx, w3
394	adc	$0, R32(w0)
395	mul	v1
396	add	%rax, w3
397	mov	w2, -8(tp,j,8)
398	adc	%rdx, w0
399L(m2x):	mov	(up,j,8), %rax
400	mul	v0
401	add	%rax, w3
402	adc	%rdx, w0
403	adc	$0, R32(w1)
404	add	$4, j
405	mov	-32(up,j,8), %rax
406	mov	w3, -32(tp,j,8)
407	js	L(mul_2_m0_top)
408
409	mul	v1
410	add	%rax, w0
411	adc	%rdx, w1
412	mov	w0, -8(tp)
413	mov	w1, (tp)
414
415	lea	-16(up), up
416	lea	eval(3*8-24)(tp), tp	C tp += 3
417	jmp	L(dowhile_end)
418
419
420L(2m4):
421	lea	-16(rp,n,8), tp		C point tp in middle of result operand
422	mov	(up), v0
423	mov	8(up), %rax
424	lea	(up,n,8), up		C point up at end of input operand
425
426	lea	-4(n), i
427C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
428	lea	-2(n), j
429	neg	j
430
431	mul	v0
432	mov	%rax, w2
433	mov	(up,j,8), %rax
434	mov	%rdx, w1
435	jmp	L(L1)
436
437	ALIGN(16)
438L(mul_1_m1_top):
439	add	%rax, w2
440	mov	w3, (tp,j,8)
441	mov	(up,j,8), %rax
442	adc	%rdx, w1
443L(L1):	xor	R32(w0), R32(w0)
444	mul	v0
445	xor	R32(w3), R32(w3)
446	mov	w2, 8(tp,j,8)
447	add	%rax, w1
448	adc	%rdx, w0
449	mov	8(up,j,8), %rax
450	mov	w1, 16(tp,j,8)
451	xor	R32(w2), R32(w2)
452	mul	v0
453	add	%rax, w0
454	mov	16(up,j,8), %rax
455	adc	%rdx, w3
456	xor	R32(w1), R32(w1)
457	mul	v0
458	add	%rax, w3
459	mov	24(up,j,8), %rax
460	adc	%rdx, w2
461	mov	w0, 24(tp,j,8)
462	mul	v0
463	add	$4, j
464	js	L(mul_1_m1_top)
465
466	add	%rax, w2
467	mov	w3, (tp)
468	adc	%rdx, w1
469	mov	w2, 8(tp)
470	mov	w1, 16(tp)
471
472	lea	eval(2*8)(tp), tp	C tp += 2
473	lea	-8(up), up
474	jmp	L(dowhile_mid)
475
476
477L(3m4):
478	lea	8(rp,n,8), tp		C point tp in middle of result operand
479	mov	(up), v0		C u0
480	mov	8(up), %rax		C u1
481	lea	8(up,n,8), up		C point up at end of input operand
482
483	lea	-5(n), i
484C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
485	lea	-1(n), j
486	neg	j
487
488	mov	%rax, v1		C u1
489	mul	v0			C u0 * u1
490	mov	%rdx, w3
491	xor	R32(w0), R32(w0)
492	xor	R32(w1), R32(w1)
493	mov	%rax, 8(rp)
494	jmp	L(m2)
495
496	ALIGN(16)
497L(mul_2_m2_top):
498	mul	v1
499	add	%rax, w0
500	adc	%rdx, w1
501	mov	-24(up,j,8), %rax
502	mov	$0, R32(w2)
503	mul	v0
504	add	%rax, w0
505	mov	-24(up,j,8), %rax
506	adc	%rdx, w1
507	adc	$0, R32(w2)
508	mul	v1			C v1 * u0
509	add	%rax, w1
510	mov	w0, -24(tp,j,8)
511	adc	%rdx, w2
512	mov	-16(up,j,8), %rax
513	mul	v0
514	mov	$0, R32(w3)
515	add	%rax, w1
516	adc	%rdx, w2
517	mov	-16(up,j,8), %rax
518	adc	$0, R32(w3)
519	mov	$0, R32(w0)
520	mov	w1, -16(tp,j,8)
521	mul	v1
522	add	%rax, w2
523	mov	-8(up,j,8), %rax
524	adc	%rdx, w3
525	mov	$0, R32(w1)
526	mul	v0
527	add	%rax, w2
528	mov	-8(up,j,8), %rax
529	adc	%rdx, w3
530	adc	$0, R32(w0)
531	mul	v1
532	add	%rax, w3
533	mov	w2, -8(tp,j,8)
534	adc	%rdx, w0
535L(m2):	mov	(up,j,8), %rax
536	mul	v0
537	add	%rax, w3
538	adc	%rdx, w0
539	adc	$0, R32(w1)
540	add	$4, j
541	mov	-32(up,j,8), %rax
542	mov	w3, -32(tp,j,8)
543	js	L(mul_2_m2_top)
544
545	mul	v1
546	add	%rax, w0
547	adc	%rdx, w1
548	mov	w0, -8(tp)
549	mov	w1, (tp)
550
551	lea	-16(up), up
552	jmp	L(dowhile_mid)
553
554L(dowhile):
555C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
556	lea	4(i), j
557	neg	j
558
559	mov	16(up,j,8), v0
560	mov	24(up,j,8), v1
561	mov	24(up,j,8), %rax
562	mul	v0
563	xor	R32(w3), R32(w3)
564	add	%rax, 24(tp,j,8)
565	adc	%rdx, w3
566	xor	R32(w0), R32(w0)
567	xor	R32(w1), R32(w1)
568	jmp	L(am2)
569
570	ALIGN(16)
571L(addmul_2_m2_top):
572	add	w3, (tp,j,8)
573	adc	%rax, w0
574	mov	8(up,j,8), %rax
575	adc	%rdx, w1
576	mov	$0, R32(w2)
577	mul	v0
578	add	%rax, w0
579	mov	8(up,j,8), %rax
580	adc	%rdx, w1
581	adc	$0, R32(w2)
582	mul	v1				C v1 * u0
583	add	w0, 8(tp,j,8)
584	adc	%rax, w1
585	adc	%rdx, w2
586	mov	16(up,j,8), %rax
587	mov	$0, R32(w3)
588	mul	v0				C v0 * u1
589	add	%rax, w1
590	mov	16(up,j,8), %rax
591	adc	%rdx, w2
592	adc	$0, R32(w3)
593	mul	v1				C v1 * u1
594	add	w1, 16(tp,j,8)
595	adc	%rax, w2
596	mov	24(up,j,8), %rax
597	adc	%rdx, w3
598	mul	v0
599	mov	$0, R32(w0)
600	add	%rax, w2
601	adc	%rdx, w3
602	mov	$0, R32(w1)
603	mov	24(up,j,8), %rax
604	adc	$0, R32(w0)
605	mul	v1
606	add	w2, 24(tp,j,8)
607	adc	%rax, w3
608	adc	%rdx, w0
609L(am2):	mov	32(up,j,8), %rax
610	mul	v0
611	add	%rax, w3
612	mov	32(up,j,8), %rax
613	adc	%rdx, w0
614	adc	$0, R32(w1)
615	mul	v1
616	add	$4, j
617	js	L(addmul_2_m2_top)
618
619	add	w3, (tp)
620	adc	%rax, w0
621	adc	%rdx, w1
622	mov	w0, 8(tp)
623	mov	w1, 16(tp)
624
625	lea	eval(2*8)(tp), tp	C tp += 2
626
627	add	$-2, R32(i)		C i -= 2
628
629L(dowhile_mid):
630C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
631	lea	2(i), j
632	neg	j
633
634	mov	(up,j,8), v0
635	mov	8(up,j,8), v1
636	mov	8(up,j,8), %rax
637	mul	v0
638	xor	R32(w1), R32(w1)
639	add	%rax, 8(tp,j,8)
640	adc	%rdx, w1
641	xor	R32(w2), R32(w2)
642	jmp	L(20)
643
644	ALIGN(16)
645L(addmul_2_m0_top):
646	add	w3, (tp,j,8)
647	adc	%rax, w0
648	mov	8(up,j,8), %rax
649	adc	%rdx, w1
650	mov	$0, R32(w2)
651	mul	v0
652	add	%rax, w0
653	mov	8(up,j,8), %rax
654	adc	%rdx, w1
655	adc	$0, R32(w2)
656	mul	v1				C v1 * u0
657	add	w0, 8(tp,j,8)
658	adc	%rax, w1
659	adc	%rdx, w2
660L(20):	mov	16(up,j,8), %rax
661	mov	$0, R32(w3)
662	mul	v0				C v0 * u1
663	add	%rax, w1
664	mov	16(up,j,8), %rax
665	adc	%rdx, w2
666	adc	$0, R32(w3)
667	mul	v1				C v1 * u1
668	add	w1, 16(tp,j,8)
669	adc	%rax, w2
670	mov	24(up,j,8), %rax
671	adc	%rdx, w3
672	mul	v0
673	mov	$0, R32(w0)
674	add	%rax, w2
675	adc	%rdx, w3
676	mov	$0, R32(w1)
677	mov	24(up,j,8), %rax
678	adc	$0, R32(w0)
679	mul	v1
680	add	w2, 24(tp,j,8)
681	adc	%rax, w3
682	adc	%rdx, w0
683	mov	32(up,j,8), %rax
684	mul	v0
685	add	%rax, w3
686	mov	32(up,j,8), %rax
687	adc	%rdx, w0
688	adc	$0, R32(w1)
689	mul	v1
690	add	$4, j
691	js	L(addmul_2_m0_top)
692
693	add	w3, (tp)
694	adc	%rax, w0
695	adc	%rdx, w1
696	mov	w0, 8(tp)
697	mov	w1, 16(tp)
698
699	lea	eval(2*8)(tp), tp	C tp += 2
700L(dowhile_end):
701
702	add	$-2, R32(i)		C i -= 2
703	jne	L(dowhile)
704
705C Function mpn_addmul_2s_2
706	mov	-16(up), v0
707	mov	-8(up), v1
708	mov	-8(up), %rax
709	mul	v0
710	xor	R32(w3), R32(w3)
711	add	%rax, -8(tp)
712	adc	%rdx, w3
713	xor	R32(w0), R32(w0)
714	xor	R32(w1), R32(w1)
715	mov	(up), %rax
716	mul	v0
717	add	%rax, w3
718	mov	(up), %rax
719	adc	%rdx, w0
720	mul	v1
721	add	w3, (tp)
722	adc	%rax, w0
723	adc	%rdx, w1
724	mov	w0, 8(tp)
725	mov	w1, 16(tp)
726
727C Function mpn_sqr_diag_addlsh1
728	lea	-4(n,n), j
729
730	mov	8(rp), %r11
731	lea	-8(up), up
732	lea	(rp,j,8), rp
733	neg	j
734	mov	(up,j,4), %rax
735	mul	%rax
736	test	$2, R8(j)
737	jnz	L(odd)
738
739L(evn):	add	%r11, %r11
740	sbb	R32(%rbx), R32(%rbx)		C save CF
741	add	%rdx, %r11
742	mov	%rax, (rp,j,8)
743	jmp	L(d0)
744
745L(odd):	add	%r11, %r11
746	sbb	R32(%rbp), R32(%rbp)		C save CF
747	add	%rdx, %r11
748	mov	%rax, (rp,j,8)
749	lea	-2(j), j
750	jmp	L(d1)
751
752	ALIGN(16)
753L(top):	mov	(up,j,4), %rax
754	mul	%rax
755	add	R32(%rbp), R32(%rbp)		C restore carry
756	adc	%rax, %r10
757	adc	%rdx, %r11
758	mov	%r10, (rp,j,8)
759L(d0):	mov	%r11, 8(rp,j,8)
760	mov	16(rp,j,8), %r10
761	adc	%r10, %r10
762	mov	24(rp,j,8), %r11
763	adc	%r11, %r11
764	nop
765	sbb	R32(%rbp), R32(%rbp)		C save CF
766	mov	8(up,j,4), %rax
767	mul	%rax
768	add	R32(%rbx), R32(%rbx)		C restore carry
769	adc	%rax, %r10
770	adc	%rdx, %r11
771	mov	%r10, 16(rp,j,8)
772L(d1):	mov	%r11, 24(rp,j,8)
773	mov	32(rp,j,8), %r10
774	adc	%r10, %r10
775	mov	40(rp,j,8), %r11
776	adc	%r11, %r11
777	sbb	R32(%rbx), R32(%rbx)		C save CF
778	add	$4, j
779	js	L(top)
780
781	mov	(up), %rax
782	mul	%rax
783	add	R32(%rbp), R32(%rbp)		C restore carry
784	adc	%rax, %r10
785	adc	%rdx, %r11
786	mov	%r10, (rp)
787	mov	%r11, 8(rp)
788	mov	16(rp), %r10
789	adc	%r10, %r10
790	sbb	R32(%rbp), R32(%rbp)		C save CF
791	neg	R32(%rbp)
792	mov	8(up), %rax
793	mul	%rax
794	add	R32(%rbx), R32(%rbx)		C restore carry
795	adc	%rax, %r10
796	adc	%rbp, %rdx
797	mov	%r10, 16(rp)
798	mov	%rdx, 24(rp)
799
800	pop	%r14
801	pop	%r13
802	pop	%r12
803	pop	%rbp
804	pop	%rbx
805	FUNC_EXIT()
806	ret
807EPILOGUE()
808