xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/k8/mul_basecase.asm (revision 924795e69c8bb3f17afd8fcbb799710cc1719dc4)
1dnl  AMD64 mpn_mul_basecase.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
4
5dnl  Copyright 2008, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 2.375
37C AMD K10	 2.375
38C Intel P4	15-16
39C Intel core2	 4.45
40C Intel corei	 4.35
41C Intel atom	 ?
42C VIA nano	 4.5
43
44C The inner loops of this code are the result of running a code generation and
45C optimization tool suite written by David Harvey and Torbjorn Granlund.
46
47C TODO
48C  * Use fewer registers.  (how??? I can't see it -- david)
49C  * Avoid some "mov $0,r" and instead use "xor r,r".
50C  * Can the top of each L(addmul_outer_n) prologue be folded into the
51C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
52C    case where vn = 1 or 2; is it worth it?
53
54C INPUT PARAMETERS
55define(`rp',      `%rdi')
56define(`up',      `%rsi')
57define(`un_param',`%rdx')
58define(`vp',      `%rcx')
59define(`vn',      `%r8')
60
61define(`v0', `%r12')
62define(`v1', `%r9')
63
64define(`w0', `%rbx')
65define(`w1', `%r15')
66define(`w2', `%rbp')
67define(`w3', `%r10')
68
69define(`n',  `%r11')
70define(`outer_addr', `%r14')
71define(`un',  `%r13')
72
73ABI_SUPPORT(DOS64)
74ABI_SUPPORT(STD64)
75
76ASM_START()
77	TEXT
78	ALIGN(16)
79PROLOGUE(mpn_mul_basecase)
80	FUNC_ENTRY(4)
81IFDOS(`	mov	56(%rsp), %r8d	')
82	push	%rbx
83	push	%rbp
84	push	%r12
85	push	%r13
86	push	%r14
87	push	%r15
88
89	xor	R32(un), R32(un)
90	mov	(up), %rax
91	mov	(vp), v0
92
93	sub	un_param, un		C rdx used by mul
94	mov	un, n
95	mov	R32(un_param), R32(w0)
96
97	lea	(rp,un_param,8), rp
98	lea	(up,un_param,8), up
99
100	mul	v0
101
102	test	$1, R8(vn)
103	jz	L(mul_2)
104
105C ===========================================================
106C     mul_1 for vp[0] if vn is odd
107
108L(mul_1):
109	and	$3, R32(w0)
110	jz	L(mul_1_prologue_0)
111	cmp	$2, R32(w0)
112	jc	L(mul_1_prologue_1)
113	jz	L(mul_1_prologue_2)
114
115L(mul_1_prologue_3):
116	add	$-1, n
117	lea	L(addmul_outer_3)(%rip), outer_addr
118	mov	%rax, w3
119	mov	%rdx, w0
120	jmp	L(mul_1_entry_3)
121
122L(mul_1_prologue_0):
123	mov	%rax, w2
124	mov	%rdx, w3		C note: already w0 == 0
125	lea	L(addmul_outer_0)(%rip), outer_addr
126	jmp	L(mul_1_entry_0)
127
128L(mul_1_prologue_1):
129	cmp	$-1, un
130	jne	2f
131	mov	%rax, -8(rp)
132	mov	%rdx, (rp)
133	jmp	L(ret)
1342:	add	$1, n
135	lea	L(addmul_outer_1)(%rip), outer_addr
136	mov	%rax, w1
137	mov	%rdx, w2
138	xor	R32(w3), R32(w3)
139	mov	(up,n,8), %rax
140	jmp	L(mul_1_entry_1)
141
142L(mul_1_prologue_2):
143	add	$-2, n
144	lea	L(addmul_outer_2)(%rip), outer_addr
145	mov	%rax, w0
146	mov	%rdx, w1
147	mov	24(up,n,8), %rax
148	xor	R32(w2), R32(w2)
149	xor	R32(w3), R32(w3)
150	jmp	L(mul_1_entry_2)
151
152
153	C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
154
155	ALIGN(16)
156L(mul_1_top):
157	mov	w0, -16(rp,n,8)
158	add	%rax, w1
159	mov	(up,n,8), %rax
160	adc	%rdx, w2
161L(mul_1_entry_1):
162	xor	R32(w0), R32(w0)
163	mul	v0
164	mov	w1, -8(rp,n,8)
165	add	%rax, w2
166	adc	%rdx, w3
167L(mul_1_entry_0):
168	mov	8(up,n,8), %rax
169	mul	v0
170	mov	w2, (rp,n,8)
171	add	%rax, w3
172	adc	%rdx, w0
173L(mul_1_entry_3):
174	mov	16(up,n,8), %rax
175	mul	v0
176	mov	w3, 8(rp,n,8)
177	xor	R32(w2), R32(w2)	C zero
178	mov	w2, w3			C zero
179	add	%rax, w0
180	mov	24(up,n,8), %rax
181	mov	w2, w1			C zero
182	adc	%rdx, w1
183L(mul_1_entry_2):
184	mul	v0
185	add	$4, n
186	js	L(mul_1_top)
187
188	mov	w0, -16(rp)
189	add	%rax, w1
190	mov	w1, -8(rp)
191	adc	%rdx, w2
192	mov	w2, (rp)
193
194	add	$-1, vn			C vn -= 1
195	jz	L(ret)
196
197	mov	8(vp), v0
198	mov	16(vp), v1
199
200	lea	8(vp), vp		C vp += 1
201	lea	8(rp), rp		C rp += 1
202
203	jmp	*outer_addr
204
205C ===========================================================
206C     mul_2 for vp[0], vp[1] if vn is even
207
208	ALIGN(16)
209L(mul_2):
210	mov	8(vp), v1
211
212	and	$3, R32(w0)
213	jz	L(mul_2_prologue_0)
214	cmp	$2, R32(w0)
215	jz	L(mul_2_prologue_2)
216	jc	L(mul_2_prologue_1)
217
218L(mul_2_prologue_3):
219	lea	L(addmul_outer_3)(%rip), outer_addr
220	add	$2, n
221	mov	%rax, -16(rp,n,8)
222	mov	%rdx, w2
223	xor	R32(w3), R32(w3)
224	xor	R32(w0), R32(w0)
225	mov	-16(up,n,8), %rax
226	jmp	L(mul_2_entry_3)
227
228	ALIGN(16)
229L(mul_2_prologue_0):
230	add	$3, n
231	mov	%rax, w0
232	mov	%rdx, w1
233	xor	R32(w2), R32(w2)
234	mov	-24(up,n,8), %rax
235	lea	L(addmul_outer_0)(%rip), outer_addr
236	jmp	L(mul_2_entry_0)
237
238	ALIGN(16)
239L(mul_2_prologue_1):
240	mov	%rax, w3
241	mov	%rdx, w0
242	xor	R32(w1), R32(w1)
243	lea	L(addmul_outer_1)(%rip), outer_addr
244	jmp	L(mul_2_entry_1)
245
246	ALIGN(16)
247L(mul_2_prologue_2):
248	add	$1, n
249	lea	L(addmul_outer_2)(%rip), outer_addr
250	mov	$0, R32(w0)
251	mov	$0, R32(w1)
252	mov	%rax, w2
253	mov	-8(up,n,8), %rax
254	mov	%rdx, w3
255	jmp	L(mul_2_entry_2)
256
257	C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
258
259	ALIGN(16)
260L(mul_2_top):
261	mov	-32(up,n,8), %rax
262	mul	v1
263	add	%rax, w0
264	adc	%rdx, w1
265	mov	-24(up,n,8), %rax
266	xor	R32(w2), R32(w2)
267	mul	v0
268	add	%rax, w0
269	mov	-24(up,n,8), %rax
270	adc	%rdx, w1
271	adc	$0, R32(w2)
272L(mul_2_entry_0):
273	mul	v1
274	add	%rax, w1
275	mov	w0, -24(rp,n,8)
276	adc	%rdx, w2
277	mov	-16(up,n,8), %rax
278	mul	v0
279	mov	$0, R32(w3)
280	add	%rax, w1
281	adc	%rdx, w2
282	mov	-16(up,n,8), %rax
283	adc	$0, R32(w3)
284	mov	$0, R32(w0)
285	mov	w1, -16(rp,n,8)
286L(mul_2_entry_3):
287	mul	v1
288	add	%rax, w2
289	mov	-8(up,n,8), %rax
290	adc	%rdx, w3
291	mov	$0, R32(w1)
292	mul	v0
293	add	%rax, w2
294	mov	-8(up,n,8), %rax
295	adc	%rdx, w3
296	adc	R32(w1), R32(w0)	C adc $0, w0
297L(mul_2_entry_2):
298	mul	v1
299	add	%rax, w3
300	mov	w2, -8(rp,n,8)
301	adc	%rdx, w0
302	mov	(up,n,8), %rax
303	mul	v0
304	add	%rax, w3
305	adc	%rdx, w0
306	adc	$0, R32(w1)
307L(mul_2_entry_1):
308	add	$4, n
309	mov	w3, -32(rp,n,8)
310	js	L(mul_2_top)
311
312	mov	-32(up,n,8), %rax	C FIXME: n is constant
313	mul	v1
314	add	%rax, w0
315	mov	w0, (rp)
316	adc	%rdx, w1
317	mov	w1, 8(rp)
318
319	add	$-2, vn			C vn -= 2
320	jz	L(ret)
321
322	mov	16(vp), v0
323	mov	24(vp), v1
324
325	lea	16(vp), vp		C vp += 2
326	lea	16(rp), rp		C rp += 2
327
328	jmp	*outer_addr
329
330
331C ===========================================================
332C     addmul_2 for remaining vp's
333
334	C in the following prologues, we reuse un to store the
335	C adjusted value of n that is reloaded on each iteration
336
337L(addmul_outer_0):
338	add	$3, un
339	lea	0(%rip), outer_addr
340
341	mov	un, n
342	mov	-24(up,un,8), %rax
343	mul	v0
344	mov	%rax, w0
345	mov	-24(up,un,8), %rax
346	mov	%rdx, w1
347	xor	R32(w2), R32(w2)
348	jmp	L(addmul_entry_0)
349
350L(addmul_outer_1):
351	mov	un, n
352	mov	(up,un,8), %rax
353	mul	v0
354	mov	%rax, w3
355	mov	(up,un,8), %rax
356	mov	%rdx, w0
357	xor	R32(w1), R32(w1)
358	jmp	L(addmul_entry_1)
359
360L(addmul_outer_2):
361	add	$1, un
362	lea	0(%rip), outer_addr
363
364	mov	un, n
365	mov	-8(up,un,8), %rax
366	mul	v0
367	xor	R32(w0), R32(w0)
368	mov	%rax, w2
369	xor	R32(w1), R32(w1)
370	mov	%rdx, w3
371	mov	-8(up,un,8), %rax
372	jmp	L(addmul_entry_2)
373
374L(addmul_outer_3):
375	add	$2, un
376	lea	0(%rip), outer_addr
377
378	mov	un, n
379	mov	-16(up,un,8), %rax
380	xor	R32(w3), R32(w3)
381	mul	v0
382	mov	%rax, w1
383	mov	-16(up,un,8), %rax
384	mov	%rdx, w2
385	jmp	L(addmul_entry_3)
386
387	C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
388
389	ALIGN(16)
390L(addmul_top):
391	add	w3, -32(rp,n,8)
392	adc	%rax, w0
393	mov	-24(up,n,8), %rax
394	adc	%rdx, w1
395	xor	R32(w2), R32(w2)
396	mul	v0
397	add	%rax, w0
398	mov	-24(up,n,8), %rax
399	adc	%rdx, w1
400	adc	R32(w2), R32(w2)	C adc $0, w2
401L(addmul_entry_0):
402	mul	v1
403	xor	R32(w3), R32(w3)
404	add	w0, -24(rp,n,8)
405	adc	%rax, w1
406	mov	-16(up,n,8), %rax
407	adc	%rdx, w2
408	mul	v0
409	add	%rax, w1
410	mov	-16(up,n,8), %rax
411	adc	%rdx, w2
412	adc	$0, R32(w3)
413L(addmul_entry_3):
414	mul	v1
415	add	w1, -16(rp,n,8)
416	adc	%rax, w2
417	mov	-8(up,n,8), %rax
418	adc	%rdx, w3
419	mul	v0
420	xor	R32(w0), R32(w0)
421	add	%rax, w2
422	adc	%rdx, w3
423	mov	$0, R32(w1)
424	mov	-8(up,n,8), %rax
425	adc	R32(w1), R32(w0)	C adc $0, w0
426L(addmul_entry_2):
427	mul	v1
428	add	w2, -8(rp,n,8)
429	adc	%rax, w3
430	adc	%rdx, w0
431	mov	(up,n,8), %rax
432	mul	v0
433	add	%rax, w3
434	mov	(up,n,8), %rax
435	adc	%rdx, w0
436	adc	$0, R32(w1)
437L(addmul_entry_1):
438	mul	v1
439	add	$4, n
440	js	L(addmul_top)
441
442	add	w3, -8(rp)
443	adc	%rax, w0
444	mov	w0, (rp)
445	adc	%rdx, w1
446	mov	w1, 8(rp)
447
448	add	$-2, vn			C vn -= 2
449	jz	L(ret)
450
451	lea	16(rp), rp		C rp += 2
452	lea	16(vp), vp		C vp += 2
453
454	mov	(vp), v0
455	mov	8(vp), v1
456
457	jmp	*outer_addr
458
459	ALIGN(16)
460L(ret):	pop	%r15
461	pop	%r14
462	pop	%r13
463	pop	%r12
464	pop	%rbp
465	pop	%rbx
466	FUNC_EXIT()
467	ret
468
469EPILOGUE()
470