xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bt1/mul_basecase.asm (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1dnl  AMD64 mpn_mul_basecase optimised for AMD bobcat.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 4.5
35C AMD K10	 4.5
36C AMD bd1	 4.75
37C AMD bobcat	 5
38C Intel P4	17.7
39C Intel core2	 5.5
40C Intel NHM	 5.43
41C Intel SBR	 3.92
42C Intel atom	23
43C VIA nano	 5.63
44
45C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
46C multiply insn bandwidth, without any apparent loop branch exit pipeline
47C replays experienced on K8.  The structure is unusual: it falls into mul_1 in
48C the same way for all n, then it splits into 4 different wind-down blocks and
49C 4 separate addmul_1 loops.
50C
51C We have not tried using the same addmul_1 loops with a switch into feed-in
52C code, as we do in other basecase implementations.  Doing that could save
53C substantial code volume, but would also probably add some overhead.
54
55C TODO
56C  * Tune un < 3 code.
57C  * Fix slowdown for un=vn=3 (67->71) compared to default code.
58C  * This is 1263 bytes, compared to 1099 bytes for default code.  Consider
59C    combining addmul loops like that code.  Tolerable slowdown?
60C  * Lots of space could be saved by replacing the "switch" code by gradual
61C    jumps out from mul_1 winddown code, perhaps with no added overhead.
62C  * Are the ALIGN(16) really necessary?  They add about 25 bytes of padding.
63
64ABI_SUPPORT(DOS64)
65ABI_SUPPORT(STD64)
66
67C Standard parameters
68define(`rp',              `%rdi')
69define(`up',              `%rsi')
70define(`un_param',        `%rdx')
71define(`vp',              `%rcx')
72define(`vn',              `%r8')
73C Standard allocations
74define(`un',              `%rbx')
75define(`w0',              `%r10')
76define(`w1',              `%r11')
77define(`w2',              `%r12')
78define(`w3',              `%r13')
79define(`n',               `%rbp')
80define(`v0',              `%r9')
81
82C Temp macro for allowing control over indexing.
83C Define to return $1 for more conservative ptr handling.
84define(`X',`$2')
85
86
87ASM_START()
88	TEXT
89	ALIGN(16)
90PROLOGUE(mpn_mul_basecase)
91	FUNC_ENTRY(4)
92IFDOS(`	mov	56(%rsp), %r8d	')
93
94	mov	(up), %rax
95	mov	(vp), v0
96
97	cmp	$2, un_param
98	ja	L(ge3)
99	jz	L(u2)
100
101	mul	v0			C u0 x v0
102	mov	%rax, (rp)
103	mov	%rdx, 8(rp)
104	FUNC_EXIT()
105	ret
106
107L(u2):	mul	v0			C u0 x v0
108	mov	%rax, (rp)
109	mov	8(up), %rax
110	mov	%rdx, w0
111	mul	v0
112	add	%rax, w0
113	mov	%rdx, w1
114	adc	$0, w1
115	cmp	$1, R32(vn)
116	jnz	L(u2v2)
117	mov	w0, 8(rp)
118	mov	w1, 16(rp)
119	FUNC_EXIT()
120	ret
121
122L(u2v2):mov	8(vp), v0
123	mov	(up), %rax
124	mul	v0
125	add	%rax, w0
126	mov	w0, 8(rp)
127	mov	%rdx, %r8		C CAUTION: r8 realloc
128	adc	$0, %r8
129	mov	8(up), %rax
130	mul	v0
131	add	w1, %r8
132	adc	$0, %rdx
133	add	%r8, %rax
134	adc	$0, %rdx
135	mov	%rax, 16(rp)
136	mov	%rdx, 24(rp)
137	FUNC_EXIT()
138	ret
139
140
141L(ge3):	push	%rbx
142	push	%rbp
143	push	%r12
144	push	%r13
145
146	lea	8(vp), vp
147
148	lea	-24(rp,un_param,8), rp
149	lea	-24(up,un_param,8), up
150	xor	R32(un), R32(un)
151	mov	$2, R32(n)
152	sub	un_param, un
153	sub	un_param, n
154
155	mul	v0
156	mov	%rax, w2
157	mov	%rdx, w3
158	jmp	L(L3)
159
160	ALIGN(16)
161L(top):	mov	w0, -16(rp,n,8)
162	add	w1, w2
163	adc	$0, w3
164	mov	(up,n,8), %rax
165	mul	v0
166	mov	%rax, w0
167	mov	%rdx, w1
168	mov	w2, -8(rp,n,8)
169	add	w3, w0
170	adc	$0, w1
171	mov	8(up,n,8), %rax
172	mul	v0
173	mov	%rax, w2
174	mov	%rdx, w3
175	mov	w0, (rp,n,8)
176	add	w1, w2
177	adc	$0, w3
178L(L3):	mov	16(up,n,8), %rax
179	mul	v0
180	mov	%rax, w0
181	mov	%rdx, w1
182	mov	w2, 8(rp,n,8)
183	add	w3, w0
184	adc	$0, w1
185	mov	24(up,n,8), %rax
186	mul	v0
187	mov	%rax, w2
188	mov	%rdx, w3
189	add	$4, n
190	js	L(top)
191
192	mov	w0, -16(rp,n,8)
193	add	w1, w2
194	adc	$0, w3
195
196C Switch on n into right addmul_l loop
197	test	n, n
198	jz	L(r2)
199	cmp	$2, R32(n)
200	ja	L(r3)
201	jz	L(r0)
202	jmp	L(r1)
203
204
205L(r3):	mov	w2, X(-8(rp,n,8),16(rp))
206	mov	w3, X((rp,n,8),24(rp))
207	add	$2, un
208
209C outer loop(3)
210L(to3):	dec	vn
211	jz	L(ret)
212	mov	(vp), v0
213	mov	8(up,un,8), %rax
214	lea	8(vp), vp
215	lea	8(rp), rp
216	mov	un, n
217	mul	v0
218	mov	%rax, w2
219	mov	%rdx, w3
220	jmp	L(al3)
221
222	ALIGN(16)
223L(ta3):	add	w0, -16(rp,n,8)
224	adc	w1, w2
225	adc	$0, w3
226	mov	(up,n,8), %rax
227	mul	v0
228	mov	%rax, w0
229	mov	%rdx, w1
230	add	w2, -8(rp,n,8)
231	adc	w3, w0
232	adc	$0, w1
233	mov	8(up,n,8), %rax
234	mul	v0
235	mov	%rax, w2
236	mov	%rdx, w3
237	add	w0, (rp,n,8)
238	adc	w1, w2
239	adc	$0, w3
240L(al3):	mov	16(up,n,8), %rax
241	mul	v0
242	mov	%rax, w0
243	mov	%rdx, w1
244	add	w2, 8(rp,n,8)
245	adc	w3, w0
246	adc	$0, w1
247	mov	24(up,n,8), %rax
248	mul	v0
249	mov	%rax, w2
250	mov	%rdx, w3
251	add	$4, n
252	js	L(ta3)
253
254	add	w0, X(-16(rp,n,8),8(rp))
255	adc	w1, w2
256	adc	$0, w3
257	add	w2, X(-8(rp,n,8),16(rp))
258	adc	$0, w3
259	mov	w3, X((rp,n,8),24(rp))
260	jmp	L(to3)
261
262
263L(r2):	mov	X(0(up,n,8),(up)), %rax
264	mul	v0
265	mov	%rax, w0
266	mov	%rdx, w1
267	mov	w2, X(-8(rp,n,8),-8(rp))
268	add	w3, w0
269	adc	$0, w1
270	mov	X(8(up,n,8),8(up)), %rax
271	mul	v0
272	mov	%rax, w2
273	mov	%rdx, w3
274	mov	w0, X((rp,n,8),(rp))
275	add	w1, w2
276	adc	$0, w3
277	mov	X(16(up,n,8),16(up)), %rax
278	mul	v0
279	mov	%rax, w0
280	mov	%rdx, w1
281	mov	w2, X(8(rp,n,8),8(rp))
282	add	w3, w0
283	adc	$0, w1
284	mov	w0, X(16(rp,n,8),16(rp))
285	adc	$0, w3
286	mov	w1, X(24(rp,n,8),24(rp))
287	inc	un
288
289C outer loop(2)
290L(to2):	dec	vn
291	jz	L(ret)
292	mov	(vp), v0
293	mov	16(up,un,8), %rax
294	lea	8(vp), vp
295	lea	8(rp), rp
296	mov	un, n
297	mul	v0
298	mov	%rax, w0
299	mov	%rdx, w1
300	jmp	L(al2)
301
302	ALIGN(16)
303L(ta2):	add	w0, -16(rp,n,8)
304	adc	w1, w2
305	adc	$0, w3
306	mov	(up,n,8), %rax
307	mul	v0
308	mov	%rax, w0
309	mov	%rdx, w1
310	add	w2, -8(rp,n,8)
311	adc	w3, w0
312	adc	$0, w1
313	mov	8(up,n,8), %rax
314	mul	v0
315	mov	%rax, w2
316	mov	%rdx, w3
317	add	w0, (rp,n,8)
318	adc	w1, w2
319	adc	$0, w3
320	mov	16(up,n,8), %rax
321	mul	v0
322	mov	%rax, w0
323	mov	%rdx, w1
324	add	w2, 8(rp,n,8)
325	adc	w3, w0
326	adc	$0, w1
327L(al2):	mov	24(up,n,8), %rax
328	mul	v0
329	mov	%rax, w2
330	mov	%rdx, w3
331	add	$4, n
332	js	L(ta2)
333
334	add	w0, X(-16(rp,n,8),8(rp))
335	adc	w1, w2
336	adc	$0, w3
337	add	w2, X(-8(rp,n,8),16(rp))
338	adc	$0, w3
339	mov	w3, X((rp,n,8),24(rp))
340	jmp	L(to2)
341
342
343L(r1):	mov	X(0(up,n,8),8(up)), %rax
344	mul	v0
345	mov	%rax, w0
346	mov	%rdx, w1
347	mov	w2, X(-8(rp,n,8),(rp))
348	add	w3, w0
349	adc	$0, w1
350	mov	X(8(up,n,8),16(up)), %rax
351	mul	v0
352	mov	%rax, w2
353	mov	%rdx, w3
354	mov	w0, X((rp,n,8),8(rp))
355	add	w1, w2
356	adc	$0, w3
357	mov	w2, X(8(rp,n,8),16(rp))
358	mov	w3, X(16(rp,n,8),24(rp))
359	add	$4, un
360
361C outer loop(1)
362L(to1):	dec	vn
363	jz	L(ret)
364	mov	(vp), v0
365	mov	-8(up,un,8), %rax
366	lea	8(vp), vp
367	lea	8(rp), rp
368	mov	un, n
369	mul	v0
370	mov	%rax, w2
371	mov	%rdx, w3
372	jmp	L(al1)
373
374	ALIGN(16)
375L(ta1):	add	w0, -16(rp,n,8)
376	adc	w1, w2
377	adc	$0, w3
378L(al1):	mov	(up,n,8), %rax
379	mul	v0
380	mov	%rax, w0
381	mov	%rdx, w1
382	add	w2, -8(rp,n,8)
383	adc	w3, w0
384	adc	$0, w1
385	mov	8(up,n,8), %rax
386	mul	v0
387	mov	%rax, w2
388	mov	%rdx, w3
389	add	w0, (rp,n,8)
390	adc	w1, w2
391	adc	$0, w3
392	mov	16(up,n,8), %rax
393	mul	v0
394	mov	%rax, w0
395	mov	%rdx, w1
396	add	w2, 8(rp,n,8)
397	adc	w3, w0
398	adc	$0, w1
399	mov	24(up,n,8), %rax
400	mul	v0
401	mov	%rax, w2
402	mov	%rdx, w3
403	add	$4, n
404	js	L(ta1)
405
406	add	w0, X(-16(rp,n,8),8(rp))
407	adc	w1, w2
408	adc	$0, w3
409	add	w2, X(-8(rp,n,8),16(rp))
410	adc	$0, w3
411	mov	w3, X((rp,n,8),24(rp))
412	jmp	L(to1)
413
414
415L(r0):	mov	X((up,n,8),16(up)), %rax
416	mul	v0
417	mov	%rax, w0
418	mov	%rdx, w1
419	mov	w2, X(-8(rp,n,8),8(rp))
420	add	w3, w0
421	adc	$0, w1
422	mov	w0, X((rp,n,8),16(rp))
423	mov	w1, X(8(rp,n,8),24(rp))
424	add	$3, un
425
426C outer loop(0)
427L(to0):	dec	vn
428	jz	L(ret)
429	mov	(vp), v0
430	mov	(up,un,8), %rax
431	lea	8(vp), vp
432	lea	8(rp), rp
433	mov	un, n
434	mul	v0
435	mov	%rax, w0
436	mov	%rdx, w1
437	jmp	L(al0)
438
439	ALIGN(16)
440L(ta0):	add	w0, -16(rp,n,8)
441	adc	w1, w2
442	adc	$0, w3
443	mov	(up,n,8), %rax
444	mul	v0
445	mov	%rax, w0
446	mov	%rdx, w1
447	add	w2, -8(rp,n,8)
448	adc	w3, w0
449	adc	$0, w1
450L(al0):	mov	8(up,n,8), %rax
451	mul	v0
452	mov	%rax, w2
453	mov	%rdx, w3
454	add	w0, (rp,n,8)
455	adc	w1, w2
456	adc	$0, w3
457	mov	16(up,n,8), %rax
458	mul	v0
459	mov	%rax, w0
460	mov	%rdx, w1
461	add	w2, 8(rp,n,8)
462	adc	w3, w0
463	adc	$0, w1
464	mov	24(up,n,8), %rax
465	mul	v0
466	mov	%rax, w2
467	mov	%rdx, w3
468	add	$4, n
469	js	L(ta0)
470
471	add	w0, X(-16(rp,n,8),8(rp))
472	adc	w1, w2
473	adc	$0, w3
474	add	w2, X(-8(rp,n,8),16(rp))
475	adc	$0, w3
476	mov	w3, X((rp,n,8),24(rp))
477	jmp	L(to0)
478
479
480L(ret):	pop	%r13
481	pop	%r12
482	pop	%rbp
483	pop	%rbx
484	FUNC_EXIT()
485	ret
486EPILOGUE()
487