xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bd1/mul_basecase.asm (revision 924795e69c8bb3f17afd8fcbb799710cc1719dc4)
1dnl  AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C cycles/limb	mul_1		mul_2		mul_3		addmul_2
36C AMD K8,K9
37C AMD K10
38C AMD bull	~4.8		~4.55		-		~4.3
39C AMD pile	~4.6		~4.55		-		~4.55
40C AMD bobcat
41C AMD jaguar
42C Intel P4
43C Intel core
44C Intel NHM
45C Intel SBR
46C Intel IBR
47C Intel HWL
48C Intel BWL
49C Intel atom
50C VIA nano
51
52C The inner loops of this code are the result of running a code generation and
53C optimisation tool suite written by David Harvey and Torbjorn Granlund.
54
55C TODO
56C  * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
57C    Alternatively, we could tweak the present code (which was loopmixed for a
58C    different CPU).
59C  * Merge faster mul_2, such as the one in the same directory as this file.
60C  * Further micro-optimise.
61
62C When playing with pointers, set this to $2 to fall back to conservative
63C indexing in wind-down code.
64define(`I',`$1')
65
66
67define(`rp',      `%rdi')
68define(`up',      `%rsi')
69define(`un_param',`%rdx')
70define(`vp',      `%rcx')
71define(`vn',      `%r8')
72
73define(`un',      `%rbx')
74
75define(`w0',	`%r10')
76define(`w1',	`%r11')
77define(`w2',	`%r12')
78define(`w3',	`%r13')
79define(`n',	`%rbp')
80define(`v0',	`%r9')
81
82ABI_SUPPORT(DOS64)
83ABI_SUPPORT(STD64)
84
85ASM_START()
86	TEXT
87	ALIGN(16)
88PROLOGUE(mpn_mul_basecase)
89	FUNC_ENTRY(4)
90IFDOS(`	mov	56(%rsp), %r8d	')
91	push	%rbx
92	push	%rbp
93	mov	un_param, un		C free up rdx
94	neg	un
95
96	mov	(up), %rax		C shared for mul_1 and mul_2
97	lea	(up,un_param,8), up	C point at operand end
98	lea	(rp,un_param,8), rp	C point at rp[un-1]
99
100	mov	(vp), v0		C shared for mul_1 and mul_2
101	mul	v0			C shared for mul_1 and mul_2
102
103	test	$1, R8(vn)
104	jz	L(do_mul_2)
105
106L(do_mul_1):
107	test	$1, R8(un)
108	jnz	L(m1x1)
109
110L(m1x0):mov	%rax, w0		C un = 2, 4, 6, 8, ...
111	mov	%rdx, w1
112	mov	8(up,un,8), %rax
113	test	$2, R8(un)
114	jnz	L(m110)
115
116L(m100):lea	2(un), n		C un = 4, 8, 12, ...
117	jmp	L(m1l0)
118
119L(m110):lea	(un), n			C un = 2, 6, 10, ...
120	jmp	L(m1l2)
121
122L(m1x1):mov	%rax, w1		C un = 1, 3, 5, 7, ...
123	mov	%rdx, w0
124	test	$2, R8(un)
125	jz	L(m111)
126
127L(m101):lea	3(un), n		C un = 1, 5, 9, ...
128	test	n, n
129	js	L(m1l1)
130	mov	%rax, -8(rp)
131	mov	%rdx, (rp)
132	pop	%rbp
133	pop	%rbx
134	FUNC_EXIT()
135	ret
136
137L(m111):lea	1(un), n		C un = 3, 7, 11, ...
138	mov	8(up,un,8), %rax
139	jmp	L(m1l3)
140
141	ALIGN(16)
142L(m1tp):mov	%rdx, w0
143	add	%rax, w1
144L(m1l1):mov	-16(up,n,8), %rax
145	adc	$0, w0
146	mul	v0
147	add	%rax, w0
148	mov	w1, -24(rp,n,8)
149	mov	-8(up,n,8), %rax
150	mov	%rdx, w1
151	adc	$0, w1
152L(m1l0):mul	v0
153	mov	w0, -16(rp,n,8)
154	add	%rax, w1
155	mov	%rdx, w0
156	mov	(up,n,8), %rax
157	adc	$0, w0
158L(m1l3):mul	v0
159	mov	w1, -8(rp,n,8)
160	mov	%rdx, w1
161	add	%rax, w0
162	mov	8(up,n,8), %rax
163	adc	$0, w1
164L(m1l2):mul	v0
165	mov	w0, (rp,n,8)
166	add	$4, n
167	jnc	L(m1tp)
168
169L(m1ed):add	%rax, w1
170	adc	$0, %rdx
171	mov	w1, I(-8(rp),-24(rp,n,8))
172	mov	%rdx, I((rp),-16(rp,n,8))
173
174	dec	R32(vn)
175	jz	L(ret2)
176
177	lea	8(vp), vp
178	lea	8(rp), rp
179	push	%r12
180	push	%r13
181	push	%r14
182	jmp	L(do_addmul)
183
184L(do_mul_2):
185define(`v1',	`%r14')
186	push	%r12
187	push	%r13
188	push	%r14
189
190	mov	8(vp), v1
191
192	test	$1, R8(un)
193	jnz	L(m2b1)
194
195L(m2b0):lea	(un), n
196	mov	%rax, w2		C 0
197	mov	(up,un,8), %rax
198	mov	%rdx, w1		C 1
199	mul	v1
200	mov	%rax, w0		C 1
201	mov	w2, (rp,un,8)		C 0
202	mov	8(up,un,8), %rax
203	mov	%rdx, w2		C 2
204	jmp	L(m2l0)
205
206L(m2b1):lea	1(un), n
207	mov	%rax, w0		C 1
208	mov	%rdx, w3		C 2
209	mov	(up,un,8), %rax
210	mul	v1
211	mov	w0, (rp,un,8)		C 1
212	mov	%rdx, w0		C 3
213	mov	%rax, w2		C 0
214	mov	8(up,un,8), %rax
215	jmp	L(m2l1)
216
217	ALIGN(32)
218L(m2tp):add	%rax, w2		C 0
219	mov	(up,n,8), %rax
220	adc	$0, w0			C 1
221L(m2l1):mul	v0
222	add	%rax, w2		C 0
223	mov	(up,n,8), %rax
224	mov	%rdx, w1		C 1
225	adc	$0, w1			C 1
226	mul	v1
227	add	w3, w2			C 0
228	adc	$0, w1			C 1
229	add	%rax, w0		C 1
230	mov	w2, (rp,n,8)		C 0
231	mov	8(up,n,8), %rax
232	mov	%rdx, w2		C 2
233	adc	$0, w2			C 2
234L(m2l0):mul	v0
235	add	%rax, w0		C 1
236	mov	%rdx, w3		C 2
237	adc	$0, w3			C 2
238	add	w1, w0			C 1
239	adc	$0, w3			C 2
240	mov	8(up,n,8), %rax
241	mul	v1
242	add	$2, n
243	mov	w0, -8(rp,n,8)		C 1
244	mov	%rdx, w0		C 3
245	jnc	L(m2tp)
246
247L(m2ed):add	%rax, w2
248	adc	$0, %rdx
249	add	w3, w2
250	adc	$0, %rdx
251	mov	w2, I((rp),(rp,n,8))
252	mov	%rdx, I(8(rp),8(rp,n,8))
253
254	add	$-2, R32(vn)
255	jz	L(ret5)
256
257	lea	16(vp), vp
258	lea	16(rp), rp
259
260
261L(do_addmul):
262	push	%r15
263	push	vn			C save vn in new stack slot
264define(`vn',	`(%rsp)')
265define(`X0',	`%r14')
266define(`X1',	`%r15')
267define(`v1',	`%r8')
268
269L(outer):
270	mov	(vp), v0
271	mov	8(vp), v1
272
273	mov	(up,un,8), %rax
274	mul	v0
275
276	test	$1, R8(un)
277	jnz	L(bx1)
278
279L(bx0):	mov	%rax, X1
280	mov	(up,un,8), %rax
281	mov	%rdx, X0
282	mul	v1
283	test	$2, R8(un)
284	jnz	L(b10)
285
286L(b00):	lea	(un), n			C un = 4, 8, 12, ...
287	mov	(rp,un,8), w3
288	mov	%rax, w0
289	mov	8(up,un,8), %rax
290	mov	%rdx, w1
291	jmp	L(lo0)
292
293L(b10):	lea	2(un), n		C un = 2, 6, 10, ...
294	mov	(rp,un,8), w1
295	mov	%rdx, w3
296	mov	%rax, w2
297	mov	8(up,un,8), %rax
298	jmp	L(lo2)
299
300L(bx1):	mov	%rax, X0
301	mov	(up,un,8), %rax
302	mov	%rdx, X1
303	mul	v1
304	test	$2, R8(un)
305	jz	L(b11)
306
307L(b01):	lea	1(un), n		C un = 1, 5, 9, ...
308	mov	(rp,un,8), w2
309	mov	%rdx, w0
310	mov	%rax, w3
311	jmp	L(lo1)
312
313L(b11):	lea	-1(un), n		C un = 3, 7, 11, ...
314	mov	(rp,un,8), w0
315	mov	%rax, w1
316	mov	8(up,un,8), %rax
317	mov	%rdx, w2
318	jmp	L(lo3)
319
320	ALIGN(32)
321L(top):
322L(lo2):	mul	v0
323	add	w1, X1
324	mov	X1, -16(rp,n,8)
325	mov	%rdx, X1
326	adc	%rax, X0
327	adc	$0, X1
328	mov	-8(up,n,8), %rax
329	mul	v1
330	mov	-8(rp,n,8), w1
331	mov	%rdx, w0
332	add	w1, w2
333	adc	%rax, w3
334	adc	$0, w0
335L(lo1):	mov	(up,n,8), %rax
336	mul	v0
337	add	w2, X0
338	mov	X0, -8(rp,n,8)
339	mov	%rdx, X0
340	adc	%rax, X1
341	mov	(up,n,8), %rax
342	adc	$0, X0
343	mov	(rp,n,8), w2
344	mul	v1
345	add	w2, w3
346	adc	%rax, w0
347	mov	8(up,n,8), %rax
348	mov	%rdx, w1
349	adc	$0, w1
350L(lo0):	mul	v0
351	add	w3, X1
352	mov	X1, (rp,n,8)
353	adc	%rax, X0
354	mov	8(up,n,8), %rax
355	mov	%rdx, X1
356	adc	$0, X1
357	mov	8(rp,n,8), w3
358	mul	v1
359	add	w3, w0
360	adc	%rax, w1
361	mov	16(up,n,8), %rax
362	mov	%rdx, w2
363	adc	$0, w2
364L(lo3):	mul	v0
365	add	w0, X0
366	mov	X0, 8(rp,n,8)
367	mov	%rdx, X0
368	adc	%rax, X1
369	adc	$0, X0
370	mov	16(up,n,8), %rax
371	mov	16(rp,n,8), w0
372	mul	v1
373	mov	%rdx, w3
374	add	w0, w1
375	adc	%rax, w2
376	adc	$0, w3
377	mov	24(up,n,8), %rax
378	add	$4, n
379	jnc	L(top)
380
381L(end):	mul	v0
382	add	w1, X1
383	mov	X1, I(-16(rp),-16(rp,n,8))
384	mov	%rdx, X1
385	adc	%rax, X0
386	adc	$0, X1
387	mov	I(-8(up),-8(up,n,8)), %rax
388	mul	v1
389	mov	I(-8(rp),-8(rp,n,8)), w1
390	add	w1, w2
391	adc	%rax, w3
392	adc	$0, %rdx
393	add	w2, X0
394	adc	$0, X1
395	mov	X0, I(-8(rp),-8(rp,n,8))
396	add	w3, X1
397	mov	X1, I((rp),(rp,n,8))
398	adc	$0, %rdx
399	mov	%rdx, I(8(rp),8(rp,n,8))
400
401
402	addl	$-2, vn
403	lea	16(vp), vp
404	lea	16(rp), rp
405	jnz	L(outer)
406
407	pop	%rax		C deallocate vn slot
408	pop	%r15
409L(ret5):pop	%r14
410	pop	%r13
411	pop	%r12
412L(ret2):pop	%rbp
413	pop	%rbx
414	FUNC_EXIT()
415	ret
416EPILOGUE()
417