xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/k8/mullo_basecase.asm (revision fc4f42693f9b1c31f39f9cf50af1bf2010325808)
1dnl  AMD64 mpn_mullo_basecase.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C The inner loops of this code are the result of running a code generation and
36C optimisation tool suite written by David Harvey and Torbjorn Granlund.
37
38C NOTES
39C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
40C     large trip count.  Instead, we should start with mul_2 for any operand
41C     size congruence class.
42C   * Stop iterating addmul_2 earlier, falling into straight-line triangle code
43C     for the last 2-3 iterations.
44C   * Perhaps implement n=4 special code.
45C   * The reload of the outer loop jump address hurts branch prediction.
46C   * The addmul_2 loop ends with an MUL whose high part is not used upon loop
47C     exit.
48
49C INPUT PARAMETERS
50define(`rp',	   `%rdi')
51define(`up',	   `%rsi')
52define(`vp_param', `%rdx')
53define(`n',	   `%rcx')
54
55define(`vp',	`%r11')
56define(`outer_addr', `%r8')
57define(`j',	`%r9')
58define(`v0',	`%r13')
59define(`v1',	`%r14')
60define(`w0',	`%rbx')
61define(`w1',	`%r15')
62define(`w2',	`%rbp')
63define(`w3',	`%r10')
64
65ABI_SUPPORT(DOS64)
66ABI_SUPPORT(STD64)
67
68ASM_START()
69	TEXT
70	ALIGN(16)
71PROLOGUE(mpn_mullo_basecase)
72	FUNC_ENTRY(4)
73	cmp	$4, n
74	jge	L(gen)
75	mov	(up), %rax		C u0
76	mov	(vp_param), %r8		C v0
77
78	lea	L(tab)(%rip), %r9
79ifdef(`PIC',
80`	movslq	(%r9,%rcx,4), %r10
81	add	%r10, %r9
82	jmp	*%r9
83',`
84	jmp	*(%r9,n,8)
85')
86	JUMPTABSECT
87	ALIGN(8)
88L(tab):	JMPENT(	L(tab), L(tab))			C not allowed
89	JMPENT(	L(1), L(tab))			C 1
90	JMPENT(	L(2), L(tab))			C 2
91	JMPENT(	L(3), L(tab))			C 3
92dnl	JMPENT(	L(0m4), L(tab))			C 4
93dnl	JMPENT(	L(1m4), L(tab))			C 5
94dnl	JMPENT(	L(2m4), L(tab))			C 6
95dnl	JMPENT(	L(3m4), L(tab))			C 7
96dnl	JMPENT(	L(0m4), L(tab))			C 8
97dnl	JMPENT(	L(1m4), L(tab))			C 9
98dnl	JMPENT(	L(2m4), L(tab))			C 10
99dnl	JMPENT(	L(3m4), L(tab))			C 11
100	TEXT
101
102L(1):	imul	%r8, %rax
103	mov	%rax, (rp)
104	FUNC_EXIT()
105	ret
106
107L(2):	mov	8(vp_param), %r11
108	imul	%rax, %r11		C u0 x v1
109	mul	%r8			C u0 x v0
110	mov	%rax, (rp)
111	imul	8(up), %r8		C u1 x v0
112	lea	(%r11, %rdx), %rax
113	add	%r8, %rax
114	mov	%rax, 8(rp)
115	FUNC_EXIT()
116	ret
117
118L(3):	mov	8(vp_param), %r9	C v1
119	mov	16(vp_param), %r11
120	mul	%r8			C u0 x v0 -> <r1,r0>
121	mov	%rax, (rp)		C r0
122	mov	(up), %rax		C u0
123	mov	%rdx, %rcx		C r1
124	mul	%r9			C u0 x v1 -> <r2,r1>
125	imul	8(up), %r9		C u1 x v1 -> r2
126	mov	16(up), %r10
127	imul	%r8, %r10		C u2 x v0 -> r2
128	add	%rax, %rcx
129	adc	%rdx, %r9
130	add	%r10, %r9
131	mov	8(up), %rax		C u1
132	mul	%r8			C u1 x v0 -> <r2,r1>
133	add	%rax, %rcx
134	adc	%rdx, %r9
135	mov	%r11, %rax
136	imul	(up), %rax		C u0 x v2 -> r2
137	add	%rax, %r9
138	mov	%rcx, 8(rp)
139	mov	%r9, 16(rp)
140	FUNC_EXIT()
141	ret
142
143L(0m4):
144L(1m4):
145L(2m4):
146L(3m4):
147L(gen):	push	%rbx
148	push	%rbp
149	push	%r13
150	push	%r14
151	push	%r15
152
153	mov	(up), %rax
154	mov	(vp_param), v0
155	mov	vp_param, vp
156
157	lea	(rp,n,8), rp
158	lea	(up,n,8), up
159	neg	n
160
161	mul	v0
162
163	test	$1, R8(n)
164	jz	L(mul_2)
165
166L(mul_1):
167	lea	-8(rp), rp
168	lea	-8(up), up
169	test	$2, R8(n)
170	jnz	L(mul_1_prologue_3)
171
172L(mul_1_prologue_2):		C n = 7, 11, 15, ...
173	lea	-1(n), j
174	lea	L(addmul_outer_1)(%rip), outer_addr
175	mov	%rax, w0
176	mov	%rdx, w1
177	xor	R32(w2), R32(w2)
178	xor	R32(w3), R32(w3)
179	mov	16(up,n,8), %rax
180	jmp	L(mul_1_entry_2)
181
182L(mul_1_prologue_3):		C n = 5, 9, 13, ...
183	lea	1(n), j
184	lea	L(addmul_outer_3)(%rip), outer_addr
185	mov	%rax, w2
186	mov	%rdx, w3
187	xor	R32(w0), R32(w0)
188	jmp	L(mul_1_entry_0)
189
190	ALIGN(16)
191L(mul_1_top):
192	mov	w0, -16(rp,j,8)
193	add	%rax, w1
194	mov	(up,j,8), %rax
195	adc	%rdx, w2
196	xor	R32(w0), R32(w0)
197	mul	v0
198	mov	w1, -8(rp,j,8)
199	add	%rax, w2
200	adc	%rdx, w3
201L(mul_1_entry_0):
202	mov	8(up,j,8), %rax
203	mul	v0
204	mov	w2, (rp,j,8)
205	add	%rax, w3
206	adc	%rdx, w0
207	mov	16(up,j,8), %rax
208	mul	v0
209	mov	w3, 8(rp,j,8)
210	xor	R32(w2), R32(w2)	C zero
211	mov	w2, w3			C zero
212	add	%rax, w0
213	mov	24(up,j,8), %rax
214	mov	w2, w1			C zero
215	adc	%rdx, w1
216L(mul_1_entry_2):
217	mul	v0
218	add	$4, j
219	js	L(mul_1_top)
220
221	mov	w0, -16(rp)
222	add	%rax, w1
223	mov	w1, -8(rp)
224	adc	%rdx, w2
225
226	imul	(up), v0
227	add	v0, w2
228	mov	w2, (rp)
229
230	add	$1, n
231	jz	L(ret)
232
233	mov	8(vp), v0
234	mov	16(vp), v1
235
236	lea	16(up), up
237	lea	8(vp), vp
238	lea	24(rp), rp
239
240	jmp	*outer_addr
241
242
243L(mul_2):
244	mov	8(vp), v1
245	test	$2, R8(n)
246	jz	L(mul_2_prologue_3)
247
248	ALIGN(16)
249L(mul_2_prologue_1):
250	lea	0(n), j
251	mov	%rax, w3
252	mov	%rdx, w0
253	xor	R32(w1), R32(w1)
254	mov	(up,n,8), %rax
255	lea	L(addmul_outer_3)(%rip), outer_addr
256	jmp	L(mul_2_entry_1)
257
258	ALIGN(16)
259L(mul_2_prologue_3):
260	lea	2(n), j
261	mov	$0, R32(w3)
262	mov	%rax, w1
263	mov	(up,n,8), %rax
264	mov	%rdx, w2
265	lea	L(addmul_outer_1)(%rip), outer_addr
266	jmp	L(mul_2_entry_3)
267
268	ALIGN(16)
269L(mul_2_top):
270	mov	-32(up,j,8), %rax
271	mul	v1
272	add	%rax, w0
273	adc	%rdx, w1
274	mov	-24(up,j,8), %rax
275	xor	R32(w2), R32(w2)
276	mul	v0
277	add	%rax, w0
278	mov	-24(up,j,8), %rax
279	adc	%rdx, w1
280	adc	$0, R32(w2)
281	mul	v1
282	add	%rax, w1
283	mov	w0, -24(rp,j,8)
284	adc	%rdx, w2
285	mov	-16(up,j,8), %rax
286	mul	v0
287	mov	$0, R32(w3)
288	add	%rax, w1
289	adc	%rdx, w2
290	mov	-16(up,j,8), %rax
291	adc	$0, R32(w3)
292L(mul_2_entry_3):
293	mov	$0, R32(w0)
294	mov	w1, -16(rp,j,8)
295	mul	v1
296	add	%rax, w2
297	mov	-8(up,j,8), %rax
298	adc	%rdx, w3
299	mov	$0, R32(w1)
300	mul	v0
301	add	%rax, w2
302	mov	-8(up,j,8), %rax
303	adc	%rdx, w3
304	adc	R32(w1), R32(w0)
305	mul	v1
306	add	%rax, w3
307	mov	w2, -8(rp,j,8)
308	adc	%rdx, w0
309	mov	(up,j,8), %rax
310	mul	v0
311	add	%rax, w3
312	adc	%rdx, w0
313	adc	$0, R32(w1)
314L(mul_2_entry_1):
315	add	$4, j
316	mov	w3, -32(rp,j,8)
317	js	L(mul_2_top)
318
319	imul	-16(up), v1
320	add	v1, w0
321	imul	-8(up), v0
322	add	v0, w0
323	mov	w0, -8(rp)
324
325	add	$2, n
326	jz	L(ret)
327
328	mov	16(vp), v0
329	mov	24(vp), v1
330
331	lea	16(vp), vp
332	lea	16(rp), rp
333
334	jmp	*outer_addr
335
336
337L(addmul_outer_1):
338	lea	-2(n), j
339	mov	-16(up,n,8), %rax
340	mul	v0
341	mov	%rax, w3
342	mov	-16(up,n,8), %rax
343	mov	%rdx, w0
344	xor	R32(w1), R32(w1)
345	lea	L(addmul_outer_3)(%rip), outer_addr
346	jmp	L(addmul_entry_1)
347
348L(addmul_outer_3):
349	lea	0(n), j
350	mov	-16(up,n,8), %rax
351	xor	R32(w3), R32(w3)
352	mul	v0
353	mov	%rax, w1
354	mov	-16(up,n,8), %rax
355	mov	%rdx, w2
356	lea	L(addmul_outer_1)(%rip), outer_addr
357	jmp	L(addmul_entry_3)
358
359	ALIGN(16)
360L(addmul_top):
361	add	w3, -32(rp,j,8)
362	adc	%rax, w0
363	mov	-24(up,j,8), %rax
364	adc	%rdx, w1
365	xor	R32(w2), R32(w2)
366	mul	v0
367	add	%rax, w0
368	mov	-24(up,j,8), %rax
369	adc	%rdx, w1
370	adc	R32(w2), R32(w2)
371	mul	v1
372	xor	R32(w3), R32(w3)
373	add	w0, -24(rp,j,8)
374	adc	%rax, w1
375	mov	-16(up,j,8), %rax
376	adc	%rdx, w2
377	mul	v0
378	add	%rax, w1
379	mov	-16(up,j,8), %rax
380	adc	%rdx, w2
381	adc	$0, R32(w3)
382L(addmul_entry_3):
383	mul	v1
384	add	w1, -16(rp,j,8)
385	adc	%rax, w2
386	mov	-8(up,j,8), %rax
387	adc	%rdx, w3
388	mul	v0
389	xor	R32(w0), R32(w0)
390	add	%rax, w2
391	adc	%rdx, w3
392	mov	$0, R32(w1)
393	mov	-8(up,j,8), %rax
394	adc	R32(w1), R32(w0)
395	mul	v1
396	add	w2, -8(rp,j,8)
397	adc	%rax, w3
398	adc	%rdx, w0
399	mov	(up,j,8), %rax
400	mul	v0
401	add	%rax, w3
402	mov	(up,j,8), %rax
403	adc	%rdx, w0
404	adc	$0, R32(w1)
405L(addmul_entry_1):
406	mul	v1
407	add	$4, j
408	js	L(addmul_top)
409
410	add	w3, -32(rp)
411	adc	%rax, w0
412
413	imul	-24(up), v0
414	add	v0, w0
415	add	w0, -24(rp)
416
417	add	$2, n
418	jns	L(ret)
419
420	lea	16(vp), vp
421
422	mov	(vp), v0
423	mov	8(vp), v1
424
425	lea	-16(up), up
426
427	jmp	*outer_addr
428
429L(ret):	pop	%r15
430	pop	%r14
431	pop	%r13
432	pop	%rbp
433	pop	%rbx
434	FUNC_EXIT()
435	ret
436EPILOGUE()
437