xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mul_1.asm (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1dnl  AMD64 mpn_mul_1.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 2.5
35C AMD K10	 2.5
36C AMD bd1	 5.0
37C AMD bobcat	 5.5
38C Intel P4	12.3
39C Intel core2	 4.0
40C Intel NHM	 3.75
41C Intel SBR	 2.95
42C Intel atom	19.8
43C VIA nano	 4.25
44
45C The loop of this code is the result of running a code generation and
46C optimization tool suite written by David Harvey and Torbjorn Granlund.
47
48C TODO
49C  * The loop is great, but the prologue and epilogue code was quickly written.
50C    Tune it!
51
52define(`rp',      `%rdi')   C rcx
53define(`up',      `%rsi')   C rdx
54define(`n_param', `%rdx')   C r8
55define(`vl',      `%rcx')   C r9
56
57define(`n',       `%r11')
58
59ABI_SUPPORT(DOS64)
60ABI_SUPPORT(STD64)
61
62IFDOS(`	define(`up', ``%rsi'')	') dnl
63IFDOS(`	define(`rp', ``%rcx'')	') dnl
64IFDOS(`	define(`vl', ``%r9'')	') dnl
65IFDOS(`	define(`r9', ``rdi'')	') dnl
66IFDOS(`	define(`n',  ``%r8'')	') dnl
67IFDOS(`	define(`r8', ``r11'')	') dnl
68
69ASM_START()
70	TEXT
71	ALIGN(16)
72PROLOGUE(mpn_mul_1c)
73IFDOS(``push	%rsi		'')
74IFDOS(``push	%rdi		'')
75IFDOS(``mov	%rdx, %rsi	'')
76	push	%rbx
77IFSTD(`	mov	%r8, %r10')
78IFDOS(`	mov	64(%rsp), %r10')	C 40 + 3*8  (3 push insns)
79	jmp	L(common)
80EPILOGUE()
81
82PROLOGUE(mpn_mul_1)
83IFDOS(``push	%rsi		'')
84IFDOS(``push	%rdi		'')
85IFDOS(``mov	%rdx, %rsi	'')
86
87	push	%rbx
88	xor	%r10, %r10
89L(common):
90	mov	(up), %rax		C read first u limb early
91IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
92IFDOS(`	mov	n, %rbx         ')
93	mul	vl
94IFSTD(`	mov	%rbx, n         ')
95
96	add	%r10, %rax
97	adc	$0, %rdx
98
99	and	$3, R32(%rbx)
100	jz	L(b0)
101	cmp	$2, R32(%rbx)
102	jz	L(b2)
103	jg	L(b3)
104
105L(b1):	dec	n
106	jne	L(gt1)
107	mov	%rax, (rp)
108	jmp	L(ret)
109L(gt1):	lea	8(up,n,8), up
110	lea	-8(rp,n,8), rp
111	neg	n
112	xor	%r10, %r10
113	xor	R32(%rbx), R32(%rbx)
114	mov	%rax, %r9
115	mov	(up,n,8), %rax
116	mov	%rdx, %r8
117	jmp	L(L1)
118
119L(b0):	lea	(up,n,8), up
120	lea	-16(rp,n,8), rp
121	neg	n
122	xor	%r10, %r10
123	mov	%rax, %r8
124	mov	%rdx, %rbx
125	jmp	 L(L0)
126
127L(b3):	lea	-8(up,n,8), up
128	lea	-24(rp,n,8), rp
129	neg	n
130	mov	%rax, %rbx
131	mov	%rdx, %r10
132	jmp	L(L3)
133
134L(b2):	lea	-16(up,n,8), up
135	lea	-32(rp,n,8), rp
136	neg	n
137	xor	%r8, %r8
138	xor	R32(%rbx), R32(%rbx)
139	mov	%rax, %r10
140	mov	24(up,n,8), %rax
141	mov	%rdx, %r9
142	jmp	L(L2)
143
144	ALIGN(16)
145L(top):	mov	%r10, (rp,n,8)
146	add	%rax, %r9
147	mov	(up,n,8), %rax
148	adc	%rdx, %r8
149	mov	$0, R32(%r10)
150L(L1):	mul	vl
151	mov	%r9, 8(rp,n,8)
152	add	%rax, %r8
153	adc	%rdx, %rbx
154L(L0):	mov	8(up,n,8), %rax
155	mul	vl
156	mov	%r8, 16(rp,n,8)
157	add	%rax, %rbx
158	adc	%rdx, %r10
159L(L3):	mov	16(up,n,8), %rax
160	mul	vl
161	mov	%rbx, 24(rp,n,8)
162	mov	$0, R32(%r8)		C zero
163	mov	%r8, %rbx		C zero
164	add	%rax, %r10
165	mov	24(up,n,8), %rax
166	mov	%r8, %r9		C zero
167	adc	%rdx, %r9
168L(L2):	mul	vl
169	add	$4, n
170	js	 L(top)
171
172	mov	%r10, (rp,n,8)
173	add	%rax, %r9
174	adc	%r8, %rdx
175	mov	%r9, 8(rp,n,8)
176	add	%r8, %rdx
177L(ret):	mov	%rdx, %rax
178
179	pop	%rbx
180IFDOS(``pop	%rdi		'')
181IFDOS(``pop	%rsi		'')
182	ret
183EPILOGUE()
184