xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bd1/mul_2.asm (revision 9fd8799cb5ceb66c69f2eb1a6d26a1d587ba1f1e)
1dnl  AMD64 mpn_mul_2 optimised for AMD Bulldozer.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 6.78
37C AMD K10	 6.78
38C AMD bd1	 8.39	 8.65
39C AMD bd2	 8.47
40C AMD bd3
41C AMD bd4
42C AMD zen
43C AMD bt1	12.1
44C AMD bt2	11.5
45C Intel P4	24.0
46C Intel PNR	 8.14
47C Intel NHM	 7.78
48C Intel SBR	 6.34
49C Intel IBR	 6.15
50C Intel HWL	 6.04
51C Intel BWL	 4.33
52C Intel SKL	 4.41
53C Intel atom	39.5
54C Intel SLM	27.8
55C VIA nano
56
57C The loop of this code is the result of running a code generation and
58C optimisation tool suite written by David Harvey and Torbjorn Granlund.
59
60define(`rp',      `%rdi')   C rcx
61define(`up',      `%rsi')   C rdx
62define(`n_param', `%rdx')   C r8
63define(`vp',      `%rcx')   C r9
64
65define(`v0', `%r8')
66define(`v1', `%r9')
67define(`w0', `%rbx')
68define(`w1', `%rcx')
69define(`w2', `%rbp')
70define(`w3', `%r10')
71define(`n',  `%r11')
72
73ABI_SUPPORT(DOS64)
74ABI_SUPPORT(STD64)
75
76ASM_START()
77	TEXT
78	ALIGN(32)
79PROLOGUE(mpn_mul_2)
80	FUNC_ENTRY(4)
81	push	%rbx
82	push	%rbp
83
84	mov	(up), %rax
85
86	mov	(vp), v0
87	mov	8(vp), v1
88
89	lea	(up,n_param,8), up
90	lea	(rp,n_param,8), rp
91
92	mov	n_param, n
93	mul	v0
94	neg	n
95
96	test	$1, R8(n)
97	jnz	L(bx1)
98
99L(bx0):	test	$2, R8(n)
100	jnz	L(b10)
101
102L(b00):	mov	%rax, w0
103	mov	%rdx, w1
104	xor	R32(w2), R32(w2)
105	mov	(up,n,8), %rax
106	jmp	L(lo0)
107
108L(b10):	mov	%rax, w2
109	mov	%rdx, w3
110	mov	(up,n,8), %rax
111	xor	R32(w0), R32(w0)
112	mul	v1
113	add	$-2, n
114	jmp	L(lo2)
115
116L(bx1):	test	$2, R8(n)
117	jz	L(b11)
118
119L(b01):	mov	%rax, w3
120	mov	%rdx, w0
121	mov	(up,n,8), %rax
122	mul	v1
123	xor	R32(w1), R32(w1)
124	inc	n
125	jmp	L(lo1)
126
127L(b11):	mov	%rax, w1
128	mov	%rdx, w2
129	mov	(up,n,8), %rax
130	xor	R32(w3), R32(w3)
131	dec	n
132	jmp	L(lo3)
133
134	ALIGN(32)
135L(top):	mov	-8(up,n,8), %rax
136	mul	v1
137	mov	w2, -16(rp,n,8)
138L(lo1):	add	%rax, w0
139	mov	w3, -8(rp,n,8)
140	adc	%rdx, w1
141	mov	(up,n,8), %rax
142	mul	v0
143	mov	$0, R32(w2)
144	add	%rax, w0
145	adc	%rdx, w1
146	adc	$0, R32(w2)
147	mov	(up,n,8), %rax
148L(lo0):	mul	v1
149	add	%rax, w1
150	adc	%rdx, w2
151	mov	8(up,n,8), %rax
152	mul	v0
153	add	%rax, w1
154	mov	w0, (rp,n,8)
155	mov	$0, R32(w3)
156	mov	8(up,n,8), %rax
157	adc	%rdx, w2
158	adc	$0, R32(w3)
159L(lo3):	mul	v1
160	add	%rax, w2
161	mov	16(up,n,8), %rax
162	adc	%rdx, w3
163	mul	v0
164	add	%rax, w2
165	mov	16(up,n,8), %rax
166	mov	$0, R32(w0)
167	adc	%rdx, w3
168	adc	$0, R32(w0)
169	mul	v1
170	mov	w1, 8(rp,n,8)
171L(lo2):	add	%rax, w3
172	adc	%rdx, w0
173	mov	24(up,n,8), %rax
174	mul	v0
175	add	%rax, w3
176	adc	%rdx, w0
177	mov	$0, R32(w1)
178	adc	$0, R32(w1)
179	add	$4, n
180	jnc	L(top)
181
182L(end):	mov	-8(up), %rax
183	mul	v1
184	mov	w2, -16(rp)
185	add	%rax, w0
186	mov	w3, -8(rp)
187	adc	%rdx, w1
188	mov	w0, (rp)
189	mov	w1, %rax
190
191	pop	%rbp
192	pop	%rbx
193	FUNC_EXIT()
194	ret
195EPILOGUE()
196