xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bd1/mul_2.asm (revision 1580a27b92f58fcdcb23fdfbc04a7c2b54a0b7c8)
1dnl  AMD64 mpn_mul_2 optimised for AMD Bulldozer.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9
37C AMD K10
38C AMD bull	4.36		average, quite fluctuating
39C AMD pile	4.38		slighty fluctuating
40C AMD steam
41C AMD bobcat
42C AMD jaguar
43C Intel P4
44C Intel core
45C Intel NHM
46C Intel SBR
47C Intel IBR
48C Intel HWL
49C Intel BWL
50C Intel atom
51C VIA nano
52
53C The loop of this code is the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbjorn Granlund.
55C Scheme: genxmul --mul
56
57define(`rp',      `%rdi')   C rcx
58define(`up',      `%rsi')   C rdx
59define(`n_param', `%rdx')   C r8
60define(`vp',      `%rcx')   C r9
61
62define(`v0', `%r8')
63define(`v1', `%r9')
64define(`w0', `%rbx')
65define(`w1', `%rcx')
66define(`w2', `%rbp')
67define(`w3', `%r10')
68define(`n',  `%r11')
69
70ABI_SUPPORT(DOS64)
71ABI_SUPPORT(STD64)
72
73ASM_START()
74	TEXT
75	ALIGN(32)
76PROLOGUE(mpn_mul_2)
77	FUNC_ENTRY(4)
78	push	%rbx
79	push	%rbp
80
81	mov	(up), %rax
82
83	mov	(vp), v0
84	mov	8(vp), v1
85
86	lea	(up,n_param,8), up
87	lea	(rp,n_param,8), rp
88
89	mov	n_param, n
90	mul	v0
91	neg	n
92
93	test	$1, R8(n)
94	jnz	L(bx1)
95
96L(bx0):	test	$2, R8(n)
97	jnz	L(b10)
98
99L(b00):	mov	%rax, w0
100	mov	%rdx, w1
101	xor	R32(w2), R32(w2)
102	mov	(up,n,8), %rax
103	jmp	L(lo0)
104
105L(b10):	mov	%rax, w2
106	mov	%rdx, w3
107	mov	(up,n,8), %rax
108	xor	R32(w0), R32(w0)
109	mul	v1
110	add	$-2, n
111	jmp	L(lo2)
112
113L(bx1):	test	$2, R8(n)
114	jz	L(b11)
115
116L(b01):	mov	%rax, w3
117	mov	%rdx, w0
118	mov	(up,n,8), %rax
119	mul	v1
120	xor	R32(w1), R32(w1)
121	inc	n
122	jmp	L(lo1)
123
124L(b11):	mov	%rax, w1
125	mov	%rdx, w2
126	mov	(up,n,8), %rax
127	xor	R32(w3), R32(w3)
128	dec	n
129	jmp	L(lo3)
130
131	ALIGN(32)
132L(top):	mov	-8(up,n,8), %rax
133	mul	v1
134	mov	w2, -16(rp,n,8)
135L(lo1):	add	%rax, w0
136	mov	w3, -8(rp,n,8)
137	adc	%rdx, w1
138	mov	(up,n,8), %rax
139	mul	v0
140	mov	$0, R32(w2)
141	add	%rax, w0
142	adc	%rdx, w1
143	adc	$0, R32(w2)
144	mov	(up,n,8), %rax
145L(lo0):	mul	v1
146	add	%rax, w1
147	adc	%rdx, w2
148	mov	8(up,n,8), %rax
149	mul	v0
150	add	%rax, w1
151	mov	w0, (rp,n,8)
152	mov	$0, R32(w3)
153	mov	8(up,n,8), %rax
154	adc	%rdx, w2
155	adc	$0, R32(w3)
156L(lo3):	mul	v1
157	add	%rax, w2
158	mov	16(up,n,8), %rax
159	adc	%rdx, w3
160	mul	v0
161	add	%rax, w2
162	mov	16(up,n,8), %rax
163	mov	$0, R32(w0)
164	adc	%rdx, w3
165	adc	$0, R32(w0)
166	mul	v1
167	mov	w1, 8(rp,n,8)
168L(lo2):	add	%rax, w3
169	adc	%rdx, w0
170	mov	24(up,n,8), %rax
171	mul	v0
172	add	%rax, w3
173	adc	%rdx, w0
174	mov	$0, R32(w1)
175	adc	$0, R32(w1)
176	add	$4, n
177	jnc	L(top)
178
179L(end):	mov	-8(up,n,8), %rax
180	mul	v1
181	mov	w2, -16(rp,n,8)
182	add	%rax, w0
183	mov	w3, -8(rp,n,8)
184	adc	%rdx, w1
185	mov	w0, (rp,n,8)
186	mov	w1, %rax
187
188	pop	%rbp
189	pop	%rbx
190	FUNC_EXIT()
191	ret
192EPILOGUE()
193