xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bd1/mul_1.asm (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1dnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9
35C AMD K10
36C AMD bd1	 4
37C AMD bobcat
38C Intel P4
39C Intel core2
40C Intel NHM
41C Intel SBR
42C Intel atom
43C VIA nano
44
45C The loop of this code is the result of running a code generation and
46C optimisation tool suite written by David Harvey and Torbjorn Granlund.
47
48C TODO
49C  * Move loop code into feed-in blocks, to save insn for zeroing regs.
50
51define(`rp',      `%rdi')   C rcx
52define(`up',      `%rsi')   C rdx
53define(`n_param', `%rdx')   C r8
54define(`v0',      `%rcx')   C r9
55
56define(`n',       `%rbx')
57
58ABI_SUPPORT(DOS64)
59ABI_SUPPORT(STD64)
60
61IFDOS(`	define(`up', ``%rsi'')	') dnl
62IFDOS(`	define(`rp', ``%rcx'')	') dnl
63IFDOS(`	define(`v0', ``%r9'')	') dnl
64IFDOS(`	define(`r9', ``rdi'')	') dnl
65IFDOS(`	define(`n',  ``%r8'')	') dnl
66IFDOS(`	define(`r8', ``rbx'')	') dnl
67
68ASM_START()
69	TEXT
70	ALIGN(16)
71PROLOGUE(mpn_mul_1c)
72IFDOS(``push	%rsi		'')
73IFDOS(``push	%rdi		'')
74IFDOS(``mov	%rdx, %rsi	'')
75
76	mov	(up), %rax		C read first u limb early
77	push	%rbx
78IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
79IFDOS(`	mov	n, %r11		')
80	mul	v0
81
82IFSTD(` add	%r8, %rax	')
83IFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
84	adc	$0, %rdx
85	jmp	L(common)
86
87EPILOGUE()
88
89	ALIGN(16)
90PROLOGUE(mpn_mul_1)
91IFDOS(``push	%rsi		'')
92IFDOS(``push	%rdi		'')
93IFDOS(``mov	%rdx, %rsi	'')
94
95	mov	(up), %rax		C read first u limb early
96	push	%rbx
97IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
98IFDOS(`	mov	n, %r11		')
99	mul	v0
100
101L(common):
102IFSTD(`	mov	%r11, n		')
103
104	and	$3, R32(%r11)
105	lea	-16(rp,n,8), rp
106	jz	L(b0)
107	cmp	$2, R32(%r11)
108	jb	L(b1)
109	jz	L(b2)
110
111L(b3):	mov	%rax, %r10
112	mov	%rdx, %r11
113	mov	8(up), %rax
114	mul	v0
115	lea	(up,n,8), up
116	not	n
117	jmp	L(L3)
118
119L(b0):	mov	%rax, %r9
120	mov	%rdx, %r10
121	mov	8(up), %rax
122	lea	(up,n,8), up
123	neg	n
124	jmp	L(L0)
125
126L(b1):	mov	%rax, %r8
127	cmp	$1, n
128	jz	L(n1)
129	mov	%rdx, %r9
130	lea	(up,n,8), up
131	neg	n
132	mov	%r8, 16(rp,n,8)
133	inc	n
134	jmp	L(L1)
135
136L(b2):	mov	%rax, %r11
137	mov	%rdx, %r8
138	mov	8(up), %rax
139	lea	(up,n,8), up
140	neg	n
141	add	$2, n
142	jns	L(end)
143
144	ALIGN(16)
145L(top):	mul	v0
146	mov	%rdx, %r9
147	add	%rax, %r8
148	adc	$0, %r9
149	mov	%r8, 8(rp,n,8)
150	mov	%r11, (rp,n,8)
151L(L1):	mov	(up,n,8), %rax
152	mul	v0
153	add	%rax, %r9
154	mov	%rdx, %r10
155	mov	8(up,n,8), %rax
156	adc	$0, %r10
157L(L0):	mul	v0
158	add	%rax, %r10
159	mov	%rdx, %r11
160	mov	16(up,n,8), %rax
161	adc	$0, %r11
162	mul	v0
163	mov	%r9, 16(rp,n,8)
164L(L3):	add	%rax, %r11
165	mov	%r10, 24(rp,n,8)
166	mov	%rdx, %r8
167	adc	$0, %r8
168	add	$4, n
169	mov	-8(up,n,8), %rax
170	js	L(top)
171
172L(end):	mul	v0
173	add	%rax, %r8
174	adc	$0, %rdx
175	mov	%r11, (rp)
176L(n1):	mov	%r8, 8(rp)
177	mov	%rdx, %rax
178
179	pop	%rbx
180IFDOS(``pop	%rdi		'')
181IFDOS(``pop	%rsi		'')
182	ret
183EPILOGUE()
184ASM_END()
185