xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/atom/mul_2.asm (revision 7bdf38e5b7a28439665f2fdeff81e36913eef7dd)
1dnl  AMD64 mpn_mul_2 optimised for Intel Atom.
2
3dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb	best
34C AMD K8,K9      5.78
35C AMD K10        5.78
36C AMD bull       9.10
37C AMD pile       9.17
38C AMD steam
39C AMD excavator
40C AMD bobcat    11.3
41C AMD jaguar    10.9
42C Intel P4      24.6
43C Intel core2    8.06
44C Intel NHM      7.65
45C Intel SBR      6.28
46C Intel IBR      6.10
47C Intel HWL      6.09
48C Intel BWL      4.73
49C Intel SKL      4.77
50C Intel atom    35.3
51C Intel SLM     25.6
52C VIA nano
53
54C The loop of this code is the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56
57define(`rp',      `%rdi')   C rcx
58define(`up',      `%rsi')   C rdx
59define(`n_param', `%rdx')   C r8
60define(`vp',      `%rcx')   C r9
61
62define(`v0', `%r8')
63define(`v1', `%r9')
64define(`w0', `%rbx')
65define(`w1', `%rcx')
66define(`w2', `%rbp')
67define(`w3', `%r10')
68define(`n',  `%r11')
69
70ABI_SUPPORT(DOS64)
71ABI_SUPPORT(STD64)
72
73ASM_START()
74	TEXT
75	ALIGN(16)
76PROLOGUE(mpn_mul_2)
77	FUNC_ENTRY(4)
78	push	%rbx
79	push	%rbp
80
81	mov	(up), %rax
82
83	mov	(vp), v0
84	mov	8(vp), v1
85
86	mov	n_param, n
87	mul	v0
88
89	test	$1, R8(n)
90	jnz	L(bx1)
91
92L(bx0):	test	$2, R8(n)
93	jnz	L(b10)
94
95L(b00):	mov	%rax, w0
96	mov	(up), %rax
97	mov	%rdx, w1
98	xor	R32(w2), R32(w2)
99	lea	-8(rp), rp
100	jmp	L(lo0)
101
102L(b10):	mov	%rax, w2
103	mov	(up), %rax
104	mov	%rdx, w3
105	xor	R32(w0), R32(w0)
106	lea	-16(up), up
107	lea	-24(rp), rp
108	jmp	L(lo2)
109
110L(bx1):	test	$2, R8(n)
111	jnz	L(b11)
112
113L(b01):	mov	%rax, w3
114	mov	%rdx, w0
115	mov	(up), %rax
116	xor	R32(w1), R32(w1)
117	lea	8(up), up
118	dec	n
119	jmp	L(lo1)
120
121L(b11):	mov	%rax, w1
122	mov	(up), %rax
123	mov	%rdx, w2
124	xor	R32(w3), R32(w3)
125	lea	-8(up), up
126	lea	-16(rp), rp
127	jmp	L(lo3)
128
129	ALIGN(16)
130L(top):
131L(lo1):	mul	v1
132	add	%rax, w0
133	mov	(up), %rax
134	mov	$0, R32(w2)
135	mov	w3, (rp)
136	adc	%rdx, w1
137	mul	v0
138	add	%rax, w0
139	mov	(up), %rax
140	adc	%rdx, w1
141	adc	$0, R32(w2)
142L(lo0):	mul	v1
143	add	%rax, w1
144	mov	8(up), %rax
145	mov	w0, 8(rp)
146	adc	%rdx, w2
147	mul	v0
148	add	%rax, w1
149	mov	8(up), %rax
150	adc	%rdx, w2
151	mov	$0, R32(w3)
152	adc	$0, R32(w3)
153L(lo3):	mul	v1
154	add	%rax, w2
155	mov	16(up), %rax
156	mov	w1, 16(rp)
157	mov	$0, R32(w0)
158	adc	%rdx, w3
159	mul	v0
160	add	%rax, w2
161	mov	16(up), %rax
162	adc	%rdx, w3
163L(lo2):	mov	$0, R32(w1)
164	mov	w2, 24(rp)
165	adc	$0, R32(w0)
166	mul	v1
167	add	%rax, w3
168	mov	24(up), %rax
169	lea	32(up), up
170	adc	%rdx, w0
171	mul	v0
172	lea	32(rp), rp
173	add	%rax, w3
174	adc	%rdx, w0
175	mov	-8(up), %rax
176	adc	$0, R32(w1)
177	sub	$4, n
178	ja	L(top)
179
180L(end):	mul	v1
181	mov	w3, (rp)
182	add	%rax, w0
183	adc	%rdx, w1
184	mov	w0, 8(rp)
185	mov	w1, %rax
186	pop	%rbp
187	pop	%rbx
188	FUNC_EXIT()
189	ret
190EPILOGUE()
191