xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mul_2.asm (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
2dnl  store the result in a third limb vector.
3
4dnl  Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C	     cycles/limb
35C AMD K8,K9      4.53
36C AMD K10        4.53
37C AMD bull       9.76   10.37
38C AMD pile       9.22
39C AMD steam
40C AMD excavator
41C AMD bobcat    11.3
42C AMD jaguar    11.9
43C Intel P4      25.0
44C Intel core2    8.05
45C Intel NHM      7.72
46C Intel SBR      6.33
47C Intel IBR      6.15
48C Intel HWL      6.00
49C Intel BWL      4.44
50C Intel SKL      4.54
51C Intel atom    39.0
52C Intel SLM     24.0
53C VIA nano
54
55C This code is the result of running a code generation and optimization tool
56C suite written by David Harvey and Torbjorn Granlund.
57
58C TODO
59C  * Work on feed-in and wind-down code.
60C  * Convert "mov $0" to "xor".
61C  * Adjust initial lea to save some bytes.
62C  * Perhaps adjust n from n_param&3 value?
63C  * Replace with 2.25 c/l sequence.
64
65C INPUT PARAMETERS
66define(`rp',	 `%rdi')
67define(`up',	 `%rsi')
68define(`n_param',`%rdx')
69define(`vp',	 `%rcx')
70
71define(`v0', `%r8')
72define(`v1', `%r9')
73define(`w0', `%rbx')
74define(`w1', `%rcx')
75define(`w2', `%rbp')
76define(`w3', `%r10')
77define(`n',  `%r11')
78
79ABI_SUPPORT(DOS64)
80ABI_SUPPORT(STD64)
81
82ASM_START()
83	TEXT
84	ALIGN(16)
85PROLOGUE(mpn_mul_2)
86	FUNC_ENTRY(4)
87	push	%rbx
88	push	%rbp
89
90	mov	(vp), v0
91	mov	8(vp), v1
92
93	mov	(up), %rax
94
95	mov	n_param, n
96	neg	n
97	lea	-8(up,n_param,8), up
98	lea	-8(rp,n_param,8), rp
99
100	and	$3, R32(n_param)
101	jz	L(m2p0)
102	cmp	$2, R32(n_param)
103	jc	L(m2p1)
104	jz	L(m2p2)
105L(m2p3):
106	mul	v0
107	xor	R32(w3), R32(w3)
108	mov	%rax, w1
109	mov	%rdx, w2
110	mov	8(up,n,8), %rax
111	add	$-1, n
112	mul	v1
113	add	%rax, w2
114	jmp	L(m23)
115L(m2p0):
116	mul	v0
117	xor	R32(w2), R32(w2)
118	mov	%rax, w0
119	mov	%rdx, w1
120	jmp	L(m20)
121L(m2p1):
122	mul	v0
123	xor	R32(w3), R32(w3)
124	xor	R32(w0), R32(w0)
125	xor	R32(w1), R32(w1)
126	add	$1, n
127	jmp	L(m2top)
128L(m2p2):
129	mul	v0
130	xor	R32(w0), R32(w0)
131	xor	R32(w1), R32(w1)
132	mov	%rax, w2
133	mov	%rdx, w3
134	mov	8(up,n,8), %rax
135	add	$-2, n
136	jmp	L(m22)
137
138
139	ALIGN(32)
140L(m2top):
141	add	%rax, w3
142	adc	%rdx, w0
143	mov	0(up,n,8), %rax
144	adc	$0, R32(w1)
145	mov	$0, R32(w2)
146	mul	v1
147	add	%rax, w0
148	mov	w3, 0(rp,n,8)
149	adc	%rdx, w1
150	mov	8(up,n,8), %rax
151	mul	v0
152	add	%rax, w0
153	adc	%rdx, w1
154	adc	$0, R32(w2)
155L(m20):	mov	8(up,n,8), %rax
156	mul	v1
157	add	%rax, w1
158	adc	%rdx, w2
159	mov	16(up,n,8), %rax
160	mov	$0, R32(w3)
161	mul	v0
162	add	%rax, w1
163	mov	16(up,n,8), %rax
164	adc	%rdx, w2
165	adc	$0, R32(w3)
166	mul	v1
167	add	%rax, w2
168	mov	w0, 8(rp,n,8)
169L(m23):	adc	%rdx, w3
170	mov	24(up,n,8), %rax
171	mul	v0
172	mov	$0, R32(w0)
173	add	%rax, w2
174	adc	%rdx, w3
175	mov	w1, 16(rp,n,8)
176	mov	24(up,n,8), %rax
177	mov	$0, R32(w1)
178	adc	$0, R32(w0)
179L(m22):	mul	v1
180	add	%rax, w3
181	mov	w2, 24(rp,n,8)
182	adc	%rdx, w0
183	mov	32(up,n,8), %rax
184	mul	v0
185	add	$4, n
186	js	L(m2top)
187
188
189	add	%rax, w3
190	adc	%rdx, w0
191	adc	$0, R32(w1)
192	mov	(up), %rax
193	mul	v1
194	mov	w3, (rp)
195	add	%rax, w0
196	adc	%rdx, w1
197	mov	w0, 8(rp)
198	mov	w1, %rax
199
200	pop	%rbp
201	pop	%rbx
202	FUNC_EXIT()
203	ret
204EPILOGUE()
205