xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mul_2.asm (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
2dnl  store the result in a third limb vector.
3
4dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C AMD K8,K9	 2.275
25C AMD K10	 2.275
26C Intel P4	13.5
27C Intel core2	 4.0
28C Intel corei	 3.8
29C Intel atom	 ?
30C VIA nano	 ?
31
32C This code is the result of running a code generation and optimization tool
33C suite written by David Harvey and Torbjorn Granlund.
34
35C TODO
36C  * Work on feed-in and wind-down code.
37C  * Convert "mov $0" to "xor".
38C  * Adjust initial lea to save some bytes.
39C  * Perhaps adjust n from n_param&3 value?
40C  * Replace with 2.25 c/l sequence.
41
42C INPUT PARAMETERS
43define(`rp',	 `%rdi')
44define(`up',	 `%rsi')
45define(`n_param',`%rdx')
46define(`vp',	 `%rcx')
47
48define(`v0', `%r8')
49define(`v1', `%r9')
50define(`w0', `%rbx')
51define(`w1', `%rcx')
52define(`w2', `%rbp')
53define(`w3', `%r10')
54define(`n',  `%r11')
55
56ABI_SUPPORT(DOS64)
57ABI_SUPPORT(STD64)
58
59ASM_START()
60	TEXT
61	ALIGN(16)
62PROLOGUE(mpn_mul_2)
63	FUNC_ENTRY(4)
64	push	%rbx
65	push	%rbp
66
67	mov	(vp), v0
68	mov	8(vp), v1
69
70	mov	(up), %rax
71
72	mov	n_param, n
73	neg	n
74	lea	-8(up,n_param,8), up
75	lea	-8(rp,n_param,8), rp
76
77	and	$3, R32(n_param)
78	jz	L(m2p0)
79	cmp	$2, R32(n_param)
80	jc	L(m2p1)
81	jz	L(m2p2)
82L(m2p3):
83	mul	v0
84	xor	R32(w3), R32(w3)
85	mov	%rax, w1
86	mov	%rdx, w2
87	mov	8(up,n,8), %rax
88	add	$-1, n
89	mul	v1
90	add	%rax, w2
91	jmp	L(m23)
92L(m2p0):
93	mul	v0
94	xor	R32(w2), R32(w2)
95	mov	%rax, w0
96	mov	%rdx, w1
97	jmp	L(m20)
98L(m2p1):
99	mul	v0
100	xor	R32(w3), R32(w3)
101	xor	R32(w0), R32(w0)
102	xor	R32(w1), R32(w1)
103	add	$1, n
104	jmp	L(m2top)
105L(m2p2):
106	mul	v0
107	xor	R32(w0), R32(w0)
108	xor	R32(w1), R32(w1)
109	mov	%rax, w2
110	mov	%rdx, w3
111	mov	8(up,n,8), %rax
112	add	$-2, n
113	jmp	L(m22)
114
115
116	ALIGN(32)
117L(m2top):
118	add	%rax, w3
119	adc	%rdx, w0
120	mov	0(up,n,8), %rax
121	adc	$0, R32(w1)
122	mov	$0, R32(w2)
123	mul	v1
124	add	%rax, w0
125	mov	w3, 0(rp,n,8)
126	adc	%rdx, w1
127	mov	8(up,n,8), %rax
128	mul	v0
129	add	%rax, w0
130	adc	%rdx, w1
131	adc	$0, R32(w2)
132L(m20):	mov	8(up,n,8), %rax
133	mul	v1
134	add	%rax, w1
135	adc	%rdx, w2
136	mov	16(up,n,8), %rax
137	mov	$0, R32(w3)
138	mul	v0
139	add	%rax, w1
140	mov	16(up,n,8), %rax
141	adc	%rdx, w2
142	adc	$0, R32(w3)
143	mul	v1
144	add	%rax, w2
145	mov	w0, 8(rp,n,8)
146L(m23):	adc	%rdx, w3
147	mov	24(up,n,8), %rax
148	mul	v0
149	mov	$0, R32(w0)
150	add	%rax, w2
151	adc	%rdx, w3
152	mov	w1, 16(rp,n,8)
153	mov	24(up,n,8), %rax
154	mov	$0, R32(w1)
155	adc	$0, R32(w0)
156L(m22):	mul	v1
157	add	%rax, w3
158	mov	w2, 24(rp,n,8)
159	adc	%rdx, w0
160	mov	32(up,n,8), %rax
161	mul	v0
162	add	$4, n
163	js	L(m2top)
164
165
166	add	%rax, w3
167	adc	%rdx, w0
168	adc	$0, R32(w1)
169	mov	(up), %rax
170	mul	v1
171	mov	w3, (rp)
172	add	%rax, w0
173	adc	%rdx, w1
174	mov	w0, 8(rp)
175	mov	w1, %rax
176
177	pop	%rbp
178	pop	%rbx
179	FUNC_EXIT()
180	ret
181EPILOGUE()
182