xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mul_2.asm (revision ead2c0eee3abe6bcf08c63bfc78eb8a93a579b2b)
1dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
2dnl  store the result in a third limb vector.
3
4dnl  Copyright 2008 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C K8,K9:	 2.275
25C K10:		 2.275
26C P4:		 ?
27C P6 core2:	 4.0
28C P6 corei7:	 3.8
29
30C This code is the result of running a code generation and optimization tool
31C suite written by David Harvey and Torbjorn Granlund.
32
33C TODO
34C  * Work on feed-in and wind-down code.
35C  * Convert "mov $0" to "xor".
36C  * Adjust initial lea to save some bytes.
37C  * Perhaps adjust n from n_param&3 value?
38C  * Replace with 2.25 c/l sequence.
39
40C INPUT PARAMETERS
41define(`rp',	 `%rdi')
42define(`up',	 `%rsi')
43define(`n_param',`%rdx')
44define(`vp',	 `%rcx')
45
46define(`v0', `%r8')
47define(`v1', `%r9')
48define(`w0', `%rbx')
49define(`w1', `%rcx')
50define(`w2', `%rbp')
51define(`w3', `%r10')
52define(`n',  `%r11')
53
54ASM_START()
55	TEXT
56	ALIGN(16)
57PROLOGUE(mpn_mul_2)
58	push	%rbx
59	push	%rbp
60
61	mov	(vp), v0
62	mov	8(vp), v1
63
64	mov	(up), %rax
65
66	mov	n_param, n
67	neg	n
68	lea	-8(up,n_param,8), up
69	lea	-8(rp,n_param,8), rp
70
71	and	$3, R32(n_param)
72	jz	L(m2p0)
73	cmp	$2, R32(n_param)
74	jc	L(m2p1)
75	jz	L(m2p2)
76L(m2p3):
77	mul	v0
78	xor	R32(w3), R32(w3)
79	mov	%rax, w1
80	mov	%rdx, w2
81	mov	8(up,n,8), %rax
82	add	$-1, n
83	mul	v1
84	add	%rax, w2
85	jmp	L(m23)
86L(m2p0):
87	mul	v0
88	xor	R32(w2), R32(w2)
89	mov	%rax, w0
90	mov	%rdx, w1
91	jmp	L(m20)
92L(m2p1):
93	mul	v0
94	xor	R32(w3), R32(w3)
95	xor	R32(w0), R32(w0)
96	xor	R32(w1), R32(w1)
97	add	$1, n
98	jmp	L(m2top)
99L(m2p2):
100	mul	v0
101	xor	R32(w0), R32(w0)
102	xor	R32(w1), R32(w1)
103	mov	%rax, w2
104	mov	%rdx, w3
105	mov	8(up,n,8), %rax
106	add	$-2, n
107	jmp	L(m22)
108
109
110	ALIGN(32)
111L(m2top):
112	add	%rax, w3
113	adc	%rdx, w0
114	mov	0(up,n,8), %rax
115	adc	$0, R32(w1)
116	mov	$0, R32(w2)
117	mul	v1
118	add	%rax, w0
119	mov	w3, 0(rp,n,8)
120	adc	%rdx, w1
121	mov	8(up,n,8), %rax
122	mul	v0
123	add	%rax, w0
124	adc	%rdx, w1
125	adc	$0, R32(w2)
126L(m20):	mov	8(up,n,8), %rax
127	mul	v1
128	add	%rax, w1
129	adc	%rdx, w2
130	mov	16(up,n,8), %rax
131	mov	$0, R32(w3)
132	mul	v0
133	add	%rax, w1
134	mov	16(up,n,8), %rax
135	adc	%rdx, w2
136	adc	$0, R32(w3)
137	mul	v1
138	add	%rax, w2
139	mov	w0, 8(rp,n,8)
140L(m23):	adc	%rdx, w3
141	mov	24(up,n,8), %rax
142	mul	v0
143	mov	$0, R32(w0)
144	add	%rax, w2
145	adc	%rdx, w3
146	mov	w1, 16(rp,n,8)
147	mov	24(up,n,8), %rax
148	mov	$0, R32(w1)
149	adc	$0, R32(w0)
150L(m22):	mul	v1
151	add	%rax, w3
152	mov	w2, 24(rp,n,8)
153	adc	%rdx, w0
154	mov	32(up,n,8), %rax
155	mul	v0
156	add	$4, n
157	js	L(m2top)
158
159
160	add	%rax, w3
161	adc	%rdx, w0
162	adc	$0, R32(w1)
163	mov	(up), %rax
164	mul	v1
165	mov	w3, (rp)
166	add	%rax, w0
167	adc	%rdx, w1
168	mov	w0, 8(rp)
169	mov	w1, %rax
170
171	pop	%rbp
172	pop	%rbx
173	ret
174EPILOGUE()
175