xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/k8/addmul_2.asm (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
2dnl  add the result to a third limb vector.
3
4dnl  Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C	     cycles/limb     cycles/limb cfg	cycles/limb am1+am1
35C AMD K8,K9	 2.375
36C AMD K10	 2.375
37C AMD bull	 5.2		<-		4.6-4.75		bad
38C AMD pile	 4.96		<-		4.6-4.75		bad
39C AMD steam	 ?
40C AMD excavator	 ?
41C AMD bobcat	 5.75				5.0			bad
42C AMD jaguar	 5.9				5.2-5.4			bad
43C Intel P4	15-16
44C Intel core2	 4.5				4.25-4.5		bad
45C Intel NHM	 4.33				4.55			bad
46C Intel SBR	 3.4		 2.93		3.24			bad
47C Intel IBR	 3.35		 2.6		2.95			bad
48C Intel HWL	 3.3		 2.15		2.3			bad
49C Intel BWL	 2.33		 2.33		1.65			bad
50C Intel SKL	 2.37		 2.21		1.64			bad
51C Intel atom	20		18.7
52C Intel SLM	 8		 8.5
53C VIA nano	 4.4
54
55C This code is the result of running a code generation and optimization tool
56C suite written by David Harvey and Torbjorn Granlund.
57
58C TODO
59C  * Tune feed-in and wind-down code.
60
61C INPUT PARAMETERS
62define(`rp',     `%rdi')
63define(`up',     `%rsi')
64define(`n_param',`%rdx')
65define(`vp',     `%rcx')
66
67define(`v0', `%r8')
68define(`v1', `%r9')
69define(`w0', `%rbx')
70define(`w1', `%rcx')
71define(`w2', `%rbp')
72define(`w3', `%r10')
73define(`n',  `%r11')
74
75ABI_SUPPORT(DOS64)
76ABI_SUPPORT(STD64)
77
78ASM_START()
79	TEXT
80	ALIGN(16)
81PROLOGUE(mpn_addmul_2)
82	FUNC_ENTRY(4)
83	mov	n_param, n
84	push	%rbx
85	push	%rbp
86
87	mov	0(vp), v0
88	mov	8(vp), v1
89
90	mov	R32(n_param), R32(%rbx)
91	mov	(up), %rax
92	lea	-8(up,n_param,8), up
93	lea	-8(rp,n_param,8), rp
94	mul	v0
95	neg	n
96	and	$3, R32(%rbx)
97	jz	L(b0)
98	cmp	$2, R32(%rbx)
99	jc	L(b1)
100	jz	L(b2)
101
102L(b3):	mov	%rax, w1
103	mov	%rdx, w2
104	xor	R32(w3), R32(w3)
105	mov	8(up,n,8), %rax
106	dec	n
107	jmp	L(lo3)
108
109L(b2):	mov	%rax, w2
110	mov	8(up,n,8), %rax
111	mov	%rdx, w3
112	xor	R32(w0), R32(w0)
113	add	$-2, n
114	jmp	L(lo2)
115
116L(b1):	mov	%rax, w3
117	mov	8(up,n,8), %rax
118	mov	%rdx, w0
119	xor	R32(w1), R32(w1)
120	inc	n
121	jmp	L(lo1)
122
123L(b0):	mov	$0, R32(w3)
124	mov	%rax, w0
125	mov	8(up,n,8), %rax
126	mov	%rdx, w1
127	xor	R32(w2), R32(w2)
128	jmp	L(lo0)
129
130	ALIGN(32)
131L(top):	mov	$0, R32(w1)
132	mul	v0
133	add	%rax, w3
134	mov	(up,n,8), %rax
135	adc	%rdx, w0
136	adc	$0, R32(w1)
137L(lo1):	mul	v1
138	add	w3, (rp,n,8)
139	mov	$0, R32(w3)
140	adc	%rax, w0
141	mov	$0, R32(w2)
142	mov	8(up,n,8), %rax
143	adc	%rdx, w1
144	mul	v0
145	add	%rax, w0
146	mov	8(up,n,8), %rax
147	adc	%rdx, w1
148	adc	$0, R32(w2)
149L(lo0):	mul	v1
150	add	w0, 8(rp,n,8)
151	adc	%rax, w1
152	adc	%rdx, w2
153	mov	16(up,n,8), %rax
154	mul	v0
155	add	%rax, w1
156	adc	%rdx, w2
157	adc	$0, R32(w3)
158	mov	16(up,n,8), %rax
159L(lo3):	mul	v1
160	add	w1, 16(rp,n,8)
161	adc	%rax, w2
162	adc	%rdx, w3
163	xor	R32(w0), R32(w0)
164	mov	24(up,n,8), %rax
165	mul	v0
166	add	%rax, w2
167	mov	24(up,n,8), %rax
168	adc	%rdx, w3
169	adc	$0, R32(w0)
170L(lo2):	mul	v1
171	add	w2, 24(rp,n,8)
172	adc	%rax, w3
173	adc	%rdx, w0
174	mov	32(up,n,8), %rax
175	add	$4, n
176	js	L(top)
177
178L(end):	xor	R32(w1), R32(w1)
179	mul	v0
180	add	%rax, w3
181	mov	(up), %rax
182	adc	%rdx, w0
183	adc	R32(w1), R32(w1)
184	mul	v1
185	add	w3, (rp)
186	adc	%rax, w0
187	adc	%rdx, w1
188	mov	w0, 8(rp)
189	mov	w1, %rax
190
191	pop	%rbp
192	pop	%rbx
193	FUNC_EXIT()
194	ret
195EPILOGUE()
196