xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/divrem_1.asm (revision fa28c6faa16e0b00edee7acdcaf4899797043def)
1dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 2004, 2005, 2007, 2008, 2009, 2010, 2011, 2012 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C		norm	unorm	frac
25C AMD K8,K9	13	13	12
26C AMD K10	13	13	12
27C Intel P4	43	44	43
28C Intel core2	24.5	24.5	19.5
29C Intel corei	20.5	19.5	18
30C Intel atom	43	46	36
31C VIA nano	25.5	25.5	24
32
33C mp_limb_t
34C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
35C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
36
37C mp_limb_t
38C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
39C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
40C                      mp_limb_t dinv, int cnt)
41
42C INPUT PARAMETERS
43define(`qp',		`%rdi')
44define(`fn_param',	`%rsi')
45define(`up_param',	`%rdx')
46define(`un_param',	`%rcx')
47define(`d',		`%r8')
48define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
49C       shift passed on stack		C only for mpn_preinv_divrem_1
50
51define(`cnt',		`%rcx')
52define(`up',		`%rsi')
53define(`fn',		`%r12')
54define(`un',		`%rbx')
55
56
57C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
58C         cnt         qp      d  dinv
59
60ABI_SUPPORT(DOS64)
61ABI_SUPPORT(STD64)
62
63IFSTD(`define(`CNTOFF',		`40($1)')')
64IFDOS(`define(`CNTOFF',		`104($1)')')
65
66ASM_START()
67	TEXT
68	ALIGN(16)
69PROLOGUE(mpn_preinv_divrem_1)
70	FUNC_ENTRY(4)
71IFDOS(`	mov	56(%rsp), %r8	')
72IFDOS(`	mov	64(%rsp), %r9	')
73	xor	R32(%rax), R32(%rax)
74	push	%r13
75	push	%r12
76	push	%rbp
77	push	%rbx
78
79	mov	fn_param, fn
80	mov	un_param, un
81	add	fn_param, un_param
82	mov	up_param, up
83
84	lea	-8(qp,un_param,8), qp
85
86	test	d, d
87	js	L(nent)
88
89	mov	CNTOFF(%rsp), R8(cnt)
90	shl	R8(cnt), d
91	jmp	L(uent)
92EPILOGUE()
93
94	ALIGN(16)
95PROLOGUE(mpn_divrem_1)
96	FUNC_ENTRY(4)
97IFDOS(`	mov	56(%rsp), %r8	')
98	xor	R32(%rax), R32(%rax)
99	push	%r13
100	push	%r12
101	push	%rbp
102	push	%rbx
103
104	mov	fn_param, fn
105	mov	un_param, un
106	add	fn_param, un_param
107	mov	up_param, up
108	je	L(ret)
109
110	lea	-8(qp,un_param,8), qp
111	xor	R32(%rbp), R32(%rbp)
112
113	test	d, d
114	jns	L(unnormalized)
115
116L(normalized):
117	test	un, un
118	je	L(8)			C un == 0
119	mov	-8(up,un,8), %rbp
120	dec	un
121	mov	%rbp, %rax
122	sub	d, %rbp
123	cmovc	%rax, %rbp
124	sbb	R32(%rax), R32(%rax)
125	inc	R32(%rax)
126	mov	%rax, (qp)
127	lea	-8(qp), qp
128L(8):
129IFSTD(`	push	%rdi		')
130IFSTD(`	push	%rsi		')
131	push	%r8
132IFSTD(`	mov	d, %rdi		')
133IFDOS(`	mov	d, %rcx		')
134	CALL(	mpn_invert_limb)
135	pop	%r8
136IFSTD(`	pop	%rsi		')
137IFSTD(`	pop	%rdi		')
138
139	mov	%rax, dinv
140	mov	%rbp, %rax
141	jmp	L(nent)
142
143	ALIGN(16)
144L(ntop):				C	    K8-K10  P6-CNR P6-NHM  P4
145	mov	(up,un,8), %r10		C
146	mul	dinv			C	      0,13   0,20   0,18   0,45
147	add	%r10, %rax		C	      4      8      3     12
148	adc	%rbp, %rdx		C	      5      9     10     13
149	mov	%rax, %rbp		C	      5      9      4     13
150	mov	%rdx, %r13		C	      6     11     12     23
151	imul	d, %rdx			C	      6     11     11     23
152	sub	%rdx, %r10		C	     10     16     14     33
153	mov	d, %rax			C
154	add	%r10, %rax		C	     11     17     15     34
155	cmp	%rbp, %r10		C	     11     17     15     34
156	cmovc	%r10, %rax		C	     12     18     16     35
157	adc	$-1, %r13		C
158	cmp	d, %rax			C
159	jae	L(nfx)			C
160L(nok):	mov	%r13, (qp)		C
161	sub	$8, qp			C
162L(nent):lea	1(%rax), %rbp		C
163	dec	un			C
164	jns	L(ntop)			C
165
166	xor	R32(%rcx), R32(%rcx)
167	jmp	L(87)
168
169L(nfx):	sub	d, %rax
170	inc	%r13
171	jmp	L(nok)
172
173L(unnormalized):
174	test	un, un
175	je	L(44)
176	mov	-8(up,un,8), %rax
177	cmp	d, %rax
178	jae	L(44)
179	mov	%rbp, (qp)
180	mov	%rax, %rbp
181	lea	-8(qp), qp
182	je	L(ret)
183	dec	un
184L(44):
185	bsr	d, %rcx
186	not	R32(%rcx)
187	shl	R8(%rcx), d
188	shl	R8(%rcx), %rbp
189
190	push	%rcx
191IFSTD(`	push	%rdi		')
192IFSTD(`	push	%rsi		')
193	push	%r8
194IFSTD(`	mov	d, %rdi		')
195IFDOS(`	mov	d, %rcx		')
196	CALL(	mpn_invert_limb)
197	pop	%r8
198IFSTD(`	pop	%rsi		')
199IFSTD(`	pop	%rdi		')
200	pop	%rcx
201
202	mov	%rax, dinv
203	mov	%rbp, %rax
204	test	un, un
205	je	L(87)
206
207L(uent):dec	un
208	mov	(up,un,8), %rbp
209	neg	R32(%rcx)
210	shr	R8(%rcx), %rbp
211	neg	R32(%rcx)
212	or	%rbp, %rax
213	jmp	L(ent)
214
215	ALIGN(16)
216L(utop):mov	(up,un,8), %r10
217	shl	R8(%rcx), %rbp
218	neg	R32(%rcx)
219	shr	R8(%rcx), %r10
220	neg	R32(%rcx)
221	or	%r10, %rbp
222	mul	dinv
223	add	%rbp, %rax
224	adc	%r11, %rdx
225	mov	%rax, %r11
226	mov	%rdx, %r13
227	imul	d, %rdx
228	sub	%rdx, %rbp
229	mov	d, %rax
230	add	%rbp, %rax
231	cmp	%r11, %rbp
232	cmovc	%rbp, %rax
233	adc	$-1, %r13
234	cmp	d, %rax
235	jae	L(ufx)
236L(uok):	mov	%r13, (qp)
237	sub	$8, qp
238L(ent):	mov	(up,un,8), %rbp
239	dec	un
240	lea	1(%rax), %r11
241	jns	L(utop)
242
243L(uend):shl	R8(%rcx), %rbp
244	mul	dinv
245	add	%rbp, %rax
246	adc	%r11, %rdx
247	mov	%rax, %r11
248	mov	%rdx, %r13
249	imul	d, %rdx
250	sub	%rdx, %rbp
251	mov	d, %rax
252	add	%rbp, %rax
253	cmp	%r11, %rbp
254	cmovc	%rbp, %rax
255	adc	$-1, %r13
256	cmp	d, %rax
257	jae	L(efx)
258L(eok):	mov	%r13, (qp)
259	sub	$8, qp
260	jmp	L(87)
261
262L(ufx):	sub	d, %rax
263	inc	%r13
264	jmp	L(uok)
265L(efx):	sub	d, %rax
266	inc	%r13
267	jmp	L(eok)
268
269L(87):	mov	d, %rbp
270	neg	%rbp
271	jmp	L(fent)
272
273	ALIGN(16)			C	    K8-K10  P6-CNR P6-NHM  P4
274L(ftop):mul	dinv			C	      0,12   0,17   0,17
275	add	%r11, %rdx		C	      5      8     10
276	mov	%rax, %r11		C	      4      8      3
277	mov	%rdx, %r13		C	      6      9     11
278	imul	%rbp, %rdx		C	      6      9     11
279	mov	d, %rax			C
280	add	%rdx, %rax		C	     10     14     14
281	cmp	%r11, %rdx		C	     10     14     14
282	cmovc	%rdx, %rax		C	     11     15     15
283	adc	$-1, %r13		C
284	mov	%r13, (qp)		C
285	sub	$8, qp			C
286L(fent):lea	1(%rax), %r11		C
287	dec	fn			C
288	jns	L(ftop)			C
289
290	shr	R8(%rcx), %rax
291L(ret):	pop	%rbx
292	pop	%rbp
293	pop	%r12
294	pop	%r13
295	FUNC_EXIT()
296	ret
297EPILOGUE()
298