xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/divrem_2.asm (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1dnl  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb	best
35C AMD K8,K9	18
36C AMD K10	18
37C AMD bull
38C AMD pile
39C AMD bobcat
40C AMD jaguar
41C Intel P4	68
42C Intel core	34
43C Intel NHM	30.25
44C Intel SBR	21.3
45C Intel IBR	21.4
46C Intel HWL	20.6
47C Intel BWL
48C Intel atom	73
49C VIA nano	33
50
51
52C INPUT PARAMETERS
53define(`qp',		`%rdi')
54define(`fn',		`%rsi')
55define(`up_param',	`%rdx')
56define(`un_param',	`%rcx')
57define(`dp',		`%r8')
58
59ABI_SUPPORT(DOS64)
60ABI_SUPPORT(STD64)
61
62ASM_START()
63	TEXT
64	ALIGN(16)
65PROLOGUE(mpn_divrem_2)
66	FUNC_ENTRY(4)
67IFDOS(`	mov	56(%rsp), %r8	')
68	push	%r15
69	push	%r14
70	push	%r13
71	push	%r12
72	lea	-24(%rdx,%rcx,8), %r12	C r12 = &up[un-1]
73	mov	%rsi, %r13
74	push	%rbp
75	mov	%rdi, %rbp
76	push	%rbx
77	mov	8(%r8), %r11		C d1
78	mov	16(%r12), %rbx
79	mov	(%r8), %r8		C d0
80	mov	8(%r12), %r10
81
82	xor	R32(%r15), R32(%r15)
83	cmp	%rbx, %r11
84	ja	L(2)
85	setb	%dl
86	cmp	%r10, %r8
87	setbe	%al
88	orb	%al, %dl		C "orb" form to placate Sun tools
89	je	L(2)
90	inc	R32(%r15)
91	sub	%r8, %r10
92	sbb	%r11, %rbx
93L(2):
94	lea	-3(%rcx,%r13), %r14	C un + fn - 3
95	test	%r14, %r14
96	js	L(end)
97
98	push	%r8
99	push	%r10
100	push	%r11
101IFSTD(`	mov	%r11, %rdi	')
102IFDOS(`	mov	%r11, %rcx	')
103IFDOS(`	sub	$32, %rsp	')
104	ASSERT(nz, `test $15, %rsp')
105	CALL(	mpn_invert_limb)
106IFDOS(`	add	$32, %rsp	')
107	pop	%r11
108	pop	%r10
109	pop	%r8
110
111	mov	%r11, %rdx
112	mov	%rax, %rdi
113	imul	%rax, %rdx
114	mov	%rdx, %r9
115	mul	%r8
116	xor	R32(%rcx), R32(%rcx)
117	add	%r8, %r9
118	adc	$-1, %rcx
119	add	%rdx, %r9
120	adc	$0, %rcx
121	js	2f
1221:	dec	%rdi
123	sub	%r11, %r9
124	sbb	$0, %rcx
125	jns	1b
1262:
127
128	lea	(%rbp,%r14,8), %rbp
129	mov	%r11, %rsi
130	neg	%rsi			C -d1
131
132C rax rbx rcx rdx rsi rdi  rbp r8 r9 r10 r11 r12 r13 r14 r15
133C     n2  un      -d1 dinv qp  d0 q0     d1  up  fn      msl
134
135	ALIGN(16)
136L(top):	mov	%rdi, %rax		C di		ncp
137	mul	%rbx			C		0, 17
138	mov	%r10, %rcx		C
139	add	%rax, %rcx		C		4
140	adc	%rbx, %rdx		C		5
141	mov	%rdx, %r9		C q		6
142	imul	%rsi, %rdx		C		6
143	mov	%r8, %rax		C		ncp
144	lea	(%rdx, %r10), %rbx	C n1 -= ...	10
145	xor	R32(%r10), R32(%r10)	C
146	mul	%r9			C		7
147	cmp	%r14, %r13		C
148	jg	L(19)			C
149	mov	(%r12), %r10		C
150	sub	$8, %r12		C
151L(19):	sub	%r8, %r10		C		ncp
152	sbb	%r11, %rbx		C		11
153	sub	%rax, %r10		C		11
154	sbb	%rdx, %rbx		C		12
155	xor	R32(%rax), R32(%rax)	C
156	xor	R32(%rdx), R32(%rdx)	C
157	cmp	%rcx, %rbx		C		13
158	cmovnc	%r8, %rax		C		14
159	cmovnc	%r11, %rdx		C		14
160	adc	$0, %r9			C adjust q	14
161	nop
162	add	%rax, %r10		C		15
163	adc	%rdx, %rbx		C		16
164	cmp	%r11, %rbx		C
165	jae	L(fix)			C
166L(bck):	mov	%r9, (%rbp)		C
167	sub	$8, %rbp		C
168	dec	%r14
169	jns	L(top)
170
171L(end):	mov	%r10, 8(%r12)
172	mov	%rbx, 16(%r12)
173	pop	%rbx
174	pop	%rbp
175	pop	%r12
176	pop	%r13
177	pop	%r14
178	mov	%r15, %rax
179	pop	%r15
180	FUNC_EXIT()
181	ret
182
183L(fix):	seta	%dl
184	cmp	%r8, %r10
185	setae	%al
186	orb	%dl, %al		C "orb" form to placate Sun tools
187	je	L(bck)
188	inc	%r9
189	sub	%r8, %r10
190	sbb	%r11, %rbx
191	jmp	L(bck)
192EPILOGUE()
193