xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bdiv_q_1.asm (revision c5e820cae412164fcbee52f470436200af5358ea)
1dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
2dnl  1-limb divisor, returning quotient only.
3
4dnl  Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb
26C K8,K9:	10
27C K10:		10
28C P4:		33
29C P6 core2:	13.25
30C P6 corei7:	14
31C P6 atom:	42
32
33
34C INPUT PARAMETERS
35C rp		rdi
36C up		rsi
37C n		rdx
38C d		rcx
39C di		r8	just mpn_pi1_bdiv_q_1
40C shift		r9	just mpn_pi1_bdiv_q_1
41
42
43ASM_START()
44	TEXT
45	ALIGN(16)
46PROLOGUE(mpn_bdiv_q_1)
47	push	%rbx
48
49	mov	%rcx, %rax
50	xor	R32(%rcx), R32(%rcx)	C shift count
51	mov	%rdx, %r10
52
53	bt	$0, R32(%rax)
54	jnc	L(evn)			C skip bsfq unless divisor is even
55
56L(odd):	mov	%rax, %rbx
57	shr	R32(%rax)
58	and	$127, R32(%rax)		C d/2, 7 bits
59
60ifdef(`PIC',`
61	mov	binvert_limb_table@GOTPCREL(%rip), %rdx
62',`
63	movabs	$binvert_limb_table, %rdx
64')
65
66	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
67
68	mov	%rbx, %r11		C d without twos
69
70	lea	(%rax,%rax), R32(%rdx)	C 2*inv
71	imul	R32(%rax), R32(%rax)	C inv*inv
72	imul	R32(%rbx), R32(%rax)	C inv*inv*d
73	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
74
75	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
76	imul	R32(%rdx), R32(%rdx)	C inv*inv
77	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
78	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
79
80	lea	(%rax,%rax), %r8	C 2*inv
81	imul	%rax, %rax		C inv*inv
82	imul	%rbx, %rax		C inv*inv*d
83	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
84
85	jmp	L(com)
86
87L(evn):	bsf	%rax, %rcx
88	shr	R8(%rcx), %rax
89	jmp	L(odd)
90EPILOGUE()
91
92PROLOGUE(mpn_pi1_bdiv_q_1)
93	push	%rbx
94
95	mov	%rcx, %r11		C d
96	mov	%rdx, %r10		C n
97	mov	%r9, %rcx		C shift
98L(com):
99	mov	(%rsi), %rax		C up[0]
100
101	dec	%r10
102	jz	L(one)
103
104	mov	8(%rsi), %rdx		C up[1]
105	lea	(%rsi,%r10,8), %rsi	C up end
106	lea	(%rdi,%r10,8), %rdi	C rp end
107	neg	%r10			C -n
108
109	shrd	R8(%rcx), %rdx, %rax
110
111	xor	R32(%rbx), R32(%rbx)
112	jmp	L(ent)
113
114	ALIGN(8)
115L(top):
116	C rax	q
117	C rbx	carry bit, 0 or 1
118	C rcx	shift
119	C rdx
120	C rsi	up end
121	C rdi	rp end
122	C r10	counter, limbs, negative
123
124	mul	%r11			C carry limb in rdx
125	mov	(%rsi,%r10,8), %rax
126	mov	8(%rsi,%r10,8), %r9
127	shrd	R8(%rcx), %r9, %rax
128	nop
129	sub	%rbx, %rax		C apply carry bit
130	setc	R8(%rbx)
131	sub	%rdx, %rax		C apply carry limb
132	adc	$0, %rbx
133L(ent):	imul	%r8, %rax
134	mov	%rax, (%rdi,%r10,8)
135	inc	%r10
136	jnz	L(top)
137
138	mul	%r11			C carry limb in rdx
139	mov	(%rsi), %rax		C up high limb
140	shr	R8(%rcx), %rax
141	sub	%rbx, %rax		C apply carry bit
142	sub	%rdx, %rax		C apply carry limb
143	imul	%r8, %rax
144	mov	%rax, (%rdi)
145	pop	%rbx
146	ret
147
148L(one):	shr	R8(%rcx), %rax
149	imul	%r8, %rax
150	mov	%rax, (%rdi)
151	pop	%rbx
152	ret
153EPILOGUE()
154