xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bdiv_q_1.asm (revision b1bb3099bf4d47bbe8c7be5b78240a535263771f)
1dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
2dnl  1-limb divisor, returning quotient only.
3
4dnl  Copyright 2001, 2002, 2004, 2005, 2006, 2009, 2011, 2012 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb
26C AMD K8,K9	10
27C AMD K10	10
28C Intel P4	33
29C Intel core2	13.25
30C Intel corei	14
31C Intel atom	42
32C VIA nano	 ?
33
34
35C INPUT PARAMETERS
36define(`rp',		`%rdi')
37define(`up',		`%rsi')
38define(`n',		`%rdx')
39define(`d',		`%rcx')
40define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
41define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
42
43ABI_SUPPORT(DOS64)
44ABI_SUPPORT(STD64)
45
46ASM_START()
47	TEXT
48	ALIGN(16)
49PROLOGUE(mpn_bdiv_q_1)
50	FUNC_ENTRY(4)
51	push	%rbx
52
53	mov	%rcx, %rax
54	xor	R32(%rcx), R32(%rcx)	C ncnt count
55	mov	%rdx, %r10
56
57	bt	$0, R32(%rax)
58	jnc	L(evn)			C skip bsfq unless divisor is even
59
60L(odd):	mov	%rax, %rbx
61	shr	R32(%rax)
62	and	$127, R32(%rax)		C d/2, 7 bits
63
64	LEA(	binvert_limb_table, %rdx)
65
66	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
67
68	mov	%rbx, %r11		C d without twos
69
70	lea	(%rax,%rax), R32(%rdx)	C 2*inv
71	imul	R32(%rax), R32(%rax)	C inv*inv
72	imul	R32(%rbx), R32(%rax)	C inv*inv*d
73	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
74
75	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
76	imul	R32(%rdx), R32(%rdx)	C inv*inv
77	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
78	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
79
80	lea	(%rax,%rax), %r8	C 2*inv
81	imul	%rax, %rax		C inv*inv
82	imul	%rbx, %rax		C inv*inv*d
83	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
84
85	jmp	L(com)
86
87L(evn):	bsf	%rax, %rcx
88	shr	R8(%rcx), %rax
89	jmp	L(odd)
90EPILOGUE()
91
92PROLOGUE(mpn_pi1_bdiv_q_1)
93	FUNC_ENTRY(4)
94IFDOS(`	mov	56(%rsp), %r8	')
95IFDOS(`	mov	64(%rsp), %r9	')
96	push	%rbx
97
98	mov	%rcx, %r11		C d
99	mov	%rdx, %r10		C n
100	mov	%r9, %rcx		C ncnt
101
102L(com):	mov	(up), %rax		C up[0]
103
104	dec	%r10
105	jz	L(one)
106
107	mov	8(up), %rdx		C up[1]
108	lea	(up,%r10,8), up		C up end
109	lea	(rp,%r10,8), rp		C rp end
110	neg	%r10			C -n
111
112	shrd	R8(%rcx), %rdx, %rax
113
114	xor	R32(%rbx), R32(%rbx)
115	jmp	L(ent)
116
117	ALIGN(8)
118L(top):
119	C rax	q
120	C rbx	carry bit, 0 or 1
121	C rcx	ncnt
122	C rdx
123	C r10	counter, limbs, negative
124
125	mul	%r11			C carry limb in rdx
126	mov	(up,%r10,8), %rax
127	mov	8(up,%r10,8), %r9
128	shrd	R8(%rcx), %r9, %rax
129	nop
130	sub	%rbx, %rax		C apply carry bit
131	setc	R8(%rbx)
132	sub	%rdx, %rax		C apply carry limb
133	adc	$0, %rbx
134L(ent):	imul	%r8, %rax
135	mov	%rax, (rp,%r10,8)
136	inc	%r10
137	jnz	L(top)
138
139	mul	%r11			C carry limb in rdx
140	mov	(up), %rax		C up high limb
141	shr	R8(%rcx), %rax
142	sub	%rbx, %rax		C apply carry bit
143	sub	%rdx, %rax		C apply carry limb
144	imul	%r8, %rax
145	mov	%rax, (rp)
146	pop	%rbx
147	FUNC_EXIT()
148	ret
149
150L(one):	shr	R8(%rcx), %rax
151	imul	%r8, %rax
152	mov	%rax, (rp)
153	pop	%rbx
154	FUNC_EXIT()
155	ret
156EPILOGUE()
157