xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bdiv_q_1.asm (revision 32d1c65c71fbdb65a012e8392a62a757dd6853e9)
1dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
2
3dnl  Copyright 2001, 2002, 2004-2006, 2010-2012, 2017 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C	    cycles/limb    cycles/limb
35C	       norm	       unorm
36C AMD K8,K9	11		11
37C AMD K10	11		11
38C AMD bull	13.5		14
39C AMD pile	14		15
40C AMD steam
41C AMD excavator
42C AMD bobcat	14		14
43C AMD jaguar	14.5		15
44C Intel P4	33		33
45C Intel core2	13.5		13.25
46C Intel NHM	14		14
47C Intel SBR	8		8.25
48C Intel IBR	7.75		7.85
49C Intel HWL	8		8
50C Intel BWL	8		8
51C Intel SKL	8		8
52C Intel atom	34		36
53C Intel SLM	13.7		13.5
54C VIA nano	19.25		19.25	needs re-measuring
55
56C INPUT PARAMETERS
57define(`rp',		`%rdi')
58define(`up',		`%rsi')
59define(`n',		`%rdx')
60define(`d',		`%rcx')
61define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
62define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
63
64ABI_SUPPORT(DOS64)
65ABI_SUPPORT(STD64)
66
67ASM_START()
68	TEXT
69	ALIGN(16)
70PROLOGUE(mpn_bdiv_q_1)
71	FUNC_ENTRY(4)
72	push	%rbx
73
74	mov	%rcx, %rax
75	xor	R32(%rcx), R32(%rcx)	C ncnt count
76	mov	%rdx, %r10
77
78	bt	$0, R32(%rax)
79	jnc	L(evn)			C skip bsf unless divisor is even
80
81L(odd):	mov	%rax, %rbx
82	shr	R32(%rax)
83	and	$127, R32(%rax)		C d/2, 7 bits
84
85	LEA(	binvert_limb_table, %rdx)
86
87	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
88
89	mov	%rbx, %r11		C d without twos
90
91	lea	(%rax,%rax), R32(%rdx)	C 2*inv
92	imul	R32(%rax), R32(%rax)	C inv*inv
93	imul	R32(%rbx), R32(%rax)	C inv*inv*d
94	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
95
96	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
97	imul	R32(%rdx), R32(%rdx)	C inv*inv
98	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
99	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
100
101	lea	(%rax,%rax), %r8	C 2*inv
102	imul	%rax, %rax		C inv*inv
103	imul	%rbx, %rax		C inv*inv*d
104	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
105
106	jmp	L(pi1)
107
108L(evn):	bsf	%rax, %rcx
109	shr	R8(%rcx), %rax
110	jmp	L(odd)
111EPILOGUE()
112
113PROLOGUE(mpn_pi1_bdiv_q_1)
114	FUNC_ENTRY(4)
115IFDOS(`	mov	56(%rsp), %r8	')
116IFDOS(`	mov	64(%rsp), %r9	')
117	push	%rbx
118
119	mov	%rcx, %r11		C d
120	mov	%rdx, %r10		C n
121	mov	%r9, %rcx		C ncnt
122
123L(pi1):	mov	(up), %rax		C up[0]
124
125	dec	%r10
126	jz	L(one)
127
128	lea	8(up,%r10,8), up	C up end
129	lea	(rp,%r10,8), rp		C rp end
130	neg	%r10			C -n
131
132	test	R32(%rcx), R32(%rcx)
133	jnz	L(unorm)		C branch if count != 0
134	xor	R32(%rbx), R32(%rbx)
135	jmp	L(nent)
136
137	ALIGN(8)
138L(ntop):mul	%r11			C carry limb in rdx	0 10
139	mov	-8(up,%r10,8), %rax	C
140	sub	%rbx, %rax		C apply carry bit
141	setc	R8(%rbx)		C
142	sub	%rdx, %rax		C apply carry limb	5
143	adc	$0, R32(%rbx)		C			6
144L(nent):imul	%r8, %rax		C			6
145	mov	%rax, (rp,%r10,8)	C
146	inc	%r10			C
147	jnz	L(ntop)
148
149	mov	-8(up), %r9		C up high limb
150	jmp	L(com)
151
152L(unorm):
153	mov	(up,%r10,8), %r9	C up[1]
154	shr	R8(%rcx), %rax		C
155	neg	R32(%rcx)
156	shl	R8(%rcx), %r9		C
157	neg	R32(%rcx)
158	or	%r9, %rax
159	xor	R32(%rbx), R32(%rbx)
160	jmp	L(uent)
161
162	ALIGN(8)
163L(utop):mul	%r11			C carry limb in rdx	0 10
164	mov	(up,%r10,8), %rax	C
165	shl	R8(%rcx), %rax		C
166	neg	R32(%rcx)
167	or	%r9, %rax
168	sub	%rbx, %rax		C apply carry bit
169	setc	R8(%rbx)		C
170	sub	%rdx, %rax		C apply carry limb	5
171	adc	$0, R32(%rbx)		C			6
172L(uent):imul	%r8, %rax		C			6
173	mov	(up,%r10,8), %r9	C
174	shr	R8(%rcx), %r9		C
175	neg	R32(%rcx)
176	mov	%rax, (rp,%r10,8)	C
177	inc	%r10			C
178	jnz	L(utop)
179
180L(com):	mul	%r11			C carry limb in rdx
181	sub	%rbx, %r9		C apply carry bit
182	sub	%rdx, %r9		C apply carry limb
183	imul	%r8, %r9
184	mov	%r9, (rp)
185	pop	%rbx
186	FUNC_EXIT()
187	ret
188
189L(one):	shr	R8(%rcx), %rax
190	imul	%r8, %rax
191	mov	%rax, (rp)
192	pop	%rbx
193	FUNC_EXIT()
194	ret
195EPILOGUE()
196