xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/dive_1.asm (revision 19ef5b5b0bcb90f63509df6e78769de1b57c2758)
1dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004, 2005, 2006, 2011, 2012 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb
25C AMD K8,K9	10
26C AMD K10	10
27C Intel P4	33
28C Intel core2	13.25
29C Intel corei	14
30C Intel atom	42
31C VIA nano	43
32
33C A quick adoption of the 32-bit K7 code.
34
35
36C INPUT PARAMETERS
37C rp		rdi
38C up		rsi
39C n		rdx
40C divisor	rcx
41
42ABI_SUPPORT(DOS64)
43ABI_SUPPORT(STD64)
44
45ASM_START()
46	TEXT
47	ALIGN(16)
48PROLOGUE(mpn_divexact_1)
49	FUNC_ENTRY(4)
50	push	%rbx
51
52	mov	%rcx, %rax
53	xor	R32(%rcx), R32(%rcx)	C shift count
54	mov	%rdx, %r8
55
56	bt	$0, R32(%rax)
57	jnc	L(evn)			C skip bsfq unless divisor is even
58
59L(odd):	mov	%rax, %rbx
60	shr	R32(%rax)
61	and	$127, R32(%rax)		C d/2, 7 bits
62
63	LEA(	binvert_limb_table, %rdx)
64
65	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
66
67	mov	%rbx, %r11		C d without twos
68
69	lea	(%rax,%rax), R32(%rdx)	C 2*inv
70	imul	R32(%rax), R32(%rax)	C inv*inv
71	imul	R32(%rbx), R32(%rax)	C inv*inv*d
72	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
73
74	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
75	imul	R32(%rdx), R32(%rdx)	C inv*inv
76	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
77	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
78
79	lea	(%rax,%rax), %r10	C 2*inv
80	imul	%rax, %rax		C inv*inv
81	imul	%rbx, %rax		C inv*inv*d
82	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
83
84	lea	(%rsi,%r8,8), %rsi	C up end
85	lea	-8(%rdi,%r8,8), %rdi	C rp end
86	neg	%r8			C -n
87
88	mov	(%rsi,%r8,8), %rax	C up[0]
89
90	inc	%r8
91	jz	L(one)
92
93	mov	(%rsi,%r8,8), %rdx	C up[1]
94
95	shrd	R8(%rcx), %rdx, %rax
96
97	xor	R32(%rbx), R32(%rbx)
98	jmp	L(ent)
99
100L(evn):	bsf	%rax, %rcx
101	shr	R8(%rcx), %rax
102	jmp	L(odd)
103
104	ALIGN(8)
105L(top):
106	C rax	q
107	C rbx	carry bit, 0 or 1
108	C rcx	shift
109	C rdx
110	C rsi	up end
111	C rdi	rp end
112	C r8	counter, limbs, negative
113	C r10	d^(-1) mod 2^64
114	C r11	d, shifted down
115
116	mul	%r11			C carry limb in rdx	0 10
117	mov	-8(%rsi,%r8,8), %rax	C
118	mov	(%rsi,%r8,8), %r9	C
119	shrd	R8(%rcx), %r9, %rax	C
120	nop				C
121	sub	%rbx, %rax		C apply carry bit
122	setc	%bl			C
123	sub	%rdx, %rax		C apply carry limb	5
124	adc	$0, %rbx		C			6
125L(ent):	imul	%r10, %rax		C			6
126	mov	%rax, (%rdi,%r8,8)	C
127	inc	%r8			C
128	jnz	L(top)
129
130	mul	%r11			C carry limb in rdx
131	mov	-8(%rsi), %rax		C up high limb
132	shr	R8(%rcx), %rax
133	sub	%rbx, %rax		C apply carry bit
134	sub	%rdx, %rax		C apply carry limb
135	imul	%r10, %rax
136	mov	%rax, (%rdi)
137	pop	%rbx
138	FUNC_EXIT()
139	ret
140
141L(one):	shr	R8(%rcx), %rax
142	imul	%r10, %rax
143	mov	%rax, (%rdi)
144	pop	%rbx
145	FUNC_EXIT()
146	ret
147
148EPILOGUE()
149