xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/dive_1.asm (revision 0953dc8744b62dfdecb2f203329e730593755659)
1dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	10
25C K10:		10
26C P4:		33
27C P6 core2:	13.25
28C P6 corei7:	14
29C P6 atom:	42
30
31C A quick adoption of the 32-bit K7 code.
32
33
34C INPUT PARAMETERS
35C rp		rdi
36C up		rsi
37C n		rdx
38C divisor	rcx
39
40ASM_START()
41	TEXT
42	ALIGN(16)
43PROLOGUE(mpn_divexact_1)
44	push	%rbx
45
46	mov	%rcx, %rax
47	xor	R32(%rcx), R32(%rcx)	C shift count
48	mov	%rdx, %r8
49
50	bt	$0, R32(%rax)
51	jnc	L(evn)			C skip bsfq unless divisor is even
52
53L(odd):	mov	%rax, %rbx
54	shr	R32(%rax)
55	and	$127, R32(%rax)		C d/2, 7 bits
56
57ifdef(`PIC',`
58	mov	binvert_limb_table@GOTPCREL(%rip), %rdx
59',`
60	movabs	$binvert_limb_table, %rdx
61')
62
63	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
64
65	mov	%rbx, %r11		C d without twos
66
67	lea	(%rax,%rax), R32(%rdx)	C 2*inv
68	imul	R32(%rax), R32(%rax)	C inv*inv
69	imul	R32(%rbx), R32(%rax)	C inv*inv*d
70	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
71
72	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
73	imul	R32(%rdx), R32(%rdx)	C inv*inv
74	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
75	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
76
77	lea	(%rax,%rax), %r10	C 2*inv
78	imul	%rax, %rax		C inv*inv
79	imul	%rbx, %rax		C inv*inv*d
80	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
81
82	lea	(%rsi,%r8,8), %rsi	C up end
83	lea	-8(%rdi,%r8,8), %rdi	C rp end
84	neg	%r8			C -n
85
86	mov	(%rsi,%r8,8), %rax	C up[0]
87
88	inc	%r8
89	jz	L(one)
90
91	mov	(%rsi,%r8,8), %rdx	C up[1]
92
93	shrd	R8(%rcx), %rdx, %rax
94
95	xor	R32(%rbx), R32(%rbx)
96	jmp	L(ent)
97
98L(evn):	bsf	%rax, %rcx
99	shr	R8(%rcx), %rax
100	jmp	L(odd)
101
102	ALIGN(8)
103L(top):
104	C rax	q
105	C rbx	carry bit, 0 or 1
106	C rcx	shift
107	C rdx
108	C rsi	up end
109	C rdi	rp end
110	C r8	counter, limbs, negative
111	C r10	d^(-1) mod 2^64
112	C r11	d, shifted down
113
114	mul	%r11			C carry limb in rdx	0 10
115	mov	-8(%rsi,%r8,8), %rax	C
116	mov	(%rsi,%r8,8), %r9	C
117	shrd	R8(%rcx), %r9, %rax	C
118	nop				C
119	sub	%rbx, %rax		C apply carry bit
120	setc	%bl			C
121	sub	%rdx, %rax		C apply carry limb	5
122	adc	$0, %rbx		C			6
123L(ent):	imul	%r10, %rax		C			6
124	mov	%rax, (%rdi,%r8,8)	C
125	inc	%r8			C
126	jnz	L(top)
127
128	mul	%r11			C carry limb in rdx
129	mov	-8(%rsi), %rax		C up high limb
130	shr	R8(%rcx), %rax
131	sub	%rbx, %rax		C apply carry bit
132	sub	%rdx, %rax		C apply carry limb
133	imul	%r10, %rax
134	mov	%rax, (%rdi)
135	pop	%rbx
136	ret
137
138L(one):	shr	R8(%rcx), %rax
139	imul	%r10, %rax
140	mov	%rax, (%rdi)
141	pop	%rbx
142	ret
143
144EPILOGUE()
145