xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/dive_1.asm (revision eceb233b9bd0dfebb902ed73b531ae6964fa3f9b)
1dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	10
36C AMD K10	10
37C Intel P4	33
38C Intel core2	13.25
39C Intel corei	14
40C Intel atom	42
41C VIA nano	43
42
43C A quick adoption of the 32-bit K7 code.
44
45
46C INPUT PARAMETERS
47C rp		rdi
48C up		rsi
49C n		rdx
50C divisor	rcx
51
52ABI_SUPPORT(DOS64)
53ABI_SUPPORT(STD64)
54
55ASM_START()
56	TEXT
57	ALIGN(16)
58PROLOGUE(mpn_divexact_1)
59	FUNC_ENTRY(4)
60	push	%rbx
61
62	mov	%rcx, %rax
63	xor	R32(%rcx), R32(%rcx)	C shift count
64	mov	%rdx, %r8
65
66	bt	$0, R32(%rax)
67	jnc	L(evn)			C skip bsfq unless divisor is even
68
69L(odd):	mov	%rax, %rbx
70	shr	R32(%rax)
71	and	$127, R32(%rax)		C d/2, 7 bits
72
73	LEA(	binvert_limb_table, %rdx)
74
75	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
76
77	mov	%rbx, %r11		C d without twos
78
79	lea	(%rax,%rax), R32(%rdx)	C 2*inv
80	imul	R32(%rax), R32(%rax)	C inv*inv
81	imul	R32(%rbx), R32(%rax)	C inv*inv*d
82	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
83
84	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
85	imul	R32(%rdx), R32(%rdx)	C inv*inv
86	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
87	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
88
89	lea	(%rax,%rax), %r10	C 2*inv
90	imul	%rax, %rax		C inv*inv
91	imul	%rbx, %rax		C inv*inv*d
92	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
93
94	lea	(%rsi,%r8,8), %rsi	C up end
95	lea	-8(%rdi,%r8,8), %rdi	C rp end
96	neg	%r8			C -n
97
98	mov	(%rsi,%r8,8), %rax	C up[0]
99
100	inc	%r8
101	jz	L(one)
102
103	mov	(%rsi,%r8,8), %rdx	C up[1]
104
105	shrd	R8(%rcx), %rdx, %rax
106
107	xor	R32(%rbx), R32(%rbx)
108	jmp	L(ent)
109
110L(evn):	bsf	%rax, %rcx
111	shr	R8(%rcx), %rax
112	jmp	L(odd)
113
114	ALIGN(8)
115L(top):
116	C rax	q
117	C rbx	carry bit, 0 or 1
118	C rcx	shift
119	C rdx
120	C rsi	up end
121	C rdi	rp end
122	C r8	counter, limbs, negative
123	C r10	d^(-1) mod 2^64
124	C r11	d, shifted down
125
126	mul	%r11			C carry limb in rdx	0 10
127	mov	-8(%rsi,%r8,8), %rax	C
128	mov	(%rsi,%r8,8), %r9	C
129	shrd	R8(%rcx), %r9, %rax	C
130	nop				C
131	sub	%rbx, %rax		C apply carry bit
132	setc	%bl			C
133	sub	%rdx, %rax		C apply carry limb	5
134	adc	$0, %rbx		C			6
135L(ent):	imul	%r10, %rax		C			6
136	mov	%rax, (%rdi,%r8,8)	C
137	inc	%r8			C
138	jnz	L(top)
139
140	mul	%r11			C carry limb in rdx
141	mov	-8(%rsi), %rax		C up high limb
142	shr	R8(%rcx), %rax
143	sub	%rbx, %rax		C apply carry bit
144	sub	%rdx, %rax		C apply carry limb
145	imul	%r10, %rax
146	mov	%rax, (%rdi)
147	pop	%rbx
148	FUNC_EXIT()
149	ret
150
151L(one):	shr	R8(%rcx), %rax
152	imul	%r10, %rax
153	mov	%rax, (%rdi)
154	pop	%rbx
155	FUNC_EXIT()
156	ret
157
158EPILOGUE()
159