xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/nano/dive_1.asm (revision 32d1c65c71fbdb65a012e8392a62a757dd6853e9)
1dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C	       norm	       unorm
36C AMD K8,K9	11		11
37C AMD K10	11		11
38C Intel P4	 ?
39C Intel core2	13.5		13.25
40C Intel corei	14.25
41C Intel atom	34		36
42C VIA nano	19.25		19.25
43
44
45C INPUT PARAMETERS
46C rp		rdi
47C up		rsi
48C n		rdx
49C divisor	rcx
50
51ABI_SUPPORT(DOS64)
52ABI_SUPPORT(STD64)
53
54ASM_START()
55	TEXT
56	ALIGN(16)
57PROLOGUE(mpn_divexact_1)
58	FUNC_ENTRY(4)
59	push	%rbx
60
61	mov	%rcx, %rax
62	xor	R32(%rcx), R32(%rcx)	C shift count
63	mov	%rdx, %r8
64
65	bt	$0, R32(%rax)
66	jc	L(odd)			C skip bsfq unless divisor is even
67	bsf	%rax, %rcx
68	shr	R8(%rcx), %rax
69L(odd):	mov	%rax, %rbx
70	shr	R32(%rax)
71	and	$127, R32(%rax)		C d/2, 7 bits
72
73	LEA(	binvert_limb_table, %rdx)
74
75	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
76
77	mov	%rbx, %r11		C d without twos
78
79	lea	(%rax,%rax), R32(%rdx)	C 2*inv
80	imul	R32(%rax), R32(%rax)	C inv*inv
81	imul	R32(%rbx), R32(%rax)	C inv*inv*d
82	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
83
84	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
85	imul	R32(%rdx), R32(%rdx)	C inv*inv
86	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
87	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
88
89	lea	(%rax,%rax), %r10	C 2*inv
90	imul	%rax, %rax		C inv*inv
91	imul	%rbx, %rax		C inv*inv*d
92	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
93
94	lea	(%rsi,%r8,8), %rsi	C up end
95	lea	-8(%rdi,%r8,8), %rdi	C rp end
96	neg	%r8			C -n
97
98	mov	(%rsi,%r8,8), %rax	C up[0]
99
100	inc	%r8
101	jz	L(one)
102
103	test	R32(%rcx), R32(%rcx)
104	jnz	L(unorm)		C branch if count != 0
105	xor	R32(%rbx), R32(%rbx)
106	jmp	L(nent)
107
108	ALIGN(8)
109L(ntop):mul	%r11			C carry limb in rdx	0 10
110	mov	-8(%rsi,%r8,8), %rax	C
111	sub	%rbx, %rax		C apply carry bit
112	setc	%bl			C
113	sub	%rdx, %rax		C apply carry limb	5
114	adc	$0, %rbx		C			6
115L(nent):imul	%r10, %rax		C			6
116	mov	%rax, (%rdi,%r8,8)	C
117	inc	%r8			C
118	jnz	L(ntop)
119
120	mov	-8(%rsi), %r9		C up high limb
121	jmp	L(com)
122
123L(unorm):
124	mov	(%rsi,%r8,8), %r9	C up[1]
125	shr	R8(%rcx), %rax		C
126	neg	R32(%rcx)
127	shl	R8(%rcx), %r9		C
128	neg	R32(%rcx)
129	or	%r9, %rax
130	xor	R32(%rbx), R32(%rbx)
131	jmp	L(uent)
132
133	ALIGN(8)
134L(utop):mul	%r11			C carry limb in rdx	0 10
135	mov	(%rsi,%r8,8), %rax	C
136	shl	R8(%rcx), %rax		C
137	neg	R32(%rcx)
138	or	%r9, %rax
139	sub	%rbx, %rax		C apply carry bit
140	setc	%bl			C
141	sub	%rdx, %rax		C apply carry limb	5
142	adc	$0, %rbx		C			6
143L(uent):imul	%r10, %rax		C			6
144	mov	(%rsi,%r8,8), %r9	C
145	shr	R8(%rcx), %r9		C
146	neg	R32(%rcx)
147	mov	%rax, (%rdi,%r8,8)	C
148	inc	%r8			C
149	jnz	L(utop)
150
151L(com):	mul	%r11			C carry limb in rdx
152	sub	%rbx, %r9		C apply carry bit
153	sub	%rdx, %r9		C apply carry limb
154	imul	%r10, %r9
155	mov	%r9, (%rdi)
156	pop	%rbx
157	FUNC_EXIT()
158	ret
159
160L(one):	shr	R8(%rcx), %rax
161	imul	%r10, %rax
162	mov	%rax, (%rdi)
163	pop	%rbx
164	FUNC_EXIT()
165	ret
166EPILOGUE()
167