xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/dive_1.asm (revision ae87de8892f277bece3527c15b186ebcfa188227)
1dnl  x86 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C     cycles/limb
35C P54    30.0
36C P55    29.0
37C P6     13.0 odd divisor, 12.0 even (strangely)
38C K6     14.0
39C K7     12.0
40C P4     42.0
41
42
43C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
44C                           mp_limb_t divisor);
45C
46
47defframe(PARAM_DIVISOR,16)
48defframe(PARAM_SIZE,   12)
49defframe(PARAM_SRC,    8)
50defframe(PARAM_DST,    4)
51
52dnl  re-use parameter space
53define(VAR_INVERSE,`PARAM_SRC')
54
55	TEXT
56
57	ALIGN(16)
58PROLOGUE(mpn_divexact_1)
59deflit(`FRAME',0)
60
61	movl	PARAM_DIVISOR, %eax
62	pushl	%ebp	FRAME_pushl()
63
64	movl	PARAM_SIZE, %ebp
65	pushl	%edi	FRAME_pushl()
66
67	pushl	%ebx	FRAME_pushl()
68	movl	$-1, %ecx		C shift count
69
70	pushl	%esi	FRAME_pushl()
71
72L(strip_twos):
73	incl	%ecx
74
75	shrl	%eax
76	jnc	L(strip_twos)
77
78	leal	1(%eax,%eax), %ebx	C d without twos
79	andl	$127, %eax		C d/2, 7 bits
80
81ifdef(`PIC',`
82	LEA(	binvert_limb_table, %edx)
83	movzbl	(%eax,%edx), %eax		C inv 8 bits
84',`
85	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
86')
87
88	leal	(%eax,%eax), %edx	C 2*inv
89	movl	%ebx, PARAM_DIVISOR	C d without twos
90
91	imull	%eax, %eax		C inv*inv
92
93	movl	PARAM_SRC, %esi
94	movl	PARAM_DST, %edi
95
96	imull	%ebx, %eax		C inv*inv*d
97
98	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
99	leal	(%edx,%edx), %eax	C 2*inv
100
101	imull	%edx, %edx		C inv*inv
102
103	leal	(%esi,%ebp,4), %esi	C src end
104	leal	(%edi,%ebp,4), %edi	C dst end
105	negl	%ebp			C -size
106
107	imull	%ebx, %edx		C inv*inv*d
108
109	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
110
111	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
112	pushl	%eax	FRAME_pushl()
113	imull	PARAM_DIVISOR, %eax
114	cmpl	$1, %eax
115	popl	%eax	FRAME_popl()')
116
117	movl	%eax, VAR_INVERSE
118	movl	(%esi,%ebp,4), %eax	C src[0]
119
120	xorl	%ebx, %ebx
121	xorl	%edx, %edx
122
123	incl	%ebp
124	jz	L(one)
125
126	movl	(%esi,%ebp,4), %edx	C src[1]
127
128	shrdl(	%cl, %edx, %eax)
129
130	movl	VAR_INVERSE, %edx
131	jmp	L(entry)
132
133
134	ALIGN(8)
135	nop	C k6 code alignment
136	nop
137L(top):
138	C eax	q
139	C ebx	carry bit, 0 or -1
140	C ecx	shift
141	C edx	carry limb
142	C esi	src end
143	C edi	dst end
144	C ebp	counter, limbs, negative
145
146	movl	-4(%esi,%ebp,4), %eax
147	subl	%ebx, %edx		C accumulate carry bit
148
149	movl	(%esi,%ebp,4), %ebx
150
151	shrdl(	%cl, %ebx, %eax)
152
153	subl	%edx, %eax		C apply carry limb
154	movl	VAR_INVERSE, %edx
155
156	sbbl	%ebx, %ebx
157
158L(entry):
159	imull	%edx, %eax
160
161	movl	%eax, -4(%edi,%ebp,4)
162	movl	PARAM_DIVISOR, %edx
163
164	mull	%edx
165
166	incl	%ebp
167	jnz	L(top)
168
169
170	movl	-4(%esi), %eax		C src high limb
171L(one):
172	shrl	%cl, %eax
173	popl	%esi	FRAME_popl()
174
175	addl	%ebx, %eax		C apply carry bit
176	popl	%ebx	FRAME_popl()
177
178	subl	%edx, %eax		C apply carry limb
179
180	imull	VAR_INVERSE, %eax
181
182	movl	%eax, -4(%edi)
183
184	popl	%edi
185	popl	%ebp
186
187	ret
188
189EPILOGUE()
190ASM_END()
191