xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/divrem_1.asm (revision 7e30e94394d0994ab9534f68a8f91665045c91ce)
1dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C K6: 20 cycles/limb
25
26
27C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
28C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
29C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
30C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
31C                          mp_limb_t carry);
32C
33C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
34C instead of decl+jnz, since it comes out 2 cycles/limb faster.
35C
36C A test is done to see if the high limb is less than the divisor, and if so
37C one less div is done.  A div is 20 cycles, so assuming high<divisor about
38C half the time, then this test saves half that amount.  The branch
39C misprediction penalty is less than that.
40C
41C Back-to-back div instructions run at 20 cycles, the same as the loop here,
42C so it seems there's nothing to gain by rearranging the loop.  Pairing the
43C mov and loop instructions was found to gain nothing.
44C
45C Enhancements:
46C
47C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
48C that algorithm has been found to suffer from the relatively poor carry
49C handling on K6 and too many auxiliary instructions.  The fractional part
50C however could be done at about 13 c/l, if it mattered enough.
51
52defframe(PARAM_CARRY,  24)
53defframe(PARAM_DIVISOR,20)
54defframe(PARAM_SIZE,   16)
55defframe(PARAM_SRC,    12)
56defframe(PARAM_XSIZE,  8)
57defframe(PARAM_DST,    4)
58
59	TEXT
60
61	ALIGN(32)
62PROLOGUE(mpn_divrem_1c)
63deflit(`FRAME',0)
64
65	movl	PARAM_SIZE, %ecx
66	pushl	%edi		FRAME_pushl()
67
68	movl	PARAM_SRC, %edi
69	pushl	%esi		FRAME_pushl()
70
71	movl	PARAM_DIVISOR, %esi
72	pushl	%ebx		FRAME_pushl()
73
74	movl	PARAM_DST, %ebx
75	pushl	%ebp		FRAME_pushl()
76
77	movl	PARAM_XSIZE, %ebp
78	orl	%ecx, %ecx		C size
79
80	movl	PARAM_CARRY, %edx
81	jz	L(fraction)		C if size==0
82
83	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
84	jmp	L(integer_top)
85
86EPILOGUE()
87
88
89	ALIGN(16)
90PROLOGUE(mpn_divrem_1)
91deflit(`FRAME',0)
92
93	movl	PARAM_SIZE, %ecx
94	pushl	%edi		FRAME_pushl()
95
96	movl	PARAM_SRC, %edi
97	pushl	%esi		FRAME_pushl()
98
99	movl	PARAM_DIVISOR, %esi
100	orl	%ecx,%ecx		C size
101
102	jz	L(size_zero)
103	pushl	%ebx		FRAME_pushl()
104
105	movl	-4(%edi,%ecx,4), %eax	C src high limb
106	xorl	%edx, %edx
107
108	movl	PARAM_DST, %ebx
109	pushl	%ebp		FRAME_pushl()
110
111	movl	PARAM_XSIZE, %ebp
112	cmpl	%esi, %eax
113
114	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
115	jae	L(integer_entry)
116
117
118	C high<divisor, so high of dst is zero, and avoid one div
119
120	movl	%edx, (%ebx,%ecx,4)
121	decl	%ecx
122
123	movl	%eax, %edx
124	jz	L(fraction)
125
126
127L(integer_top):
128	C eax	scratch (quotient)
129	C ebx	dst+4*xsize-4
130	C ecx	counter
131	C edx	scratch (remainder)
132	C esi	divisor
133	C edi	src
134	C ebp	xsize
135
136	movl	-4(%edi,%ecx,4), %eax
137L(integer_entry):
138
139	divl	%esi
140
141	movl	%eax, (%ebx,%ecx,4)
142	loop	L(integer_top)
143
144
145L(fraction):
146	orl	%ebp, %ecx
147	jz	L(done)
148
149	movl	PARAM_DST, %ebx
150
151
152L(fraction_top):
153	C eax	scratch (quotient)
154	C ebx	dst
155	C ecx	counter
156	C edx	scratch (remainder)
157	C esi	divisor
158	C edi
159	C ebp
160
161	xorl	%eax, %eax
162
163	divl	%esi
164
165	movl	%eax, -4(%ebx,%ecx,4)
166	loop	L(fraction_top)
167
168
169L(done):
170	popl	%ebp
171	movl	%edx, %eax
172	popl	%ebx
173	popl	%esi
174	popl	%edi
175	ret
176
177
178L(size_zero):
179deflit(`FRAME',8)
180	movl	PARAM_XSIZE, %ecx
181	xorl	%eax, %eax
182
183	movl	PARAM_DST, %edi
184
185	cld	C better safe than sorry, see mpn/x86/README
186
187	rep
188	stosl
189
190	popl	%esi
191	popl	%edi
192	ret
193EPILOGUE()
194