xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/divrem_1.asm (revision 901e7e84758515fbf39dfc064cb0b45ab146d8b0)
1dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: 20 cycles/limb
35
36
37C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
39C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
40C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
41C                          mp_limb_t carry);
42C
43C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
44C instead of decl+jnz, since it comes out 2 cycles/limb faster.
45C
46C A test is done to see if the high limb is less than the divisor, and if so
47C one less div is done.  A div is 20 cycles, so assuming high<divisor about
48C half the time, then this test saves half that amount.  The branch
49C misprediction penalty is less than that.
50C
51C Back-to-back div instructions run at 20 cycles, the same as the loop here,
52C so it seems there's nothing to gain by rearranging the loop.  Pairing the
53C mov and loop instructions was found to gain nothing.
54C
55C Enhancements:
56C
57C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
58C that algorithm has been found to suffer from the relatively poor carry
59C handling on K6 and too many auxiliary instructions.  The fractional part
60C however could be done at about 13 c/l, if it mattered enough.
61
62defframe(PARAM_CARRY,  24)
63defframe(PARAM_DIVISOR,20)
64defframe(PARAM_SIZE,   16)
65defframe(PARAM_SRC,    12)
66defframe(PARAM_XSIZE,  8)
67defframe(PARAM_DST,    4)
68
69	TEXT
70
71	ALIGN(32)
72PROLOGUE(mpn_divrem_1c)
73deflit(`FRAME',0)
74
75	movl	PARAM_SIZE, %ecx
76	pushl	%edi		FRAME_pushl()
77
78	movl	PARAM_SRC, %edi
79	pushl	%esi		FRAME_pushl()
80
81	movl	PARAM_DIVISOR, %esi
82	pushl	%ebx		FRAME_pushl()
83
84	movl	PARAM_DST, %ebx
85	pushl	%ebp		FRAME_pushl()
86
87	movl	PARAM_XSIZE, %ebp
88	orl	%ecx, %ecx		C size
89
90	movl	PARAM_CARRY, %edx
91	jz	L(fraction)		C if size==0
92
93	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
94	jmp	L(integer_top)
95
96EPILOGUE()
97
98
99	ALIGN(16)
100PROLOGUE(mpn_divrem_1)
101deflit(`FRAME',0)
102
103	movl	PARAM_SIZE, %ecx
104	pushl	%edi		FRAME_pushl()
105
106	movl	PARAM_SRC, %edi
107	pushl	%esi		FRAME_pushl()
108
109	movl	PARAM_DIVISOR, %esi
110	orl	%ecx,%ecx		C size
111
112	jz	L(size_zero)
113	pushl	%ebx		FRAME_pushl()
114
115	movl	-4(%edi,%ecx,4), %eax	C src high limb
116	xorl	%edx, %edx
117
118	movl	PARAM_DST, %ebx
119	pushl	%ebp		FRAME_pushl()
120
121	movl	PARAM_XSIZE, %ebp
122	cmpl	%esi, %eax
123
124	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
125	jae	L(integer_entry)
126
127
128	C high<divisor, so high of dst is zero, and avoid one div
129
130	movl	%edx, (%ebx,%ecx,4)
131	decl	%ecx
132
133	movl	%eax, %edx
134	jz	L(fraction)
135
136
137L(integer_top):
138	C eax	scratch (quotient)
139	C ebx	dst+4*xsize-4
140	C ecx	counter
141	C edx	scratch (remainder)
142	C esi	divisor
143	C edi	src
144	C ebp	xsize
145
146	movl	-4(%edi,%ecx,4), %eax
147L(integer_entry):
148
149	divl	%esi
150
151	movl	%eax, (%ebx,%ecx,4)
152	loop	L(integer_top)
153
154
155L(fraction):
156	orl	%ebp, %ecx
157	jz	L(done)
158
159	movl	PARAM_DST, %ebx
160
161
162L(fraction_top):
163	C eax	scratch (quotient)
164	C ebx	dst
165	C ecx	counter
166	C edx	scratch (remainder)
167	C esi	divisor
168	C edi
169	C ebp
170
171	xorl	%eax, %eax
172
173	divl	%esi
174
175	movl	%eax, -4(%ebx,%ecx,4)
176	loop	L(fraction_top)
177
178
179L(done):
180	popl	%ebp
181	movl	%edx, %eax
182	popl	%ebx
183	popl	%esi
184	popl	%edi
185	ret
186
187
188L(size_zero):
189deflit(`FRAME',8)
190	movl	PARAM_XSIZE, %ecx
191	xorl	%eax, %eax
192
193	movl	PARAM_DST, %edi
194
195	cld	C better safe than sorry, see mpn/x86/README
196
197	rep
198	stosl
199
200	popl	%esi
201	popl	%edi
202	ret
203EPILOGUE()
204