xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mmx/dive_1.asm (revision e670fd5c413e99c2f6a37901bb21c537fcd322d2)
1dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C         divisor
35C       odd   even
36C K6:   10.0  12.0  cycles/limb
37C K6-2: 10.0  11.5
38
39
40C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
41C                      mp_limb_t divisor);
42C
43C A simple divl is used for size==1.  This is about 10 cycles faster for an
44C odd divisor or 20 cycles for an even divisor.
45C
46C The loops are quite sensitive to code alignment, speeds should be
47C rechecked (odd and even divisor, pic and non-pic) if contemplating
48C changing anything.
49
50defframe(PARAM_DIVISOR,16)
51defframe(PARAM_SIZE,   12)
52defframe(PARAM_SRC,    8)
53defframe(PARAM_DST,    4)
54
55dnl  re-use parameter space
56define(VAR_INVERSE,`PARAM_DST')
57
58	TEXT
59
60	ALIGN(32)
61PROLOGUE(mpn_divexact_1)
62deflit(`FRAME',0)
63
64	movl	PARAM_SIZE, %ecx
65
66	movl	PARAM_SRC, %eax
67	xorl	%edx, %edx
68
69	cmpl	$1, %ecx
70	jnz	L(two_or_more)
71
72	movl	(%eax), %eax
73
74	divl	PARAM_DIVISOR
75
76	movl	PARAM_DST, %ecx
77	movl	%eax, (%ecx)
78
79	ret
80
81
82L(two_or_more):
83	movl	PARAM_DIVISOR, %eax
84	pushl	%ebx		FRAME_pushl()
85
86	movl	PARAM_SRC, %ebx
87	pushl	%ebp		FRAME_pushl()
88
89L(strip_twos):
90	shrl	%eax
91	incl	%edx			C will get shift+1
92
93	jnc	L(strip_twos)
94	pushl	%esi		FRAME_pushl()
95
96	leal	1(%eax,%eax), %esi	C d without twos
97	andl	$127, %eax		C d/2, 7 bits
98
99ifdef(`PIC',`
100	LEA(	binvert_limb_table, %ebp)
101Zdisp(	movzbl,	0,(%eax,%ebp), %eax)
102',`
103	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
104')
105	pushl	%edi		FRAME_pushl()
106
107	leal	(%eax,%eax), %ebp	C 2*inv
108
109	imull	%eax, %eax		C inv*inv
110
111	movl	PARAM_DST, %edi
112
113	imull	%esi, %eax		C inv*inv*d
114
115	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
116	leal	(%ebp,%ebp), %eax	C 2*inv
117
118	imull	%ebp, %ebp		C inv*inv
119
120	movl	%esi, PARAM_DIVISOR	C d without twos
121	leal	(%ebx,%ecx,4), %ebx	C src end
122
123	imull	%esi, %ebp		C inv*inv*d
124
125	leal	(%edi,%ecx,4), %edi	C dst end
126	negl	%ecx			C -size
127
128	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
129	subl	$1, %edx		C shift amount, and clear carry
130
131	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
132	pushl	%eax	FRAME_pushl()
133	imull	PARAM_DIVISOR, %eax
134	cmpl	$1, %eax
135	popl	%eax	FRAME_popl()')
136
137	movl	%eax, VAR_INVERSE
138	jnz	L(even)
139
140	movl	(%ebx,%ecx,4), %esi	C src low limb
141	jmp	L(odd_entry)
142
143
144	ALIGN(16)
145	nop	C code alignment
146L(odd_top):
147	C eax	scratch
148	C ebx	src end
149	C ecx	counter, limbs, negative
150	C edx	inverse
151	C esi	next limb, adjusted for carry
152	C edi	dst end
153	C ebp	carry bit, 0 or -1
154
155	imull	%edx, %esi
156
157	movl	PARAM_DIVISOR, %eax
158	movl	%esi, -4(%edi,%ecx,4)
159
160	mull	%esi			C carry limb in edx
161
162	subl	%ebp, %edx		C apply carry bit
163	movl	(%ebx,%ecx,4), %esi
164
165L(odd_entry):
166	subl	%edx, %esi		C apply carry limb
167	movl	VAR_INVERSE, %edx
168
169	sbbl	%ebp, %ebp		C 0 or -1
170
171	incl	%ecx
172	jnz	L(odd_top)
173
174
175	imull	%edx, %esi
176
177	movl	%esi, -4(%edi,%ecx,4)
178
179	popl	%edi
180	popl	%esi
181
182	popl	%ebp
183	popl	%ebx
184
185	ret
186
187
188L(even):
189	C eax
190	C ebx	src end
191	C ecx	-size
192	C edx	twos
193	C esi
194	C edi	dst end
195	C ebp
196
197	xorl	%ebp, %ebp
198Zdisp(	movq,	0,(%ebx,%ecx,4), %mm0)	C src[0,1]
199
200	movd	%edx, %mm7
201	movl	VAR_INVERSE, %edx
202
203	addl	$2, %ecx
204	psrlq	%mm7, %mm0
205
206	movd	%mm0, %esi
207	jz	L(even_two)		C if only two limbs
208
209
210C Out-of-order execution is good enough to hide the load/rshift/movd
211C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
212C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
213C been found.  Maybe the fact every second movq is unaligned costs the extra
214C 0.5.
215
216L(even_top):
217	C eax	scratch
218	C ebx	src end
219	C ecx	counter, limbs, negative
220	C edx	inverse
221	C esi	next limb, adjusted for carry
222	C edi	dst end
223	C ebp	carry bit, 0 or -1
224	C
225	C mm0	scratch, source limbs
226	C mm7	twos
227
228	imull	%edx, %esi
229
230	movl	%esi, -8(%edi,%ecx,4)
231	movl	PARAM_DIVISOR, %eax
232
233	mull	%esi			C carry limb in edx
234
235	movq	-4(%ebx,%ecx,4), %mm0
236	psrlq	%mm7, %mm0
237
238	movd	%mm0, %esi
239	subl	%ebp, %edx		C apply carry bit
240
241	subl	%edx, %esi		C apply carry limb
242	movl	VAR_INVERSE, %edx
243
244	sbbl	%ebp, %ebp		C 0 or -1
245
246	incl	%ecx
247	jnz	L(even_top)
248
249
250L(even_two):
251	movd	-4(%ebx), %mm0		C src high limb
252	psrlq	%mm7, %mm0
253
254	imull	%edx, %esi
255
256	movl	%esi, -8(%edi)
257	movl	PARAM_DIVISOR, %eax
258
259	mull	%esi			C carry limb in edx
260
261	movd	%mm0, %esi
262	subl	%ebp, %edx		C apply carry bit
263
264	movl	VAR_INVERSE, %eax
265	subl	%edx, %esi		C apply carry limb
266
267	imull	%eax, %esi
268
269	movl	%esi, -4(%edi)
270
271	popl	%edi
272	popl	%esi
273
274	popl	%ebp
275	popl	%ebx
276
277	emms_or_femms
278
279	ret
280
281EPILOGUE()
282ASM_END()
283