xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/mode1o.asm (revision 8ecbf5f02b752fcb7debe1a8fab1dc82602bc760)
1dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2000-2002, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 23.0 cycles/limb
35
36
37C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
38C                               mp_limb_t divisor);
39C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40C                                mp_limb_t divisor, mp_limb_t carry);
41C
42C There seems no way to pair up the two lone instructions in the main loop.
43C
44C The special case for size==1 saves about 20 cycles (non-PIC), making it
45C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
46C all sizes.
47C
48C Alternatives:
49C
50C Using mmx for the multiplies might be possible, with pmullw and pmulhw
51C having just 3 cycle latencies, but carry bit handling would probably be
52C complicated.
53
54defframe(PARAM_CARRY,  16)
55defframe(PARAM_DIVISOR,12)
56defframe(PARAM_SIZE,   8)
57defframe(PARAM_SRC,    4)
58
59dnl  re-using parameter space
60define(VAR_INVERSE,`PARAM_SIZE')
61
62	TEXT
63
64	ALIGN(16)
65PROLOGUE(mpn_modexact_1c_odd)
66deflit(`FRAME',0)
67
68	movl	PARAM_DIVISOR, %eax
69	movl	PARAM_CARRY, %edx
70
71	jmp	L(start_1c)
72
73EPILOGUE()
74
75	ALIGN(16)
76PROLOGUE(mpn_modexact_1_odd)
77deflit(`FRAME',0)
78
79	movl	PARAM_DIVISOR, %eax
80	xorl	%edx, %edx		C carry
81
82L(start_1c):
83
84ifdef(`PIC',`
85ifdef(`DARWIN',`
86	shrl	%eax			C d/2
87	LEA(	binvert_limb_table, %ecx)
88	pushl	%ebx		FRAME_pushl()
89	movl	PARAM_SIZE, %ebx
90
91	andl	$127, %eax
92	subl	$2, %ebx
93
94	movb	(%eax,%ecx), %cl
95	jc	L(one_limb)
96',`
97	call	L(here)		FRAME_pushl()
98L(here):
99
100	shrl	%eax			C d/2
101	movl	(%esp), %ecx		C eip
102
103	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
104	movl	%ebx, (%esp)		C push ebx
105
106	andl	$127, %eax
107	movl	PARAM_SIZE, %ebx
108
109	movl	binvert_limb_table@GOT(%ecx), %ecx
110	subl	$2, %ebx
111
112	movb	(%eax,%ecx), %cl			C inv 8 bits
113	jc	L(one_limb)
114')
115',`
116dnl non-PIC
117	shrl	%eax			C d/2
118	pushl	%ebx		FRAME_pushl()
119
120	movl	PARAM_SIZE, %ebx
121	andl	$127, %eax
122
123	subl	$2, %ebx
124	jc	L(one_limb)
125
126	movb	binvert_limb_table(%eax), %cl		C inv 8 bits
127')
128
129	movl	%ecx, %eax
130	addl	%ecx, %ecx		C 2*inv
131
132	imull	%eax, %eax		C inv*inv
133
134	imull	PARAM_DIVISOR, %eax	C inv*inv*d
135
136	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
137
138	movl	%ecx, %eax
139	addl	%ecx, %ecx		C 2*inv
140
141	imull	%eax, %eax		C inv*inv
142
143	imull	PARAM_DIVISOR, %eax	C inv*inv*d
144
145	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
146	pushl	%esi		FRAME_pushl()
147
148	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
149	movl	%ecx, %eax
150	imull	PARAM_DIVISOR, %eax
151	cmpl	$1, %eax')
152
153	movl	PARAM_SRC, %esi
154	movl	%ecx, VAR_INVERSE
155
156	movl	(%esi), %eax		C src[0]
157	leal	4(%esi,%ebx,4), %esi	C &src[size-1]
158
159	xorl	$-1, %ebx		C -(size-1)
160	ASSERT(nz)
161	jmp	L(entry)
162
163
164C The use of VAR_INVERSE means only a store is needed for that value, rather
165C than a push and pop of say %edi.
166
167	ALIGN(16)
168L(top):
169	C eax	scratch, low product
170	C ebx	counter, limbs, negative
171	C ecx	carry bit
172	C edx	scratch, high product
173	C esi	&src[size-1]
174	C edi
175	C ebp
176
177	mull	PARAM_DIVISOR		C h:dummy = q*d
178
179	movl	(%esi,%ebx,4), %eax	C src[i]
180	subl	%ecx, %edx		C h -= -c
181
182L(entry):
183	subl	%edx, %eax		C s = src[i] - h
184
185	sbbl	%ecx, %ecx		C new -c (0 or -1)
186
187	imull	VAR_INVERSE, %eax	C q = s*i
188
189	incl	%ebx
190	jnz	L(top)
191
192
193	mull	PARAM_DIVISOR
194
195	movl	(%esi), %eax		C src high
196	subl	%ecx, %edx		C h -= -c
197
198	cmpl	PARAM_DIVISOR, %eax
199
200	jbe	L(skip_last)
201deflit(FRAME_LAST,FRAME)
202
203
204	subl	%edx, %eax		C s = src[i] - h
205	popl	%esi		FRAME_popl()
206
207	sbbl	%ecx, %ecx		C c (0 or -1)
208	popl	%ebx		FRAME_popl()
209
210	imull	VAR_INVERSE, %eax	C q = s*i
211
212	mull	PARAM_DIVISOR		C h:dummy = q*d
213
214	movl	%edx, %eax
215
216	subl	%ecx, %eax
217
218	ret
219
220
221C When high<divisor can skip last step.
222
223L(skip_last):
224deflit(`FRAME',FRAME_LAST)
225	C eax	src high
226	C ebx
227	C ecx
228	C edx	r
229	C esi
230
231	subl	%eax, %edx	C r-s
232	popl	%esi		FRAME_popl()
233
234	sbbl	%eax, %eax	C -1 if underflow
235	movl	PARAM_DIVISOR, %ebx
236
237	andl	%ebx, %eax	C divisor if underflow
238	popl	%ebx		FRAME_popl()
239
240	addl	%edx, %eax	C addback if underflow
241
242	ret
243
244
245C Special case for size==1 using a division for r = c-a mod d.
246C Could look for a-c<d and save a division sometimes, but that doesn't seem
247C worth bothering about.
248
249L(one_limb):
250deflit(`FRAME',4)
251	C eax
252	C ebx	size-2 (==-1)
253	C ecx
254	C edx	carry
255	C esi	src end
256	C edi
257	C ebp
258
259	movl	%edx, %eax
260	movl	PARAM_SRC, %edx
261
262	movl	PARAM_DIVISOR, %ecx
263	popl	%ebx		FRAME_popl()
264
265	subl	(%edx), %eax		C c-a
266
267	sbbl	%edx, %edx
268	decl	%ecx			C d-1
269
270	andl	%ecx, %edx		C b*d+c-a if c<a, or c-a if c>=a
271
272	divl	PARAM_DIVISOR
273
274	movl	%edx, %eax
275
276	ret
277
278EPILOGUE()
279ASM_END()
280