xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mod_1_1.asm (revision aceb213538ec08a74028e213127af18aa17bf1cf)
1dnl  x86-32 mpn_mod_1_1p, requiring cmov.
2
3dnl  Contributed to the GNU project by Niels Mller and Torbjorn Granlund.
4dnl
5dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
6dnl
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C			    cycles/limb
25C P5				 ?
26C P6 model 0-8,10-12		 ?
27C P6 model 9  (Banias)		 ?
28C P6 model 13 (Dothan)		 ?
29C P4 model 0  (Willamette)	 ?
30C P4 model 1  (?)		 ?
31C P4 model 2  (Northwood)	 ?
32C P4 model 3  (Prescott)	 ?
33C P4 model 4  (Nocona)		 ?
34C AMD K6			 ?
35C AMD K7			 7
36C AMD K8			 ?
37
38define(`B2mb', `%ebx')
39define(`r0', `%esi')
40define(`r2', `%ebp')
41define(`t0', `%edi')
42define(`ap', `%ecx')  C Also shift count
43
44C Stack frame
45C	pre	36(%esp)
46C	b	32(%esp)
47C	n	28(%esp)
48C	ap	24(%esp)
49C	return	20(%esp)
50C	%ebp	16(%esp)
51C	%edi	12(%esp)
52C	%esi	8(%esp)
53C	%ebx	4(%esp)
54C	B2mod	(%esp)
55
56define(`B2modb', `(%esp)')
57define(`n', `28(%esp)')
58define(`b', `32(%esp)')
59define(`pre', `36(%esp)')
60
61C mp_limb_t
62C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
63C
64C The pre array contains bi, cnt, B1modb, B2modb
65C Note: This implementation needs B1modb only when cnt > 0
66
67ASM_START()
68	TEXT
69	ALIGN(8)
70PROLOGUE(mpn_mod_1_1p)
71	push	%ebp
72	push	%edi
73	push	%esi
74	push	%ebx
75	mov	32(%esp), %ebp		C pre[]
76
77	mov	12(%ebp), %eax		C B2modb
78	push	%eax			C Put it on stack
79
80	mov	n, %edx
81	mov	24(%esp), ap
82
83	lea	(ap, %edx, 4), ap
84	mov	-4(ap), %eax
85	cmp	$3, %edx
86	jnc	L(first)
87	mov	-8(ap), r0
88	jmp	L(reduce_two)
89
90L(first):
91	C First iteration, no r2
92	mull	B2modb
93	mov	-12(ap), r0
94	add	%eax, r0
95	mov	-8(ap), %eax
96	adc	%edx, %eax
97	sbb	r2, r2
98	sub	$3, n
99	lea	-16(ap), ap
100	jz	L(reduce_three)
101
102	mov	B2modb, B2mb
103	sub	b, B2mb
104	lea	(B2mb, r0), t0
105	jmp	L(mid)
106
107	ALIGN(16)
108L(top): C Loopmixed to 7 c/l on k7
109	add	%eax, r0
110	lea	(B2mb, r0), t0
111	mov	r2, %eax
112	adc	%edx, %eax
113	sbb	r2, r2
114L(mid):	mull	B2modb
115	and	B2modb, r2
116	add	r0, r2
117	decl	n
118	mov	(ap), r0
119	cmovc(	t0, r2)
120	lea	-4(ap), ap
121	jnz	L(top)
122
123	add	%eax, r0
124	mov	r2, %eax
125	adc	%edx, %eax
126	sbb	r2, r2
127
128L(reduce_three):
129	C Eliminate r2
130	and	b, r2
131	sub	r2, %eax
132
133L(reduce_two):
134	mov	pre, %ebp
135	movb	4(%ebp), %cl
136	test	%cl, %cl
137	jz	L(normalized)
138
139	C Unnormalized, use B1modb to reduce to size < B b
140	mull	8(%ebp)
141	xor	t0, t0
142	add	%eax, r0
143	adc	%edx, t0
144	mov	t0, %eax
145
146	C Left-shift to normalize
147	shld	%cl, r0, %eax C Always use shld?
148
149	shl	%cl, r0
150	jmp	L(udiv)
151
152L(normalized):
153	mov	%eax, t0
154	sub	b, t0
155	cmovnc(	t0, %eax)
156
157L(udiv):
158	lea	1(%eax), t0
159	mull	(%ebp)
160	mov	b, %ebx		C Needed in register for lea
161	add	r0, %eax
162	adc	t0, %edx
163	imul	%ebx, %edx
164	sub	%edx, r0
165	cmp	r0, %eax
166	lea	(%ebx, r0), %eax
167	cmovnc(	r0, %eax)
168	cmp	%ebx, %eax
169	jnc	L(fix)
170L(ok):	shr	%cl, %eax
171
172	add	$4, %esp
173	pop	%ebx
174	pop	%esi
175	pop	%edi
176	pop	%ebp
177
178	ret
179L(fix):	sub	%ebx, %eax
180	jmp	L(ok)
181EPILOGUE()
182
183PROLOGUE(mpn_mod_1_1p_cps)
184	push	%ebp
185	mov	12(%esp), %ebp
186	push	%esi
187	bsr	%ebp, %ecx
188	push	%ebx
189	xor	$31, %ecx
190	mov	16(%esp), %esi
191	sal	%cl, %ebp
192	mov	%ebp, %edx
193	not	%edx
194	mov	$-1, %eax
195	div	%ebp			C On K7, invert_limb would be a few cycles faster.
196	mov	%eax, (%esi)		C store bi
197	mov	%ecx, 4(%esi)		C store cnt
198	neg	%ebp
199	mov	$1, %edx
200	shld	%cl, %eax, %edx
201	imul	%ebp, %edx
202	shr	%cl, %edx
203	imul	%ebp, %eax
204	mov	%edx, 8(%esi)		C store B1modb
205	mov	%eax, 12(%esi)		C store B2modb
206	pop	%ebx
207	pop	%esi
208	pop	%ebp
209	ret
210EPILOGUE()
211