xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/rshift.asm (revision 7d3af8c6a2070d16ec6d1aef203d052d6683100d)
1dnl  Intel Pentium mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
4dnl  Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C         cycles/limb
25C P5,P54:    6.0
26C P55:       5.375
27
28
29C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
30C                       unsigned shift);
31C
32C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
33C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
34
35defframe(PARAM_SHIFT,16)
36defframe(PARAM_SIZE, 12)
37defframe(PARAM_SRC,  8)
38defframe(PARAM_DST,  4)
39
40	TEXT
41	ALIGN(8)
42PROLOGUE(mpn_rshift)
43
44	pushl	%edi
45	pushl	%esi
46	pushl	%ebx
47	pushl	%ebp
48deflit(`FRAME',16)
49
50	movl	PARAM_DST,%edi
51	movl	PARAM_SRC,%esi
52	movl	PARAM_SIZE,%ebp
53	movl	PARAM_SHIFT,%ecx
54
55C We can use faster code for shift-by-1 under certain conditions.
56	cmp	$1,%ecx
57	jne	L(normal)
58	leal	4(%edi),%eax
59	cmpl	%esi,%eax
60	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
61	leal	(%edi,%ebp,4),%eax
62	cmpl	%eax,%esi
63	jnc	L(special)		C jump if s_ptr >= res_ptr + size
64
65L(normal):
66	movl	(%esi),%edx
67	addl	$4,%esi
68	xorl	%eax,%eax
69	shrdl(	%cl, %edx, %eax)	C compute carry limb
70	pushl	%eax			C push carry limb onto stack
71
72	decl	%ebp
73	pushl	%ebp
74	shrl	$3,%ebp
75	jz	L(end)
76
77	movl	(%edi),%eax		C fetch destination cache line
78
79	ALIGN(4)
80L(oop):	movl	28(%edi),%eax		C fetch destination cache line
81	movl	%edx,%ebx
82
83	movl	(%esi),%eax
84	movl	4(%esi),%edx
85	shrdl(	%cl, %eax, %ebx)
86	shrdl(	%cl, %edx, %eax)
87	movl	%ebx,(%edi)
88	movl	%eax,4(%edi)
89
90	movl	8(%esi),%ebx
91	movl	12(%esi),%eax
92	shrdl(	%cl, %ebx, %edx)
93	shrdl(	%cl, %eax, %ebx)
94	movl	%edx,8(%edi)
95	movl	%ebx,12(%edi)
96
97	movl	16(%esi),%edx
98	movl	20(%esi),%ebx
99	shrdl(	%cl, %edx, %eax)
100	shrdl(	%cl, %ebx, %edx)
101	movl	%eax,16(%edi)
102	movl	%edx,20(%edi)
103
104	movl	24(%esi),%eax
105	movl	28(%esi),%edx
106	shrdl(	%cl, %eax, %ebx)
107	shrdl(	%cl, %edx, %eax)
108	movl	%ebx,24(%edi)
109	movl	%eax,28(%edi)
110
111	addl	$32,%esi
112	addl	$32,%edi
113	decl	%ebp
114	jnz	L(oop)
115
116L(end):	popl	%ebp
117	andl	$7,%ebp
118	jz	L(end2)
119L(oop2):
120	movl	(%esi),%eax
121	shrdl(	%cl,%eax,%edx)		C compute result limb
122	movl	%edx,(%edi)
123	movl	%eax,%edx
124	addl	$4,%esi
125	addl	$4,%edi
126	decl	%ebp
127	jnz	L(oop2)
128
129L(end2):
130	shrl	%cl,%edx		C compute most significant limb
131	movl	%edx,(%edi)		C store it
132
133	popl	%eax			C pop carry limb
134
135	popl	%ebp
136	popl	%ebx
137	popl	%esi
138	popl	%edi
139	ret
140
141
142C We loop from least significant end of the arrays, which is only
143C permissable if the source and destination don't overlap, since the
144C function is documented to work for overlapping source and destination.
145
146L(special):
147	leal	-4(%edi,%ebp,4),%edi
148	leal	-4(%esi,%ebp,4),%esi
149
150	movl	(%esi),%edx
151	subl	$4,%esi
152
153	decl	%ebp
154	pushl	%ebp
155	shrl	$3,%ebp
156
157	shrl	%edx
158	incl	%ebp
159	decl	%ebp
160	jz	L(Lend)
161
162	movl	(%edi),%eax		C fetch destination cache line
163
164	ALIGN(4)
165L(Loop):
166	movl	-28(%edi),%eax		C fetch destination cache line
167	movl	%edx,%ebx
168
169	movl	(%esi),%eax
170	movl	-4(%esi),%edx
171	rcrl	%eax
172	movl	%ebx,(%edi)
173	rcrl	%edx
174	movl	%eax,-4(%edi)
175
176	movl	-8(%esi),%ebx
177	movl	-12(%esi),%eax
178	rcrl	%ebx
179	movl	%edx,-8(%edi)
180	rcrl	%eax
181	movl	%ebx,-12(%edi)
182
183	movl	-16(%esi),%edx
184	movl	-20(%esi),%ebx
185	rcrl	%edx
186	movl	%eax,-16(%edi)
187	rcrl	%ebx
188	movl	%edx,-20(%edi)
189
190	movl	-24(%esi),%eax
191	movl	-28(%esi),%edx
192	rcrl	%eax
193	movl	%ebx,-24(%edi)
194	rcrl	%edx
195	movl	%eax,-28(%edi)
196
197	leal	-32(%esi),%esi		C use leal not to clobber carry
198	leal	-32(%edi),%edi
199	decl	%ebp
200	jnz	L(Loop)
201
202L(Lend):
203	popl	%ebp
204	sbbl	%eax,%eax		C save carry in %eax
205	andl	$7,%ebp
206	jz	L(Lend2)
207	addl	%eax,%eax		C restore carry from eax
208L(Loop2):
209	movl	%edx,%ebx
210	movl	(%esi),%edx
211	rcrl	%edx
212	movl	%ebx,(%edi)
213
214	leal	-4(%esi),%esi		C use leal not to clobber carry
215	leal	-4(%edi),%edi
216	decl	%ebp
217	jnz	L(Loop2)
218
219	jmp	L(L1)
220L(Lend2):
221	addl	%eax,%eax		C restore carry from eax
222L(L1):	movl	%edx,(%edi)		C store last limb
223
224	movl	$0,%eax
225	rcrl	%eax
226
227	popl	%ebp
228	popl	%ebx
229	popl	%esi
230	popl	%edi
231	ret
232
233EPILOGUE()
234