xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/rshift.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  Intel Pentium mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C         cycles/limb
35C P5,P54:    6.0
36C P55:       5.375
37
38
39C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40C                       unsigned shift);
41C
42C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
43C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
44
45defframe(PARAM_SHIFT,16)
46defframe(PARAM_SIZE, 12)
47defframe(PARAM_SRC,  8)
48defframe(PARAM_DST,  4)
49
50	TEXT
51	ALIGN(8)
52PROLOGUE(mpn_rshift)
53
54	pushl	%edi
55	pushl	%esi
56	pushl	%ebx
57	pushl	%ebp
58deflit(`FRAME',16)
59
60	movl	PARAM_DST,%edi
61	movl	PARAM_SRC,%esi
62	movl	PARAM_SIZE,%ebp
63	movl	PARAM_SHIFT,%ecx
64
65C We can use faster code for shift-by-1 under certain conditions.
66	cmp	$1,%ecx
67	jne	L(normal)
68	leal	4(%edi),%eax
69	cmpl	%esi,%eax
70	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
71	leal	(%edi,%ebp,4),%eax
72	cmpl	%eax,%esi
73	jnc	L(special)		C jump if s_ptr >= res_ptr + size
74
75L(normal):
76	movl	(%esi),%edx
77	addl	$4,%esi
78	xorl	%eax,%eax
79	shrdl(	%cl, %edx, %eax)	C compute carry limb
80	pushl	%eax			C push carry limb onto stack
81
82	decl	%ebp
83	pushl	%ebp
84	shrl	$3,%ebp
85	jz	L(end)
86
87	movl	(%edi),%eax		C fetch destination cache line
88
89	ALIGN(4)
90L(oop):	movl	28(%edi),%eax		C fetch destination cache line
91	movl	%edx,%ebx
92
93	movl	(%esi),%eax
94	movl	4(%esi),%edx
95	shrdl(	%cl, %eax, %ebx)
96	shrdl(	%cl, %edx, %eax)
97	movl	%ebx,(%edi)
98	movl	%eax,4(%edi)
99
100	movl	8(%esi),%ebx
101	movl	12(%esi),%eax
102	shrdl(	%cl, %ebx, %edx)
103	shrdl(	%cl, %eax, %ebx)
104	movl	%edx,8(%edi)
105	movl	%ebx,12(%edi)
106
107	movl	16(%esi),%edx
108	movl	20(%esi),%ebx
109	shrdl(	%cl, %edx, %eax)
110	shrdl(	%cl, %ebx, %edx)
111	movl	%eax,16(%edi)
112	movl	%edx,20(%edi)
113
114	movl	24(%esi),%eax
115	movl	28(%esi),%edx
116	shrdl(	%cl, %eax, %ebx)
117	shrdl(	%cl, %edx, %eax)
118	movl	%ebx,24(%edi)
119	movl	%eax,28(%edi)
120
121	addl	$32,%esi
122	addl	$32,%edi
123	decl	%ebp
124	jnz	L(oop)
125
126L(end):	popl	%ebp
127	andl	$7,%ebp
128	jz	L(end2)
129L(oop2):
130	movl	(%esi),%eax
131	shrdl(	%cl,%eax,%edx)		C compute result limb
132	movl	%edx,(%edi)
133	movl	%eax,%edx
134	addl	$4,%esi
135	addl	$4,%edi
136	decl	%ebp
137	jnz	L(oop2)
138
139L(end2):
140	shrl	%cl,%edx		C compute most significant limb
141	movl	%edx,(%edi)		C store it
142
143	popl	%eax			C pop carry limb
144
145	popl	%ebp
146	popl	%ebx
147	popl	%esi
148	popl	%edi
149	ret
150
151
152C We loop from least significant end of the arrays, which is only
153C permissable if the source and destination don't overlap, since the
154C function is documented to work for overlapping source and destination.
155
156L(special):
157	leal	-4(%edi,%ebp,4),%edi
158	leal	-4(%esi,%ebp,4),%esi
159
160	movl	(%esi),%edx
161	subl	$4,%esi
162
163	decl	%ebp
164	pushl	%ebp
165	shrl	$3,%ebp
166
167	shrl	%edx
168	incl	%ebp
169	decl	%ebp
170	jz	L(Lend)
171
172	movl	(%edi),%eax		C fetch destination cache line
173
174	ALIGN(4)
175L(Loop):
176	movl	-28(%edi),%eax		C fetch destination cache line
177	movl	%edx,%ebx
178
179	movl	(%esi),%eax
180	movl	-4(%esi),%edx
181	rcrl	%eax
182	movl	%ebx,(%edi)
183	rcrl	%edx
184	movl	%eax,-4(%edi)
185
186	movl	-8(%esi),%ebx
187	movl	-12(%esi),%eax
188	rcrl	%ebx
189	movl	%edx,-8(%edi)
190	rcrl	%eax
191	movl	%ebx,-12(%edi)
192
193	movl	-16(%esi),%edx
194	movl	-20(%esi),%ebx
195	rcrl	%edx
196	movl	%eax,-16(%edi)
197	rcrl	%ebx
198	movl	%edx,-20(%edi)
199
200	movl	-24(%esi),%eax
201	movl	-28(%esi),%edx
202	rcrl	%eax
203	movl	%ebx,-24(%edi)
204	rcrl	%edx
205	movl	%eax,-28(%edi)
206
207	leal	-32(%esi),%esi		C use leal not to clobber carry
208	leal	-32(%edi),%edi
209	decl	%ebp
210	jnz	L(Loop)
211
212L(Lend):
213	popl	%ebp
214	sbbl	%eax,%eax		C save carry in %eax
215	andl	$7,%ebp
216	jz	L(Lend2)
217	addl	%eax,%eax		C restore carry from eax
218L(Loop2):
219	movl	%edx,%ebx
220	movl	(%esi),%edx
221	rcrl	%edx
222	movl	%ebx,(%edi)
223
224	leal	-4(%esi),%esi		C use leal not to clobber carry
225	leal	-4(%edi),%edi
226	decl	%ebp
227	jnz	L(Loop2)
228
229	jmp	L(L1)
230L(Lend2):
231	addl	%eax,%eax		C restore carry from eax
232L(L1):	movl	%edx,(%edi)		C store last limb
233
234	movl	$0,%eax
235	rcrl	%eax
236
237	popl	%ebp
238	popl	%ebx
239	popl	%esi
240	popl	%edi
241	ret
242
243EPILOGUE()
244