xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/lshift.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  Intel Pentium mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C         cycles/limb
35C P5,P54:    6.0
36C P55:       5.375
37
38
39C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40C                       unsigned shift);
41C
42C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
43C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
44
45defframe(PARAM_SHIFT,16)
46defframe(PARAM_SIZE, 12)
47defframe(PARAM_SRC,  8)
48defframe(PARAM_DST,  4)
49
50	TEXT
51	ALIGN(8)
52PROLOGUE(mpn_lshift)
53
54	pushl	%edi
55	pushl	%esi
56	pushl	%ebx
57	pushl	%ebp
58deflit(`FRAME',16)
59
60	movl	PARAM_DST,%edi
61	movl	PARAM_SRC,%esi
62	movl	PARAM_SIZE,%ebp
63	movl	PARAM_SHIFT,%ecx
64
65C We can use faster code for shift-by-1 under certain conditions.
66	cmp	$1,%ecx
67	jne	L(normal)
68	leal	4(%esi),%eax
69	cmpl	%edi,%eax
70	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
71	leal	(%esi,%ebp,4),%eax
72	cmpl	%eax,%edi
73	jnc	L(special)		C jump if res_ptr >= s_ptr + size
74
75L(normal):
76	leal	-4(%edi,%ebp,4),%edi
77	leal	-4(%esi,%ebp,4),%esi
78
79	movl	(%esi),%edx
80	subl	$4,%esi
81	xorl	%eax,%eax
82	shldl(	%cl, %edx, %eax)	C compute carry limb
83	pushl	%eax			C push carry limb onto stack
84
85	decl	%ebp
86	pushl	%ebp
87	shrl	$3,%ebp
88	jz	L(end)
89
90	movl	(%edi),%eax		C fetch destination cache line
91
92	ALIGN(4)
93L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
94	movl	%edx,%ebx
95
96	movl	(%esi),%eax
97	movl	-4(%esi),%edx
98	shldl(	%cl, %eax, %ebx)
99	shldl(	%cl, %edx, %eax)
100	movl	%ebx,(%edi)
101	movl	%eax,-4(%edi)
102
103	movl	-8(%esi),%ebx
104	movl	-12(%esi),%eax
105	shldl(	%cl, %ebx, %edx)
106	shldl(	%cl, %eax, %ebx)
107	movl	%edx,-8(%edi)
108	movl	%ebx,-12(%edi)
109
110	movl	-16(%esi),%edx
111	movl	-20(%esi),%ebx
112	shldl(	%cl, %edx, %eax)
113	shldl(	%cl, %ebx, %edx)
114	movl	%eax,-16(%edi)
115	movl	%edx,-20(%edi)
116
117	movl	-24(%esi),%eax
118	movl	-28(%esi),%edx
119	shldl(	%cl, %eax, %ebx)
120	shldl(	%cl, %edx, %eax)
121	movl	%ebx,-24(%edi)
122	movl	%eax,-28(%edi)
123
124	subl	$32,%esi
125	subl	$32,%edi
126	decl	%ebp
127	jnz	L(oop)
128
129L(end):	popl	%ebp
130	andl	$7,%ebp
131	jz	L(end2)
132L(oop2):
133	movl	(%esi),%eax
134	shldl(	%cl,%eax,%edx)
135	movl	%edx,(%edi)
136	movl	%eax,%edx
137	subl	$4,%esi
138	subl	$4,%edi
139	decl	%ebp
140	jnz	L(oop2)
141
142L(end2):
143	shll	%cl,%edx		C compute least significant limb
144	movl	%edx,(%edi)		C store it
145
146	popl	%eax			C pop carry limb
147
148	popl	%ebp
149	popl	%ebx
150	popl	%esi
151	popl	%edi
152	ret
153
154
155C We loop from least significant end of the arrays, which is only
156C permissable if the source and destination don't overlap, since the
157C function is documented to work for overlapping source and destination.
158
159L(special):
160	movl	(%esi),%edx
161	addl	$4,%esi
162
163	decl	%ebp
164	pushl	%ebp
165	shrl	$3,%ebp
166
167	addl	%edx,%edx
168	incl	%ebp
169	decl	%ebp
170	jz	L(Lend)
171
172	movl	(%edi),%eax		C fetch destination cache line
173
174	ALIGN(4)
175L(Loop):
176	movl	28(%edi),%eax		C fetch destination cache line
177	movl	%edx,%ebx
178
179	movl	(%esi),%eax
180	movl	4(%esi),%edx
181	adcl	%eax,%eax
182	movl	%ebx,(%edi)
183	adcl	%edx,%edx
184	movl	%eax,4(%edi)
185
186	movl	8(%esi),%ebx
187	movl	12(%esi),%eax
188	adcl	%ebx,%ebx
189	movl	%edx,8(%edi)
190	adcl	%eax,%eax
191	movl	%ebx,12(%edi)
192
193	movl	16(%esi),%edx
194	movl	20(%esi),%ebx
195	adcl	%edx,%edx
196	movl	%eax,16(%edi)
197	adcl	%ebx,%ebx
198	movl	%edx,20(%edi)
199
200	movl	24(%esi),%eax
201	movl	28(%esi),%edx
202	adcl	%eax,%eax
203	movl	%ebx,24(%edi)
204	adcl	%edx,%edx
205	movl	%eax,28(%edi)
206
207	leal	32(%esi),%esi		C use leal not to clobber carry
208	leal	32(%edi),%edi
209	decl	%ebp
210	jnz	L(Loop)
211
212L(Lend):
213	popl	%ebp
214	sbbl	%eax,%eax		C save carry in %eax
215	andl	$7,%ebp
216	jz	L(Lend2)
217	addl	%eax,%eax		C restore carry from eax
218L(Loop2):
219	movl	%edx,%ebx
220	movl	(%esi),%edx
221	adcl	%edx,%edx
222	movl	%ebx,(%edi)
223
224	leal	4(%esi),%esi		C use leal not to clobber carry
225	leal	4(%edi),%edi
226	decl	%ebp
227	jnz	L(Loop2)
228
229	jmp	L(L1)
230L(Lend2):
231	addl	%eax,%eax		C restore carry from eax
232L(L1):	movl	%edx,(%edi)		C store last limb
233
234	sbbl	%eax,%eax
235	negl	%eax
236
237	popl	%ebp
238	popl	%ebx
239	popl	%esi
240	popl	%edi
241	ret
242
243EPILOGUE()
244