xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/mmx/lshift.asm (revision 8450a7c42673d65e3b1f6560d3b6ecd317a6cbe8)
1dnl  Intel P5 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 1.75 cycles/limb.
24
25
26C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                       unsigned shift);
28C
29C Shift src,size left by shift many bits and store the result in dst,size.
30C Zeros are shifted in at the right.  Return the bits shifted out at the
31C left.
32C
33C The comments in mpn_rshift apply here too.
34
35defframe(PARAM_SHIFT,16)
36defframe(PARAM_SIZE, 12)
37defframe(PARAM_SRC,  8)
38defframe(PARAM_DST,  4)
39deflit(`FRAME',0)
40
41dnl  minimum 5, because the unrolled loop can't handle less
42deflit(UNROLL_THRESHOLD, 5)
43
44	TEXT
45	ALIGN(8)
46
47PROLOGUE(mpn_lshift)
48
49	pushl	%ebx
50	pushl	%edi
51deflit(`FRAME',8)
52
53	movl	PARAM_SIZE, %eax
54	movl	PARAM_DST, %edx
55
56	movl	PARAM_SRC, %ebx
57	movl	PARAM_SHIFT, %ecx
58
59	cmp	$UNROLL_THRESHOLD, %eax
60	jae	L(unroll)
61
62	movl	-4(%ebx,%eax,4), %edi	C src high limb
63	decl	%eax
64
65	jnz	L(simple)
66
67	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
68
69	shll	%cl, %edi
70
71	movl	%edi, (%edx)		C dst low limb
72	popl	%edi			C risk of data cache bank clash
73
74	popl	%ebx
75
76	ret
77
78
79C -----------------------------------------------------------------------------
80L(simple):
81	C eax	size-1
82	C ebx	src
83	C ecx	shift
84	C edx	dst
85	C esi
86	C edi
87	C ebp
88deflit(`FRAME',8)
89
90	movd	(%ebx,%eax,4), %mm5	C src high limb
91
92	movd	%ecx, %mm6		C lshift
93	negl	%ecx
94
95	psllq	%mm6, %mm5
96	addl	$32, %ecx
97
98	movd	%ecx, %mm7
99	psrlq	$32, %mm5		C retval
100
101
102L(simple_top):
103	C eax	counter, limbs, negative
104	C ebx	src
105	C ecx
106	C edx	dst
107	C esi
108	C edi
109	C
110	C mm0	scratch
111	C mm5	return value
112	C mm6	shift
113	C mm7	32-shift
114
115	movq	-4(%ebx,%eax,4), %mm0
116	decl	%eax
117
118	psrlq	%mm7, %mm0
119
120	C
121
122	movd	%mm0, 4(%edx,%eax,4)
123	jnz	L(simple_top)
124
125
126	movd	(%ebx), %mm0
127
128	movd	%mm5, %eax
129	psllq	%mm6, %mm0
130
131	popl	%edi
132	popl	%ebx
133
134	movd	%mm0, (%edx)
135
136	emms
137
138	ret
139
140
141C -----------------------------------------------------------------------------
142	ALIGN(8)
143L(unroll):
144	C eax	size
145	C ebx	src
146	C ecx	shift
147	C edx	dst
148	C esi
149	C edi
150	C ebp
151deflit(`FRAME',8)
152
153	movd	-4(%ebx,%eax,4), %mm5	C src high limb
154	leal	(%ebx,%eax,4), %edi
155
156	movd	%ecx, %mm6		C lshift
157	andl	$4, %edi
158
159	psllq	%mm6, %mm5
160	jz	L(start_src_aligned)
161
162
163	C src isn't aligned, process high limb separately (marked xxx) to
164	C make it so.
165	C
166	C  source     -8(ebx,%eax,4)
167	C                  |
168	C  +-------+-------+-------+--
169	C  |               |
170	C  +-------+-------+-------+--
171	C        0mod8   4mod8   0mod8
172	C
173	C  dest
174	C     -4(edx,%eax,4)
175	C          |
176	C  +-------+-------+--
177	C  |  xxx  |       |
178	C  +-------+-------+--
179
180	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
181
182	psllq	%mm6, %mm0
183	decl	%eax
184
185	psrlq	$32, %mm0
186
187	C
188
189	movd	%mm0, (%edx,%eax,4)
190L(start_src_aligned):
191
192	movq	-8(%ebx,%eax,4), %mm1	C src high qword
193	leal	(%edx,%eax,4), %edi
194
195	andl	$4, %edi
196	psrlq	$32, %mm5		C return value
197
198	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
199	jz	L(start_dst_aligned)
200
201	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
202	C is 32 bits extra.  High limb of dst (marked xxx) handled here
203	C separately.
204	C
205	C  source     -8(ebx,%eax,4)
206	C                  |
207	C  +-------+-------+--
208	C  |      mm1      |
209	C  +-------+-------+--
210	C                0mod8   4mod8
211	C
212	C  dest
213	C     -4(edx,%eax,4)
214	C          |
215	C  +-------+-------+-------+--
216	C  |  xxx  |               |
217	C  +-------+-------+-------+--
218	C        0mod8   4mod8   0mod8
219
220	movq	%mm1, %mm0
221	addl	$32, %ecx		C new shift
222
223	psllq	%mm6, %mm0
224
225	movd	%ecx, %mm6
226	psrlq	$32, %mm0
227
228	C wasted cycle here waiting for %mm0
229
230	movd	%mm0, -4(%edx,%eax,4)
231	subl	$4, %edx
232L(start_dst_aligned):
233
234
235	psllq	%mm6, %mm1
236	negl	%ecx			C -shift
237
238	addl	$64, %ecx		C 64-shift
239	movq	%mm3, %mm2
240
241	movd	%ecx, %mm7
242	subl	$8, %eax		C size-8
243
244	psrlq	%mm7, %mm3
245
246	por	%mm1, %mm3		C mm3 ready to store
247	jc	L(finish)
248
249
250	C The comments in mpn_rshift apply here too.
251
252	ALIGN(8)
253L(unroll_loop):
254	C eax	counter, limbs
255	C ebx	src
256	C ecx
257	C edx	dst
258	C esi
259	C edi
260	C
261	C mm0
262	C mm1
263	C mm2	src qword from 16(%ebx,%eax,4)
264	C mm3	dst qword ready to store to 24(%edx,%eax,4)
265	C
266	C mm5	return value
267	C mm6	lshift
268	C mm7	rshift
269
270	movq	8(%ebx,%eax,4), %mm0
271	psllq	%mm6, %mm2
272
273	movq	%mm0, %mm1
274	psrlq	%mm7, %mm0
275
276	movq	%mm3, 24(%edx,%eax,4)	C prev
277	por	%mm2, %mm0
278
279	movq	(%ebx,%eax,4), %mm3	C
280	psllq	%mm6, %mm1		C
281
282	movq	%mm0, 16(%edx,%eax,4)
283	movq	%mm3, %mm2		C
284
285	psrlq	%mm7, %mm3		C
286	subl	$4, %eax
287
288	por	%mm1, %mm3		C
289	jnc	L(unroll_loop)
290
291
292
293L(finish):
294	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
295
296	testb	$2, %al
297
298	jz	L(finish_no_two)
299
300	movq	8(%ebx,%eax,4), %mm0
301	psllq	%mm6, %mm2
302
303	movq	%mm0, %mm1
304	psrlq	%mm7, %mm0
305
306	movq	%mm3, 24(%edx,%eax,4)	C prev
307	por	%mm2, %mm0
308
309	movq	%mm1, %mm2
310	movq	%mm0, %mm3
311
312	subl	$2, %eax
313L(finish_no_two):
314
315
316	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
317	C
318	C mm2	src prev qword, from 16(%ebx,%eax,4)
319	C mm3	dst qword, for 24(%edx,%eax,4)
320
321	testb	$1, %al
322	movd	%mm5, %eax	C retval
323
324	popl	%edi
325	jz	L(finish_zero)
326
327
328	C One extra src limb, destination was aligned.
329	C
330	C                 source                  ebx
331	C                 --+---------------+-------+
332	C                   |      mm2      |       |
333	C                 --+---------------+-------+
334	C
335	C dest         edx+12           edx+4     edx
336	C --+---------------+---------------+-------+
337	C   |      mm3      |               |       |
338	C --+---------------+---------------+-------+
339	C
340	C mm6 = shift
341	C mm7 = ecx = 64-shift
342
343
344	C One extra src limb, destination was unaligned.
345	C
346	C                 source                  ebx
347	C                 --+---------------+-------+
348	C                   |      mm2      |       |
349	C                 --+---------------+-------+
350	C
351	C         dest         edx+12           edx+4
352	C         --+---------------+---------------+
353	C           |      mm3      |               |
354	C         --+---------------+---------------+
355	C
356	C mm6 = shift+32
357	C mm7 = ecx = 64-(shift+32)
358
359
360	C In both cases there's one extra limb of src to fetch and combine
361	C with mm2 to make a qword at 4(%edx), and in the aligned case
362	C there's an extra limb of dst to be formed from that extra src limb
363	C left shifted.
364
365
366	movd	(%ebx), %mm0
367	psllq	%mm6, %mm2
368
369	movq	%mm3, 12(%edx)
370	psllq	$32, %mm0
371
372	movq	%mm0, %mm1
373	psrlq	%mm7, %mm0
374
375	por	%mm2, %mm0
376	psllq	%mm6, %mm1
377
378	movq	%mm0, 4(%edx)
379	psrlq	$32, %mm1
380
381	andl	$32, %ecx
382	popl	%ebx
383
384	jz	L(finish_one_unaligned)
385
386	movd	%mm1, (%edx)
387L(finish_one_unaligned):
388
389	emms
390
391	ret
392
393
394L(finish_zero):
395
396	C No extra src limbs, destination was aligned.
397	C
398	C                 source          ebx
399	C                 --+---------------+
400	C                   |      mm2      |
401	C                 --+---------------+
402	C
403	C dest          edx+8             edx
404	C --+---------------+---------------+
405	C   |      mm3      |               |
406	C --+---------------+---------------+
407	C
408	C mm6 = shift
409	C mm7 = ecx = 64-shift
410
411
412	C No extra src limbs, destination was unaligned.
413	C
414	C               source            ebx
415	C                 --+---------------+
416	C                   |      mm2      |
417	C                 --+---------------+
418	C
419	C         dest          edx+8   edx+4
420	C         --+---------------+-------+
421	C           |      mm3      |       |
422	C         --+---------------+-------+
423	C
424	C mm6 = shift+32
425	C mm7 = ecx = 64-(shift+32)
426
427
428	C The movd for the unaligned case writes the same data to 4(%edx)
429	C that the movq does for the aligned case.
430
431
432	movq	%mm3, 8(%edx)
433	andl	$32, %ecx
434
435	psllq	%mm6, %mm2
436	jz	L(finish_zero_unaligned)
437
438	movq	%mm2, (%edx)
439L(finish_zero_unaligned):
440
441	psrlq	$32, %mm2
442	popl	%ebx
443
444	movd	%mm5, %eax	C retval
445
446	movd	%mm2, 4(%edx)
447
448	emms
449
450	ret
451
452EPILOGUE()
453