xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/mmx/lshift.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  Intel P5 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 2000-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 1.75 cycles/limb.
35
36
37C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40C Shift src,size left by shift many bits and store the result in dst,size.
41C Zeros are shifted in at the right.  Return the bits shifted out at the
42C left.
43C
44C The comments in mpn_rshift apply here too.
45
46defframe(PARAM_SHIFT,16)
47defframe(PARAM_SIZE, 12)
48defframe(PARAM_SRC,  8)
49defframe(PARAM_DST,  4)
50deflit(`FRAME',0)
51
52dnl  minimum 5, because the unrolled loop can't handle less
53deflit(UNROLL_THRESHOLD, 5)
54
55	TEXT
56	ALIGN(8)
57
58PROLOGUE(mpn_lshift)
59
60	pushl	%ebx
61	pushl	%edi
62deflit(`FRAME',8)
63
64	movl	PARAM_SIZE, %eax
65	movl	PARAM_DST, %edx
66
67	movl	PARAM_SRC, %ebx
68	movl	PARAM_SHIFT, %ecx
69
70	cmp	$UNROLL_THRESHOLD, %eax
71	jae	L(unroll)
72
73	movl	-4(%ebx,%eax,4), %edi	C src high limb
74	decl	%eax
75
76	jnz	L(simple)
77
78	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
79
80	shll	%cl, %edi
81
82	movl	%edi, (%edx)		C dst low limb
83	popl	%edi			C risk of data cache bank clash
84
85	popl	%ebx
86
87	ret
88
89
90C -----------------------------------------------------------------------------
91L(simple):
92	C eax	size-1
93	C ebx	src
94	C ecx	shift
95	C edx	dst
96	C esi
97	C edi
98	C ebp
99deflit(`FRAME',8)
100
101	movd	(%ebx,%eax,4), %mm5	C src high limb
102
103	movd	%ecx, %mm6		C lshift
104	negl	%ecx
105
106	psllq	%mm6, %mm5
107	addl	$32, %ecx
108
109	movd	%ecx, %mm7
110	psrlq	$32, %mm5		C retval
111
112
113L(simple_top):
114	C eax	counter, limbs, negative
115	C ebx	src
116	C ecx
117	C edx	dst
118	C esi
119	C edi
120	C
121	C mm0	scratch
122	C mm5	return value
123	C mm6	shift
124	C mm7	32-shift
125
126	movq	-4(%ebx,%eax,4), %mm0
127	decl	%eax
128
129	psrlq	%mm7, %mm0
130
131	C
132
133	movd	%mm0, 4(%edx,%eax,4)
134	jnz	L(simple_top)
135
136
137	movd	(%ebx), %mm0
138
139	movd	%mm5, %eax
140	psllq	%mm6, %mm0
141
142	popl	%edi
143	popl	%ebx
144
145	movd	%mm0, (%edx)
146
147	emms
148
149	ret
150
151
152C -----------------------------------------------------------------------------
153	ALIGN(8)
154L(unroll):
155	C eax	size
156	C ebx	src
157	C ecx	shift
158	C edx	dst
159	C esi
160	C edi
161	C ebp
162deflit(`FRAME',8)
163
164	movd	-4(%ebx,%eax,4), %mm5	C src high limb
165	leal	(%ebx,%eax,4), %edi
166
167	movd	%ecx, %mm6		C lshift
168	andl	$4, %edi
169
170	psllq	%mm6, %mm5
171	jz	L(start_src_aligned)
172
173
174	C src isn't aligned, process high limb separately (marked xxx) to
175	C make it so.
176	C
177	C  source     -8(ebx,%eax,4)
178	C                  |
179	C  +-------+-------+-------+--
180	C  |               |
181	C  +-------+-------+-------+--
182	C        0mod8   4mod8   0mod8
183	C
184	C  dest
185	C     -4(edx,%eax,4)
186	C          |
187	C  +-------+-------+--
188	C  |  xxx  |       |
189	C  +-------+-------+--
190
191	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
192
193	psllq	%mm6, %mm0
194	decl	%eax
195
196	psrlq	$32, %mm0
197
198	C
199
200	movd	%mm0, (%edx,%eax,4)
201L(start_src_aligned):
202
203	movq	-8(%ebx,%eax,4), %mm1	C src high qword
204	leal	(%edx,%eax,4), %edi
205
206	andl	$4, %edi
207	psrlq	$32, %mm5		C return value
208
209	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
210	jz	L(start_dst_aligned)
211
212	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
213	C is 32 bits extra.  High limb of dst (marked xxx) handled here
214	C separately.
215	C
216	C  source     -8(ebx,%eax,4)
217	C                  |
218	C  +-------+-------+--
219	C  |      mm1      |
220	C  +-------+-------+--
221	C                0mod8   4mod8
222	C
223	C  dest
224	C     -4(edx,%eax,4)
225	C          |
226	C  +-------+-------+-------+--
227	C  |  xxx  |               |
228	C  +-------+-------+-------+--
229	C        0mod8   4mod8   0mod8
230
231	movq	%mm1, %mm0
232	addl	$32, %ecx		C new shift
233
234	psllq	%mm6, %mm0
235
236	movd	%ecx, %mm6
237	psrlq	$32, %mm0
238
239	C wasted cycle here waiting for %mm0
240
241	movd	%mm0, -4(%edx,%eax,4)
242	subl	$4, %edx
243L(start_dst_aligned):
244
245
246	psllq	%mm6, %mm1
247	negl	%ecx			C -shift
248
249	addl	$64, %ecx		C 64-shift
250	movq	%mm3, %mm2
251
252	movd	%ecx, %mm7
253	subl	$8, %eax		C size-8
254
255	psrlq	%mm7, %mm3
256
257	por	%mm1, %mm3		C mm3 ready to store
258	jc	L(finish)
259
260
261	C The comments in mpn_rshift apply here too.
262
263	ALIGN(8)
264L(unroll_loop):
265	C eax	counter, limbs
266	C ebx	src
267	C ecx
268	C edx	dst
269	C esi
270	C edi
271	C
272	C mm0
273	C mm1
274	C mm2	src qword from 16(%ebx,%eax,4)
275	C mm3	dst qword ready to store to 24(%edx,%eax,4)
276	C
277	C mm5	return value
278	C mm6	lshift
279	C mm7	rshift
280
281	movq	8(%ebx,%eax,4), %mm0
282	psllq	%mm6, %mm2
283
284	movq	%mm0, %mm1
285	psrlq	%mm7, %mm0
286
287	movq	%mm3, 24(%edx,%eax,4)	C prev
288	por	%mm2, %mm0
289
290	movq	(%ebx,%eax,4), %mm3	C
291	psllq	%mm6, %mm1		C
292
293	movq	%mm0, 16(%edx,%eax,4)
294	movq	%mm3, %mm2		C
295
296	psrlq	%mm7, %mm3		C
297	subl	$4, %eax
298
299	por	%mm1, %mm3		C
300	jnc	L(unroll_loop)
301
302
303
304L(finish):
305	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
306
307	testb	$2, %al
308
309	jz	L(finish_no_two)
310
311	movq	8(%ebx,%eax,4), %mm0
312	psllq	%mm6, %mm2
313
314	movq	%mm0, %mm1
315	psrlq	%mm7, %mm0
316
317	movq	%mm3, 24(%edx,%eax,4)	C prev
318	por	%mm2, %mm0
319
320	movq	%mm1, %mm2
321	movq	%mm0, %mm3
322
323	subl	$2, %eax
324L(finish_no_two):
325
326
327	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
328	C
329	C mm2	src prev qword, from 16(%ebx,%eax,4)
330	C mm3	dst qword, for 24(%edx,%eax,4)
331
332	testb	$1, %al
333	movd	%mm5, %eax	C retval
334
335	popl	%edi
336	jz	L(finish_zero)
337
338
339	C One extra src limb, destination was aligned.
340	C
341	C                 source                  ebx
342	C                 --+---------------+-------+
343	C                   |      mm2      |       |
344	C                 --+---------------+-------+
345	C
346	C dest         edx+12           edx+4     edx
347	C --+---------------+---------------+-------+
348	C   |      mm3      |               |       |
349	C --+---------------+---------------+-------+
350	C
351	C mm6 = shift
352	C mm7 = ecx = 64-shift
353
354
355	C One extra src limb, destination was unaligned.
356	C
357	C                 source                  ebx
358	C                 --+---------------+-------+
359	C                   |      mm2      |       |
360	C                 --+---------------+-------+
361	C
362	C         dest         edx+12           edx+4
363	C         --+---------------+---------------+
364	C           |      mm3      |               |
365	C         --+---------------+---------------+
366	C
367	C mm6 = shift+32
368	C mm7 = ecx = 64-(shift+32)
369
370
371	C In both cases there's one extra limb of src to fetch and combine
372	C with mm2 to make a qword at 4(%edx), and in the aligned case
373	C there's an extra limb of dst to be formed from that extra src limb
374	C left shifted.
375
376
377	movd	(%ebx), %mm0
378	psllq	%mm6, %mm2
379
380	movq	%mm3, 12(%edx)
381	psllq	$32, %mm0
382
383	movq	%mm0, %mm1
384	psrlq	%mm7, %mm0
385
386	por	%mm2, %mm0
387	psllq	%mm6, %mm1
388
389	movq	%mm0, 4(%edx)
390	psrlq	$32, %mm1
391
392	andl	$32, %ecx
393	popl	%ebx
394
395	jz	L(finish_one_unaligned)
396
397	movd	%mm1, (%edx)
398L(finish_one_unaligned):
399
400	emms
401
402	ret
403
404
405L(finish_zero):
406
407	C No extra src limbs, destination was aligned.
408	C
409	C                 source          ebx
410	C                 --+---------------+
411	C                   |      mm2      |
412	C                 --+---------------+
413	C
414	C dest          edx+8             edx
415	C --+---------------+---------------+
416	C   |      mm3      |               |
417	C --+---------------+---------------+
418	C
419	C mm6 = shift
420	C mm7 = ecx = 64-shift
421
422
423	C No extra src limbs, destination was unaligned.
424	C
425	C               source            ebx
426	C                 --+---------------+
427	C                   |      mm2      |
428	C                 --+---------------+
429	C
430	C         dest          edx+8   edx+4
431	C         --+---------------+-------+
432	C           |      mm3      |       |
433	C         --+---------------+-------+
434	C
435	C mm6 = shift+32
436	C mm7 = ecx = 64-(shift+32)
437
438
439	C The movd for the unaligned case writes the same data to 4(%edx)
440	C that the movq does for the aligned case.
441
442
443	movq	%mm3, 8(%edx)
444	andl	$32, %ecx
445
446	psllq	%mm6, %mm2
447	jz	L(finish_zero_unaligned)
448
449	movq	%mm2, (%edx)
450L(finish_zero_unaligned):
451
452	psrlq	$32, %mm2
453	popl	%ebx
454
455	movd	%mm5, %eax	C retval
456
457	movd	%mm2, 4(%edx)
458
459	emms
460
461	ret
462
463EPILOGUE()
464