xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/mmx/rshift.asm (revision 4391d5e9d4f291db41e3b3ba26a01b5e51364aae)
1dnl  Intel P5 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 1.75 cycles/limb.
24
25
26C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                       unsigned shift);
28C
29C Shift src,size right by shift many bits and store the result in dst,size.
30C Zeros are shifted in at the left.  Return the bits shifted out at the
31C right.
32C
33C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
34C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
35C
36C Full speed depends on source and destination being aligned.  Unaligned mmx
37C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
38C setups and finish-ups are done to ensure alignment for the loop.
39C
40C MMX shifts work out a bit faster even for the simple loop.
41
42defframe(PARAM_SHIFT,16)
43defframe(PARAM_SIZE, 12)
44defframe(PARAM_SRC,  8)
45defframe(PARAM_DST,  4)
46deflit(`FRAME',0)
47
48dnl  Minimum 5, because the unrolled loop can't handle less.
49deflit(UNROLL_THRESHOLD, 5)
50
51	TEXT
52	ALIGN(8)
53
54PROLOGUE(mpn_rshift)
55
56	pushl	%ebx
57	pushl	%edi
58deflit(`FRAME',8)
59
60	movl	PARAM_SIZE, %eax
61	movl	PARAM_DST, %edx
62
63	movl	PARAM_SRC, %ebx
64	movl	PARAM_SHIFT, %ecx
65
66	cmp	$UNROLL_THRESHOLD, %eax
67	jae	L(unroll)
68
69	decl	%eax
70	movl	(%ebx), %edi		C src low limb
71
72	jnz	L(simple)
73
74	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
75
76	shrl	%cl, %edi
77
78	movl	%edi, (%edx)		C dst low limb
79	popl	%edi			C risk of data cache bank clash
80
81	popl	%ebx
82
83	ret
84
85
86C -----------------------------------------------------------------------------
87	ALIGN(8)
88L(simple):
89	C eax	size-1
90	C ebx	src
91	C ecx	shift
92	C edx	dst
93	C esi
94	C edi
95	C ebp
96deflit(`FRAME',8)
97
98	movd	(%ebx), %mm5		C src[0]
99	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
100
101	movd	%ecx, %mm6		C rshift
102	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
103
104	psllq	$32, %mm5
105	negl	%eax
106
107
108C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
109C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
110C cycles and would be 8 in a simple loop.  Using mmx helps the return value
111C and last limb calculations too.
112
113L(simple_top):
114	C eax	counter, limbs, negative
115	C ebx	&src[size-1]
116	C ecx	return value
117	C edx	&dst[size-2]
118	C
119	C mm0	scratch
120	C mm5	return value
121	C mm6	shift
122
123	movq	(%ebx,%eax,4), %mm0
124	incl	%eax
125
126	psrlq	%mm6, %mm0
127
128	movd	%mm0, (%edx,%eax,4)
129	jnz	L(simple_top)
130
131
132	movd	(%ebx), %mm0
133	psrlq	%mm6, %mm5		C return value
134
135	psrlq	%mm6, %mm0
136	popl	%edi
137
138	movd	%mm5, %eax
139	popl	%ebx
140
141	movd	%mm0, 4(%edx)
142
143	emms
144
145	ret
146
147
148C -----------------------------------------------------------------------------
149	ALIGN(8)
150L(unroll):
151	C eax	size
152	C ebx	src
153	C ecx	shift
154	C edx	dst
155	C esi
156	C edi
157	C ebp
158deflit(`FRAME',8)
159
160	movd	(%ebx), %mm5		C src[0]
161	movl	$4, %edi
162
163	movd	%ecx, %mm6		C rshift
164	testl	%edi, %ebx
165
166	psllq	$32, %mm5
167	jz	L(start_src_aligned)
168
169
170	C src isn't aligned, process low limb separately (marked xxx) and
171	C step src and dst by one limb, making src aligned.
172	C
173	C source                  ebx
174	C --+-------+-------+-------+
175	C           |          xxx  |
176	C --+-------+-------+-------+
177	C         4mod8   0mod8   4mod8
178	C
179	C         dest            edx
180	C         --+-------+-------+
181	C           |       |  xxx  |
182	C         --+-------+-------+
183
184	movq	(%ebx), %mm0		C unaligned load
185
186	psrlq	%mm6, %mm0
187	addl	$4, %ebx
188
189	decl	%eax
190
191	movd	%mm0, (%edx)
192	addl	$4, %edx
193L(start_src_aligned):
194
195
196	movq	(%ebx), %mm1
197	testl	%edi, %edx
198
199	psrlq	%mm6, %mm5		C retval
200	jz	L(start_dst_aligned)
201
202	C dst isn't aligned, add 4 to make it so, and pretend the shift is
203	C 32 bits extra.  Low limb of dst (marked xxx) handled here
204	C separately.
205	C
206	C          source          ebx
207	C          --+-------+-------+
208	C            |      mm1      |
209	C          --+-------+-------+
210	C                  4mod8   0mod8
211	C
212	C  dest                    edx
213	C  --+-------+-------+-------+
214	C                    |  xxx  |
215	C  --+-------+-------+-------+
216	C          4mod8   0mod8   4mod8
217
218	movq	%mm1, %mm0
219	addl	$32, %ecx		C new shift
220
221	psrlq	%mm6, %mm0
222
223	movd	%ecx, %mm6
224
225	movd	%mm0, (%edx)
226	addl	$4, %edx
227L(start_dst_aligned):
228
229
230	movq	8(%ebx), %mm3
231	negl	%ecx
232
233	movq	%mm3, %mm2		C mm2 src qword
234	addl	$64, %ecx
235
236	movd	%ecx, %mm7
237	psrlq	%mm6, %mm1
238
239	leal	-12(%ebx,%eax,4), %ebx
240	leal	-20(%edx,%eax,4), %edx
241
242	psllq	%mm7, %mm3
243	subl	$7, %eax		C size-7
244
245	por	%mm1, %mm3		C mm3 ready to store
246	negl	%eax			C -(size-7)
247
248	jns	L(finish)
249
250
251	C This loop is the important bit, the rest is just support.  Careful
252	C instruction scheduling achieves the claimed 1.75 c/l.  The
253	C relevant parts of the pairing rules are:
254	C
255	C - mmx loads and stores execute only in the U pipe
256	C - only one mmx shift in a pair
257	C - wait one cycle before storing an mmx register result
258	C - the usual address generation interlock
259	C
260	C Two qword calculations are slightly interleaved.  The instructions
261	C marked "C" belong to the second qword, and the "C prev" one is for
262	C the second qword from the previous iteration.
263
264	ALIGN(8)
265L(unroll_loop):
266	C eax	counter, limbs, negative
267	C ebx	&src[size-12]
268	C ecx
269	C edx	&dst[size-12]
270	C esi
271	C edi
272	C
273	C mm0
274	C mm1
275	C mm2	src qword from -8(%ebx,%eax,4)
276	C mm3	dst qword ready to store to -8(%edx,%eax,4)
277	C
278	C mm5	return value
279	C mm6	rshift
280	C mm7	lshift
281
282	movq	(%ebx,%eax,4), %mm0
283	psrlq	%mm6, %mm2
284
285	movq	%mm0, %mm1
286	psllq	%mm7, %mm0
287
288	movq	%mm3, -8(%edx,%eax,4)	C prev
289	por	%mm2, %mm0
290
291	movq	8(%ebx,%eax,4), %mm3	C
292	psrlq	%mm6, %mm1		C
293
294	movq	%mm0, (%edx,%eax,4)
295	movq	%mm3, %mm2		C
296
297	psllq	%mm7, %mm3		C
298	addl	$4, %eax
299
300	por	%mm1, %mm3		C
301	js	L(unroll_loop)
302
303
304L(finish):
305	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
306
307	testb	$2, %al
308
309	jnz	L(finish_no_two)
310
311	movq	(%ebx,%eax,4), %mm0
312	psrlq	%mm6, %mm2
313
314	movq	%mm0, %mm1
315	psllq	%mm7, %mm0
316
317	movq	%mm3, -8(%edx,%eax,4)	C prev
318	por	%mm2, %mm0
319
320	movq	%mm1, %mm2
321	movq	%mm0, %mm3
322
323	addl	$2, %eax
324L(finish_no_two):
325
326
327	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
328	C
329	C mm2	src prev qword, from -8(%ebx,%eax,4)
330	C mm3	dst qword, for -8(%edx,%eax,4)
331
332	testb	$1, %al
333	popl	%edi
334
335	movd	%mm5, %eax	C retval
336	jnz	L(finish_zero)
337
338
339	C One extra limb, destination was aligned.
340	C
341	C source                ebx
342	C +-------+---------------+--
343	C |       |      mm2      |
344	C +-------+---------------+--
345	C
346	C dest                                  edx
347	C +-------+---------------+---------------+--
348	C |       |               |      mm3      |
349	C +-------+---------------+---------------+--
350	C
351	C mm6 = shift
352	C mm7 = ecx = 64-shift
353
354
355	C One extra limb, destination was unaligned.
356	C
357	C source                ebx
358	C +-------+---------------+--
359	C |       |      mm2      |
360	C +-------+---------------+--
361	C
362	C dest                          edx
363	C +---------------+---------------+--
364	C |               |      mm3      |
365	C +---------------+---------------+--
366	C
367	C mm6 = shift+32
368	C mm7 = ecx = 64-(shift+32)
369
370
371	C In both cases there's one extra limb of src to fetch and combine
372	C with mm2 to make a qword at 8(%edx), and in the aligned case
373	C there's a further extra limb of dst to be formed.
374
375
376	movd	8(%ebx), %mm0
377	psrlq	%mm6, %mm2
378
379	movq	%mm0, %mm1
380	psllq	%mm7, %mm0
381
382	movq	%mm3, (%edx)
383	por	%mm2, %mm0
384
385	psrlq	%mm6, %mm1
386	andl	$32, %ecx
387
388	popl	%ebx
389	jz	L(finish_one_unaligned)
390
391	C dst was aligned, must store one extra limb
392	movd	%mm1, 16(%edx)
393L(finish_one_unaligned):
394
395	movq	%mm0, 8(%edx)
396
397	emms
398
399	ret
400
401
402L(finish_zero):
403
404	C No extra limbs, destination was aligned.
405	C
406	C source        ebx
407	C +---------------+--
408	C |      mm2      |
409	C +---------------+--
410	C
411	C dest                        edx+4
412	C +---------------+---------------+--
413	C |               |      mm3      |
414	C +---------------+---------------+--
415	C
416	C mm6 = shift
417	C mm7 = ecx = 64-shift
418
419
420	C No extra limbs, destination was unaligned.
421	C
422	C source        ebx
423	C +---------------+--
424	C |      mm2      |
425	C +---------------+--
426	C
427	C dest                edx+4
428	C +-------+---------------+--
429	C |       |      mm3      |
430	C +-------+---------------+--
431	C
432	C mm6 = shift+32
433	C mm7 = 64-(shift+32)
434
435
436	C The movd for the unaligned case is clearly the same data as the
437	C movq for the aligned case, it's just a choice between whether one
438	C or two limbs should be written.
439
440
441	movq	%mm3, 4(%edx)
442	psrlq	%mm6, %mm2
443
444	movd	%mm2, 12(%edx)
445	andl	$32, %ecx
446
447	popl	%ebx
448	jz	L(finish_zero_unaligned)
449
450	movq	%mm2, 12(%edx)
451L(finish_zero_unaligned):
452
453	emms
454
455	ret
456
457EPILOGUE()
458