xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mmx/rshift.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  AMD K6 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: 3.0 cycles/limb
35
36
37C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
41C instructions.  This is despite every second fetch being unaligned.
42
43
44defframe(PARAM_SHIFT,16)
45defframe(PARAM_SIZE, 12)
46defframe(PARAM_SRC,  8)
47defframe(PARAM_DST,  4)
48deflit(`FRAME',0)
49
50	TEXT
51	ALIGN(32)
52
53PROLOGUE(mpn_rshift)
54deflit(`FRAME',0)
55
56	C The 1 limb case can be done without the push %ebx, but it's then
57	C still the same speed.  The push is left as a free helping hand for
58	C the two_or_more code.
59
60	movl	PARAM_SIZE, %eax
61	pushl	%ebx			FRAME_pushl()
62
63	movl	PARAM_SRC, %ebx
64	decl	%eax
65
66	movl	PARAM_SHIFT, %ecx
67	jnz	L(two_or_more)
68
69	movl	(%ebx), %edx		C src limb
70	movl	PARAM_DST, %ebx
71
72	shrdl(	%cl, %edx, %eax)	C return value
73
74	shrl	%cl, %edx
75
76	movl	%edx, (%ebx)		C dst limb
77	popl	%ebx
78
79	ret
80
81
82	ALIGN(16)	C avoid offset 0x1f
83L(two_or_more):
84	C eax	size-1
85	C ebx	src
86	C ecx	shift
87	C edx
88
89	movl	(%ebx), %edx	C src low limb
90	negl	%ecx
91
92	addl	$32, %ecx	C 32-shift
93	movd	PARAM_SHIFT, %mm6
94
95	shll	%cl, %edx	C retval
96	movl	PARAM_DST, %ecx
97
98	leal	(%ebx,%eax,4), %ebx
99
100	leal	-4(%ecx,%eax,4), %ecx
101	negl	%eax
102
103
104L(simple):
105	C eax	counter (negative)
106	C ebx	&src[size-1]
107	C ecx	&dst[size-1]
108	C edx	retval
109	C
110	C mm0	scratch
111	C mm6	shift
112
113Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
114	incl	%eax
115
116	psrlq	%mm6, %mm0
117
118Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
119	jnz	L(simple)
120
121
122	movq	%mm0, (%ecx)
123	movl	%edx, %eax
124
125	popl	%ebx
126
127	emms
128	ret
129
130EPILOGUE()
131