xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/k62mmx/rshift.asm (revision 33881f779a77dce6440bdc44610d94de75bebefe)
1dnl  AMD K6-2 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6-2: 1.75 cycles/limb
35
36
37C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40
41defframe(PARAM_SHIFT,16)
42defframe(PARAM_SIZE, 12)
43defframe(PARAM_SRC,  8)
44defframe(PARAM_DST,  4)
45deflit(`FRAME',0)
46
47dnl  Minimum 9, because the unrolled loop can't handle less.
48dnl
49deflit(UNROLL_THRESHOLD, 9)
50
51	TEXT
52	ALIGN(32)
53
54PROLOGUE(mpn_rshift)
55deflit(`FRAME',0)
56
57	C The 1 limb case can be done without the push %ebx, but it's then
58	C still the same speed.  The push is left as a free helping hand for
59	C the two_or_more code.
60
61	movl	PARAM_SIZE, %eax
62	pushl	%ebx			FRAME_pushl()
63
64	movl	PARAM_SRC, %ebx
65	decl	%eax
66
67	movl	PARAM_SHIFT, %ecx
68	jnz	L(two_or_more)
69
70	movl	(%ebx), %edx		C src limb
71	movl	PARAM_DST, %ebx
72
73	shrdl(	%cl, %edx, %eax)	C return value
74
75	shrl	%cl, %edx
76
77	movl	%edx, (%ebx)		C dst limb
78	popl	%ebx
79
80	ret
81
82
83C -----------------------------------------------------------------------------
84	ALIGN(16)	C avoid offset 0x1f
85L(two_or_more):
86	C eax	size-1
87	C ebx	src
88	C ecx	shift
89	C edx
90
91	movl	(%ebx), %edx	C src low limb
92	negl	%ecx
93
94	addl	$32, %ecx
95	movd	PARAM_SHIFT, %mm6
96
97	shll	%cl, %edx
98	cmpl	$UNROLL_THRESHOLD-1, %eax
99
100	jae	L(unroll)
101
102
103	C eax	size-1
104	C ebx	src
105	C ecx	32-shift
106	C edx	retval
107	C
108	C mm6	shift
109
110	movl	PARAM_DST, %ecx
111	leal	(%ebx,%eax,4), %ebx
112
113	leal	-4(%ecx,%eax,4), %ecx
114	negl	%eax
115
116	C This loop runs at about 3 cycles/limb, which is the amount of
117	C decoding, and this is despite every second access being unaligned.
118
119L(simple):
120	C eax	counter, -(size-1) to -1
121	C ebx	&src[size-1]
122	C ecx	&dst[size-1]
123	C edx	retval
124	C
125	C mm0	scratch
126	C mm6	shift
127
128Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
129	incl	%eax
130
131	psrlq	%mm6, %mm0
132
133Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
134	jnz	L(simple)
135
136
137	movq	%mm0, (%ecx)
138	movl	%edx, %eax
139
140	popl	%ebx
141
142	femms
143	ret
144
145
146C -----------------------------------------------------------------------------
147	ALIGN(16)
148L(unroll):
149	C eax	size-1
150	C ebx	src
151	C ecx	32-shift
152	C edx	retval
153	C
154	C mm6	shift
155
156	addl	$32, %ecx
157	subl	$7, %eax		C size-8
158
159	movd	%ecx, %mm7
160	movl	PARAM_DST, %ecx
161
162	movq	(%ebx), %mm2		C src low qword
163	leal	(%ebx,%eax,4), %ebx	C src end - 32
164
165	testb	$4, %cl
166	leal	(%ecx,%eax,4), %ecx	C dst end - 32
167
168	notl	%eax			C -(size-7)
169	jz	L(dst_aligned)
170
171	psrlq	%mm6, %mm2
172	incl	%eax
173
174Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
175	movq	4(%ebx,%eax,4), %mm2	C new src low qword
176L(dst_aligned):
177
178	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
179	nop	C avoid bad cache line crossing
180
181
182	C This loop is the important bit, the rest is just support for it.
183	C Four src limbs are held at the start, and four more will be read.
184	C Four dst limbs will be written.  This schedule seems necessary for
185	C full speed.
186	C
187	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
188	C and leaves 0 to 3 which can be tested with test $1 and $2.
189
190L(top):
191	C eax	counter, -(size-7) step by +4 until >=0
192	C ebx	src end - 32
193	C ecx	dst end - 32
194	C edx	retval
195	C
196	C mm0	src next qword
197	C mm1	scratch
198	C mm2	src prev qword
199	C mm6	shift
200	C mm7	64-shift
201
202	psrlq	%mm6, %mm2
203	addl	$4, %eax
204
205	movq	%mm0, %mm1
206	psllq	%mm7, %mm0
207
208	por	%mm0, %mm2
209	movq	4(%ebx,%eax,4), %mm0
210
211	psrlq	%mm6, %mm1
212	movq	%mm2, -12(%ecx,%eax,4)
213
214	movq	%mm0, %mm2
215	psllq	%mm7, %mm0
216
217	por	%mm0, %mm1
218	movq	12(%ebx,%eax,4), %mm0
219
220	movq	%mm1, -4(%ecx,%eax,4)
221	ja	L(top)		C jump if no carry and not zero
222
223
224
225	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
226	C to 3 representing respectively 3 to 0 further limbs.
227
228	testl	$2, %eax	C testl to avoid bad cache line crossings
229	jnz	L(finish_nottwo)
230
231	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
232	C becomes new mm2 and a new mm0 is loaded.
233
234	psrlq	%mm6, %mm2
235	movq	%mm0, %mm1
236
237	psllq	%mm7, %mm0
238	addl	$2, %eax
239
240	por	%mm0, %mm2
241	movq	12(%ebx,%eax,4), %mm0
242
243	movq	%mm2, -4(%ecx,%eax,4)
244	movq	%mm1, %mm2
245L(finish_nottwo):
246
247
248	testb	$1, %al
249	psrlq	%mm6, %mm2
250
251	movq	%mm0, %mm1
252	psllq	%mm7, %mm0
253
254	por	%mm0, %mm2
255	psrlq	%mm6, %mm1
256
257	movq	%mm2, 4(%ecx,%eax,4)
258	jnz	L(finish_even)
259
260
261	C one further extra limb to process
262
263	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
264	popl	%ebx
265
266	movq	%mm0, %mm2
267	psllq	%mm7, %mm0
268
269	por	%mm0, %mm1
270	psrlq	%mm6, %mm2
271
272	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
273	movd	%mm2, 32-4(%ecx)	C dst[size-1]
274
275	movl	%edx, %eax		C retval
276
277	femms
278	ret
279
280
281	nop	C avoid bad cache line crossing
282L(finish_even):
283	C no further extra limbs
284
285	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
286	movl	%edx, %eax		C retval
287
288	popl	%ebx
289
290	femms
291	ret
292
293EPILOGUE()
294