xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mmx/lshift.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  AMD K6 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: 3.0 cycles/limb
35
36
37C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
41C instructions.  This is despite every second fetch being unaligned.
42
43
44defframe(PARAM_SHIFT,16)
45defframe(PARAM_SIZE, 12)
46defframe(PARAM_SRC,  8)
47defframe(PARAM_DST,  4)
48
49	TEXT
50	ALIGN(32)
51
52PROLOGUE(mpn_lshift)
53deflit(`FRAME',0)
54
55	C The 1 limb case can be done without the push %ebx, but it's then
56	C still the same speed.  The push is left as a free helping hand for
57	C the two_or_more code.
58
59	movl	PARAM_SIZE, %eax
60	pushl	%ebx			FRAME_pushl()
61
62	movl	PARAM_SRC, %ebx
63	decl	%eax
64
65	movl	PARAM_SHIFT, %ecx
66	jnz	L(two_or_more)
67
68	movl	(%ebx), %edx		C src limb
69	movl	PARAM_DST, %ebx
70
71	shldl(	%cl, %edx, %eax)	C return value
72
73	shll	%cl, %edx
74
75	movl	%edx, (%ebx)		C dst limb
76	popl	%ebx
77
78	ret
79
80
81	ALIGN(16)	C avoid offset 0x1f
82	nop		C avoid bad cache line crossing
83L(two_or_more):
84	C eax	size-1
85	C ebx	src
86	C ecx	shift
87	C edx
88
89	movl	(%ebx,%eax,4), %edx	C src high limb
90	negl	%ecx
91
92	movd	PARAM_SHIFT, %mm6
93	addl	$32, %ecx		C 32-shift
94
95	shrl	%cl, %edx
96
97	movd	%ecx, %mm7
98	movl	PARAM_DST, %ecx
99
100L(top):
101	C eax	counter, size-1 to 1
102	C ebx	src
103	C ecx	dst
104	C edx	retval
105	C
106	C mm0	scratch
107	C mm6	shift
108	C mm7	32-shift
109
110	movq	-4(%ebx,%eax,4), %mm0
111	decl	%eax
112
113	psrlq	%mm7, %mm0
114
115	movd	%mm0, 4(%ecx,%eax,4)
116	jnz	L(top)
117
118
119	movd	(%ebx), %mm0
120	popl	%ebx
121
122	psllq	%mm6, %mm0
123	movl	%edx, %eax
124
125	movd	%mm0, (%ecx)
126
127	emms
128	ret
129
130EPILOGUE()
131