xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/pentium4/lshift.asm (revision b83ebeba7f767758d2778bb0f9d7a76534253621)
1dnl  x86-64 mpn_lshift optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C AMD K8,K9	 2.5
25C AMD K10	 ?
26C Intel P4	 3.29
27C Intel core2	 2.1 (fluctuates, presumably cache related)
28C Intel corei	 ?
29C Intel atom	14.3
30C VIA nano	 ?
31
32C INPUT PARAMETERS
33define(`rp',`%rdi')
34define(`up',`%rsi')
35define(`n',`%rdx')
36define(`cnt',`%cl')
37
38ABI_SUPPORT(DOS64)
39ABI_SUPPORT(STD64)
40
41ASM_START()
42	TEXT
43	ALIGN(32)
44PROLOGUE(mpn_lshift)
45	FUNC_ENTRY(4)
46	mov	-8(up,n,8), %rax
47	movd	R32(%rcx), %mm4
48	neg	R32(%rcx)		C put rsh count in cl
49	and	$63, R32(%rcx)
50	movd	R32(%rcx), %mm5
51
52	lea	1(n), R32(%r8)
53
54	shr	R8(%rcx), %rax		C function return value
55
56	and	$3, R32(%r8)
57	je	L(rol)			C jump for n = 3, 7, 11, ...
58
59	dec	R32(%r8)
60	jne	L(1)
61C	n = 4, 8, 12, ...
62	movq	-8(up,n,8), %mm2
63	psllq	%mm4, %mm2
64	movq	-16(up,n,8), %mm0
65	psrlq	%mm5, %mm0
66	por	%mm0, %mm2
67	movq	%mm2, -8(rp,n,8)
68	dec	n
69	jmp	L(rol)
70
71L(1):	dec	R32(%r8)
72	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
73C	n = 2, 6, 10, 16, ...
74	movq	-8(up,n,8), %mm2
75	psllq	%mm4, %mm2
76	movq	-16(up,n,8), %mm0
77	psrlq	%mm5, %mm0
78	por	%mm0, %mm2
79	movq	%mm2, -8(rp,n,8)
80	dec	n
81L(1x):
82	cmp	$1, n
83	je	L(ast)
84	movq	-8(up,n,8), %mm2
85	psllq	%mm4, %mm2
86	movq	-16(up,n,8), %mm3
87	psllq	%mm4, %mm3
88	movq	-16(up,n,8), %mm0
89	movq	-24(up,n,8), %mm1
90	psrlq	%mm5, %mm0
91	por	%mm0, %mm2
92	psrlq	%mm5, %mm1
93	por	%mm1, %mm3
94	movq	%mm2, -8(rp,n,8)
95	movq	%mm3, -16(rp,n,8)
96	sub	$2, n
97
98L(rol):	movq	-8(up,n,8), %mm2
99	psllq	%mm4, %mm2
100	movq	-16(up,n,8), %mm3
101	psllq	%mm4, %mm3
102
103	sub	$4, n			C				      4
104	jb	L(end)			C				      2
105	ALIGN(32)
106L(top):
107	C finish stuff from lsh block
108	movq	16(up,n,8), %mm0
109	movq	8(up,n,8), %mm1
110	psrlq	%mm5, %mm0
111	por	%mm0, %mm2
112	psrlq	%mm5, %mm1
113	movq	(up,n,8), %mm0
114	por	%mm1, %mm3
115	movq	-8(up,n,8), %mm1
116	movq	%mm2, 24(rp,n,8)
117	movq	%mm3, 16(rp,n,8)
118	C start two new rsh
119	psrlq	%mm5, %mm0
120	psrlq	%mm5, %mm1
121
122	C finish stuff from rsh block
123	movq	8(up,n,8), %mm2
124	movq	(up,n,8), %mm3
125	psllq	%mm4, %mm2
126	por	%mm2, %mm0
127	psllq	%mm4, %mm3
128	movq	-8(up,n,8), %mm2
129	por	%mm3, %mm1
130	movq	-16(up,n,8), %mm3
131	movq	%mm0, 8(rp,n,8)
132	movq	%mm1, (rp,n,8)
133	C start two new lsh
134	sub	$4, n
135	psllq	%mm4, %mm2
136	psllq	%mm4, %mm3
137
138	jae	L(top)			C				      2
139L(end):
140	movq	8(up), %mm0
141	psrlq	%mm5, %mm0
142	por	%mm0, %mm2
143	movq	(up), %mm1
144	psrlq	%mm5, %mm1
145	por	%mm1, %mm3
146	movq	%mm2, 16(rp)
147	movq	%mm3, 8(rp)
148
149L(ast):	movq	(up), %mm2
150	psllq	%mm4, %mm2
151	movq	%mm2, (rp)
152	emms
153	FUNC_EXIT()
154	ret
155EPILOGUE()
156