xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/pentium4/lshift.asm (revision 1580a27b92f58fcdcb23fdfbc04a7c2b54a0b7c8)
1dnl  x86-64 mpn_lshift optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	 2.5
36C AMD K10	 ?
37C Intel P4	 3.29
38C Intel core2	 2.1 (fluctuates, presumably cache related)
39C Intel corei	 ?
40C Intel atom	14.3
41C VIA nano	 ?
42
43C INPUT PARAMETERS
44define(`rp',`%rdi')
45define(`up',`%rsi')
46define(`n',`%rdx')
47define(`cnt',`%cl')
48
49ABI_SUPPORT(DOS64)
50ABI_SUPPORT(STD64)
51
52ASM_START()
53	TEXT
54	ALIGN(32)
55PROLOGUE(mpn_lshift)
56	FUNC_ENTRY(4)
57	mov	-8(up,n,8), %rax
58	movd	R32(%rcx), %mm4
59	neg	R32(%rcx)		C put rsh count in cl
60	and	$63, R32(%rcx)
61	movd	R32(%rcx), %mm5
62
63	lea	1(n), R32(%r8)
64
65	shr	R8(%rcx), %rax		C function return value
66
67	and	$3, R32(%r8)
68	je	L(rol)			C jump for n = 3, 7, 11, ...
69
70	dec	R32(%r8)
71	jne	L(1)
72C	n = 4, 8, 12, ...
73	movq	-8(up,n,8), %mm2
74	psllq	%mm4, %mm2
75	movq	-16(up,n,8), %mm0
76	psrlq	%mm5, %mm0
77	por	%mm0, %mm2
78	movq	%mm2, -8(rp,n,8)
79	dec	n
80	jmp	L(rol)
81
82L(1):	dec	R32(%r8)
83	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
84C	n = 2, 6, 10, 16, ...
85	movq	-8(up,n,8), %mm2
86	psllq	%mm4, %mm2
87	movq	-16(up,n,8), %mm0
88	psrlq	%mm5, %mm0
89	por	%mm0, %mm2
90	movq	%mm2, -8(rp,n,8)
91	dec	n
92L(1x):
93	cmp	$1, n
94	je	L(ast)
95	movq	-8(up,n,8), %mm2
96	psllq	%mm4, %mm2
97	movq	-16(up,n,8), %mm3
98	psllq	%mm4, %mm3
99	movq	-16(up,n,8), %mm0
100	movq	-24(up,n,8), %mm1
101	psrlq	%mm5, %mm0
102	por	%mm0, %mm2
103	psrlq	%mm5, %mm1
104	por	%mm1, %mm3
105	movq	%mm2, -8(rp,n,8)
106	movq	%mm3, -16(rp,n,8)
107	sub	$2, n
108
109L(rol):	movq	-8(up,n,8), %mm2
110	psllq	%mm4, %mm2
111	movq	-16(up,n,8), %mm3
112	psllq	%mm4, %mm3
113
114	sub	$4, n			C				      4
115	jb	L(end)			C				      2
116	ALIGN(32)
117L(top):
118	C finish stuff from lsh block
119	movq	16(up,n,8), %mm0
120	movq	8(up,n,8), %mm1
121	psrlq	%mm5, %mm0
122	por	%mm0, %mm2
123	psrlq	%mm5, %mm1
124	movq	(up,n,8), %mm0
125	por	%mm1, %mm3
126	movq	-8(up,n,8), %mm1
127	movq	%mm2, 24(rp,n,8)
128	movq	%mm3, 16(rp,n,8)
129	C start two new rsh
130	psrlq	%mm5, %mm0
131	psrlq	%mm5, %mm1
132
133	C finish stuff from rsh block
134	movq	8(up,n,8), %mm2
135	movq	(up,n,8), %mm3
136	psllq	%mm4, %mm2
137	por	%mm2, %mm0
138	psllq	%mm4, %mm3
139	movq	-8(up,n,8), %mm2
140	por	%mm3, %mm1
141	movq	-16(up,n,8), %mm3
142	movq	%mm0, 8(rp,n,8)
143	movq	%mm1, (rp,n,8)
144	C start two new lsh
145	sub	$4, n
146	psllq	%mm4, %mm2
147	psllq	%mm4, %mm3
148
149	jae	L(top)			C				      2
150L(end):
151	movq	8(up), %mm0
152	psrlq	%mm5, %mm0
153	por	%mm0, %mm2
154	movq	(up), %mm1
155	psrlq	%mm5, %mm1
156	por	%mm1, %mm3
157	movq	%mm2, 16(rp)
158	movq	%mm3, 8(rp)
159
160L(ast):	movq	(up), %mm2
161	psllq	%mm4, %mm2
162	movq	%mm2, (rp)
163	emms
164	FUNC_EXIT()
165	ret
166EPILOGUE()
167