xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/pentium4/rshift.asm (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1dnl  x86-64 mpn_rshift optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C AMD K8,K9	 2.5
25C AMD K10	 ?
26C Intel P4	 3.29
27C Intel core2	 2.1 (fluctuates, presumably cache related)
28C Intel corei	 ?
29C Intel atom	14.3
30C VIA nano	 ?
31
32C INPUT PARAMETERS
33define(`rp',`%rdi')
34define(`up',`%rsi')
35define(`n',`%rdx')
36define(`cnt',`%cl')
37
38ABI_SUPPORT(DOS64)
39ABI_SUPPORT(STD64)
40
41ASM_START()
42	TEXT
43	ALIGN(32)
44PROLOGUE(mpn_rshift)
45	FUNC_ENTRY(4)
46	mov	(up), %rax
47	movd	R32(%rcx), %mm4
48	neg	R32(%rcx)			C put lsh count in cl
49	and	$63, R32(%rcx)
50	movd	R32(%rcx), %mm5
51
52	lea	-8(up,n,8), up
53	lea	-8(rp,n,8), rp
54	lea	1(n), R32(%r8)
55	neg	n
56
57	shl	R8(%rcx), %rax		C function return value
58
59	and	$3, R32(%r8)
60	je	L(rol)			C jump for n = 3, 7, 11, ...
61
62	dec	R32(%r8)
63	jne	L(1)
64C	n = 4, 8, 12, ...
65	movq	8(up,n,8), %mm2
66	psrlq	%mm4, %mm2
67	movq	16(up,n,8), %mm0
68	psllq	%mm5, %mm0
69	por	%mm0, %mm2
70	movq	%mm2, 8(rp,n,8)
71	inc	n
72	jmp	L(rol)
73
74L(1):	dec	R32(%r8)
75	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
76C	n = 2, 6, 10, 16, ...
77	movq	8(up,n,8), %mm2
78	psrlq	%mm4, %mm2
79	movq	16(up,n,8), %mm0
80	psllq	%mm5, %mm0
81	por	%mm0, %mm2
82	movq	%mm2, 8(rp,n,8)
83	inc	n
84L(1x):
85	cmp	$-1, n
86	je	L(ast)
87	movq	8(up,n,8), %mm2
88	psrlq	%mm4, %mm2
89	movq	16(up,n,8), %mm3
90	psrlq	%mm4, %mm3
91	movq	16(up,n,8), %mm0
92	movq	24(up,n,8), %mm1
93	psllq	%mm5, %mm0
94	por	%mm0, %mm2
95	psllq	%mm5, %mm1
96	por	%mm1, %mm3
97	movq	%mm2, 8(rp,n,8)
98	movq	%mm3, 16(rp,n,8)
99	add	$2, n
100
101L(rol):	movq	8(up,n,8), %mm2
102	psrlq	%mm4, %mm2
103	movq	16(up,n,8), %mm3
104	psrlq	%mm4, %mm3
105
106	add	$4, n			C				      4
107	jb	L(end)			C				      2
108	ALIGN(32)
109L(top):
110	C finish stuff from lsh block
111	movq	-16(up,n,8), %mm0
112	movq	-8(up,n,8), %mm1
113	psllq	%mm5, %mm0
114	por	%mm0, %mm2
115	psllq	%mm5, %mm1
116	movq	(up,n,8), %mm0
117	por	%mm1, %mm3
118	movq	8(up,n,8), %mm1
119	movq	%mm2, -24(rp,n,8)
120	movq	%mm3, -16(rp,n,8)
121	C start two new rsh
122	psllq	%mm5, %mm0
123	psllq	%mm5, %mm1
124
125	C finish stuff from rsh block
126	movq	-8(up,n,8), %mm2
127	movq	(up,n,8), %mm3
128	psrlq	%mm4, %mm2
129	por	%mm2, %mm0
130	psrlq	%mm4, %mm3
131	movq	8(up,n,8), %mm2
132	por	%mm3, %mm1
133	movq	16(up,n,8), %mm3
134	movq	%mm0, -8(rp,n,8)
135	movq	%mm1, (rp,n,8)
136	C start two new lsh
137	add	$4, n
138	psrlq	%mm4, %mm2
139	psrlq	%mm4, %mm3
140
141	jae	L(top)			C				      2
142L(end):
143	movq	-8(up), %mm0
144	psllq	%mm5, %mm0
145	por	%mm0, %mm2
146	movq	(up), %mm1
147	psllq	%mm5, %mm1
148	por	%mm1, %mm3
149	movq	%mm2, -16(rp)
150	movq	%mm3, -8(rp)
151
152L(ast):	movq	(up), %mm2
153	psrlq	%mm4, %mm2
154	movq	%mm2, (rp)
155	emms
156	FUNC_EXIT()
157	ret
158EPILOGUE()
159