xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/pentium4/rshift.asm (revision 70f7362772ba52b749c976fb5e86e39a8b2c9afc)
1dnl  x86-64 mpn_rshift optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	 2.5
36C AMD K10	 ?
37C Intel P4	 3.29
38C Intel core2	 2.1 (fluctuates, presumably cache related)
39C Intel corei	 ?
40C Intel atom	14.3
41C VIA nano	 ?
42
43C INPUT PARAMETERS
44define(`rp',`%rdi')
45define(`up',`%rsi')
46define(`n',`%rdx')
47define(`cnt',`%cl')
48
49ABI_SUPPORT(DOS64)
50ABI_SUPPORT(STD64)
51
52ASM_START()
53	TEXT
54	ALIGN(32)
55PROLOGUE(mpn_rshift)
56	FUNC_ENTRY(4)
57	mov	(up), %rax
58	movd	R32(%rcx), %mm4
59	neg	R32(%rcx)			C put lsh count in cl
60	and	$63, R32(%rcx)
61	movd	R32(%rcx), %mm5
62
63	lea	-8(up,n,8), up
64	lea	-8(rp,n,8), rp
65	lea	1(n), R32(%r8)
66	neg	n
67
68	shl	R8(%rcx), %rax		C function return value
69
70	and	$3, R32(%r8)
71	je	L(rol)			C jump for n = 3, 7, 11, ...
72
73	dec	R32(%r8)
74	jne	L(1)
75C	n = 4, 8, 12, ...
76	movq	8(up,n,8), %mm2
77	psrlq	%mm4, %mm2
78	movq	16(up,n,8), %mm0
79	psllq	%mm5, %mm0
80	por	%mm0, %mm2
81	movq	%mm2, 8(rp,n,8)
82	inc	n
83	jmp	L(rol)
84
85L(1):	dec	R32(%r8)
86	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
87C	n = 2, 6, 10, 16, ...
88	movq	8(up,n,8), %mm2
89	psrlq	%mm4, %mm2
90	movq	16(up,n,8), %mm0
91	psllq	%mm5, %mm0
92	por	%mm0, %mm2
93	movq	%mm2, 8(rp,n,8)
94	inc	n
95L(1x):
96	cmp	$-1, n
97	je	L(ast)
98	movq	8(up,n,8), %mm2
99	psrlq	%mm4, %mm2
100	movq	16(up,n,8), %mm3
101	psrlq	%mm4, %mm3
102	movq	16(up,n,8), %mm0
103	movq	24(up,n,8), %mm1
104	psllq	%mm5, %mm0
105	por	%mm0, %mm2
106	psllq	%mm5, %mm1
107	por	%mm1, %mm3
108	movq	%mm2, 8(rp,n,8)
109	movq	%mm3, 16(rp,n,8)
110	add	$2, n
111
112L(rol):	movq	8(up,n,8), %mm2
113	psrlq	%mm4, %mm2
114	movq	16(up,n,8), %mm3
115	psrlq	%mm4, %mm3
116
117	add	$4, n			C				      4
118	jb	L(end)			C				      2
119	ALIGN(32)
120L(top):
121	C finish stuff from lsh block
122	movq	-16(up,n,8), %mm0
123	movq	-8(up,n,8), %mm1
124	psllq	%mm5, %mm0
125	por	%mm0, %mm2
126	psllq	%mm5, %mm1
127	movq	(up,n,8), %mm0
128	por	%mm1, %mm3
129	movq	8(up,n,8), %mm1
130	movq	%mm2, -24(rp,n,8)
131	movq	%mm3, -16(rp,n,8)
132	C start two new rsh
133	psllq	%mm5, %mm0
134	psllq	%mm5, %mm1
135
136	C finish stuff from rsh block
137	movq	-8(up,n,8), %mm2
138	movq	(up,n,8), %mm3
139	psrlq	%mm4, %mm2
140	por	%mm2, %mm0
141	psrlq	%mm4, %mm3
142	movq	8(up,n,8), %mm2
143	por	%mm3, %mm1
144	movq	16(up,n,8), %mm3
145	movq	%mm0, -8(rp,n,8)
146	movq	%mm1, (rp,n,8)
147	C start two new lsh
148	add	$4, n
149	psrlq	%mm4, %mm2
150	psrlq	%mm4, %mm3
151
152	jae	L(top)			C				      2
153L(end):
154	movq	-8(up), %mm0
155	psllq	%mm5, %mm0
156	por	%mm0, %mm2
157	movq	(up), %mm1
158	psllq	%mm5, %mm1
159	por	%mm1, %mm3
160	movq	%mm2, -16(rp)
161	movq	%mm3, -8(rp)
162
163L(ast):	movq	(up), %mm2
164	psrlq	%mm4, %mm2
165	movq	%mm2, (rp)
166	emms
167	FUNC_EXIT()
168	ret
169EPILOGUE()
170