xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/pentium4/lshiftc.asm (revision 479d8f7d843cc1b22d497efdf1f27a50ee8418d4)
1dnl  x86-64 mpn_lshiftc optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008, 2010, 2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb
25C AMD K8,K9	 ?
26C AMD K10	 ?
27C Intel P4	 4.15
28C Intel core2	 ?
29C Intel corei	 ?
30C Intel atom	 ?
31C VIA nano	 ?
32
33C INPUT PARAMETERS
34define(`rp',`%rdi')
35define(`up',`%rsi')
36define(`n',`%rdx')
37define(`cnt',`%cl')
38
39ABI_SUPPORT(DOS64)
40ABI_SUPPORT(STD64)
41
42ASM_START()
43	TEXT
44	ALIGN(32)
45PROLOGUE(mpn_lshiftc)
46	FUNC_ENTRY(4)
47	mov	-8(up,n,8), %rax
48	pcmpeqd	%mm6, %mm6		C 0xffff...fff
49	movd	R32(%rcx), %mm4
50	neg	R32(%rcx)		C put rsh count in cl
51	and	$63, R32(%rcx)
52	movd	R32(%rcx), %mm5
53
54	lea	1(n), R32(%r8)
55
56	shr	R8(%rcx), %rax		C function return value
57
58	and	$3, R32(%r8)
59	je	L(rol)			C jump for n = 3, 7, 11, ...
60
61	dec	R32(%r8)
62	jne	L(1)
63C	n = 4, 8, 12, ...
64	movq	-8(up,n,8), %mm2
65	psllq	%mm4, %mm2
66	movq	-16(up,n,8), %mm0
67	pxor	%mm6, %mm2
68	psrlq	%mm5, %mm0
69	pandn	%mm2, %mm0
70	movq	%mm0, -8(rp,n,8)
71	dec	n
72	jmp	L(rol)
73
74L(1):	dec	R32(%r8)
75	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
76C	n = 2, 6, 10, 16, ...
77	movq	-8(up,n,8), %mm2
78	psllq	%mm4, %mm2
79	movq	-16(up,n,8), %mm0
80	pxor	%mm6, %mm2
81	psrlq	%mm5, %mm0
82	pandn	%mm2, %mm0
83	movq	%mm0, -8(rp,n,8)
84	dec	n
85L(1x):
86	cmp	$1, n
87	je	L(ast)
88	movq	-8(up,n,8), %mm2
89	psllq	%mm4, %mm2
90	movq	-16(up,n,8), %mm3
91	psllq	%mm4, %mm3
92	movq	-16(up,n,8), %mm0
93	movq	-24(up,n,8), %mm1
94	pxor	%mm6, %mm2
95	psrlq	%mm5, %mm0
96	pandn	%mm2, %mm0
97	pxor	%mm6, %mm3
98	psrlq	%mm5, %mm1
99	pandn	%mm3, %mm1
100	movq	%mm0, -8(rp,n,8)
101	movq	%mm1, -16(rp,n,8)
102	sub	$2, n
103
104L(rol):	movq	-8(up,n,8), %mm2
105	psllq	%mm4, %mm2
106	movq	-16(up,n,8), %mm3
107	psllq	%mm4, %mm3
108
109	sub	$4, n
110	jb	L(end)
111	ALIGN(32)
112L(top):
113	C finish stuff from lsh block
114	movq	16(up,n,8), %mm0
115	pxor	%mm6, %mm2
116	movq	8(up,n,8), %mm1
117	psrlq	%mm5, %mm0
118	psrlq	%mm5, %mm1
119	pandn	%mm2, %mm0
120	pxor	%mm6, %mm3
121	movq	%mm0, 24(rp,n,8)
122	movq	(up,n,8), %mm0
123	pandn	%mm3, %mm1
124	movq	%mm1, 16(rp,n,8)
125	movq	-8(up,n,8), %mm1
126	C start two new rsh
127	psrlq	%mm5, %mm0
128	psrlq	%mm5, %mm1
129
130	C finish stuff from rsh block
131	movq	8(up,n,8), %mm2
132	pxor	%mm6, %mm0
133	movq	(up,n,8), %mm3
134	psllq	%mm4, %mm2
135	psllq	%mm4, %mm3
136	pandn	%mm0, %mm2
137	pxor	%mm6, %mm1
138	movq	%mm2, 8(rp,n,8)
139	movq	-8(up,n,8), %mm2
140	pandn	%mm1, %mm3
141	movq	%mm3, (rp,n,8)
142	movq	-16(up,n,8), %mm3
143	C start two new lsh
144	sub	$4, n
145	psllq	%mm4, %mm2
146	psllq	%mm4, %mm3
147
148	jae	L(top)
149
150L(end):	pxor	%mm6, %mm2
151	movq	8(up), %mm0
152	psrlq	%mm5, %mm0
153	pandn	%mm2, %mm0
154	pxor	%mm6, %mm3
155	movq	(up), %mm1
156	psrlq	%mm5, %mm1
157	pandn	%mm3, %mm1
158	movq	%mm0, 16(rp)
159	movq	%mm1, 8(rp)
160
161L(ast):	movq	(up), %mm2
162	psllq	%mm4, %mm2
163	pxor	%mm6, %mm2
164	movq	%mm2, (rp)
165	emms
166	FUNC_EXIT()
167	ret
168EPILOGUE()
169