xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/lshift.asm (revision a24efa7dea9f1f56c3bdb15a927d3516792ace1c)
1dnl  AMD64 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 2003, 2005, 2007, 2009, 2011, 2012 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb   cycles/limb cnt=1
25C AMD K8,K9	 2.375		 1.375
26C AMD K10	 2.375		 1.375
27C Intel P4	 8		10.5
28C Intel core2	 2.11		 4.28
29C Intel corei	 ?		 ?
30C Intel atom	 5.75		 3.5
31C VIA nano	 3.5		 2.25
32
33
34C INPUT PARAMETERS
35define(`rp',	`%rdi')
36define(`up',	`%rsi')
37define(`n',	`%rdx')
38define(`cnt',	`%rcx')
39
40ABI_SUPPORT(DOS64)
41ABI_SUPPORT(STD64)
42
43ASM_START()
44	TEXT
45	ALIGN(32)
46PROLOGUE(mpn_lshift)
47	FUNC_ENTRY(4)
48	cmp	$1, R8(%rcx)
49	jne	L(gen)
50
51C For cnt=1 we want to work from lowest limb towards higher limbs.
52C Check for bad overlap (up=rp is OK!) up=rp+1..rp+n-1 is bad.
53C FIXME: this could surely be done more cleverly.
54
55	mov    rp, %rax
56	sub    up, %rax
57	je     L(fwd)			C rp = up
58	shr    $3, %rax
59	cmp    n, %rax
60	jb     L(gen)
61
62L(fwd):	mov	R32(n), R32(%rax)
63	shr	$2, n
64	je	L(e1)
65	and	$3, R32(%rax)
66
67	ALIGN(8)
68	nop
69	nop
70L(t1):	mov	(up), %r8
71	mov	8(up), %r9
72	mov	16(up), %r10
73	mov	24(up), %r11
74	lea	32(up), up
75	adc	%r8, %r8
76	mov	%r8, (rp)
77	adc	%r9, %r9
78	mov	%r9, 8(rp)
79	adc	%r10, %r10
80	mov	%r10, 16(rp)
81	adc	%r11, %r11
82	mov	%r11, 24(rp)
83	lea	32(rp), rp
84	dec	n
85	jne	L(t1)
86
87	inc	R32(%rax)
88	dec	R32(%rax)
89	jne	L(n00)
90	adc	R32(%rax), R32(%rax)
91	FUNC_EXIT()
92	ret
93L(e1):	test	R32(%rax), R32(%rax)	C clear cy
94L(n00):	mov	(up), %r8
95	dec	R32(%rax)
96	jne	L(n01)
97	adc	%r8, %r8
98	mov	%r8, (rp)
99L(ret):	adc	R32(%rax), R32(%rax)
100	FUNC_EXIT()
101	ret
102L(n01):	dec	R32(%rax)
103	mov	8(up), %r9
104	jne	L(n10)
105	adc	%r8, %r8
106	adc	%r9, %r9
107	mov	%r8, (rp)
108	mov	%r9, 8(rp)
109	adc	R32(%rax), R32(%rax)
110	FUNC_EXIT()
111	ret
112L(n10):	mov	16(up), %r10
113	adc	%r8, %r8
114	adc	%r9, %r9
115	adc	%r10, %r10
116	mov	%r8, (rp)
117	mov	%r9, 8(rp)
118	mov	%r10, 16(rp)
119	adc	$-1, R32(%rax)
120	FUNC_EXIT()
121	ret
122
123L(gen):	neg	R32(%rcx)		C put rsh count in cl
124	mov	-8(up,n,8), %rax
125	shr	R8(%rcx), %rax		C function return value
126
127	neg	R32(%rcx)		C put lsh count in cl
128	lea	1(n), R32(%r8)
129	and	$3, R32(%r8)
130	je	L(rlx)			C jump for n = 3, 7, 11, ...
131
132	dec	R32(%r8)
133	jne	L(1)
134C	n = 4, 8, 12, ...
135	mov	-8(up,n,8), %r10
136	shl	R8(%rcx), %r10
137	neg	R32(%rcx)		C put rsh count in cl
138	mov	-16(up,n,8), %r8
139	shr	R8(%rcx), %r8
140	or	%r8, %r10
141	mov	%r10, -8(rp,n,8)
142	dec	n
143	jmp	L(rll)
144
145L(1):	dec	R32(%r8)
146	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
147C	n = 2, 6, 10, 16, ...
148	mov	-8(up,n,8), %r10
149	shl	R8(%rcx), %r10
150	neg	R32(%rcx)		C put rsh count in cl
151	mov	-16(up,n,8), %r8
152	shr	R8(%rcx), %r8
153	or	%r8, %r10
154	mov	%r10, -8(rp,n,8)
155	dec	n
156	neg	R32(%rcx)		C put lsh count in cl
157L(1x):
158	cmp	$1, n
159	je	L(ast)
160	mov	-8(up,n,8), %r10
161	shl	R8(%rcx), %r10
162	mov	-16(up,n,8), %r11
163	shl	R8(%rcx), %r11
164	neg	R32(%rcx)		C put rsh count in cl
165	mov	-16(up,n,8), %r8
166	mov	-24(up,n,8), %r9
167	shr	R8(%rcx), %r8
168	or	%r8, %r10
169	shr	R8(%rcx), %r9
170	or	%r9, %r11
171	mov	%r10, -8(rp,n,8)
172	mov	%r11, -16(rp,n,8)
173	sub	$2, n
174
175L(rll):	neg	R32(%rcx)		C put lsh count in cl
176L(rlx):	mov	-8(up,n,8), %r10
177	shl	R8(%rcx), %r10
178	mov	-16(up,n,8), %r11
179	shl	R8(%rcx), %r11
180
181	sub	$4, n			C				      4
182	jb	L(end)			C				      2
183	ALIGN(16)
184L(top):
185	C finish stuff from lsh block
186	neg	R32(%rcx)		C put rsh count in cl
187	mov	16(up,n,8), %r8
188	mov	8(up,n,8), %r9
189	shr	R8(%rcx), %r8
190	or	%r8, %r10
191	shr	R8(%rcx), %r9
192	or	%r9, %r11
193	mov	%r10, 24(rp,n,8)
194	mov	%r11, 16(rp,n,8)
195	C start two new rsh
196	mov	0(up,n,8), %r8
197	mov	-8(up,n,8), %r9
198	shr	R8(%rcx), %r8
199	shr	R8(%rcx), %r9
200
201	C finish stuff from rsh block
202	neg	R32(%rcx)		C put lsh count in cl
203	mov	8(up,n,8), %r10
204	mov	0(up,n,8), %r11
205	shl	R8(%rcx), %r10
206	or	%r10, %r8
207	shl	R8(%rcx), %r11
208	or	%r11, %r9
209	mov	%r8, 8(rp,n,8)
210	mov	%r9, 0(rp,n,8)
211	C start two new lsh
212	mov	-8(up,n,8), %r10
213	mov	-16(up,n,8), %r11
214	shl	R8(%rcx), %r10
215	shl	R8(%rcx), %r11
216
217	sub	$4, n
218	jae	L(top)			C				      2
219L(end):
220	neg	R32(%rcx)		C put rsh count in cl
221	mov	8(up), %r8
222	shr	R8(%rcx), %r8
223	or	%r8, %r10
224	mov	(up), %r9
225	shr	R8(%rcx), %r9
226	or	%r9, %r11
227	mov	%r10, 16(rp)
228	mov	%r11, 8(rp)
229
230	neg	R32(%rcx)		C put lsh count in cl
231L(ast):	mov	(up), %r10
232	shl	R8(%rcx), %r10
233	mov	%r10, (rp)
234	FUNC_EXIT()
235	ret
236EPILOGUE()
237