xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/lshift.asm (revision 1580a27b92f58fcdcb23fdfbc04a7c2b54a0b7c8)
1dnl  AMD64 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 2003, 2005, 2007, 2009, 2011, 2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C	     cycles/limb   cycles/limb cnt=1
36C AMD K8,K9	 2.375		 1.375
37C AMD K10	 2.375		 1.375
38C Intel P4	 8		10.5
39C Intel core2	 2.11		 4.28
40C Intel corei	 ?		 ?
41C Intel atom	 5.75		 3.5
42C VIA nano	 3.5		 2.25
43
44
45C INPUT PARAMETERS
46define(`rp',	`%rdi')
47define(`up',	`%rsi')
48define(`n',	`%rdx')
49define(`cnt',	`%rcx')
50
51ABI_SUPPORT(DOS64)
52ABI_SUPPORT(STD64)
53
54ASM_START()
55	TEXT
56	ALIGN(32)
57PROLOGUE(mpn_lshift)
58	FUNC_ENTRY(4)
59	cmp	$1, R8(%rcx)
60	jne	L(gen)
61
62C For cnt=1 we want to work from lowest limb towards higher limbs.
63C Check for bad overlap (up=rp is OK!) up=rp+1..rp+n-1 is bad.
64C FIXME: this could surely be done more cleverly.
65
66	mov    rp, %rax
67	sub    up, %rax
68	je     L(fwd)			C rp = up
69	shr    $3, %rax
70	cmp    n, %rax
71	jb     L(gen)
72
73L(fwd):	mov	R32(n), R32(%rax)
74	shr	$2, n
75	je	L(e1)
76	and	$3, R32(%rax)
77
78	ALIGN(8)
79	nop
80	nop
81L(t1):	mov	(up), %r8
82	mov	8(up), %r9
83	mov	16(up), %r10
84	mov	24(up), %r11
85	lea	32(up), up
86	adc	%r8, %r8
87	mov	%r8, (rp)
88	adc	%r9, %r9
89	mov	%r9, 8(rp)
90	adc	%r10, %r10
91	mov	%r10, 16(rp)
92	adc	%r11, %r11
93	mov	%r11, 24(rp)
94	lea	32(rp), rp
95	dec	n
96	jne	L(t1)
97
98	inc	R32(%rax)
99	dec	R32(%rax)
100	jne	L(n00)
101	adc	R32(%rax), R32(%rax)
102	FUNC_EXIT()
103	ret
104L(e1):	test	R32(%rax), R32(%rax)	C clear cy
105L(n00):	mov	(up), %r8
106	dec	R32(%rax)
107	jne	L(n01)
108	adc	%r8, %r8
109	mov	%r8, (rp)
110L(ret):	adc	R32(%rax), R32(%rax)
111	FUNC_EXIT()
112	ret
113L(n01):	dec	R32(%rax)
114	mov	8(up), %r9
115	jne	L(n10)
116	adc	%r8, %r8
117	adc	%r9, %r9
118	mov	%r8, (rp)
119	mov	%r9, 8(rp)
120	adc	R32(%rax), R32(%rax)
121	FUNC_EXIT()
122	ret
123L(n10):	mov	16(up), %r10
124	adc	%r8, %r8
125	adc	%r9, %r9
126	adc	%r10, %r10
127	mov	%r8, (rp)
128	mov	%r9, 8(rp)
129	mov	%r10, 16(rp)
130	adc	$-1, R32(%rax)
131	FUNC_EXIT()
132	ret
133
134L(gen):	neg	R32(%rcx)		C put rsh count in cl
135	mov	-8(up,n,8), %rax
136	shr	R8(%rcx), %rax		C function return value
137
138	neg	R32(%rcx)		C put lsh count in cl
139	lea	1(n), R32(%r8)
140	and	$3, R32(%r8)
141	je	L(rlx)			C jump for n = 3, 7, 11, ...
142
143	dec	R32(%r8)
144	jne	L(1)
145C	n = 4, 8, 12, ...
146	mov	-8(up,n,8), %r10
147	shl	R8(%rcx), %r10
148	neg	R32(%rcx)		C put rsh count in cl
149	mov	-16(up,n,8), %r8
150	shr	R8(%rcx), %r8
151	or	%r8, %r10
152	mov	%r10, -8(rp,n,8)
153	dec	n
154	jmp	L(rll)
155
156L(1):	dec	R32(%r8)
157	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
158C	n = 2, 6, 10, 16, ...
159	mov	-8(up,n,8), %r10
160	shl	R8(%rcx), %r10
161	neg	R32(%rcx)		C put rsh count in cl
162	mov	-16(up,n,8), %r8
163	shr	R8(%rcx), %r8
164	or	%r8, %r10
165	mov	%r10, -8(rp,n,8)
166	dec	n
167	neg	R32(%rcx)		C put lsh count in cl
168L(1x):
169	cmp	$1, n
170	je	L(ast)
171	mov	-8(up,n,8), %r10
172	shl	R8(%rcx), %r10
173	mov	-16(up,n,8), %r11
174	shl	R8(%rcx), %r11
175	neg	R32(%rcx)		C put rsh count in cl
176	mov	-16(up,n,8), %r8
177	mov	-24(up,n,8), %r9
178	shr	R8(%rcx), %r8
179	or	%r8, %r10
180	shr	R8(%rcx), %r9
181	or	%r9, %r11
182	mov	%r10, -8(rp,n,8)
183	mov	%r11, -16(rp,n,8)
184	sub	$2, n
185
186L(rll):	neg	R32(%rcx)		C put lsh count in cl
187L(rlx):	mov	-8(up,n,8), %r10
188	shl	R8(%rcx), %r10
189	mov	-16(up,n,8), %r11
190	shl	R8(%rcx), %r11
191
192	sub	$4, n			C				      4
193	jb	L(end)			C				      2
194	ALIGN(16)
195L(top):
196	C finish stuff from lsh block
197	neg	R32(%rcx)		C put rsh count in cl
198	mov	16(up,n,8), %r8
199	mov	8(up,n,8), %r9
200	shr	R8(%rcx), %r8
201	or	%r8, %r10
202	shr	R8(%rcx), %r9
203	or	%r9, %r11
204	mov	%r10, 24(rp,n,8)
205	mov	%r11, 16(rp,n,8)
206	C start two new rsh
207	mov	0(up,n,8), %r8
208	mov	-8(up,n,8), %r9
209	shr	R8(%rcx), %r8
210	shr	R8(%rcx), %r9
211
212	C finish stuff from rsh block
213	neg	R32(%rcx)		C put lsh count in cl
214	mov	8(up,n,8), %r10
215	mov	0(up,n,8), %r11
216	shl	R8(%rcx), %r10
217	or	%r10, %r8
218	shl	R8(%rcx), %r11
219	or	%r11, %r9
220	mov	%r8, 8(rp,n,8)
221	mov	%r9, 0(rp,n,8)
222	C start two new lsh
223	mov	-8(up,n,8), %r10
224	mov	-16(up,n,8), %r11
225	shl	R8(%rcx), %r10
226	shl	R8(%rcx), %r11
227
228	sub	$4, n
229	jae	L(top)			C				      2
230L(end):
231	neg	R32(%rcx)		C put rsh count in cl
232	mov	8(up), %r8
233	shr	R8(%rcx), %r8
234	or	%r8, %r10
235	mov	(up), %r9
236	shr	R8(%rcx), %r9
237	or	%r9, %r11
238	mov	%r10, 16(rp)
239	mov	%r11, 8(rp)
240
241	neg	R32(%rcx)		C put lsh count in cl
242L(ast):	mov	(up), %r10
243	shl	R8(%rcx), %r10
244	mov	%r10, (rp)
245	FUNC_EXIT()
246	ret
247EPILOGUE()
248