xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/lshift.asm (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1dnl  AMD64 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb   cycles/limb cnt=1
24C K8,K9:	 2.375		 1.375
25C K10:		 2.375		 1.375
26C P4:		 8		10.5
27C P6-15 (Core2): 2.11		 4.28
28C P6-28 (Atom):	 5.75		 3.5
29
30
31C INPUT PARAMETERS
32define(`rp',	`%rdi')
33define(`up',	`%rsi')
34define(`n',	`%rdx')
35define(`cnt',	`%rcx')
36
37ASM_START()
38	TEXT
39	ALIGN(32)
40PROLOGUE(mpn_lshift)
41	cmp	$1, R8(%rcx)
42	jne	L(gen)
43
44C For cnt=1 we want to work from lowest limb towards higher limbs.
45C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad.
46C FIXME: this could surely be done more cleverly.
47
48	mov    rp, %rax
49	sub    up, %rax
50	je     L(fwd)			C rp = up
51	shr    $3, %rax
52	cmp    n, %rax
53	jb     L(gen)
54
55L(fwd):	mov	R32(n), R32(%rax)
56	shr	$2, n
57	je	L(e1)
58	and	$3, R32(%rax)
59
60	ALIGN(8)
61	nop
62	nop
63L(t1):	mov	(up), %r8
64	mov	8(up), %r9
65	mov	16(up), %r10
66	mov	24(up), %r11
67	lea	32(up), up
68	adc	%r8, %r8
69	mov	%r8, (rp)
70	adc	%r9, %r9
71	mov	%r9, 8(rp)
72	adc	%r10, %r10
73	mov	%r10, 16(rp)
74	adc	%r11, %r11
75	mov	%r11, 24(rp)
76	lea	32(rp), rp
77	dec	n
78	jne	L(t1)
79
80	inc	R32(%rax)
81	dec	R32(%rax)
82	jne	L(n00)
83	adc	R32(%rax), R32(%rax)
84	ret
85L(e1):	test	R32(%rax), R32(%rax)	C clear cy
86L(n00):	mov	(up), %r8
87	dec	R32(%rax)
88	jne	L(n01)
89	adc	%r8, %r8
90	mov	%r8, (rp)
91L(ret):	adc	R32(%rax), R32(%rax)
92	ret
93L(n01):	dec	R32(%rax)
94	mov	8(up), %r9
95	jne	L(n10)
96	adc	%r8, %r8
97	adc	%r9, %r9
98	mov	%r8, (rp)
99	mov	%r9, 8(rp)
100	adc	R32(%rax), R32(%rax)
101	ret
102L(n10):	mov	16(up), %r10
103	adc	%r8, %r8
104	adc	%r9, %r9
105	adc	%r10, %r10
106	mov	%r8, (rp)
107	mov	%r9, 8(rp)
108	mov	%r10, 16(rp)
109	adc	$-1, R32(%rax)
110	ret
111
112L(gen):	neg	R32(%rcx)		C put rsh count in cl
113	mov	-8(up,n,8), %rax
114	shr	R8(%rcx), %rax		C function return value
115
116	neg	R32(%rcx)		C put lsh count in cl
117	lea	1(n), R32(%r8)
118	and	$3, R32(%r8)
119	je	L(rlx)			C jump for n = 3, 7, 11, ...
120
121	dec	R32(%r8)
122	jne	L(1)
123C	n = 4, 8, 12, ...
124	mov	-8(up,n,8), %r10
125	shl	R8(%rcx), %r10
126	neg	R32(%rcx)		C put rsh count in cl
127	mov	-16(up,n,8), %r8
128	shr	R8(%rcx), %r8
129	or	%r8, %r10
130	mov	%r10, -8(rp,n,8)
131	dec	n
132	jmp	L(rll)
133
134L(1):	dec	R32(%r8)
135	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
136C	n = 2, 6, 10, 16, ...
137	mov	-8(up,n,8), %r10
138	shl	R8(%rcx), %r10
139	neg	R32(%rcx)		C put rsh count in cl
140	mov	-16(up,n,8), %r8
141	shr	R8(%rcx), %r8
142	or	%r8, %r10
143	mov	%r10, -8(rp,n,8)
144	dec	n
145	neg	R32(%rcx)		C put lsh count in cl
146L(1x):
147	cmp	$1, n
148	je	L(ast)
149	mov	-8(up,n,8), %r10
150	shl	R8(%rcx), %r10
151	mov	-16(up,n,8), %r11
152	shl	R8(%rcx), %r11
153	neg	R32(%rcx)		C put rsh count in cl
154	mov	-16(up,n,8), %r8
155	mov	-24(up,n,8), %r9
156	shr	R8(%rcx), %r8
157	or	%r8, %r10
158	shr	R8(%rcx), %r9
159	or	%r9, %r11
160	mov	%r10, -8(rp,n,8)
161	mov	%r11, -16(rp,n,8)
162	sub	$2, n
163
164L(rll):	neg	R32(%rcx)		C put lsh count in cl
165L(rlx):	mov	-8(up,n,8), %r10
166	shl	R8(%rcx), %r10
167	mov	-16(up,n,8), %r11
168	shl	R8(%rcx), %r11
169
170	sub	$4, n			C				      4
171	jb	L(end)			C				      2
172	ALIGN(16)
173L(top):
174	C finish stuff from lsh block
175	neg	R32(%rcx)		C put rsh count in cl
176	mov	16(up,n,8), %r8
177	mov	8(up,n,8), %r9
178	shr	R8(%rcx), %r8
179	or	%r8, %r10
180	shr	R8(%rcx), %r9
181	or	%r9, %r11
182	mov	%r10, 24(rp,n,8)
183	mov	%r11, 16(rp,n,8)
184	C start two new rsh
185	mov	0(up,n,8), %r8
186	mov	-8(up,n,8), %r9
187	shr	R8(%rcx), %r8
188	shr	R8(%rcx), %r9
189
190	C finish stuff from rsh block
191	neg	R32(%rcx)		C put lsh count in cl
192	mov	8(up,n,8), %r10
193	mov	0(up,n,8), %r11
194	shl	R8(%rcx), %r10
195	or	%r10, %r8
196	shl	R8(%rcx), %r11
197	or	%r11, %r9
198	mov	%r8, 8(rp,n,8)
199	mov	%r9, 0(rp,n,8)
200	C start two new lsh
201	mov	-8(up,n,8), %r10
202	mov	-16(up,n,8), %r11
203	shl	R8(%rcx), %r10
204	shl	R8(%rcx), %r11
205
206	sub	$4, n
207	jae	L(top)			C				      2
208L(end):
209	neg	R32(%rcx)		C put rsh count in cl
210	mov	8(up), %r8
211	shr	R8(%rcx), %r8
212	or	%r8, %r10
213	mov	(up), %r9
214	shr	R8(%rcx), %r9
215	or	%r9, %r11
216	mov	%r10, 16(rp)
217	mov	%r11, 8(rp)
218
219	neg	R32(%rcx)		C put lsh count in cl
220L(ast):	mov	(up), %r10
221	shl	R8(%rcx), %r10
222	mov	%r10, (rp)
223	ret
224EPILOGUE()
225