xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/atom/rshift.asm (revision 92e958de60c71aa0f2452bd7074cbb006fe6546b)
1dnl  AMD64 mpn_rshift -- mpn right shift, optimised for Atom.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C AMD K8,K9	 ?
26C AMD K10	 ?
27C Intel P4	 ?
28C Intel core2	 ?
29C Intel NHM	 ?
30C Intel SBR	 ?
31C Intel atom	 4.5
32C VIA nano	 ?
33
34C TODO
35C  * Consider using 4-way unrolling.  We reach 4 c/l, but the code is 2.5 times
36C    larger.
37
38C INPUT PARAMETERS
39define(`rp',	`%rdi')
40define(`up',	`%rsi')
41define(`n',	`%rdx')
42define(`cnt',	`%rcx')
43
44ABI_SUPPORT(DOS64)
45ABI_SUPPORT(STD64)
46
47ASM_START()
48	TEXT
49	ALIGN(16)
50PROLOGUE(mpn_rshift)
51	FUNC_ENTRY(4)
52	shr	R32(n)
53	mov	(up), %rax
54	jnc	L(evn)
55
56	mov	%rax, %r11
57	shr	R8(cnt), %r11
58	neg	R8(cnt)
59	shl	R8(cnt), %rax
60	test	n, n
61	jnz	L(gt1)
62	mov	%r11, (rp)
63	FUNC_EXIT()
64	ret
65
66L(gt1):	mov	8(up), %r8
67	mov	%r8, %r10
68	shl	R8(cnt), %r8
69	jmp	L(lo1)
70
71L(evn):	mov	%rax, %r10
72	neg	R8(cnt)
73	shl	R8(cnt), %rax
74	mov	8(up), %r9
75	mov	%r9, %r11
76	shl	R8(cnt), %r9
77	neg	R8(cnt)
78	dec	n
79	lea	-8(rp), rp
80	lea	8(up), up
81	jz	L(end)
82
83	ALIGN(8)
84L(top):	shr	R8(cnt), %r10
85	or	%r10, %r9
86	shr	R8(cnt), %r11
87	neg	R8(cnt)
88	mov	8(up), %r8
89	mov	%r8, %r10
90	mov	%r9, 8(rp)
91	shl	R8(cnt), %r8
92	lea	16(rp), rp
93L(lo1):	mov	16(up), %r9
94	or	%r11, %r8
95	mov	%r9, %r11
96	shl	R8(cnt), %r9
97	lea	16(up), up
98	neg	R8(cnt)
99	mov	%r8, (rp)
100	dec	n
101	jg	L(top)
102
103L(end):	shr	R8(cnt), %r10
104	or	%r10, %r9
105	shr	R8(cnt), %r11
106	mov	%r9, 8(rp)
107	mov	%r11, 16(rp)
108	FUNC_EXIT()
109	ret
110EPILOGUE()
111