xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/rsh1aors_n.asm (revision 6d322f2f4598f0d8a138f10ea648ec4fabe41f8b)
1dnl  AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
2dnl  AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
3
4dnl  Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C AMD K8,K9	 2.14	(mpn_add_n + mpn_rshift need 4.125)
25C AMD K10	 2.14	(mpn_add_n + mpn_rshift need 4.125)
26C Intel P4	12.75
27C Intel core2	 3.75
28C Intel NMH	 4.4
29C Intel SBR	 ?
30C Intel atom	 ?
31C VIA nano	 3.25
32
33C TODO
34C  * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
35
36C INPUT PARAMETERS
37define(`rp', `%rdi')
38define(`up', `%rsi')
39define(`vp', `%rdx')
40define(`n',`  %rcx')
41
42ifdef(`OPERATION_rsh1add_n', `
43	define(ADDSUB,	      add)
44	define(ADCSBB,	      adc)
45	define(func_n,	      mpn_rsh1add_n)
46	define(func_nc,	      mpn_rsh1add_nc)')
47ifdef(`OPERATION_rsh1sub_n', `
48	define(ADDSUB,	      sub)
49	define(ADCSBB,	      sbb)
50	define(func_n,	      mpn_rsh1sub_n)
51	define(func_nc,	      mpn_rsh1sub_nc)')
52
53MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
54
55ABI_SUPPORT(DOS64)
56ABI_SUPPORT(STD64)
57
58ASM_START()
59	TEXT
60	ALIGN(16)
61PROLOGUE(func_nc)
62	FUNC_ENTRY(4)
63IFDOS(`	mov	56(%rsp), %r8	')
64	push	%rbx
65
66	xor	R32(%rax), R32(%rax)
67	neg	%r8			C set C flag from parameter
68	mov	(up), %rbx
69	ADCSBB	(vp), %rbx
70	jmp	L(ent)
71EPILOGUE()
72
73	ALIGN(16)
74PROLOGUE(func_n)
75	FUNC_ENTRY(4)
76	push	%rbx
77
78	xor	R32(%rax), R32(%rax)
79	mov	(up), %rbx
80	ADDSUB	(vp), %rbx
81L(ent):
82	rcr	%rbx			C rotate, save acy
83	adc	R32(%rax), R32(%rax)	C return value
84
85	mov	R32(n), R32(%r11)
86	and	$3, R32(%r11)
87
88	cmp	$1, R32(%r11)
89	je	L(do)			C jump if n = 1 5 9 ...
90
91L(n1):	cmp	$2, R32(%r11)
92	jne	L(n2)			C jump unless n = 2 6 10 ...
93	add	%rbx, %rbx		C rotate carry limb, restore acy
94	mov	8(up), %r10
95	ADCSBB	8(vp), %r10
96	lea	8(up), up
97	lea	8(vp), vp
98	lea	8(rp), rp
99	rcr	%r10
100	rcr	%rbx
101	mov	%rbx, -8(rp)
102	jmp	L(cj1)
103
104L(n2):	cmp	$3, R32(%r11)
105	jne	L(n3)			C jump unless n = 3 7 11 ...
106	add	%rbx, %rbx		C rotate carry limb, restore acy
107	mov	8(up), %r9
108	mov	16(up), %r10
109	ADCSBB	8(vp), %r9
110	ADCSBB	16(vp), %r10
111	lea	16(up), up
112	lea	16(vp), vp
113	lea	16(rp), rp
114	rcr	%r10
115	rcr	%r9
116	rcr	%rbx
117	mov	%rbx, -16(rp)
118	jmp	L(cj2)
119
120L(n3):	dec	n			C come here for n = 4 8 12 ...
121	add	%rbx, %rbx		C rotate carry limb, restore acy
122	mov	8(up), %r8
123	mov	16(up), %r9
124	ADCSBB	8(vp), %r8
125	ADCSBB	16(vp), %r9
126	mov	24(up), %r10
127	ADCSBB	24(vp), %r10
128	lea	24(up), up
129	lea	24(vp), vp
130	lea	24(rp), rp
131	rcr	%r10
132	rcr	%r9
133	rcr	%r8
134	rcr	%rbx
135	mov	%rbx, -24(rp)
136	mov	%r8, -16(rp)
137L(cj2):	mov	%r9, -8(rp)
138L(cj1):	mov	%r10, %rbx
139
140L(do):
141	shr	$2, n			C				4
142	je	L(end)			C				2
143	ALIGN(16)
144L(top):	add	%rbx, %rbx		C rotate carry limb, restore acy
145
146	mov	8(up), %r8
147	mov	16(up), %r9
148	ADCSBB	8(vp), %r8
149	ADCSBB	16(vp), %r9
150	mov	24(up), %r10
151	mov	32(up), %r11
152	ADCSBB	24(vp), %r10
153	ADCSBB	32(vp), %r11
154
155	lea	32(up), up
156	lea	32(vp), vp
157
158	rcr	%r11			C rotate, save acy
159	rcr	%r10
160	rcr	%r9
161	rcr	%r8
162
163	rcr	%rbx
164	mov	%rbx, (rp)
165	mov	%r8, 8(rp)
166	mov	%r9, 16(rp)
167	mov	%r10, 24(rp)
168	mov	%r11, %rbx
169
170	lea	32(rp), rp
171	dec	n
172	jne	L(top)
173
174L(end):	mov	%rbx, (rp)
175	pop	%rbx
176	FUNC_EXIT()
177	ret
178EPILOGUE()
179