xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/aorrlsh_n.asm (revision 15a984a0d95c8f96abe9717ee6241762c55dc106)
1dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
2
3dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	 3.1	< 3.85 for lshift + add_n
36C AMD K10	 3.1	< 3.85 for lshift + add_n
37C Intel P4	14.6	> 7.33 for lshift + add_n
38C Intel core2	 3.87	> 3.27 for lshift + add_n
39C Intel NHM	 4	> 3.75 for lshift + add_n
40C Intel SBR	(5.8)	> 3.46 for lshift + add_n
41C Intel atom	(7.75)	< 8.75 for lshift + add_n
42C VIA nano	 4.7	< 6.25 for lshift + add_n
43
44C This was written quickly and not optimized at all.  Surely one could get
45C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
46C   1) Use indexing to save the 3 LEA
47C   2) Write reasonable feed-in code
48C   3) Be more clever about register usage
49C   4) Unroll more, handling CL negation, carry save/restore cost much now
50C   5) Reschedule
51
52C INPUT PARAMETERS
53define(`rp',	`%rdi')
54define(`up',	`%rsi')
55define(`vp',	`%rdx')
56define(`n',	`%rcx')
57define(`cnt',	`%r8')
58
59ifdef(`OPERATION_addlsh_n',`
60  define(ADCSBB,       `adc')
61  define(func, mpn_addlsh_n)
62')
63ifdef(`OPERATION_rsblsh_n',`
64  define(ADCSBB,       `sbb')
65  define(func, mpn_rsblsh_n)
66')
67
68MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
69
70ABI_SUPPORT(DOS64)
71ABI_SUPPORT(STD64)
72
73ASM_START()
74	TEXT
75	ALIGN(16)
76PROLOGUE(func)
77	FUNC_ENTRY(4)
78IFDOS(`	mov	56(%rsp), %r8d	')
79	push	%r12
80	push	%r13
81	push	%r14
82	push	%rbp
83	push	%rbx
84
85	mov	n, %rax
86	xor	R32(%rbx), R32(%rbx)	C clear carry save register
87	mov	R32(%r8), R32(%rcx)	C shift count
88	xor	R32(%rbp), R32(%rbp)	C limb carry
89
90	mov	R32(%rax), R32(%r11)
91	and	$3, R32(%r11)
92	je	L(4)
93	sub	$1, R32(%r11)
94
95L(012):	mov	(vp), %r8
96	mov	%r8, %r12
97	shl	R8(%rcx), %r8
98	or	%rbp, %r8
99	neg	R8(%rcx)
100	mov	%r12, %rbp
101	shr	R8(%rcx), %rbp
102	neg	R8(%rcx)
103	add	R32(%rbx), R32(%rbx)
104	ADCSBB	(up), %r8
105	mov	%r8, (rp)
106	sbb	R32(%rbx), R32(%rbx)
107	lea	8(up), up
108	lea	8(vp), vp
109	lea	8(rp), rp
110	sub	$1, R32(%r11)
111	jnc	L(012)
112
113L(4):	sub	$4, %rax
114	jc	L(end)
115
116	ALIGN(16)
117L(top):	mov	(vp), %r8
118	mov	%r8, %r12
119	mov	8(vp), %r9
120	mov	%r9, %r13
121	mov	16(vp), %r10
122	mov	%r10, %r14
123	mov	24(vp), %r11
124
125	shl	R8(%rcx), %r8
126	shl	R8(%rcx), %r9
127	shl	R8(%rcx), %r10
128	or	%rbp, %r8
129	mov	%r11, %rbp
130	shl	R8(%rcx), %r11
131
132	neg	R8(%rcx)
133
134	shr	R8(%rcx), %r12
135	shr	R8(%rcx), %r13
136	shr	R8(%rcx), %r14
137	shr	R8(%rcx), %rbp		C used next iteration
138
139	or	%r12, %r9
140	or	%r13, %r10
141	or	%r14, %r11
142
143	neg	R8(%rcx)
144
145	add	R32(%rbx), R32(%rbx)	C restore carry flag
146
147	ADCSBB	(up), %r8
148	ADCSBB	8(up), %r9
149	ADCSBB	16(up), %r10
150	ADCSBB	24(up), %r11
151
152	mov	%r8, (rp)
153	mov	%r9, 8(rp)
154	mov	%r10, 16(rp)
155	mov	%r11, 24(rp)
156
157	sbb	R32(%rbx), R32(%rbx)	C save carry flag
158
159	lea	32(up), up
160	lea	32(vp), vp
161	lea	32(rp), rp
162
163	sub	$4, %rax
164	jnc	L(top)
165
166L(end):	add	R32(%rbx), R32(%rbx)
167	ADCSBB	$0, %rbp
168	mov	%rbp, %rax
169	pop	%rbx
170	pop	%rbp
171	pop	%r14
172	pop	%r13
173	pop	%r12
174	FUNC_EXIT()
175	ret
176EPILOGUE()
177