xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/aorrlsh_n.asm (revision 75f6d617e282811cb173c2ccfbf5df0dd71f7045)
1dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
2
3dnl  Copyright 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C AMD K8,K9	 3.1	< 3.85 for lshift + add_n
25C AMD K10	 3.1	< 3.85 for lshift + add_n
26C Intel P4	14.6	> 7.33 for lshift + add_n
27C Intel core2	 3.87	> 3.27 for lshift + add_n
28C Intel NHM	 4	> 3.75 for lshift + add_n
29C Intel SBR	(5.8)	> 3.46 for lshift + add_n
30C Intel atom	(7.75)	< 8.75 for lshift + add_n
31C VIA nano	 4.7	< 6.25 for lshift + add_n
32
33C This was written quickly and not optimized at all.  Surely one could get
34C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
35C   1) Use indexing to save the 3 LEA
36C   2) Write reasonable feed-in code
37C   3) Be more clever about register usage
38C   4) Unroll more, handling CL negation, carry save/restore cost much now
39C   5) Reschedule
40
41C INPUT PARAMETERS
42define(`rp',	`%rdi')
43define(`up',	`%rsi')
44define(`vp',	`%rdx')
45define(`n',	`%rcx')
46define(`cnt',	`%r8')
47
48ifdef(`OPERATION_addlsh_n',`
49  define(ADCSBB,       `adc')
50  define(func, mpn_addlsh_n)
51')
52ifdef(`OPERATION_rsblsh_n',`
53  define(ADCSBB,       `sbb')
54  define(func, mpn_rsblsh_n)
55')
56
57MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
58
59ABI_SUPPORT(DOS64)
60ABI_SUPPORT(STD64)
61
62ASM_START()
63	TEXT
64	ALIGN(16)
65PROLOGUE(func)
66	FUNC_ENTRY(4)
67IFDOS(`	mov	56(%rsp), %r8d	')
68	push	%r12
69	push	%r13
70	push	%r14
71	push	%rbp
72	push	%rbx
73
74	mov	n, %rax
75	xor	R32(%rbx), R32(%rbx)	C clear carry save register
76	mov	R32(%r8), R32(%rcx)	C shift count
77	xor	R32(%rbp), R32(%rbp)	C limb carry
78
79	mov	R32(%rax), R32(%r11)
80	and	$3, R32(%r11)
81	je	L(4)
82	sub	$1, R32(%r11)
83
84L(012):	mov	(vp), %r8
85	mov	%r8, %r12
86	shl	R8(%rcx), %r8
87	or	%rbp, %r8
88	neg	R8(%rcx)
89	mov	%r12, %rbp
90	shr	R8(%rcx), %rbp
91	neg	R8(%rcx)
92	add	R32(%rbx), R32(%rbx)
93	ADCSBB	(up), %r8
94	mov	%r8, (rp)
95	sbb	R32(%rbx), R32(%rbx)
96	lea	8(up), up
97	lea	8(vp), vp
98	lea	8(rp), rp
99	sub	$1, R32(%r11)
100	jnc	L(012)
101
102L(4):	sub	$4, %rax
103	jc	L(end)
104
105	ALIGN(16)
106L(top):	mov	(vp), %r8
107	mov	%r8, %r12
108	mov	8(vp), %r9
109	mov	%r9, %r13
110	mov	16(vp), %r10
111	mov	%r10, %r14
112	mov	24(vp), %r11
113
114	shl	R8(%rcx), %r8
115	shl	R8(%rcx), %r9
116	shl	R8(%rcx), %r10
117	or	%rbp, %r8
118	mov	%r11, %rbp
119	shl	R8(%rcx), %r11
120
121	neg	R8(%rcx)
122
123	shr	R8(%rcx), %r12
124	shr	R8(%rcx), %r13
125	shr	R8(%rcx), %r14
126	shr	R8(%rcx), %rbp		C used next iteration
127
128	or	%r12, %r9
129	or	%r13, %r10
130	or	%r14, %r11
131
132	neg	R8(%rcx)
133
134	add	R32(%rbx), R32(%rbx)	C restore carry flag
135
136	ADCSBB	(up), %r8
137	ADCSBB	8(up), %r9
138	ADCSBB	16(up), %r10
139	ADCSBB	24(up), %r11
140
141	mov	%r8, (rp)
142	mov	%r9, 8(rp)
143	mov	%r10, 16(rp)
144	mov	%r11, 24(rp)
145
146	sbb	R32(%rbx), R32(%rbx)	C save carry flag
147
148	lea	32(up), up
149	lea	32(vp), vp
150	lea	32(rp), rp
151
152	sub	$4, %rax
153	jnc	L(top)
154
155L(end):	add	R32(%rbx), R32(%rbx)
156	ADCSBB	$0, %rbp
157	mov	%rbp, %rax
158	pop	%rbx
159	pop	%rbp
160	pop	%r14
161	pop	%r13
162	pop	%r12
163	FUNC_EXIT()
164	ret
165EPILOGUE()
166