xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/aorrlsh1_n.asm (revision b45fa494daa2ba02187711d31a4144faf0993066)
1dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
2dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
3
4dnl  Copyright 2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb
25C K8,K9:	 2
26C K10:		 2
27C P4:		 13
28C P6 core2: 	 3.45
29C P6 corei7:	 3.45
30C P6 atom:	 ?
31
32
33C Sometimes speed degenerates, supposedly related to that some operand
34C alignments cause cache conflicts.
35
36C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
37C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
38
39C INPUT PARAMETERS
40define(`rp',`%rdi')
41define(`up',`%rsi')
42define(`vp',`%rdx')
43define(`n', `%rcx')
44
45ifdef(`OPERATION_addlsh1_n', `
46	define(ADDSUB,	      add)
47	define(ADCSBB,	      adc)
48	define(func,	      mpn_addlsh1_n)')
49ifdef(`OPERATION_rsblsh1_n', `
50	define(ADDSUB,	      sub)
51	define(ADCSBB,	      sbb)
52	define(func,	      mpn_rsblsh1_n)')
53
54MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
55
56ASM_START()
57	TEXT
58	ALIGN(16)
59PROLOGUE(func)
60	push	%rbp
61
62	mov	(vp), %r8
63	mov	R32(n), R32(%rax)
64	lea	(rp,n,8), rp
65	lea	(up,n,8), up
66	lea	(vp,n,8), vp
67	neg	n
68	xor	R32(%rbp), R32(%rbp)
69	and	$3, R32(%rax)
70	je	L(b00)
71	cmp	$2, R32(%rax)
72	jc	L(b01)
73	je	L(b10)
74
75L(b11):	add	%r8, %r8
76	mov	8(vp,n,8), %r9
77	adc	%r9, %r9
78	mov	16(vp,n,8), %r10
79	adc	%r10, %r10
80	sbb	R32(%rax), R32(%rax)	C save scy
81	ADDSUB	(up,n,8), %r8
82	ADCSBB	8(up,n,8), %r9
83	mov	%r8, (rp,n,8)
84	mov	%r9, 8(rp,n,8)
85	ADCSBB	16(up,n,8), %r10
86	mov	%r10, 16(rp,n,8)
87	sbb	R32(%rbp), R32(%rbp)	C save acy
88	add	$3, n
89	jmp	L(ent)
90
91L(b10):	add	%r8, %r8
92	mov	8(vp,n,8), %r9
93	adc	%r9, %r9
94	sbb	R32(%rax), R32(%rax)	C save scy
95	ADDSUB	(up,n,8), %r8
96	ADCSBB	8(up,n,8), %r9
97	mov	%r8, (rp,n,8)
98	mov	%r9, 8(rp,n,8)
99	sbb	R32(%rbp), R32(%rbp)	C save acy
100	add	$2, n
101	jmp	L(ent)
102
103L(b01):	add	%r8, %r8
104	sbb	R32(%rax), R32(%rax)	C save scy
105	ADDSUB	(up,n,8), %r8
106	mov	%r8, (rp,n,8)
107	sbb	R32(%rbp), R32(%rbp)	C save acy
108	inc	n
109L(ent):	jns	L(end)
110
111	ALIGN(16)
112L(top):	add	R32(%rax), R32(%rax)	C restore scy
113
114	mov	(vp,n,8), %r8
115L(b00):	adc	%r8, %r8
116	mov	8(vp,n,8), %r9
117	adc	%r9, %r9
118	mov	16(vp,n,8), %r10
119	adc	%r10, %r10
120	mov	24(vp,n,8), %r11
121	adc	%r11, %r11
122
123	sbb	R32(%rax), R32(%rax)	C save scy
124	add	R32(%rbp), R32(%rbp)	C restore acy
125
126	ADCSBB	(up,n,8), %r8
127	nop				C Hammer speedup!
128	ADCSBB	8(up,n,8), %r9
129	mov	%r8, (rp,n,8)
130	mov	%r9, 8(rp,n,8)
131	ADCSBB	16(up,n,8), %r10
132	ADCSBB	24(up,n,8), %r11
133	mov	%r10, 16(rp,n,8)
134	mov	%r11, 24(rp,n,8)
135
136	sbb	R32(%rbp), R32(%rbp)	C save acy
137	add	$4, n
138	js	L(top)
139
140L(end):
141ifdef(`OPERATION_addlsh1_n',`
142	add	R32(%rbp), R32(%rax)
143	neg	R32(%rax)')
144ifdef(`OPERATION_rsblsh1_n',`
145	sub	R32(%rax), R32(%rbp)
146	movslq	R32(%rbp), %rax')
147
148	pop	%rbp
149	ret
150EPILOGUE()
151