xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/sublsh1_n.asm (revision 4fee23f98c45552038ad6b5bd05124a41302fb01)
1dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
2
3dnl  Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 2.2
25C K10:		 2.2
26C P4:		12.75
27C P6 core2: 	 3.45
28C P6 corei7:	 3.45
29C P6 atom:	 ?
30
31
32C Sometimes speed degenerates, supposedly related to that some operand
33C alignments cause cache conflicts.
34
35C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
36C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
37
38C INPUT PARAMETERS
39define(`rp',`%rdi')
40define(`up',`%rsi')
41define(`vp',`%rdx')
42define(`n', `%rcx')
43
44ASM_START()
45	TEXT
46	ALIGN(16)
47PROLOGUE(mpn_sublsh1_n)
48	push	%rbx
49	push	%rbp
50
51	mov	(vp), %r8
52	mov	R32(n), R32(%rax)
53	lea	(rp,n,8), rp
54	lea	(up,n,8), up
55	lea	(vp,n,8), vp
56	neg	n
57	xor	R32(%rbp), R32(%rbp)
58	and	$3, R32(%rax)
59	je	L(b00)
60	cmp	$2, R32(%rax)
61	jc	L(b01)
62	je	L(b10)
63
64L(b11):	add	%r8, %r8
65	mov	8(vp,n,8), %r9
66	adc	%r9, %r9
67	mov	16(vp,n,8), %r10
68	adc	%r10, %r10
69	sbb	R32(%rax), R32(%rax)	C save scy
70	mov	(up,n,8), %rbp
71	mov	8(up,n,8), %rbx
72	sub	%r8, %rbp
73	sbb	%r9, %rbx
74	mov	%rbp, (rp,n,8)
75	mov	%rbx, 8(rp,n,8)
76	mov	16(up,n,8), %rbp
77	sbb	%r10, %rbp
78	mov	%rbp, 16(rp,n,8)
79	sbb	R32(%rbp), R32(%rbp)	C save acy
80	add	$3, n
81	jmp	L(ent)
82
83L(b10):	add	%r8, %r8
84	mov	8(vp,n,8), %r9
85	adc	%r9, %r9
86	sbb	R32(%rax), R32(%rax)	C save scy
87	mov	(up,n,8), %rbp
88	mov	8(up,n,8), %rbx
89	sub	%r8, %rbp
90	sbb	%r9, %rbx
91	mov	%rbp, (rp,n,8)
92	mov	%rbx, 8(rp,n,8)
93	sbb	R32(%rbp), R32(%rbp)	C save acy
94	add	$2, n
95	jmp	L(ent)
96
97L(b01):	add	%r8, %r8
98	sbb	R32(%rax), R32(%rax)	C save scy
99	mov	(up,n,8), %rbp
100	sub	%r8, %rbp
101	mov	%rbp, (rp,n,8)
102	sbb	R32(%rbp), R32(%rbp)	C save acy
103	inc	n
104L(ent):	jns	L(end)
105
106	ALIGN(16)
107L(top):	add	R32(%rax), R32(%rax)	C restore scy
108
109	mov	(vp,n,8), %r8
110L(b00):	adc	%r8, %r8
111	mov	8(vp,n,8), %r9
112	adc	%r9, %r9
113	mov	16(vp,n,8), %r10
114	adc	%r10, %r10
115	mov	24(vp,n,8), %r11
116	adc	%r11, %r11
117
118	sbb	R32(%rax), R32(%rax)	C save scy
119	add	R32(%rbp), R32(%rbp)	C restore acy
120
121	mov	(up,n,8), %rbp
122	mov	8(up,n,8), %rbx
123	sbb	%r8, %rbp
124	sbb	%r9, %rbx
125	mov	%rbp, (rp,n,8)
126	mov	%rbx, 8(rp,n,8)
127	mov	16(up,n,8), %rbp
128	mov	24(up,n,8), %rbx
129	sbb	%r10, %rbp
130	sbb	%r11, %rbx
131	mov	%rbp, 16(rp,n,8)
132	mov	%rbx, 24(rp,n,8)
133
134	sbb	R32(%rbp), R32(%rbp)	C save acy
135	add	$4, n
136	js	L(top)
137
138L(end):	add	R32(%rbp), R32(%rax)
139	neg	R32(%rax)
140
141	pop	%rbp
142	pop	%rbx
143	ret
144EPILOGUE()
145