xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/sublsh1_n.asm (revision 80d9064ac03cbb6a4174695f0d5b237c8766d3d0)
1dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
2
3dnl  Copyright 2003, 2005, 2006, 2007, 2011, 2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb
25C AMD K8,K9	 2.2
26C AMD K10	 2.2
27C Intel P4	12.75
28C Intel core2	 3.45
29C Intel corei	 ?
30C Intel atom	 ?
31C VIA nano	 3.25
32
33C Sometimes speed degenerates, supposedly related to that some operand
34C alignments cause cache conflicts.
35
36C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
37C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
38
39C INPUT PARAMETERS
40define(`rp',`%rdi')
41define(`up',`%rsi')
42define(`vp',`%rdx')
43define(`n', `%rcx')
44
45ABI_SUPPORT(DOS64)
46ABI_SUPPORT(STD64)
47
48ASM_START()
49	TEXT
50	ALIGN(16)
51PROLOGUE(mpn_sublsh1_n)
52	FUNC_ENTRY(4)
53	push	%rbx
54	push	%rbp
55
56	mov	(vp), %r8
57	mov	R32(n), R32(%rax)
58	lea	(rp,n,8), rp
59	lea	(up,n,8), up
60	lea	(vp,n,8), vp
61	neg	n
62	xor	R32(%rbp), R32(%rbp)
63	and	$3, R32(%rax)
64	je	L(b00)
65	cmp	$2, R32(%rax)
66	jc	L(b01)
67	je	L(b10)
68
69L(b11):	add	%r8, %r8
70	mov	8(vp,n,8), %r9
71	adc	%r9, %r9
72	mov	16(vp,n,8), %r10
73	adc	%r10, %r10
74	sbb	R32(%rax), R32(%rax)	C save scy
75	mov	(up,n,8), %rbp
76	mov	8(up,n,8), %rbx
77	sub	%r8, %rbp
78	sbb	%r9, %rbx
79	mov	%rbp, (rp,n,8)
80	mov	%rbx, 8(rp,n,8)
81	mov	16(up,n,8), %rbp
82	sbb	%r10, %rbp
83	mov	%rbp, 16(rp,n,8)
84	sbb	R32(%rbp), R32(%rbp)	C save acy
85	add	$3, n
86	jmp	L(ent)
87
88L(b10):	add	%r8, %r8
89	mov	8(vp,n,8), %r9
90	adc	%r9, %r9
91	sbb	R32(%rax), R32(%rax)	C save scy
92	mov	(up,n,8), %rbp
93	mov	8(up,n,8), %rbx
94	sub	%r8, %rbp
95	sbb	%r9, %rbx
96	mov	%rbp, (rp,n,8)
97	mov	%rbx, 8(rp,n,8)
98	sbb	R32(%rbp), R32(%rbp)	C save acy
99	add	$2, n
100	jmp	L(ent)
101
102L(b01):	add	%r8, %r8
103	sbb	R32(%rax), R32(%rax)	C save scy
104	mov	(up,n,8), %rbp
105	sub	%r8, %rbp
106	mov	%rbp, (rp,n,8)
107	sbb	R32(%rbp), R32(%rbp)	C save acy
108	inc	n
109L(ent):	jns	L(end)
110
111	ALIGN(16)
112L(top):	add	R32(%rax), R32(%rax)	C restore scy
113
114	mov	(vp,n,8), %r8
115L(b00):	adc	%r8, %r8
116	mov	8(vp,n,8), %r9
117	adc	%r9, %r9
118	mov	16(vp,n,8), %r10
119	adc	%r10, %r10
120	mov	24(vp,n,8), %r11
121	adc	%r11, %r11
122
123	sbb	R32(%rax), R32(%rax)	C save scy
124	add	R32(%rbp), R32(%rbp)	C restore acy
125
126	mov	(up,n,8), %rbp
127	mov	8(up,n,8), %rbx
128	sbb	%r8, %rbp
129	sbb	%r9, %rbx
130	mov	%rbp, (rp,n,8)
131	mov	%rbx, 8(rp,n,8)
132	mov	16(up,n,8), %rbp
133	mov	24(up,n,8), %rbx
134	sbb	%r10, %rbp
135	sbb	%r11, %rbx
136	mov	%rbp, 16(rp,n,8)
137	mov	%rbx, 24(rp,n,8)
138
139	sbb	R32(%rbp), R32(%rbp)	C save acy
140	add	$4, n
141	js	L(top)
142
143L(end):	add	R32(%rbp), R32(%rax)
144	neg	R32(%rax)
145
146	pop	%rbp
147	pop	%rbx
148	FUNC_EXIT()
149	ret
150EPILOGUE()
151