xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/aorsmul_1.asm (revision 8585484ef87f5a04d32332313cdb799625f4faf8)
1dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C AMD K8,K9	 2.5
25C AMD K10	 2.5
26C AMD bd1	 5.0
27C AMD bobcat	 6.17
28C Intel P4	14.9
29C Intel core2	 5.09
30C Intel NHM	 4.9
31C Intel SBR	 4.0
32C Intel atom	21.3
33C VIA nano	 5.0
34
35C The loop of this code is the result of running a code generation and
36C optimization tool suite written by David Harvey and Torbjorn Granlund.
37
38C TODO
39C  * The loop is great, but the prologue and epilogue code was quickly written.
40C    Tune it!
41
42define(`rp',      `%rdi')   C rcx
43define(`up',      `%rsi')   C rdx
44define(`n_param', `%rdx')   C r8
45define(`vl',      `%rcx')   C r9
46
47define(`n',       `%r11')
48
49ifdef(`OPERATION_addmul_1',`
50      define(`ADDSUB',        `add')
51      define(`func',  `mpn_addmul_1')
52')
53ifdef(`OPERATION_submul_1',`
54      define(`ADDSUB',        `sub')
55      define(`func',  `mpn_submul_1')
56')
57
58ABI_SUPPORT(DOS64)
59ABI_SUPPORT(STD64)
60
61MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
62
63IFDOS(`	define(`up', ``%rsi'')	') dnl
64IFDOS(`	define(`rp', ``%rcx'')	') dnl
65IFDOS(`	define(`vl', ``%r9'')	') dnl
66IFDOS(`	define(`r9', ``rdi'')	') dnl
67IFDOS(`	define(`n',  ``%r8'')	') dnl
68IFDOS(`	define(`r8', ``r11'')	') dnl
69
70ASM_START()
71	TEXT
72	ALIGN(16)
73PROLOGUE(func)
74
75IFDOS(``push	%rsi		'')
76IFDOS(``push	%rdi		'')
77IFDOS(``mov	%rdx, %rsi	'')
78
79	mov	(up), %rax		C read first u limb early
80	push	%rbx
81IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
82IFDOS(`	mov	n, %rbx         ')
83	mul	vl
84IFSTD(`	mov	%rbx, n         ')
85
86	and	$3, R32(%rbx)
87	jz	L(b0)
88	cmp	$2, R32(%rbx)
89	jz	L(b2)
90	jg	L(b3)
91
92L(b1):	dec	n
93	jne	L(gt1)
94	ADDSUB	%rax, (rp)
95	jmp	L(ret)
96L(gt1):	lea	8(up,n,8), up
97	lea	-8(rp,n,8), rp
98	neg	n
99	xor	%r10, %r10
100	xor	R32(%rbx), R32(%rbx)
101	mov	%rax, %r9
102	mov	(up,n,8), %rax
103	mov	%rdx, %r8
104	jmp	L(L1)
105
106L(b0):	lea	(up,n,8), up
107	lea	-16(rp,n,8), rp
108	neg	n
109	xor	%r10, %r10
110	mov	%rax, %r8
111	mov	%rdx, %rbx
112	jmp	 L(L0)
113
114L(b3):	lea	-8(up,n,8), up
115	lea	-24(rp,n,8), rp
116	neg	n
117	mov	%rax, %rbx
118	mov	%rdx, %r10
119	jmp	L(L3)
120
121L(b2):	lea	-16(up,n,8), up
122	lea	-32(rp,n,8), rp
123	neg	n
124	xor	%r8, %r8
125	xor	R32(%rbx), R32(%rbx)
126	mov	%rax, %r10
127	mov	24(up,n,8), %rax
128	mov	%rdx, %r9
129	jmp	L(L2)
130
131	ALIGN(16)
132L(top):	ADDSUB	%r10, (rp,n,8)
133	adc	%rax, %r9
134	mov	(up,n,8), %rax
135	adc	%rdx, %r8
136	mov	$0, R32(%r10)
137L(L1):	mul	vl
138	ADDSUB	%r9, 8(rp,n,8)
139	adc	%rax, %r8
140	adc	%rdx, %rbx
141L(L0):	mov	8(up,n,8), %rax
142	mul	vl
143	ADDSUB	%r8, 16(rp,n,8)
144	adc	%rax, %rbx
145	adc	%rdx, %r10
146L(L3):	mov	16(up,n,8), %rax
147	mul	vl
148	ADDSUB	%rbx, 24(rp,n,8)
149	mov	$0, R32(%r8)		C zero
150	mov	%r8, %rbx		C zero
151	adc	%rax, %r10
152	mov	24(up,n,8), %rax
153	mov	%r8, %r9		C zero
154	adc	%rdx, %r9
155L(L2):	mul	vl
156	add	$4, n
157	js	 L(top)
158
159	ADDSUB	%r10, (rp,n,8)
160	adc	%rax, %r9
161	adc	%r8, %rdx
162	ADDSUB	%r9, 8(rp,n,8)
163L(ret):	adc	$0, %rdx
164	mov	%rdx, %rax
165
166	pop	%rbx
167IFDOS(``pop	%rdi		'')
168IFDOS(``pop	%rsi		'')
169	ret
170EPILOGUE()
171