xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/aors_n.asm (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1dnl  AMD64 mpn_add_n, mpn_sub_n
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C AMD K8,K9	 1.5
25C AMD K10	 1.5
26C Intel P4	 ?
27C Intel core2	 4.9
28C Intel NHM	 5.5
29C Intel SBR	 1.59
30C Intel atom	 4
31C VIA nano	 3.25
32
33C The loop of this code is the result of running a code generation and
34C optimization tool suite written by David Harvey and Torbjorn Granlund.
35
36C INPUT PARAMETERS
37define(`rp',	`%rdi')	C rcx
38define(`up',	`%rsi')	C rdx
39define(`vp',	`%rdx')	C r8
40define(`n',	`%rcx')	C r9
41define(`cy',	`%r8')	C rsp+40    (only for mpn_add_nc)
42
43ifdef(`OPERATION_add_n', `
44	define(ADCSBB,	      adc)
45	define(func,	      mpn_add_n)
46	define(func_nc,	      mpn_add_nc)')
47ifdef(`OPERATION_sub_n', `
48	define(ADCSBB,	      sbb)
49	define(func,	      mpn_sub_n)
50	define(func_nc,	      mpn_sub_nc)')
51
52MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
53
54ABI_SUPPORT(DOS64)
55ABI_SUPPORT(STD64)
56
57ASM_START()
58	TEXT
59	ALIGN(16)
60PROLOGUE(func_nc)
61	FUNC_ENTRY(4)
62IFDOS(`	mov	56(%rsp), %r8	')
63	mov	R32(n), R32(%rax)
64	shr	$2, n
65	and	$3, R32(%rax)
66	bt	$0, %r8			C cy flag <- carry parameter
67	jrcxz	L(lt4)
68
69	mov	(up), %r8
70	mov	8(up), %r9
71	dec	n
72	jmp	L(mid)
73
74EPILOGUE()
75	ALIGN(16)
76PROLOGUE(func)
77	FUNC_ENTRY(4)
78	mov	R32(n), R32(%rax)
79	shr	$2, n
80	and	$3, R32(%rax)
81	jrcxz	L(lt4)
82
83	mov	(up), %r8
84	mov	8(up), %r9
85	dec	n
86	jmp	L(mid)
87
88L(lt4):	dec	R32(%rax)
89	mov	(up), %r8
90	jnz	L(2)
91	ADCSBB	(vp), %r8
92	mov	%r8, (rp)
93	adc	R32(%rax), R32(%rax)
94	FUNC_EXIT()
95	ret
96
97L(2):	dec	R32(%rax)
98	mov	8(up), %r9
99	jnz	L(3)
100	ADCSBB	(vp), %r8
101	ADCSBB	8(vp), %r9
102	mov	%r8, (rp)
103	mov	%r9, 8(rp)
104	adc	R32(%rax), R32(%rax)
105	FUNC_EXIT()
106	ret
107
108L(3):	mov	16(up), %r10
109	ADCSBB	(vp), %r8
110	ADCSBB	8(vp), %r9
111	ADCSBB	16(vp), %r10
112	mov	%r8, (rp)
113	mov	%r9, 8(rp)
114	mov	%r10, 16(rp)
115	setc	R8(%rax)
116	FUNC_EXIT()
117	ret
118
119	ALIGN(16)
120L(top):	ADCSBB	(vp), %r8
121	ADCSBB	8(vp), %r9
122	ADCSBB	16(vp), %r10
123	ADCSBB	24(vp), %r11
124	mov	%r8, (rp)
125	lea	32(up), up
126	mov	%r9, 8(rp)
127	mov	%r10, 16(rp)
128	dec	n
129	mov	%r11, 24(rp)
130	lea	32(vp), vp
131	mov	(up), %r8
132	mov	8(up), %r9
133	lea	32(rp), rp
134L(mid):	mov	16(up), %r10
135	mov	24(up), %r11
136	jnz	L(top)
137
138L(end):	lea	32(up), up
139	ADCSBB	(vp), %r8
140	ADCSBB	8(vp), %r9
141	ADCSBB	16(vp), %r10
142	ADCSBB	24(vp), %r11
143	lea	32(vp), vp
144	mov	%r8, (rp)
145	mov	%r9, 8(rp)
146	mov	%r10, 16(rp)
147	mov	%r11, 24(rp)
148	lea	32(rp), rp
149
150	inc	R32(%rax)
151	dec	R32(%rax)
152	jnz	L(lt4)
153	adc	R32(%rax), R32(%rax)
154	FUNC_EXIT()
155	ret
156EPILOGUE()
157