xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/aors_n.asm (revision 2dd295436a0082eb4f8d294f4aa73c223413d0f2)
1dnl  AMD64 mpn_add_n, mpn_sub_n
2
3dnl  Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 1.5
35C AMD K10	 1.5
36C AMD bd1	 1.8
37C AMD bd2	 1.74
38C AMD bd3	 ?
39C AMD bd4	 1.78
40C AMD zen	 1.5
41C AMD bt1	 2.54
42C AMD bt2	 2.15
43C Intel P4	11.5
44C Intel core2	 4.9
45C Intel NHM	 5.53
46C Intel SBR	 1.59
47C Intel IBR	 1.55
48C Intel HWL	 1.44
49C Intel BWL	 1.14
50C Intel SKL	 1.21
51C Intel atom	 4
52C Intel SLM	 3
53C VIA nano	 3.25
54
55C The loop of this code is the result of running a code generation and
56C optimization tool suite written by David Harvey and Torbjorn Granlund.
57
58C INPUT PARAMETERS
59define(`rp',	`%rdi')	C rcx
60define(`up',	`%rsi')	C rdx
61define(`vp',	`%rdx')	C r8
62define(`n',	`%rcx')	C r9
63define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
64
65ifdef(`OPERATION_add_n', `
66	define(ADCSBB,	      adc)
67	define(func,	      mpn_add_n)
68	define(func_nc,	      mpn_add_nc)')
69ifdef(`OPERATION_sub_n', `
70	define(ADCSBB,	      sbb)
71	define(func,	      mpn_sub_n)
72	define(func_nc,	      mpn_sub_nc)')
73
74MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
75
76ABI_SUPPORT(DOS64)
77ABI_SUPPORT(STD64)
78
79ASM_START()
80	TEXT
81	ALIGN(16)
82PROLOGUE(func_nc)
83	FUNC_ENTRY(4)
84IFDOS(`	mov	56(%rsp), %r8	')
85	mov	R32(n), R32(%rax)
86	shr	$2, n
87	and	$3, R32(%rax)
88	bt	$0, %r8			C cy flag <- carry parameter
89	jrcxz	L(lt4)
90
91	mov	(up), %r8
92	mov	8(up), %r9
93	dec	n
94	jmp	L(mid)
95
96EPILOGUE()
97	ALIGN(16)
98PROLOGUE(func)
99	FUNC_ENTRY(4)
100	mov	R32(n), R32(%rax)
101	shr	$2, n
102	and	$3, R32(%rax)
103	jrcxz	L(lt4)
104
105	mov	(up), %r8
106	mov	8(up), %r9
107	dec	n
108	jmp	L(mid)
109
110L(lt4):	dec	R32(%rax)
111	mov	(up), %r8
112	jnz	L(2)
113	ADCSBB	(vp), %r8
114	mov	%r8, (rp)
115	adc	R32(%rax), R32(%rax)
116	FUNC_EXIT()
117	ret
118
119L(2):	dec	R32(%rax)
120	mov	8(up), %r9
121	jnz	L(3)
122	ADCSBB	(vp), %r8
123	ADCSBB	8(vp), %r9
124	mov	%r8, (rp)
125	mov	%r9, 8(rp)
126	adc	R32(%rax), R32(%rax)
127	FUNC_EXIT()
128	ret
129
130L(3):	mov	16(up), %r10
131	ADCSBB	(vp), %r8
132	ADCSBB	8(vp), %r9
133	ADCSBB	16(vp), %r10
134	mov	%r8, (rp)
135	mov	%r9, 8(rp)
136	mov	%r10, 16(rp)
137	setc	R8(%rax)
138	FUNC_EXIT()
139	ret
140
141	ALIGN(16)
142L(top):	ADCSBB	(vp), %r8
143	ADCSBB	8(vp), %r9
144	ADCSBB	16(vp), %r10
145	ADCSBB	24(vp), %r11
146	mov	%r8, (rp)
147	lea	32(up), up
148	mov	%r9, 8(rp)
149	mov	%r10, 16(rp)
150	dec	n
151	mov	%r11, 24(rp)
152	lea	32(vp), vp
153	mov	(up), %r8
154	mov	8(up), %r9
155	lea	32(rp), rp
156L(mid):	mov	16(up), %r10
157	mov	24(up), %r11
158	jnz	L(top)
159
160L(end):	lea	32(up), up
161	ADCSBB	(vp), %r8
162	ADCSBB	8(vp), %r9
163	ADCSBB	16(vp), %r10
164	ADCSBB	24(vp), %r11
165	lea	32(vp), vp
166	mov	%r8, (rp)
167	mov	%r9, 8(rp)
168	mov	%r10, 16(rp)
169	mov	%r11, 24(rp)
170	lea	32(rp), rp
171
172	inc	R32(%rax)
173	dec	R32(%rax)
174	jnz	L(lt4)
175	adc	R32(%rax), R32(%rax)
176	FUNC_EXIT()
177	ret
178EPILOGUE()
179