xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/aors_n.asm (revision ae87de8892f277bece3527c15b186ebcfa188227)
1dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
2
3dnl  Copyright 1999-2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K7: 1.64 cycles/limb (at 16 limbs/loop).
35
36
37
38dnl  K7: UNROLL_COUNT cycles/limb
39dnl           8           1.9
40dnl          16           1.64
41dnl          32           1.7
42dnl          64           2.0
43dnl  Maximum possible with the current code is 64.
44
45deflit(UNROLL_COUNT, 16)
46
47
48ifdef(`OPERATION_add_n', `
49	define(M4_inst,        adcl)
50	define(M4_function_n,  mpn_add_n)
51	define(M4_function_nc, mpn_add_nc)
52	define(M4_description, add)
53',`ifdef(`OPERATION_sub_n', `
54	define(M4_inst,        sbbl)
55	define(M4_function_n,  mpn_sub_n)
56	define(M4_function_nc, mpn_sub_nc)
57	define(M4_description, subtract)
58',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
59')')')
60
61MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
62
63
64C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
65C                         mp_size_t size);
66C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
67C	                   mp_size_t size, mp_limb_t carry);
68C
69C Calculate src1,size M4_description src2,size, and store the result in
70C dst,size.  The return value is the carry bit from the top of the result (1
71C or 0).
72C
73C The _nc version accepts 1 or 0 for an initial carry into the low limb of
74C the calculation.  Note values other than 1 or 0 here will lead to garbage
75C results.
76C
77C This code runs at 1.64 cycles/limb, which might be the best possible with
78C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
79C which can be done each cycle, leading to 1.5 c/l.
80
81dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
82ifdef(`PIC',`
83deflit(UNROLL_THRESHOLD, 8)
84',`
85deflit(UNROLL_THRESHOLD, 8)
86')
87
88defframe(PARAM_CARRY,20)
89defframe(PARAM_SIZE, 16)
90defframe(PARAM_SRC2, 12)
91defframe(PARAM_SRC1, 8)
92defframe(PARAM_DST,  4)
93
94defframe(SAVE_EBP, -4)
95defframe(SAVE_ESI, -8)
96defframe(SAVE_EBX, -12)
97defframe(SAVE_EDI, -16)
98deflit(STACK_SPACE, 16)
99
100	TEXT
101	ALIGN(32)
102deflit(`FRAME',0)
103
104PROLOGUE(M4_function_nc)
105	movl	PARAM_CARRY, %eax
106	jmp	L(start)
107EPILOGUE()
108
109PROLOGUE(M4_function_n)
110
111	xorl	%eax, %eax	C carry
112L(start):
113	movl	PARAM_SIZE, %ecx
114	subl	$STACK_SPACE, %esp
115deflit(`FRAME',STACK_SPACE)
116
117	movl	%edi, SAVE_EDI
118	movl	%ebx, SAVE_EBX
119	cmpl	$UNROLL_THRESHOLD, %ecx
120
121	movl	PARAM_SRC2, %edx
122	movl	PARAM_SRC1, %ebx
123	jae	L(unroll)
124
125	movl	PARAM_DST, %edi
126	leal	(%ebx,%ecx,4), %ebx
127	leal	(%edx,%ecx,4), %edx
128
129	leal	(%edi,%ecx,4), %edi
130	negl	%ecx
131	shrl	%eax
132
133	C This loop in in a single 16 byte code block already, so no
134	C alignment necessary.
135L(simple):
136	C eax	scratch
137	C ebx	src1
138	C ecx	counter
139	C edx	src2
140	C esi
141	C edi	dst
142	C ebp
143
144	movl	(%ebx,%ecx,4), %eax
145	M4_inst	(%edx,%ecx,4), %eax
146	movl	%eax, (%edi,%ecx,4)
147	incl	%ecx
148	jnz	L(simple)
149
150	movl	$0, %eax
151	movl	SAVE_EDI, %edi
152
153	movl	SAVE_EBX, %ebx
154	setc	%al
155	addl	$STACK_SPACE, %esp
156
157	ret
158
159
160C -----------------------------------------------------------------------------
161	C This is at 0x55, close enough to aligned.
162L(unroll):
163deflit(`FRAME',STACK_SPACE)
164	movl	%ebp, SAVE_EBP
165	andl	$-2, %ecx		C size low bit masked out
166	andl	$1, PARAM_SIZE		C size low bit kept
167
168	movl	%ecx, %edi
169	decl	%ecx
170	movl	PARAM_DST, %ebp
171
172	shrl	$UNROLL_LOG2, %ecx
173	negl	%edi
174	movl	%esi, SAVE_ESI
175
176	andl	$UNROLL_MASK, %edi
177
178ifdef(`PIC',`
179	call	L(pic_calc)
180L(here):
181',`
182	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
183')
184	negl	%edi
185	shrl	%eax
186
187	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
188	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
189	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
190
191	jmp	*%esi
192
193
194ifdef(`PIC',`
195L(pic_calc):
196	C See mpn/x86/README about old gas bugs
197	leal	(%edi,%edi,8), %esi
198	addl	$L(entry)-L(here), %esi
199	addl	(%esp), %esi
200	ret_internal
201')
202
203
204C -----------------------------------------------------------------------------
205	ALIGN(32)
206L(top):
207	C eax	zero
208	C ebx	src1
209	C ecx	counter
210	C edx	src2
211	C esi	scratch (was computed jump)
212	C edi	dst
213	C ebp	scratch
214
215	leal	UNROLL_BYTES(%edx), %edx
216
217L(entry):
218deflit(CHUNK_COUNT, 2)
219forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
220	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
221	deflit(`disp1', eval(disp0 + 4))
222
223Zdisp(	movl,	disp0,(%ebx), %esi)
224	movl	disp1(%ebx), %ebp
225Zdisp(	M4_inst,disp0,(%edx), %esi)
226Zdisp(	movl,	%esi, disp0,(%edi))
227	M4_inst	disp1(%edx), %ebp
228	movl	%ebp, disp1(%edi)
229')
230
231	decl	%ecx
232	leal	UNROLL_BYTES(%ebx), %ebx
233	leal	UNROLL_BYTES(%edi), %edi
234	jns	L(top)
235
236
237	mov	PARAM_SIZE, %esi
238	movl	SAVE_EBP, %ebp
239	movl	$0, %eax
240
241	decl	%esi
242	js	L(even)
243
244	movl	(%ebx), %ecx
245	M4_inst	UNROLL_BYTES(%edx), %ecx
246	movl	%ecx, (%edi)
247L(even):
248
249	movl	SAVE_EDI, %edi
250	movl	SAVE_EBX, %ebx
251	setc	%al
252
253	movl	SAVE_ESI, %esi
254	addl	$STACK_SPACE, %esp
255
256	ret
257
258EPILOGUE()
259