xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/aors_n.asm (revision 7d62b00eb9ad855ffcd7da46b41e23feb5476fac)
1dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
35
36
37ifdef(`OPERATION_add_n', `
38	define(M4_inst,        adcl)
39	define(M4_function_n,  mpn_add_n)
40	define(M4_function_nc, mpn_add_nc)
41	define(M4_description, add)
42',`ifdef(`OPERATION_sub_n', `
43	define(M4_inst,        sbbl)
44	define(M4_function_n,  mpn_sub_n)
45	define(M4_function_nc, mpn_sub_nc)
46	define(M4_description, subtract)
47',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
48')')')
49
50MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
51
52
53C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
54C                          mp_size_t size);
55C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
56C	                      mp_size_t size, mp_limb_t carry);
57C
58C Calculate src1,size M4_description src2,size, and store the result in
59C dst,size.  The return value is the carry bit from the top of the result
60C (1 or 0).
61C
62C The _nc version accepts 1 or 0 for an initial carry into the low limb of
63C the calculation.  Note values other than 1 or 0 here will lead to garbage
64C results.
65C
66C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
67C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
68C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
69
70define(PARAM_CARRY, `FRAME+20(%esp)')
71define(PARAM_SIZE,  `FRAME+16(%esp)')
72define(PARAM_SRC2,  `FRAME+12(%esp)')
73define(PARAM_SRC1,  `FRAME+8(%esp)')
74define(PARAM_DST,   `FRAME+4(%esp)')
75deflit(`FRAME',0)
76
77dnl  minimum 5 because the unrolled code can't handle less
78deflit(UNROLL_THRESHOLD, 5)
79
80	TEXT
81	ALIGN(32)
82
83PROLOGUE(M4_function_nc)
84	movl	PARAM_CARRY, %eax
85	jmp	L(start)
86EPILOGUE()
87
88
89PROLOGUE(M4_function_n)
90	xorl	%eax, %eax
91L(start):
92	movl	PARAM_SIZE, %ecx
93	pushl	%ebx
94FRAME_pushl()
95
96	movl	PARAM_SRC1, %ebx
97	pushl	%edi
98FRAME_pushl()
99
100	movl	PARAM_SRC2, %edx
101	cmpl	$UNROLL_THRESHOLD, %ecx
102
103	movl	PARAM_DST, %edi
104	jae	L(unroll)
105
106
107	shrl	%eax		C initial carry flag
108
109	C offset 0x21 here, close enough to aligned
110L(simple):
111	C eax	scratch
112	C ebx	src1
113	C ecx	counter
114	C edx	src2
115	C esi
116	C edi	dst
117	C ebp
118	C
119	C The store to (%edi) could be done with a stosl; it'd be smaller
120	C code, but there's no speed gain and a cld would have to be added
121	C (per mpn/x86/README).
122
123	movl	(%ebx), %eax
124	leal	4(%ebx), %ebx
125
126	M4_inst	(%edx), %eax
127
128	movl	%eax, (%edi)
129	leal	4(%edi), %edi
130
131	leal	4(%edx), %edx
132	loop	L(simple)
133
134
135	movl	$0, %eax
136	popl	%edi
137
138	setc	%al
139
140	popl	%ebx
141	ret
142
143
144C -----------------------------------------------------------------------------
145L(unroll):
146	C eax	carry
147	C ebx	src1
148	C ecx	counter
149	C edx	src2
150	C esi
151	C edi	dst
152	C ebp
153
154	cmpl	%edi, %ebx
155	pushl	%esi
156
157	je	L(inplace)
158
159ifdef(`OPERATION_add_n',`
160	cmpl	%edi, %edx
161
162	je	L(inplace_reverse)
163')
164
165	movl	%ecx, %esi
166
167	andl	$-4, %ecx
168	andl	$3, %esi
169
170	leal	(%ebx,%ecx,4), %ebx
171	leal	(%edx,%ecx,4), %edx
172	leal	(%edi,%ecx,4), %edi
173
174	negl	%ecx
175	shrl	%eax
176
177	ALIGN(32)
178L(normal_top):
179	C eax	counter, qwords, negative
180	C ebx	src1
181	C ecx	scratch
182	C edx	src2
183	C esi
184	C edi	dst
185	C ebp
186
187	movl	(%ebx,%ecx,4), %eax
188	leal	5(%ecx), %ecx
189	M4_inst	-20(%edx,%ecx,4), %eax
190	movl	%eax, -20(%edi,%ecx,4)
191
192	movl	4-20(%ebx,%ecx,4), %eax
193	M4_inst	4-20(%edx,%ecx,4), %eax
194	movl	%eax, 4-20(%edi,%ecx,4)
195
196	movl	8-20(%ebx,%ecx,4), %eax
197	M4_inst	8-20(%edx,%ecx,4), %eax
198	movl	%eax, 8-20(%edi,%ecx,4)
199
200	movl	12-20(%ebx,%ecx,4), %eax
201	M4_inst	12-20(%edx,%ecx,4), %eax
202	movl	%eax, 12-20(%edi,%ecx,4)
203
204	loop	L(normal_top)
205
206
207	decl	%esi
208	jz	L(normal_finish_one)
209	js	L(normal_done)
210
211	C two or three more limbs
212
213	movl	(%ebx), %eax
214	M4_inst	(%edx), %eax
215	movl	%eax, (%edi)
216
217	movl	4(%ebx), %eax
218	M4_inst	4(%edx), %eax
219	decl	%esi
220	movl	%eax, 4(%edi)
221
222	jz	L(normal_done)
223	movl	$2, %ecx
224
225L(normal_finish_one):
226	movl	(%ebx,%ecx,4), %eax
227	M4_inst	(%edx,%ecx,4), %eax
228	movl	%eax, (%edi,%ecx,4)
229
230L(normal_done):
231	popl	%esi
232	popl	%edi
233
234	movl	$0, %eax
235	popl	%ebx
236
237	setc	%al
238
239	ret
240
241
242C -----------------------------------------------------------------------------
243
244ifdef(`OPERATION_add_n',`
245L(inplace_reverse):
246	C dst==src2
247
248	movl	%ebx, %edx
249')
250
251L(inplace):
252	C eax	initial carry
253	C ebx
254	C ecx	size
255	C edx	src
256	C esi
257	C edi	dst
258	C ebp
259
260	leal	-1(%ecx), %esi
261	decl	%ecx
262
263	andl	$-4, %ecx
264	andl	$3, %esi
265
266	movl	(%edx), %ebx		C src low limb
267	leal	(%edx,%ecx,4), %edx
268
269	leal	(%edi,%ecx,4), %edi
270	negl	%ecx
271
272	shrl	%eax
273
274
275	ALIGN(32)
276L(inplace_top):
277	C eax
278	C ebx	next src limb
279	C ecx	size
280	C edx	src
281	C esi
282	C edi	dst
283	C ebp
284
285	M4_inst	%ebx, (%edi,%ecx,4)
286
287	movl	4(%edx,%ecx,4), %eax
288	leal	5(%ecx), %ecx
289
290	M4_inst	%eax, 4-20(%edi,%ecx,4)
291
292	movl	8-20(%edx,%ecx,4), %eax
293	movl	12-20(%edx,%ecx,4), %ebx
294
295	M4_inst	%eax, 8-20(%edi,%ecx,4)
296	M4_inst	%ebx, 12-20(%edi,%ecx,4)
297
298	movl	16-20(%edx,%ecx,4), %ebx
299	loop	L(inplace_top)
300
301
302	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
303
304	M4_inst	%ebx, (%edi)
305
306	decl	%esi
307	jz	L(inplace_finish_one)
308	js	L(inplace_done)
309
310	C two or three more limbs
311
312	movl	4(%edx), %eax
313	movl	8(%edx), %ebx
314	M4_inst	%eax, 4(%edi)
315	M4_inst	%ebx, 8(%edi)
316
317	decl	%esi
318	movl	$2, %ecx
319
320	jz	L(normal_done)
321
322L(inplace_finish_one):
323	movl	4(%edx,%ecx,4), %eax
324	M4_inst	%eax, 4(%edi,%ecx,4)
325
326L(inplace_done):
327	popl	%esi
328	popl	%edi
329
330	movl	$0, %eax
331	popl	%ebx
332
333	setc	%al
334
335	ret
336
337EPILOGUE()
338