xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bt1/redc_1.asm (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1dnl  X86-64 mpn_redc_1 optimised for AMD bobcat.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 5.0
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 ?
47C Intel IBR	 ?
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C TODO
54C  * Micro-optimise, none performed thus far.
55C  * Consider inlining mpn_add_n.
56C  * Single basecases out before the pushes.
57
58C When playing with pointers, set this to $2 to fall back to conservative
59C indexing in wind-down code.
60define(`I',`$1')
61
62define(`rp',          `%rdi')   C rcx
63define(`up',          `%rsi')   C rdx
64define(`mp_param',    `%rdx')   C r8
65define(`n',           `%rcx')   C r9
66define(`u0inv',       `%r8')    C stack
67
68define(`i',           `%r14')
69define(`j',           `%r15')
70define(`mp',          `%r12')
71define(`q0',          `%r13')
72define(`w0',          `%rbp')
73define(`w1',          `%r9')
74define(`w2',          `%r10')
75define(`w3',          `%r11')
76
77C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
78
79ABI_SUPPORT(DOS64)
80ABI_SUPPORT(STD64)
81
82define(`ALIGNx', `ALIGN(16)')
83
84ASM_START()
85	TEXT
86	ALIGN(32)
87PROLOGUE(mpn_redc_1)
88	FUNC_ENTRY(4)
89IFDOS(`	mov	56(%rsp), %r8	')
90	push	%rbx
91	push	%rbp
92	push	%r12
93	push	%r13
94	push	%r14
95	push	%r15
96
97	mov	(up), q0
98	mov	n, j			C outer loop induction var
99	lea	(mp_param,n,8), mp
100	lea	(up,n,8), up
101	neg	n
102	imul	u0inv, q0		C first iteration q0
103
104	test	$1, R8(n)
105	jz	L(bx0)
106
107L(bx1):	test	$2, R8(n)
108	jz	L(b3)
109
110L(b1):	cmp	$-1, R32(n)
111	jz	L(n1)
112
113L(otp1):lea	1(n), i
114	mov	(mp,n,8), %rax
115	mul	q0
116	mov	%rax, w2
117	mov	%rdx, w3
118	mov	8(mp,n,8), %rax
119	mul	q0
120	mov	%rax, %rbx
121	mov	%rdx, w1
122	add	(up,n,8), w2
123	adc	w3, %rbx
124	adc	$0, w1
125	mov	16(mp,n,8), %rax
126	mul	q0
127	mov	%rax, w2
128	mov	%rdx, w3
129	add	8(up,n,8), %rbx
130	mov	%rbx, 8(up,n,8)
131	adc	w1, w2
132	adc	$0, w3
133	imul	u0inv, %rbx		C next q limb
134	jmp	L(e1)
135
136	ALIGNx
137L(tp1):	add	w0, -16(up,i,8)
138	adc	w1, w2
139	adc	$0, w3
140	mov	(mp,i,8), %rax
141	mul	q0
142	mov	%rax, w0
143	mov	%rdx, w1
144	add	w2, -8(up,i,8)
145	adc	w3, w0
146	adc	$0, w1
147	mov	8(mp,i,8), %rax
148	mul	q0
149	mov	%rax, w2
150	mov	%rdx, w3
151	add	w0, (up,i,8)
152	adc	w1, w2
153	adc	$0, w3
154L(e1):	mov	16(mp,i,8), %rax
155	mul	q0
156	mov	%rax, w0
157	mov	%rdx, w1
158	add	w2, 8(up,i,8)
159	adc	w3, w0
160	adc	$0, w1
161	mov	24(mp,i,8), %rax
162	mul	q0
163	mov	%rax, w2
164	mov	%rdx, w3
165	add	$4, i
166	js	L(tp1)
167
168L(ed1):	add	w0, I(-16(up),-16(up,i,8))
169	adc	w1, w2
170	adc	$0, w3
171	add	w2, I(-8(up),-8(up,i,8))
172	adc	$0, w3
173	mov	w3, (up,n,8)		C up[0]
174	mov	%rbx, q0		C previously computed q limb -> q0
175	lea	8(up), up		C up++
176	dec	j
177	jnz	L(otp1)
178	jmp	L(cj)
179
180L(b3):	cmp	$-3, R32(n)
181	jz	L(n3)
182
183L(otp3):lea	3(n), i
184	mov	(mp,n,8), %rax
185	mul	q0
186	mov	%rax, w2
187	mov	%rdx, w3
188	mov	8(mp,n,8), %rax
189	mul	q0
190	mov	%rax, %rbx
191	mov	%rdx, w1
192	add	(up,n,8), w2
193	adc	w3, %rbx
194	adc	$0, w1
195	mov	16(mp,n,8), %rax
196	mul	q0
197	mov	%rax, w2
198	mov	%rdx, w3
199	add	8(up,n,8), %rbx
200	mov	%rbx, 8(up,n,8)
201	adc	w1, w2
202	adc	$0, w3
203	imul	u0inv, %rbx		C next q limb
204	jmp	L(e3)
205
206	ALIGNx
207L(tp3):	add	w0, -16(up,i,8)
208	adc	w1, w2
209	adc	$0, w3
210L(e3):	mov	(mp,i,8), %rax
211	mul	q0
212	mov	%rax, w0
213	mov	%rdx, w1
214	add	w2, -8(up,i,8)
215	adc	w3, w0
216	adc	$0, w1
217	mov	8(mp,i,8), %rax
218	mul	q0
219	mov	%rax, w2
220	mov	%rdx, w3
221	add	w0, (up,i,8)
222	adc	w1, w2
223	adc	$0, w3
224	mov	16(mp,i,8), %rax
225	mul	q0
226	mov	%rax, w0
227	mov	%rdx, w1
228	add	w2, 8(up,i,8)
229	adc	w3, w0
230	adc	$0, w1
231	mov	24(mp,i,8), %rax
232	mul	q0
233	mov	%rax, w2
234	mov	%rdx, w3
235	add	$4, i
236	js	L(tp3)
237
238L(ed3):	add	w0, I(-16(up),-16(up,i,8))
239	adc	w1, w2
240	adc	$0, w3
241	add	w2, I(-8(up),-8(up,i,8))
242	adc	$0, w3
243	mov	w3, (up,n,8)		C up[0]
244	mov	%rbx, q0		C previously computed q limb -> q0
245	lea	8(up), up		C up++
246	dec	j
247	jnz	L(otp3)
248C	jmp	L(cj)
249
250L(cj):
251IFSTD(`	lea	(up,n,8), up		C param 2: up
252	lea	(up,n,8), %rdx		C param 3: up - n
253	neg	R32(n)		')	C param 4: n
254
255IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
256	lea	(%rdx,n,8), %r8		C param 3: up - n
257	neg	R32(n)
258	mov	n, %r9			C param 4: n
259	mov	rp, %rcx	')	C param 1: rp
260
261IFSTD(`	sub	$8, %rsp	')
262IFDOS(`	sub	$40, %rsp	')
263	ASSERT(nz, `test $15, %rsp')
264	CALL(	mpn_add_n)
265IFSTD(`	add	$8, %rsp	')
266IFDOS(`	add	$40, %rsp	')
267
268L(ret):	pop	%r15
269	pop	%r14
270	pop	%r13
271	pop	%r12
272	pop	%rbp
273	pop	%rbx
274	FUNC_EXIT()
275	ret
276
277L(bx0):	test	$2, R8(n)
278	jnz	L(b2)
279
280L(b0):
281L(otp0):lea	(n), i
282	mov	(mp,n,8), %rax
283	mul	q0
284	mov	%rax, w0
285	mov	%rdx, w1
286	mov	8(mp,n,8), %rax
287	mul	q0
288	mov	%rax, %rbx
289	mov	%rdx, w3
290	add	(up,n,8), w0
291	adc	w1, %rbx
292	adc	$0, w3
293	mov	16(mp,n,8), %rax
294	mul	q0
295	mov	%rax, w0
296	mov	%rdx, w1
297	add	8(up,n,8), %rbx
298	mov	%rbx, 8(up,n,8)
299	adc	w3, w0
300	adc	$0, w1
301	imul	u0inv, %rbx		C next q limb
302	jmp	L(e0)
303
304	ALIGNx
305L(tp0):	add	w0, -16(up,i,8)
306	adc	w1, w2
307	adc	$0, w3
308	mov	(mp,i,8), %rax
309	mul	q0
310	mov	%rax, w0
311	mov	%rdx, w1
312	add	w2, -8(up,i,8)
313	adc	w3, w0
314	adc	$0, w1
315	mov	8(mp,i,8), %rax
316	mul	q0
317	mov	%rax, w2
318	mov	%rdx, w3
319	add	w0, (up,i,8)
320	adc	w1, w2
321	adc	$0, w3
322	mov	16(mp,i,8), %rax
323	mul	q0
324	mov	%rax, w0
325	mov	%rdx, w1
326	add	w2, 8(up,i,8)
327	adc	w3, w0
328	adc	$0, w1
329L(e0):	mov	24(mp,i,8), %rax
330	mul	q0
331	mov	%rax, w2
332	mov	%rdx, w3
333	add	$4, i
334	js	L(tp0)
335
336L(ed0):	add	w0, I(-16(up),-16(up,i,8))
337	adc	w1, w2
338	adc	$0, w3
339	add	w2, I(-8(up),-8(up,i,8))
340	adc	$0, w3
341	mov	w3, (up,n,8)		C up[0]
342	mov	%rbx, q0		C previously computed q limb -> q0
343	lea	8(up), up		C up++
344	dec	j
345	jnz	L(otp0)
346	jmp	L(cj)
347
348L(b2):	cmp	$-2, R32(n)
349	jz	L(n2)
350
351L(otp2):lea	2(n), i
352	mov	(mp,n,8), %rax
353	mul	q0
354	mov	%rax, w0
355	mov	%rdx, w1
356	mov	8(mp,n,8), %rax
357	mul	q0
358	mov	%rax, %rbx
359	mov	%rdx, w3
360	add	(up,n,8), w0
361	adc	w1, %rbx
362	adc	$0, w3
363	mov	16(mp,n,8), %rax
364	mul	q0
365	mov	%rax, w0
366	mov	%rdx, w1
367	add	8(up,n,8), %rbx
368	mov	%rbx, 8(up,n,8)
369	adc	w3, w0
370	adc	$0, w1
371	imul	u0inv, %rbx		C next q limb
372	jmp	L(e2)
373
374	ALIGNx
375L(tp2):	add	w0, -16(up,i,8)
376	adc	w1, w2
377	adc	$0, w3
378	mov	(mp,i,8), %rax
379	mul	q0
380	mov	%rax, w0
381	mov	%rdx, w1
382	add	w2, -8(up,i,8)
383	adc	w3, w0
384	adc	$0, w1
385L(e2):	mov	8(mp,i,8), %rax
386	mul	q0
387	mov	%rax, w2
388	mov	%rdx, w3
389	add	w0, (up,i,8)
390	adc	w1, w2
391	adc	$0, w3
392	mov	16(mp,i,8), %rax
393	mul	q0
394	mov	%rax, w0
395	mov	%rdx, w1
396	add	w2, 8(up,i,8)
397	adc	w3, w0
398	adc	$0, w1
399	mov	24(mp,i,8), %rax
400	mul	q0
401	mov	%rax, w2
402	mov	%rdx, w3
403	add	$4, i
404	js	L(tp2)
405
406L(ed2):	add	w0, I(-16(up),-16(up,i,8))
407	adc	w1, w2
408	adc	$0, w3
409	add	w2, I(-8(up),-8(up,i,8))
410	adc	$0, w3
411	mov	w3, (up,n,8)		C up[0]
412	mov	%rbx, q0		C previously computed q limb -> q0
413	lea	8(up), up		C up++
414	dec	j
415	jnz	L(otp2)
416	jmp	L(cj)
417
418L(n1):	mov	(mp_param), %rax
419	mul	q0
420	add	-8(up), %rax
421	adc	(up), %rdx
422	mov	%rdx, (rp)
423	mov	$0, R32(%rax)
424	adc	R32(%rax), R32(%rax)
425	jmp	L(ret)
426
427L(n2):	mov	(mp_param), %rax
428	mov	-16(up), %rbp
429	mul	q0
430	add	%rax, %rbp
431	mov	%rdx, %r9
432	adc	$0, %r9
433	mov	-8(mp), %rax
434	mov	-8(up), %r10
435	mul	q0
436	add	%rax, %r10
437	mov	%rdx, %r11
438	adc	$0, %r11
439	add	%r9, %r10
440	adc	$0, %r11
441	mov	%r10, q0
442	imul	u0inv, q0		C next q0
443	mov	-16(mp), %rax
444	mul	q0
445	add	%rax, %r10
446	mov	%rdx, %r9
447	adc	$0, %r9
448	mov	-8(mp), %rax
449	mov	(up), %r14
450	mul	q0
451	add	%rax, %r14
452	adc	$0, %rdx
453	add	%r9, %r14
454	adc	$0, %rdx
455	xor	R32(%rax), R32(%rax)
456	add	%r11, %r14
457	adc	8(up), %rdx
458	mov	%r14, (rp)
459	mov	%rdx, 8(rp)
460	adc	R32(%rax), R32(%rax)
461	jmp	L(ret)
462
463	ALIGNx
464L(n3):	mov	-24(mp), %rax
465	mov	-24(up), %r10
466	mul	q0
467	add	%rax, %r10
468	mov	-16(mp), %rax
469	mov	%rdx, %r11
470	adc	$0, %r11
471	mov	-16(up), %rbp
472	mul	q0
473	add	%rax, %rbp
474	mov	%rdx, %r9
475	adc	$0, %r9
476	mov	-8(mp), %rax
477	add	%r11, %rbp
478	mov	-8(up), %r10
479	adc	$0, %r9
480	mul	q0
481	mov	%rbp, q0
482	imul	u0inv, q0		C next q0
483	add	%rax, %r10
484	mov	%rdx, %r11
485	adc	$0, %r11
486	mov	%rbp, -16(up)
487	add	%r9, %r10
488	adc	$0, %r11
489	mov	%r10, -8(up)
490	mov	%r11, -24(up)		C up[0]
491	lea	8(up), up		C up++
492	dec	j
493	jnz	L(n3)
494
495	mov	-48(up), %rdx
496	mov	-40(up), %rbx
497	xor	R32(%rax), R32(%rax)
498	add	%rbp, %rdx
499	adc	%r10, %rbx
500	adc	-8(up), %r11
501	mov	%rdx, (rp)
502	mov	%rbx, 8(rp)
503	mov	%r11, 16(rp)
504	adc	R32(%rax), R32(%rax)
505	jmp	L(ret)
506EPILOGUE()
507ASM_END()
508