xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mod_1_4.asm (revision 0953dc8744b62dfdecb2f203329e730593755659)
1dnl  AMD64 mpn_mod_1s_4p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C K8,K9:	 3.0
26C K10:		 3.0
27C P4:		14.5
28C P6 core2:	 5.0
29C P6 corei7:	 4.3
30C P6 atom:	25.0
31
32ASM_START()
33	TEXT
34	ALIGN(16)
35PROLOGUE(mpn_mod_1s_4p)
36	push	%r14
37	push	%r13
38	push	%r12
39	push	%rbp
40	push	%rbx
41
42	mov	%rdx, -16(%rsp)
43	mov	%rcx, %r14
44	mov	16(%rcx), %r11
45	mov	24(%rcx), %rbx
46	mov	32(%rcx), %rbp
47	mov	40(%rcx), %r13
48	mov	48(%rcx), %r12
49	xor	R32(%r8), R32(%r8)
50	mov	R32(%rsi), R32(%rdx)
51	and	$3, R32(%rdx)
52	je	L(b0)
53	cmp	$2, R32(%rdx)
54	jc	L(b1)
55	je	L(b2)
56
57L(b3):	lea	-24(%rdi,%rsi,8), %rdi
58	mov	8(%rdi), %rax
59	mul	%r11
60	mov	(%rdi), %r9
61	add	%rax, %r9
62	adc	%rdx, %r8
63	mov	16(%rdi), %rax
64	mul	%rbx
65	jmp	L(m0)
66
67	ALIGN(8)
68L(b0):	lea	-32(%rdi,%rsi,8), %rdi
69	mov	8(%rdi), %rax
70	mul	%r11
71	mov	(%rdi), %r9
72	add	%rax, %r9
73	adc	%rdx, %r8
74	mov	16(%rdi), %rax
75	mul	%rbx
76	add	%rax, %r9
77	adc	%rdx, %r8
78	mov	24(%rdi), %rax
79	mul	%rbp
80	jmp	L(m0)
81
82	ALIGN(8)
83L(b1):	lea	-8(%rdi,%rsi,8), %rdi
84	mov	(%rdi), %r9
85	jmp	L(m1)
86
87	ALIGN(8)
88L(b2):	lea	-16(%rdi,%rsi,8), %rdi
89	mov	8(%rdi), %rax
90	mul	%r11
91	mov	(%rdi), %r9
92	jmp	L(m0)
93
94	ALIGN(16)
95L(top):	mov	-24(%rdi), %rax
96	mov	-32(%rdi), %r10
97	mul	%r11
98	add	%rax, %r10
99	mov	-16(%rdi), %rax
100	mov	%rdx, %rcx
101	adc	$0, %rcx
102	mul	%rbx
103	add	%rax, %r10
104	mov	-8(%rdi), %rax
105	adc	%rdx, %rcx
106	sub	$32, %rdi
107	mul	%rbp
108	add	%rax, %r10
109	mov	%r9, %rax
110	adc	%rdx, %rcx
111	mul	%r13
112	add	%rax, %r10
113	mov	%r8, %rax
114	adc	%rdx, %rcx
115	mul	%r12
116	mov	%r10, %r9
117	mov	%rcx, %r8
118L(m0):	add	%rax, %r9
119	adc	%rdx, %r8
120L(m1):	sub	$4, %rsi
121	ja	L(top)
122
123L(end):	mov	8(%r14), R32(%rsi)
124	mov	%r8, %rax
125	mul	%r11
126	mov	%rax, %r8
127	add	%r9, %r8
128	adc	$0, %rdx
129	xor	R32(%rcx), R32(%rcx)
130	sub	R32(%rsi), R32(%rcx)
131	mov	%r8, %rdi
132	shr	R8(%rcx), %rdi
133	mov	R32(%rsi), R32(%rcx)
134	sal	R8(%rcx), %rdx
135	or	%rdx, %rdi
136	mov	%rdi, %rax
137	mulq	(%r14)
138	mov	-16(%rsp), %rbx
139	mov	%rax, %r9
140	sal	R8(%rcx), %r8
141	inc	%rdi
142	add	%r8, %r9
143	adc	%rdi, %rdx
144	imul	%rbx, %rdx
145	sub	%rdx, %r8
146	lea	(%r8,%rbx), %rax
147	cmp	%r8, %r9
148	cmovb	%rax, %r8
149	mov	%r8, %rax
150	sub	%rbx, %rax
151	cmovb	%r8, %rax
152	shr	R8(%rcx), %rax
153	pop	%rbx
154	pop	%rbp
155	pop	%r12
156	pop	%r13
157	pop	%r14
158	ret
159EPILOGUE()
160
161	ALIGN(16)
162PROLOGUE(mpn_mod_1s_4p_cps)
163	push	%r12
164	bsr	%rsi, %rcx
165	push	%rbp
166	xor	$63, R32(%rcx)
167	mov	%rsi, %rbp
168	mov	R32(%rcx), R32(%r12)
169	sal	R8(%rcx), %rbp
170	push	%rbx
171	mov	%rdi, %rbx
172	mov	%rbp, %rdi
173	CALL(	mpn_invert_limb)
174	mov	R32(%r12), R32(%rcx)
175	mov	$1, R32(%r10)
176	sal	R8(%rcx), %r10
177	mov	$64, R32(%rcx)
178	mov	%rax, %r9
179	sub	R32(%r12), R32(%rcx)
180	mov	%r9, (%rbx)
181	shr	R8(%rcx), %rax
182	mov	R32(%r12), R32(%rcx)
183	or	%rax, %r10
184	mov	%rbp, %rax
185	neg	%rax
186	imul	%rax, %r10
187	mov	%r10, %rax
188	mul	%r9
189	lea	1(%r10,%rdx), %r8
190	neg	%r8
191	imul	%rbp, %r8
192	cmp	%r8, %rax
193	lea	(%r8,%rbp), %rdx
194	cmovb	%rdx, %r8
195	mov	%r8, %rax
196	mul	%r9
197	lea	1(%r8,%rdx), %rdi
198	neg	%rdi
199	imul	%rbp, %rdi
200	cmp	%rdi, %rax
201	lea	(%rdi,%rbp), %rdx
202	cmovb	%rdx, %rdi
203	mov	%rdi, %rax
204	mul	%r9
205	lea	1(%rdi,%rdx), %rsi
206	neg	%rsi
207	imul	%rbp, %rsi
208	cmp	%rsi, %rax
209	lea	(%rsi,%rbp), %rdx
210	cmovb	%rdx, %rsi
211	mov	%rsi, %rax
212	mul	%r9
213	lea	1(%rsi,%rdx), %rdx
214	neg	%rdx
215	imul	%rbp, %rdx
216	cmp	%rdx, %rax
217	lea	(%rdx,%rbp), %rbp
218	movslq	R32(%r12), %rax
219	cmovae	%rdx, %rbp
220	shr	R8(%rcx), %r10
221	shr	R8(%rcx), %r8
222	shr	R8(%rcx), %rbp
223	shr	R8(%rcx), %rdi
224	shr	R8(%rcx), %rsi
225	mov	%rbp, 48(%rbx)
226	mov	%rax, 8(%rbx)
227	mov	%r10, 16(%rbx)
228	mov	%r8, 24(%rbx)
229	mov	%rdi, 32(%rbx)
230	mov	%rsi, 40(%rbx)
231	pop	%rbx
232	pop	%rbp
233	pop	%r12
234	ret
235EPILOGUE()
236