xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mod_1_4.asm (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1dnl  AMD64 mpn_mod_1s_4p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C AMD K8,K9	 3
26C AMD K10	 3
27C Intel P4	15.5
28C Intel core2	 5
29C Intel corei	 4
30C Intel atom	23
31C VIA nano	 4.75
32
33ABI_SUPPORT(DOS64)
34ABI_SUPPORT(STD64)
35
36ASM_START()
37	TEXT
38	ALIGN(16)
39PROLOGUE(mpn_mod_1s_4p)
40	FUNC_ENTRY(4)
41	push	%r15
42	push	%r14
43	push	%r13
44	push	%r12
45	push	%rbp
46	push	%rbx
47
48	mov	%rdx, %r15
49	mov	%rcx, %r14
50	mov	16(%rcx), %r11		C B1modb
51	mov	24(%rcx), %rbx		C B2modb
52	mov	32(%rcx), %rbp		C B3modb
53	mov	40(%rcx), %r13		C B4modb
54	mov	48(%rcx), %r12		C B5modb
55	xor	R32(%r8), R32(%r8)
56	mov	R32(%rsi), R32(%rdx)
57	and	$3, R32(%rdx)
58	je	L(b0)
59	cmp	$2, R32(%rdx)
60	jc	L(b1)
61	je	L(b2)
62
63L(b3):	lea	-24(%rdi,%rsi,8), %rdi
64	mov	8(%rdi), %rax
65	mul	%r11
66	mov	(%rdi), %r9
67	add	%rax, %r9
68	adc	%rdx, %r8
69	mov	16(%rdi), %rax
70	mul	%rbx
71	jmp	L(m0)
72
73	ALIGN(8)
74L(b0):	lea	-32(%rdi,%rsi,8), %rdi
75	mov	8(%rdi), %rax
76	mul	%r11
77	mov	(%rdi), %r9
78	add	%rax, %r9
79	adc	%rdx, %r8
80	mov	16(%rdi), %rax
81	mul	%rbx
82	add	%rax, %r9
83	adc	%rdx, %r8
84	mov	24(%rdi), %rax
85	mul	%rbp
86	jmp	L(m0)
87
88	ALIGN(8)
89L(b1):	lea	-8(%rdi,%rsi,8), %rdi
90	mov	(%rdi), %r9
91	jmp	L(m1)
92
93	ALIGN(8)
94L(b2):	lea	-16(%rdi,%rsi,8), %rdi
95	mov	8(%rdi), %r8
96	mov	(%rdi), %r9
97	jmp	L(m1)
98
99	ALIGN(16)
100L(top):	mov	-24(%rdi), %rax
101	mov	-32(%rdi), %r10
102	mul	%r11			C up[1] * B1modb
103	add	%rax, %r10
104	mov	-16(%rdi), %rax
105	mov	$0, R32(%rcx)
106	adc	%rdx, %rcx
107	mul	%rbx			C up[2] * B2modb
108	add	%rax, %r10
109	mov	-8(%rdi), %rax
110	adc	%rdx, %rcx
111	sub	$32, %rdi
112	mul	%rbp			C up[3] * B3modb
113	add	%rax, %r10
114	mov	%r13, %rax
115	adc	%rdx, %rcx
116	mul	%r9			C rl * B4modb
117	add	%rax, %r10
118	mov	%r12, %rax
119	adc	%rdx, %rcx
120	mul	%r8			C rh * B5modb
121	mov	%r10, %r9
122	mov	%rcx, %r8
123L(m0):	add	%rax, %r9
124	adc	%rdx, %r8
125L(m1):	sub	$4, %rsi
126	ja	L(top)
127
128L(end):	mov	8(%r14), R32(%rsi)
129	mov	%r8, %rax
130	mul	%r11
131	mov	%rax, %r8
132	add	%r9, %r8
133	adc	$0, %rdx
134	xor	R32(%rcx), R32(%rcx)
135	sub	R32(%rsi), R32(%rcx)
136	mov	%r8, %rdi
137	shr	R8(%rcx), %rdi
138	mov	R32(%rsi), R32(%rcx)
139	sal	R8(%rcx), %rdx
140	or	%rdx, %rdi
141	mov	%rdi, %rax
142	mulq	(%r14)
143	mov	%r15, %rbx
144	mov	%rax, %r9
145	sal	R8(%rcx), %r8
146	inc	%rdi
147	add	%r8, %r9
148	adc	%rdi, %rdx
149	imul	%rbx, %rdx
150	sub	%rdx, %r8
151	lea	(%r8,%rbx), %rax
152	cmp	%r8, %r9
153	cmovc	%rax, %r8
154	mov	%r8, %rax
155	sub	%rbx, %rax
156	cmovc	%r8, %rax
157	shr	R8(%rcx), %rax
158	pop	%rbx
159	pop	%rbp
160	pop	%r12
161	pop	%r13
162	pop	%r14
163	pop	%r15
164	FUNC_EXIT()
165	ret
166EPILOGUE()
167
168	ALIGN(16)
169PROLOGUE(mpn_mod_1s_4p_cps)
170	FUNC_ENTRY(2)
171	push	%rbp
172	bsr	%rsi, %rcx
173	push	%rbx
174	mov	%rdi, %rbx
175	push	%r12
176	xor	$63, R32(%rcx)
177	mov	%rsi, %r12
178	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
179	sal	R8(%rcx), %r12		C b << cnt
180IFSTD(`	mov	%r12, %rdi	')	C pass parameter
181IFDOS(`	mov	%r12, %rcx	')	C pass parameter
182	CALL(	mpn_invert_limb)
183	mov	%r12, %r8
184	mov	%rax, %r11
185	mov	%rax, (%rbx)		C store bi
186	mov	%rbp, 8(%rbx)		C store cnt
187	neg	%r8
188	mov	R32(%rbp), R32(%rcx)
189	mov	$1, R32(%rsi)
190ifdef(`SHLD_SLOW',`
191	shl	R8(%rcx), %rsi
192	neg	R32(%rcx)
193	mov	%rax, %rbp
194	shr	R8(%rcx), %rax
195	or	%rax, %rsi
196	mov	%rbp, %rax
197	neg	R32(%rcx)
198',`
199	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
200')
201	imul	%r8, %rsi
202	mul	%rsi
203
204	add	%rsi, %rdx
205	shr	R8(%rcx), %rsi
206	mov	%rsi, 16(%rbx)		C store B1modb
207
208	not	%rdx
209	imul	%r12, %rdx
210	lea	(%rdx,%r12), %rsi
211	cmp	%rdx, %rax
212	cmovnc	%rdx, %rsi
213	mov	%r11, %rax
214	mul	%rsi
215
216	add	%rsi, %rdx
217	shr	R8(%rcx), %rsi
218	mov	%rsi, 24(%rbx)		C store B2modb
219
220	not	%rdx
221	imul	%r12, %rdx
222	lea	(%rdx,%r12), %rsi
223	cmp	%rdx, %rax
224	cmovnc	%rdx, %rsi
225	mov	%r11, %rax
226	mul	%rsi
227
228	add	%rsi, %rdx
229	shr	R8(%rcx), %rsi
230	mov	%rsi, 32(%rbx)		C store B3modb
231
232	not	%rdx
233	imul	%r12, %rdx
234	lea	(%rdx,%r12), %rsi
235	cmp	%rdx, %rax
236	cmovnc	%rdx, %rsi
237	mov	%r11, %rax
238	mul	%rsi
239
240	add	%rsi, %rdx
241	shr	R8(%rcx), %rsi
242	mov	%rsi, 40(%rbx)		C store B4modb
243
244	not	%rdx
245	imul	%r12, %rdx
246	add	%rdx, %r12
247	cmp	%rdx, %rax
248	cmovnc	%rdx, %r12
249
250	shr	R8(%rcx), %r12
251	mov	%r12, 48(%rbx)		C store B5modb
252
253	pop	%r12
254	pop	%rbx
255	pop	%rbp
256	FUNC_EXIT()
257	ret
258EPILOGUE()
259