xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mod_1_2.asm (revision 2f62cc9c12bc202c40224f32c879f81443fee079)
1dnl  AMD64 mpn_mod_1s_2p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 4
37C AMD K10	 4
38C Intel P4	19
39C Intel core2	 8
40C Intel NHM	 6.5
41C Intel SBR	 4.5
42C Intel atom	28
43C VIA nano	 8
44
45ABI_SUPPORT(DOS64)
46ABI_SUPPORT(STD64)
47
48ASM_START()
49	TEXT
50	ALIGN(16)
51PROLOGUE(mpn_mod_1s_2p)
52	FUNC_ENTRY(4)
53	push	%r14
54	test	$1, R8(%rsi)
55	mov	%rdx, %r14
56	push	%r13
57	mov	%rcx, %r13
58	push	%r12
59	push	%rbp
60	push	%rbx
61	mov	16(%rcx), %r10
62	mov	24(%rcx), %rbx
63	mov	32(%rcx), %rbp
64	je	L(b0)
65	dec	%rsi
66	je	L(one)
67	mov	-8(%rdi,%rsi,8), %rax
68	mul	%r10
69	mov	%rax, %r9
70	mov	%rdx, %r8
71	mov	(%rdi,%rsi,8), %rax
72	add	-16(%rdi,%rsi,8), %r9
73	adc	$0, %r8
74	mul	%rbx
75	add	%rax, %r9
76	adc	%rdx, %r8
77	jmp	L(11)
78
79L(b0):	mov	-8(%rdi,%rsi,8), %r8
80	mov	-16(%rdi,%rsi,8), %r9
81
82L(11):	sub	$4, %rsi
83	jb	L(ed2)
84	lea	40(%rdi,%rsi,8), %rdi
85	mov	-40(%rdi), %r11
86	mov	-32(%rdi), %rax
87	jmp	L(m0)
88
89	ALIGN(16)
90L(top):	mov	-24(%rdi), %r9
91	add	%rax, %r11
92	mov	-16(%rdi), %rax
93	adc	%rdx, %r12
94	mul	%r10
95	add	%rax, %r9
96	mov	%r11, %rax
97	mov	%rdx, %r8
98	adc	$0, %r8
99	mul	%rbx
100	add	%rax, %r9
101	mov	%r12, %rax
102	adc	%rdx, %r8
103	mul	%rbp
104	sub	$2, %rsi
105	jb	L(ed1)
106	mov	-40(%rdi), %r11
107	add	%rax, %r9
108	mov	-32(%rdi), %rax
109	adc	%rdx, %r8
110L(m0):	mul	%r10
111	add	%rax, %r11
112	mov	%r9, %rax
113	mov	%rdx, %r12
114	adc	$0, %r12
115	mul	%rbx
116	add	%rax, %r11
117	lea	-32(%rdi), %rdi		C ap -= 4
118	mov	%r8, %rax
119	adc	%rdx, %r12
120	mul	%rbp
121	sub	$2, %rsi
122	jae	L(top)
123
124L(ed0):	mov	%r11, %r9
125	mov	%r12, %r8
126L(ed1):	add	%rax, %r9
127	adc	%rdx, %r8
128L(ed2):	mov	8(%r13), R32(%rdi)		C cnt
129	mov	%r8, %rax
130	mov	%r9, %r8
131	mul	%r10
132	add	%rax, %r8
133	adc	$0, %rdx
134L(1):	xor	R32(%rcx), R32(%rcx)
135	mov	%r8, %r9
136	sub	R32(%rdi), R32(%rcx)
137	shr	R8(%rcx), %r9
138	mov	R32(%rdi), R32(%rcx)
139	sal	R8(%rcx), %rdx
140	or	%rdx, %r9
141	sal	R8(%rcx), %r8
142	mov	%r9, %rax
143	mulq	(%r13)
144	mov	%rax, %rsi
145	inc	%r9
146	add	%r8, %rsi
147	adc	%r9, %rdx
148	imul	%r14, %rdx
149	sub	%rdx, %r8
150	lea	(%r8,%r14), %rax
151	cmp	%r8, %rsi
152	cmovc	%rax, %r8
153	mov	%r8, %rax
154	sub	%r14, %rax
155	cmovc	%r8, %rax
156	mov	R32(%rdi), R32(%rcx)
157	shr	R8(%rcx), %rax
158	pop	%rbx
159	pop	%rbp
160	pop	%r12
161	pop	%r13
162	pop	%r14
163	FUNC_EXIT()
164	ret
165L(one):
166	mov	(%rdi), %r8
167	mov	8(%rcx), R32(%rdi)
168	xor	%rdx, %rdx
169	jmp	L(1)
170EPILOGUE()
171
172	ALIGN(16)
173PROLOGUE(mpn_mod_1s_2p_cps)
174	FUNC_ENTRY(2)
175	push	%rbp
176	bsr	%rsi, %rcx
177	push	%rbx
178	mov	%rdi, %rbx
179	push	%r12
180	xor	$63, R32(%rcx)
181	mov	%rsi, %r12
182	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
183	sal	R8(%rcx), %r12		C b << cnt
184IFSTD(`	mov	%r12, %rdi	')	C pass parameter
185IFDOS(`	mov	%r12, %rcx	')	C pass parameter
186IFDOS(`	sub	$32, %rsp	')
187	ASSERT(nz, `test $15, %rsp')
188	CALL(	mpn_invert_limb)
189IFDOS(`	add	$32, %rsp	')
190	mov	%r12, %r8
191	mov	%rax, %r11
192	mov	%rax, (%rbx)		C store bi
193	mov	%rbp, 8(%rbx)		C store cnt
194	neg	%r8
195	mov	R32(%rbp), R32(%rcx)
196	mov	$1, R32(%rsi)
197ifdef(`SHLD_SLOW',`
198	shl	R8(%rcx), %rsi
199	neg	R32(%rcx)
200	mov	%rax, %rbp
201	shr	R8(%rcx), %rax
202	or	%rax, %rsi
203	mov	%rbp, %rax
204	neg	R32(%rcx)
205',`
206	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
207')
208	imul	%r8, %rsi
209	mul	%rsi
210
211	add	%rsi, %rdx
212	shr	R8(%rcx), %rsi
213	mov	%rsi, 16(%rbx)		C store B1modb
214
215	not	%rdx
216	imul	%r12, %rdx
217	lea	(%rdx,%r12), %rsi
218	cmp	%rdx, %rax
219	cmovnc	%rdx, %rsi
220	mov	%r11, %rax
221	mul	%rsi
222
223	add	%rsi, %rdx
224	shr	R8(%rcx), %rsi
225	mov	%rsi, 24(%rbx)		C store B2modb
226
227	not	%rdx
228	imul	%r12, %rdx
229	add	%rdx, %r12
230	cmp	%rdx, %rax
231	cmovnc	%rdx, %r12
232
233	shr	R8(%rcx), %r12
234	mov	%r12, 32(%rbx)		C store B3modb
235
236	pop	%r12
237	pop	%rbx
238	pop	%rbp
239	FUNC_EXIT()
240	ret
241EPILOGUE()
242