xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mod_1_2.asm (revision 212397c69a103ae7e5eafa8731ddfae671d2dee7)
1dnl  AMD64 mpn_mod_1s_2p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C AMD K8,K9	 4
26C AMD K10	 4
27C Intel P4	19
28C Intel core2	 8
29C Intel NHM	 6.5
30C Intel SBR	 4.5
31C Intel atom	28
32C VIA nano	 8
33
34ABI_SUPPORT(DOS64)
35ABI_SUPPORT(STD64)
36
37ASM_START()
38	TEXT
39	ALIGN(16)
40PROLOGUE(mpn_mod_1s_2p)
41	FUNC_ENTRY(4)
42	push	%r14
43	test	$1, R8(%rsi)
44	mov	%rdx, %r14
45	push	%r13
46	mov	%rcx, %r13
47	push	%r12
48	push	%rbp
49	push	%rbx
50	mov	16(%rcx), %r10
51	mov	24(%rcx), %rbx
52	mov	32(%rcx), %rbp
53	je	L(b0)
54	dec	%rsi
55	je	L(one)
56	mov	-8(%rdi,%rsi,8), %rax
57	mul	%r10
58	mov	%rax, %r9
59	mov	%rdx, %r8
60	mov	(%rdi,%rsi,8), %rax
61	add	-16(%rdi,%rsi,8), %r9
62	adc	$0, %r8
63	mul	%rbx
64	add	%rax, %r9
65	adc	%rdx, %r8
66	jmp	L(11)
67
68L(b0):	mov	-8(%rdi,%rsi,8), %r8
69	mov	-16(%rdi,%rsi,8), %r9
70
71L(11):	sub	$4, %rsi
72	jb	L(ed2)
73	lea	40(%rdi,%rsi,8), %rdi
74	mov	-40(%rdi), %r11
75	mov	-32(%rdi), %rax
76	jmp	L(m0)
77
78	ALIGN(16)
79L(top):	mov	-24(%rdi), %r9
80	add	%rax, %r11
81	mov	-16(%rdi), %rax
82	adc	%rdx, %r12
83	mul	%r10
84	add	%rax, %r9
85	mov	%r11, %rax
86	mov	%rdx, %r8
87	adc	$0, %r8
88	mul	%rbx
89	add	%rax, %r9
90	mov	%r12, %rax
91	adc	%rdx, %r8
92	mul	%rbp
93	sub	$2, %rsi
94	jb	L(ed1)
95	mov	-40(%rdi), %r11
96	add	%rax, %r9
97	mov	-32(%rdi), %rax
98	adc	%rdx, %r8
99L(m0):	mul	%r10
100	add	%rax, %r11
101	mov	%r9, %rax
102	mov	%rdx, %r12
103	adc	$0, %r12
104	mul	%rbx
105	add	%rax, %r11
106	lea	-32(%rdi), %rdi		C ap -= 4
107	mov	%r8, %rax
108	adc	%rdx, %r12
109	mul	%rbp
110	sub	$2, %rsi
111	jae	L(top)
112
113L(ed0):	mov	%r11, %r9
114	mov	%r12, %r8
115L(ed1):	add	%rax, %r9
116	adc	%rdx, %r8
117L(ed2):	mov	8(%r13), R32(%rdi)		C cnt
118	mov	%r8, %rax
119	mov	%r9, %r8
120	mul	%r10
121	add	%rax, %r8
122	adc	$0, %rdx
123L(1):	xor	R32(%rcx), R32(%rcx)
124	mov	%r8, %r9
125	sub	R32(%rdi), R32(%rcx)
126	shr	R8(%rcx), %r9
127	mov	R32(%rdi), R32(%rcx)
128	sal	R8(%rcx), %rdx
129	or	%rdx, %r9
130	sal	R8(%rcx), %r8
131	mov	%r9, %rax
132	mulq	(%r13)
133	mov	%rax, %rsi
134	inc	%r9
135	add	%r8, %rsi
136	adc	%r9, %rdx
137	imul	%r14, %rdx
138	sub	%rdx, %r8
139	lea	(%r8,%r14), %rax
140	cmp	%r8, %rsi
141	cmovc	%rax, %r8
142	mov	%r8, %rax
143	sub	%r14, %rax
144	cmovc	%r8, %rax
145	mov	R32(%rdi), R32(%rcx)
146	shr	R8(%rcx), %rax
147	pop	%rbx
148	pop	%rbp
149	pop	%r12
150	pop	%r13
151	pop	%r14
152	FUNC_EXIT()
153	ret
154L(one):
155	mov	(%rdi), %r8
156	mov	8(%rcx), R32(%rdi)
157	xor	%rdx, %rdx
158	jmp	L(1)
159EPILOGUE()
160
161	ALIGN(16)
162PROLOGUE(mpn_mod_1s_2p_cps)
163	FUNC_ENTRY(2)
164	push	%rbp
165	bsr	%rsi, %rcx
166	push	%rbx
167	mov	%rdi, %rbx
168	push	%r12
169	xor	$63, R32(%rcx)
170	mov	%rsi, %r12
171	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
172	sal	R8(%rcx), %r12		C b << cnt
173IFSTD(`	mov	%r12, %rdi	')	C pass parameter
174IFDOS(`	mov	%r12, %rcx	')	C pass parameter
175	CALL(	mpn_invert_limb)
176	mov	%r12, %r8
177	mov	%rax, %r11
178	mov	%rax, (%rbx)		C store bi
179	mov	%rbp, 8(%rbx)		C store cnt
180	neg	%r8
181	mov	R32(%rbp), R32(%rcx)
182	mov	$1, R32(%rsi)
183ifdef(`SHLD_SLOW',`
184	shl	R8(%rcx), %rsi
185	neg	R32(%rcx)
186	mov	%rax, %rbp
187	shr	R8(%rcx), %rax
188	or	%rax, %rsi
189	mov	%rbp, %rax
190	neg	R32(%rcx)
191',`
192	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
193')
194	imul	%r8, %rsi
195	mul	%rsi
196
197	add	%rsi, %rdx
198	shr	R8(%rcx), %rsi
199	mov	%rsi, 16(%rbx)		C store B1modb
200
201	not	%rdx
202	imul	%r12, %rdx
203	lea	(%rdx,%r12), %rsi
204	cmp	%rdx, %rax
205	cmovnc	%rdx, %rsi
206	mov	%r11, %rax
207	mul	%rsi
208
209	add	%rsi, %rdx
210	shr	R8(%rcx), %rsi
211	mov	%rsi, 24(%rbx)		C store B2modb
212
213	not	%rdx
214	imul	%r12, %rdx
215	add	%rdx, %r12
216	cmp	%rdx, %rax
217	cmovnc	%rdx, %r12
218
219	shr	R8(%rcx), %r12
220	mov	%r12, 32(%rbx)		C store B3modb
221
222	pop	%r12
223	pop	%rbx
224	pop	%rbp
225	FUNC_EXIT()
226	ret
227EPILOGUE()
228