xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mod_34lsub1.asm (revision 9573673d78c64ea1eac42d7f2e9521be89932ae5)
1dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011, 2012 Free
4dnl  Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	    cycles/limb
25C AMD K8,K9	 0.67	   0.583 is possible with zero-reg instead of $0, 4-way
26C AMD K10	 0.67	   this seems hard to beat
27C AMD bd1	 1
28C AMD bobcat	 1.07
29C Intel P4	 7.35	   terrible, use old code
30C Intel core2	 1.25	   1+epsilon with huge unrolling
31C Intel NHM	 1.15	   this seems hard to beat
32C Intel SBR	 0.93
33C Intel atom	 2.5
34C VIA nano	 1.25	   this seems hard to beat
35
36C INPUT PARAMETERS
37define(`ap',	%rdi)
38define(`n',	%rsi)
39
40C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
41
42C TODO
43C  * Review feed-in and wind-down code.
44
45ABI_SUPPORT(DOS64)
46ABI_SUPPORT(STD64)
47
48ASM_START()
49	TEXT
50	ALIGN(32)
51PROLOGUE(mpn_mod_34lsub1)
52	FUNC_ENTRY(2)
53
54	mov	$0x0000FFFFFFFFFFFF, %r11
55
56	mov	(ap), %rax
57
58	cmp	$2, %rsi
59	ja	L(gt2)
60
61	jb	L(one)
62
63	mov	8(ap), %rsi
64	mov	%rax, %rdx
65	shr	$48, %rax		C src[0] low
66
67	and	%r11, %rdx		C src[0] high
68	add	%rdx, %rax
69	mov	R32(%rsi), R32(%rdx)
70
71	shr	$32, %rsi		C src[1] high
72	add	%rsi, %rax
73
74	shl	$16, %rdx		C src[1] low
75	add	%rdx, %rax
76L(one):	FUNC_EXIT()
77	ret
78
79
80C Don't change this, the wind-down code is not able to handle greater values
81define(UNROLL,3)
82
83L(gt2):	mov	8(ap), %rcx
84	mov	16(ap), %rdx
85	xor	%r9, %r9
86	add	$24, ap
87	sub	$eval(UNROLL*3+3), %rsi
88	jc	L(end)
89	ALIGN(16)
90L(top):
91	add	(ap), %rax
92	adc	8(ap), %rcx
93	adc	16(ap), %rdx
94	adc	$0, %r9
95forloop(i,1,UNROLL-1,`dnl
96	add	eval(i*24)(ap), %rax
97	adc	eval(i*24+8)(ap), %rcx
98	adc	eval(i*24+16)(ap), %rdx
99	adc	$0, %r9
100')dnl
101	add	$eval(UNROLL*24), ap
102	sub	$eval(UNROLL*3), %rsi
103	jnc	L(top)
104
105L(end):
106	lea	L(tab)(%rip), %r8
107ifdef(`PIC',
108`	movslq	36(%r8,%rsi,4), %r10
109	add	%r10, %r8
110	jmp	*%r8
111',`
112	jmp	*72(%r8,%rsi,8)
113')
114	JUMPTABSECT
115	ALIGN(8)
116L(tab):	JMPENT(	L(0), L(tab))
117	JMPENT(	L(1), L(tab))
118	JMPENT(	L(2), L(tab))
119	JMPENT(	L(3), L(tab))
120	JMPENT(	L(4), L(tab))
121	JMPENT(	L(5), L(tab))
122	JMPENT(	L(6), L(tab))
123	JMPENT(	L(7), L(tab))
124	JMPENT(	L(8), L(tab))
125	TEXT
126
127L(6):	add	(ap), %rax
128	adc	8(ap), %rcx
129	adc	16(ap), %rdx
130	adc	$0, %r9
131	add	$24, ap
132L(3):	add	(ap), %rax
133	adc	8(ap), %rcx
134	adc	16(ap), %rdx
135	jmp	L(cj1)
136
137L(7):	add	(ap), %rax
138	adc	8(ap), %rcx
139	adc	16(ap), %rdx
140	adc	$0, %r9
141	add	$24, ap
142L(4):	add	(ap), %rax
143	adc	8(ap), %rcx
144	adc	16(ap), %rdx
145	adc	$0, %r9
146	add	$24, ap
147L(1):	add	(ap), %rax
148	adc	$0, %rcx
149	jmp	L(cj2)
150
151L(8):	add	(ap), %rax
152	adc	8(ap), %rcx
153	adc	16(ap), %rdx
154	adc	$0, %r9
155	add	$24, ap
156L(5):	add	(ap), %rax
157	adc	8(ap), %rcx
158	adc	16(ap), %rdx
159	adc	$0, %r9
160	add	$24, ap
161L(2):	add	(ap), %rax
162	adc	8(ap), %rcx
163
164L(cj2):	adc	$0, %rdx
165L(cj1):	adc	$0, %r9
166L(0):	add	%r9, %rax
167	adc	$0, %rcx
168	adc	$0, %rdx
169	adc	$0, %rax
170
171	mov	%rax, %rdi		C 0mod3
172	shr	$48, %rax		C 0mod3 high
173
174	and	%r11, %rdi		C 0mod3 low
175	mov	R32(%rcx), R32(%r10)	C 1mod3
176
177	shr	$32, %rcx		C 1mod3 high
178
179	add	%rdi, %rax		C apply 0mod3 low
180	movzwl	%dx, R32(%rdi)		C 2mod3
181	shl	$16, %r10		C 1mod3 low
182
183	add	%rcx, %rax		C apply 1mod3 high
184	shr	$16, %rdx		C 2mod3 high
185
186	add	%r10, %rax		C apply 1mod3 low
187	shl	$32, %rdi		C 2mod3 low
188
189	add	%rdx, %rax		C apply 2mod3 high
190	add	%rdi, %rax		C apply 2mod3 low
191
192	FUNC_EXIT()
193	ret
194EPILOGUE()
195