xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mod_34lsub1.asm (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C	    cycles/limb
36C AMD K8,K9	 0.67	   0.583 is possible with zero-reg instead of $0, 4-way
37C AMD K10	 0.67	   this seems hard to beat
38C AMD bd1	 1
39C AMD bd2	 1
40C AMD bd3	 ?
41C AMD bd4	 ?
42C AMD zen	 0.62
43C AMD bobcat	 1.07
44C AMD jaguar	 1
45C Intel P4	 7.35	   terrible, use old code
46C Intel core2	 1.25	   1+epsilon with huge unrolling
47C Intel NHM	 1.15	   this seems hard to beat
48C Intel SBR	 0.93
49C Intel IBR	 0.93
50C Intel HWL	 0.82
51C Intel BWL	 0.64
52C Intel SKY	 0.60
53C Intel atom	 2.5
54C Intel SLM      1.59
55C VIA nano	 1.25	   this seems hard to beat
56
57C INPUT PARAMETERS
58define(`ap',	%rdi)
59define(`n',	%rsi)
60
61C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
62
63C TODO
64C  * Review feed-in and wind-down code.
65
66ABI_SUPPORT(DOS64)
67ABI_SUPPORT(STD64)
68
69ASM_START()
70	TEXT
71	ALIGN(32)
72PROLOGUE(mpn_mod_34lsub1)
73	FUNC_ENTRY(2)
74
75	mov	$0x0000FFFFFFFFFFFF, %r11
76
77	mov	(ap), %rax
78
79	cmp	$2, %rsi
80	ja	L(gt2)
81
82	jb	L(one)
83
84	mov	8(ap), %rsi
85	mov	%rax, %rdx
86	shr	$48, %rax		C src[0] low
87
88	and	%r11, %rdx		C src[0] high
89	add	%rdx, %rax
90	mov	R32(%rsi), R32(%rdx)
91
92	shr	$32, %rsi		C src[1] high
93	add	%rsi, %rax
94
95	shl	$16, %rdx		C src[1] low
96	add	%rdx, %rax
97L(one):	FUNC_EXIT()
98	ret
99
100
101C Don't change this, the wind-down code is not able to handle greater values
102define(UNROLL,3)
103
104L(gt2):	mov	8(ap), %rcx
105	mov	16(ap), %rdx
106	xor	%r9, %r9
107	add	$24, ap
108	sub	$eval(UNROLL*3+3), %rsi
109	jc	L(end)
110	ALIGN(16)
111L(top):
112	add	(ap), %rax
113	adc	8(ap), %rcx
114	adc	16(ap), %rdx
115	adc	$0, %r9
116forloop(i,1,UNROLL-1,`dnl
117	add	eval(i*24)(ap), %rax
118	adc	eval(i*24+8)(ap), %rcx
119	adc	eval(i*24+16)(ap), %rdx
120	adc	$0, %r9
121')dnl
122	add	$eval(UNROLL*24), ap
123	sub	$eval(UNROLL*3), %rsi
124	jnc	L(top)
125
126L(end):
127	lea	L(tab)(%rip), %r8
128ifdef(`PIC',
129`	movslq	36(%r8,%rsi,4), %r10
130	add	%r10, %r8
131	jmp	*%r8
132',`
133	jmp	*72(%r8,%rsi,8)
134')
135	JUMPTABSECT
136	ALIGN(8)
137L(tab):	JMPENT(	L(0), L(tab))
138	JMPENT(	L(1), L(tab))
139	JMPENT(	L(2), L(tab))
140	JMPENT(	L(3), L(tab))
141	JMPENT(	L(4), L(tab))
142	JMPENT(	L(5), L(tab))
143	JMPENT(	L(6), L(tab))
144	JMPENT(	L(7), L(tab))
145	JMPENT(	L(8), L(tab))
146	TEXT
147
148L(6):	add	(ap), %rax
149	adc	8(ap), %rcx
150	adc	16(ap), %rdx
151	adc	$0, %r9
152	add	$24, ap
153L(3):	add	(ap), %rax
154	adc	8(ap), %rcx
155	adc	16(ap), %rdx
156	jmp	L(cj1)
157
158L(7):	add	(ap), %rax
159	adc	8(ap), %rcx
160	adc	16(ap), %rdx
161	adc	$0, %r9
162	add	$24, ap
163L(4):	add	(ap), %rax
164	adc	8(ap), %rcx
165	adc	16(ap), %rdx
166	adc	$0, %r9
167	add	$24, ap
168L(1):	add	(ap), %rax
169	adc	$0, %rcx
170	jmp	L(cj2)
171
172L(8):	add	(ap), %rax
173	adc	8(ap), %rcx
174	adc	16(ap), %rdx
175	adc	$0, %r9
176	add	$24, ap
177L(5):	add	(ap), %rax
178	adc	8(ap), %rcx
179	adc	16(ap), %rdx
180	adc	$0, %r9
181	add	$24, ap
182L(2):	add	(ap), %rax
183	adc	8(ap), %rcx
184
185L(cj2):	adc	$0, %rdx
186L(cj1):	adc	$0, %r9
187L(0):	add	%r9, %rax
188	adc	$0, %rcx
189	adc	$0, %rdx
190	adc	$0, %rax
191
192	mov	%rax, %rdi		C 0mod3
193	shr	$48, %rax		C 0mod3 high
194
195	and	%r11, %rdi		C 0mod3 low
196	mov	R32(%rcx), R32(%r10)	C 1mod3
197
198	shr	$32, %rcx		C 1mod3 high
199
200	add	%rdi, %rax		C apply 0mod3 low
201	movzwl	%dx, R32(%rdi)		C 2mod3
202	shl	$16, %r10		C 1mod3 low
203
204	add	%rcx, %rax		C apply 1mod3 high
205	shr	$16, %rdx		C 2mod3 high
206
207	add	%r10, %rax		C apply 1mod3 low
208	shl	$32, %rdi		C 2mod3 low
209
210	add	%rdx, %rax		C apply 2mod3 high
211	add	%rdi, %rax		C apply 2mod3 low
212
213	FUNC_EXIT()
214	ret
215EPILOGUE()
216