xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/pentium4/mod_34lsub1.asm (revision 230b95665bbd3a9d1a53658a36b1053f8382a519)
1dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2010, 2011, 2012 Free
4dnl  Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb
25C AMD K8,K9	 1.0
26C AMD K10	 1.12
27C Intel P4	 3.25
28C Intel core2	 1.5
29C Intel corei	 1.5
30C Intel atom	 2.5
31C VIA nano	 1.75
32
33
34C INPUT PARAMETERS
35define(`ap',	%rdi)
36define(`n',	%rsi)
37
38C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
39
40C TODO
41C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
42C    sbb to placate Pentium4.
43C  * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
44C    without the dual loop exits.
45
46ABI_SUPPORT(DOS64)
47ABI_SUPPORT(STD64)
48
49ASM_START()
50	TEXT
51	ALIGN(32)
52PROLOGUE(mpn_mod_34lsub1)
53	FUNC_ENTRY(2)
54
55	mov	$0x0000FFFFFFFFFFFF, %r11
56
57	sub	$2, %rsi
58	ja	L(gt2)
59
60	mov	(ap), %rax
61	nop
62	jb	L(1)
63
64	mov	8(ap), %rsi
65	mov	%rax, %rdx
66	shr	$48, %rax		C src[0] low
67
68	and	%r11, %rdx		C src[0] high
69	add	%rdx, %rax
70	mov	R32(%rsi), R32(%rdx)
71
72	shr	$32, %rsi		C src[1] high
73	add	%rsi, %rax
74
75	shl	$16, %rdx		C src[1] low
76	add	%rdx, %rax
77
78L(1):	FUNC_EXIT()
79	ret
80
81
82	ALIGN(16)
83L(gt2):	xor	R32(%rax), R32(%rax)
84	xor	R32(%rcx), R32(%rcx)
85	xor	R32(%rdx), R32(%rdx)
86	xor	%r8, %r8
87	xor	%r9, %r9
88	xor	%r10, %r10
89
90L(top):	add	(ap), %rax
91	adc	$0, %r10
92	add	8(ap), %rcx
93	adc	$0, %r8
94	add	16(ap), %rdx
95	adc	$0, %r9
96
97	sub	$3, %rsi
98	jng	L(end)
99
100	add	24(ap), %rax
101	adc	$0, %r10
102	add	32(ap), %rcx
103	adc	$0, %r8
104	add	40(ap), %rdx
105	lea	48(ap), ap
106	adc	$0, %r9
107
108	sub	$3, %rsi
109	jg	L(top)
110
111
112	add	$-24, ap
113L(end):	add	%r9, %rax
114	adc	%r10, %rcx
115	adc	%r8, %rdx
116
117	inc	%rsi
118	mov	$0x1, R32(%r10)
119	js	L(combine)
120
121	mov	$0x10000, R32(%r10)
122	adc	24(ap), %rax
123	dec	%rsi
124	js	L(combine)
125
126	adc	32(ap), %rcx
127	mov	$0x100000000, %r10
128
129L(combine):
130	sbb	%rsi, %rsi		C carry
131	mov	%rax, %rdi		C 0mod3
132	shr	$48, %rax		C 0mod3 high
133
134	and	%r10, %rsi		C carry masked
135	and	%r11, %rdi		C 0mod3 low
136	mov	R32(%rcx), R32(%r10)	C 1mod3
137
138	add	%rsi, %rax		C apply carry
139	shr	$32, %rcx		C 1mod3 high
140
141	add	%rdi, %rax		C apply 0mod3 low
142	movzwl	%dx, R32(%rdi)		C 2mod3
143	shl	$16, %r10		C 1mod3 low
144
145	add	%rcx, %rax		C apply 1mod3 high
146	shr	$16, %rdx		C 2mod3 high
147
148	add	%r10, %rax		C apply 1mod3 low
149	shl	$32, %rdi		C 2mod3 low
150
151	add	%rdx, %rax		C apply 2mod3 high
152	add	%rdi, %rax		C apply 2mod3 low
153
154	FUNC_EXIT()
155	ret
156EPILOGUE()
157