xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mode1o.asm (revision af56d1fe9956bd7c616e18c1b7f025f464618471)
1dnl  AMD64 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software
4dnl  Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C	     cycles/limb
25C K8,K9:	10
26C K10:		10
27C P4:		33
28C P6 core2:	13
29C P6 corei7:	14.5
30C P6 Atom:	35
31
32
33C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
34C                               mp_limb_t divisor);
35C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
36C                                mp_limb_t divisor, mp_limb_t carry);
37C
38C
39C The dependent chain in the main loop is
40C
41C                            cycles
42C	subq	%rdx, %rax	1
43C	imulq	%r9, %rax	4
44C	mulq	%r8		5
45C			      ----
46C       total		       10
47C
48C The movq load from src seems to need to be scheduled back before the jz to
49C achieve this speed, out-of-order execution apparently can't completely
50C hide the latency otherwise.
51C
52C The l=src[i]-cbit step is rotated back too, since that allows us to avoid
53C it for the first iteration (where there's no cbit).
54C
55C The code alignment used (32-byte) for the loop also seems necessary.
56C Without that the non-PIC case has adcq crossing the 0x60 offset,
57C apparently making it run at 11 cycles instead of 10.
58C
59C Not done:
60C
61C divq for size==1 was measured at about 79 cycles, compared to the inverse
62C at about 25 cycles (both including function call overheads), so that's not
63C used.
64C
65C Enhancements:
66C
67C For PIC, we shouldn't really need the GOT fetch for binvert_limb_table,
68C it'll be in rodata or text in libgmp.so and can be accessed directly %rip
69C relative.  This would be for small model only (something we don't
70C presently detect, but which is all that gcc 3.3.3 supports), since 8-byte
71C PC-relative relocations are apparently not available.  Some rough
72C experiments with binutils 2.13 looked worrylingly like it might come out
73C with an unwanted text segment relocation though, even with ".protected".
74
75
76ASM_START()
77	TEXT
78	ALIGN(32)
79PROLOGUE(mpn_modexact_1_odd)
80
81	movl	$0, %ecx
82
83PROLOGUE(mpn_modexact_1c_odd)
84
85	C rdi	src
86	C rsi	size
87	C rdx	divisor
88	C rcx	carry
89
90	movq	%rdx, %r8		C d
91	shrl	%edx			C d/2
92ifdef(`PIC',`
93	movq	binvert_limb_table@GOTPCREL(%rip), %r9
94',`
95	movabsq	$binvert_limb_table, %r9
96')
97
98	andl	$127, %edx
99	movq	%rcx, %r10		C initial carry
100
101	movzbl	(%r9,%rdx), %edx	C inv 8 bits
102
103	movq	(%rdi), %rax		C src[0]
104	leaq	(%rdi,%rsi,8), %r11	C src end
105	movq	%r8, %rdi		C d, made available to imull
106
107	leal	(%rdx,%rdx), %ecx	C 2*inv
108	imull	%edx, %edx		C inv*inv
109
110	negq	%rsi			C -size
111
112	imull	%edi, %edx		C inv*inv*d
113
114	subl	%edx, %ecx		C inv = 2*inv - inv*inv*d, 16 bits
115
116	leal	(%rcx,%rcx), %edx	C 2*inv
117	imull	%ecx, %ecx		C inv*inv
118
119	imull	%edi, %ecx		C inv*inv*d
120
121	subl	%ecx, %edx		C inv = 2*inv - inv*inv*d, 32 bits
122	xorl	%ecx, %ecx		C initial cbit
123
124	leaq	(%rdx,%rdx), %r9	C 2*inv
125	imulq	%rdx, %rdx		C inv*inv
126
127	imulq	%r8, %rdx		C inv*inv*d
128
129	subq	%rdx, %r9		C inv = 2*inv - inv*inv*d, 64 bits
130	movq	%r10, %rdx		C initial climb
131
132	ASSERT(e,`	C d*inv == 1 mod 2^64
133	movq	%r8, %r10
134	imulq	%r9, %r10
135	cmpq	$1, %r10')
136
137	incq	%rsi
138	jz	L(one)
139
140
141	ALIGN(16)
142L(top):
143	C rax	l = src[i]-cbit
144	C rcx	new cbit, 0 or 1
145	C rdx	climb, high of last product
146	C rsi	counter, limbs, negative
147	C rdi
148	C r8	divisor
149	C r9	inverse
150	C r11	src end ptr
151
152	subq	%rdx, %rax		C l = src[i]-cbit - climb
153
154	adcq	$0, %rcx		C more cbit
155	imulq	%r9, %rax		C q = l * inverse
156
157	mulq	%r8			C climb = high (q * d)
158
159	movq	(%r11,%rsi,8), %rax	C src[i+1]
160	subq	%rcx, %rax		C next l = src[i+1] - cbit
161	setc	%cl			C new cbit
162
163	incq	%rsi
164	jnz	L(top)
165
166
167L(one):
168	subq	%rdx, %rax		C l = src[i]-cbit - climb
169
170	adcq	$0, %rcx		C more cbit
171	imulq	%r9, %rax		C q = l * inverse
172
173	mulq	%r8			C climb = high (q * d)
174
175	leaq	(%rcx,%rdx), %rax	C climb+cbit
176	ret
177
178EPILOGUE(mpn_modexact_1c_odd)
179EPILOGUE(mpn_modexact_1_odd)
180