xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/mode1o.asm (revision 70f7362772ba52b749c976fb5e86e39a8b2c9afc)
1dnl  AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
2
3dnl  Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	10
36C AMD K10	10
37C Intel P4	33
38C Intel core2	13
39C Intel corei	14.5
40C Intel atom	35
41C VIA nano	 ?
42
43
44C The dependent chain in the main loop is
45C
46C                            cycles
47C	sub	%rdx, %rax	1
48C	imul	%r9, %rax	4
49C	mul	%r8		5
50C			      ----
51C       total		       10
52C
53C The mov load from src seems to need to be scheduled back before the jz to
54C achieve this speed, out-of-order execution apparently can't completely hide
55C the latency otherwise.
56C
57C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
58C for the first iteration (where there's no cbit).
59C
60C The code alignment used (32-byte) for the loop also seems necessary.  Without
61C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
62C run at 11 cycles instead of 10.
63
64
65ABI_SUPPORT(DOS64)
66ABI_SUPPORT(STD64)
67
68ASM_START()
69	TEXT
70	ALIGN(32)
71PROLOGUE(mpn_modexact_1_odd)
72	FUNC_ENTRY(3)
73	mov	$0, R32(%rcx)
74IFDOS(`	jmp	L(ent)		')
75
76PROLOGUE(mpn_modexact_1c_odd)
77	FUNC_ENTRY(4)
78L(ent):
79	C rdi	src
80	C rsi	size
81	C rdx	divisor
82	C rcx	carry
83
84	mov	%rdx, %r8		C d
85	shr	R32(%rdx)		C d/2
86
87	LEA(	binvert_limb_table, %r9)
88
89	and	$127, R32(%rdx)
90	mov	%rcx, %r10		C initial carry
91
92	movzbl	(%r9,%rdx), R32(%rdx)	C inv 8 bits
93
94	mov	(%rdi), %rax		C src[0]
95	lea	(%rdi,%rsi,8), %r11	C src end
96	mov	%r8, %rdi		C d, made available to imull
97
98	lea	(%rdx,%rdx), R32(%rcx)	C 2*inv
99	imul	R32(%rdx), R32(%rdx)	C inv*inv
100
101	neg	%rsi			C -size
102
103	imul	R32(%rdi), R32(%rdx)	C inv*inv*d
104
105	sub	R32(%rdx), R32(%rcx)	C inv = 2*inv - inv*inv*d, 16 bits
106
107	lea	(%rcx,%rcx), R32(%rdx)	C 2*inv
108	imul	R32(%rcx), R32(%rcx)	C inv*inv
109
110	imul	R32(%rdi), R32(%rcx)	C inv*inv*d
111
112	sub	R32(%rcx), R32(%rdx)	C inv = 2*inv - inv*inv*d, 32 bits
113	xor	R32(%rcx), R32(%rcx)	C initial cbit
114
115	lea	(%rdx,%rdx), %r9	C 2*inv
116	imul	%rdx, %rdx		C inv*inv
117
118	imul	%r8, %rdx		C inv*inv*d
119
120	sub	%rdx, %r9		C inv = 2*inv - inv*inv*d, 64 bits
121	mov	%r10, %rdx		C initial climb
122
123	ASSERT(e,`	C d*inv == 1 mod 2^64
124	mov	%r8, %r10
125	imul	%r9, %r10
126	cmp	$1, %r10')
127
128	inc	%rsi
129	jz	L(one)
130
131
132	ALIGN(16)
133L(top):
134	C rax	l = src[i]-cbit
135	C rcx	new cbit, 0 or 1
136	C rdx	climb, high of last product
137	C rsi	counter, limbs, negative
138	C rdi
139	C r8	divisor
140	C r9	inverse
141	C r11	src end ptr
142
143	sub	%rdx, %rax		C l = src[i]-cbit - climb
144
145	adc	$0, %rcx		C more cbit
146	imul	%r9, %rax		C q = l * inverse
147
148	mul	%r8			C climb = high (q * d)
149
150	mov	(%r11,%rsi,8), %rax	C src[i+1]
151	sub	%rcx, %rax		C next l = src[i+1] - cbit
152	setc	R8(%rcx)		C new cbit
153
154	inc	%rsi
155	jnz	L(top)
156
157
158L(one):
159	sub	%rdx, %rax		C l = src[i]-cbit - climb
160
161	adc	$0, %rcx		C more cbit
162	imul	%r9, %rax		C q = l * inverse
163
164	mul	%r8			C climb = high (q * d)
165
166	lea	(%rcx,%rdx), %rax	C climb+cbit
167	FUNC_EXIT()
168	ret
169
170EPILOGUE(mpn_modexact_1c_odd)
171EPILOGUE(mpn_modexact_1_odd)
172