xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mode1o.asm (revision 87d689fb734c654d2486f87f7be32f1b53ecdbec)
1dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C          cycles/limb
35C Athlon:     11.0
36C Hammer:      7.0
37
38
39C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
40C                               mp_limb_t divisor);
41C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
42C                                mp_limb_t divisor, mp_limb_t carry);
43C
44C With the loop running at just 11 cycles it doesn't seem worth bothering to
45C check for high<divisor to save one step.
46C
47C Using a divl for size==1 measures slower than the modexact method, which
48C is not too surprising since for the latter it's only about 24 cycles to
49C calculate the modular inverse.
50
51defframe(PARAM_CARRY,  16)
52defframe(PARAM_DIVISOR,12)
53defframe(PARAM_SIZE,   8)
54defframe(PARAM_SRC,    4)
55
56defframe(SAVE_EBX,     -4)
57defframe(SAVE_ESI,     -8)
58defframe(SAVE_EDI,    -12)
59defframe(SAVE_EBP,    -16)
60
61deflit(STACK_SPACE, 16)
62
63	TEXT
64
65	ALIGN(16)
66PROLOGUE(mpn_modexact_1c_odd)
67deflit(`FRAME',0)
68
69	movl	PARAM_CARRY, %ecx
70	jmp	L(start_1c)
71
72EPILOGUE()
73
74
75	ALIGN(16)
76PROLOGUE(mpn_modexact_1_odd)
77deflit(`FRAME',0)
78
79	xorl	%ecx, %ecx
80L(start_1c):
81	movl	PARAM_DIVISOR, %eax
82	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
83
84	movl	%esi, SAVE_ESI
85	movl	PARAM_DIVISOR, %esi
86
87	movl	%edi, SAVE_EDI
88
89	shrl	%eax			C d/2
90
91	andl	$127, %eax
92
93ifdef(`PIC',`
94	LEA(	binvert_limb_table, %edi)
95	movzbl	(%eax,%edi), %edi		C inv 8 bits
96',`
97	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
98')
99
100	xorl	%edx, %edx		C initial extra carry
101	leal	(%edi,%edi), %eax	C 2*inv
102
103	imull	%edi, %edi		C inv*inv
104
105	movl	%ebp, SAVE_EBP
106	movl	PARAM_SIZE, %ebp
107
108	movl	%ebx, SAVE_EBX
109	movl	PARAM_SRC, %ebx
110
111	imull	%esi, %edi		C inv*inv*d
112
113	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
114	leal	(%eax,%eax), %edi	C 2*inv
115
116	imull	%eax, %eax		C inv*inv
117
118	imull	%esi, %eax		C inv*inv*d
119
120	leal	(%ebx,%ebp,4), %ebx	C src end
121	negl	%ebp			C -size
122
123	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
124
125	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
126	movl	%esi, %eax
127	imull	%edi, %eax
128	cmpl	$1, %eax')
129
130
131C The dependent chain here is
132C
133C                            cycles
134C	subl	%edx, %eax	1
135C	imull	%edi, %eax	4
136C	mull	%esi		6  (high limb)
137C			      ----
138C       total		       11
139C
140C Out of order execution hides the load latency for the source data, so no
141C special scheduling is required.
142
143L(top):
144	C eax	src limb
145	C ebx	src end ptr
146	C ecx	next carry bit, 0 or 1 (or initial carry param)
147	C edx	carry limb, high of last product
148	C esi	divisor
149	C edi	inverse
150	C ebp	counter, limbs, negative
151
152	movl	(%ebx,%ebp,4), %eax
153
154	subl	%ecx, %eax		C apply carry bit
155	movl	$0, %ecx
156
157	setc	%cl			C new carry bit
158
159	subl	%edx, %eax		C apply carry limb
160	adcl	$0, %ecx
161
162	imull	%edi, %eax
163
164	mull	%esi
165
166	incl	%ebp
167	jnz	L(top)
168
169
170	movl	SAVE_ESI, %esi
171	movl	SAVE_EDI, %edi
172	leal	(%ecx,%edx), %eax
173
174	movl	SAVE_EBX, %ebx
175	movl	SAVE_EBP, %ebp
176	addl	$STACK_SPACE, %esp
177
178	ret
179
180EPILOGUE()
181ASM_END()
182