xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mode1o.asm (revision 7788a0781fe6ff2cce37368b4578a7ade0850cb1)
1dnl  AMD K6 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2000, 2001, 2002, 2003, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: 10.0 cycles/limb
24
25
26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
27C                               mp_limb_t divisor);
28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29C                                mp_limb_t divisor, mp_limb_t carry);
30C
31C A special case for high<divisor at the end measured only about 4 cycles
32C faster, and so isn't used.
33C
34C A special case for size==1 using a divl rather than the inverse measured
35C only about 5 cycles faster, and so isn't used.  When size==1 and
36C high<divisor it can skip a division and be a full 24 cycles faster, but
37C this isn't an important case.
38
39defframe(PARAM_CARRY,  16)
40defframe(PARAM_DIVISOR,12)
41defframe(PARAM_SIZE,   8)
42defframe(PARAM_SRC,    4)
43
44	TEXT
45
46	ALIGN(32)
47PROLOGUE(mpn_modexact_1c_odd)
48deflit(`FRAME',0)
49
50	movl	PARAM_DIVISOR, %ecx
51	pushl	%esi		FRAME_pushl()
52
53	movl	PARAM_CARRY, %edx
54	jmp	L(start_1c)
55
56EPILOGUE()
57
58
59	ALIGN(16)
60PROLOGUE(mpn_modexact_1_odd)
61deflit(`FRAME',0)
62
63	movl	PARAM_DIVISOR, %ecx
64	pushl	%esi		FRAME_pushl()
65
66	xorl	%edx, %edx
67L(start_1c):
68	pushl	%edi		FRAME_pushl()
69
70	shrl	%ecx			C d/2
71	movl	PARAM_DIVISOR, %esi
72
73	andl	$127, %ecx		C d/2, 7 bits
74	pushl	%ebp		FRAME_pushl()
75
76ifdef(`PIC',`
77	LEA(	binvert_limb_table, %edi)
78Zdisp(	movzbl,	0,(%ecx,%edi), %edi)		C inv 8 bits
79',`
80	movzbl	binvert_limb_table(%ecx), %edi	C inv 8 bits
81')
82	leal	(%edi,%edi), %ecx	C 2*inv
83
84	imull	%edi, %edi		C inv*inv
85
86	movl	PARAM_SRC, %eax
87	movl	PARAM_SIZE, %ebp
88
89	imull	%esi, %edi		C inv*inv*d
90
91	pushl	%ebx		FRAME_pushl()
92	leal	(%eax,%ebp,4), %ebx	C src end
93
94	subl	%edi, %ecx		C inv = 2*inv - inv*inv*d
95	leal	(%ecx,%ecx), %edi	C 2*inv
96
97	imull	%ecx, %ecx		C inv*inv
98
99	movl	(%eax), %eax		C src low limb
100	negl	%ebp			C -size
101
102	imull	%esi, %ecx		C inv*inv*d
103
104	subl	%ecx, %edi		C inv = 2*inv - inv*inv*d
105
106	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
107	pushl	%eax
108	movl	%esi, %eax
109	imull	%edi, %eax
110	cmpl	$1, %eax
111	popl	%eax')
112
113	jmp	L(entry)
114
115
116C Rotating the mul to the top of the loop saves 1 cycle, presumably by
117C hiding the loop control under the imul latency.
118C
119C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
120C only 8).  It's not clear how to get down to 9 cycles.
121C
122C The xor and rcl to handle the carry bit could be an sbb instead, with the
123C the carry bit add becoming a sub, but that doesn't save anything.
124
125L(top):
126	C eax	(low product)
127	C ebx	src end
128	C ecx	carry bit, 0 or 1
129	C edx	(high product, being carry limb)
130	C esi	divisor
131	C edi	inverse
132	C ebp	counter, limbs, negative
133
134	mull	%esi
135
136	movl	(%ebx,%ebp,4), %eax
137	addl	%ecx, %edx		C apply carry bit to carry limb
138
139L(entry):
140	xorl	%ecx, %ecx
141	subl	%edx, %eax		C apply carry limb
142
143	rcll	%ecx
144
145	imull	%edi, %eax
146
147	incl	%ebp
148	jnz	L(top)
149
150
151
152	popl	%ebx
153	popl	%ebp
154
155	mull	%esi
156
157	popl	%edi
158	popl	%esi
159
160	leal	(%ecx,%edx), %eax
161
162	ret
163
164EPILOGUE()
165