xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mod_34lsub1.asm (revision ae082add65442546470c0ba499a860ee89eed305)
1dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
2
3dnl  Copyright 2000-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: 2.66 cycles/limb
35
36
37C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
38C
39C An attempt was made to use a loop like
40C
41C L(top):
42C	adcl	(%edx), %eax
43C	adcl	4(%edx), %ebx
44C	adcl	8(%edx), %esi
45C	leal	12(%edx), %edx
46C	loop	L(top)
47C
48C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
49C The form used instead can save about 6 cycles by not dividing by 3.
50C
51C In the code used, putting the "leal"s at the top of the loop is necessary
52C for the claimed speed, anywhere else costs an extra cycle per loop.
53C Perhaps a tight loop like this needs short decode instructions at the
54C branch target, which would explain the leal/loop form above taking 8
55C cycles instead of 7 too.
56
57defframe(PARAM_SIZE, 8)
58defframe(PARAM_SRC,  4)
59
60dnl  re-use parameter space
61define(SAVE_EBX, `PARAM_SIZE')
62define(SAVE_ESI, `PARAM_SRC')
63
64	TEXT
65	ALIGN(16)
66PROLOGUE(mpn_mod_34lsub1)
67deflit(`FRAME',0)
68
69	movl	PARAM_SIZE, %eax
70	movl	PARAM_SRC, %edx
71
72	subl	$2, %eax
73	ja	L(three_or_more)
74
75Zdisp(	movl,	0,(%edx), %eax)		C avoid code cache line boundary
76	jne	L(one)
77
78	movl	%eax, %ecx
79	movl	4(%edx), %edx
80
81	shrl	$24, %eax		C src[0] high
82	andl	$0x00FFFFFF, %ecx	C src[0] low
83
84	addl	%ecx, %eax
85	movl	%edx, %ecx
86
87	shll	$8, %edx
88	andl	$0x00FFFF00, %edx	C src[1] high
89
90	shrl	$16, %ecx		C src[1] low
91	addl	%ecx, %eax
92
93	addl	%edx, %eax
94
95L(one):
96	ret
97
98
99L(three_or_more):
100	C eax	size-2
101	C ebx
102	C ecx
103	C edx	src
104
105	movl	%ebx, SAVE_EBX
106	xorl	%ebx, %ebx
107
108	movl	%esi, SAVE_ESI
109	pushl	%edi	FRAME_pushl()
110
111	xorl	%esi, %esi
112	xorl	%edi, %edi		C and clear carry flag
113
114L(top):
115	C eax	counter, limbs
116	C ebx	acc 0mod3
117	C ecx
118	C edx	src, incrementing
119	C esi	acc 1mod3
120	C edi	acc 2mod3
121	C ebp
122
123	leal	-2(%eax), %eax
124	leal	12(%edx), %edx
125
126	adcl	-12(%edx), %ebx
127	adcl	-8(%edx), %esi
128	adcl	-4(%edx), %edi
129
130	decl	%eax
131	jg	L(top)
132
133
134	C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
135
136	movb	$0, %cl
137	incl	%eax
138
139	js	L(combine)		C 0 more
140
141Zdisp(	adcl,	0,(%edx), %ebx)		C avoid code cache line crossings
142
143	movb	$8, %cl
144	decl	%eax
145
146	js	L(combine)		C 1 more
147
148	adcl	4(%edx), %esi
149
150	movb	$16, %cl
151
152
153L(combine):
154	sbbl	%edx, %edx
155
156	shll	%cl, %edx		C carry
157	movl	%ebx, %eax		C 0mod3
158
159	shrl	$24, %eax		C 0mod3 high
160	andl	$0x00FFFFFF, %ebx	C 0mod3 low
161
162	subl	%edx, %eax		C apply carry
163	movl	%esi, %ecx		C 1mod3
164
165	shrl	$16, %esi		C 1mod3 high
166	addl	%ebx, %eax		C apply 0mod3 low
167
168	andl	$0x0000FFFF, %ecx
169	addl	%esi, %eax		C apply 1mod3 high
170
171	shll	$8, %ecx		C 1mod3 low
172	movl	%edi, %edx		C 2mod3
173
174	shrl	$8, %edx		C 2mod3 high
175	addl	%ecx, %eax		C apply 1mod3 low
176
177	addl	%edx, %eax		C apply 2mod3 high
178	andl	$0x000000FF, %edi
179
180	shll	$16, %edi		C 2mod3 low
181	movl	SAVE_EBX, %ebx
182
183	addl	%edi, %eax		C apply 2mod3 low
184	movl	SAVE_ESI, %esi
185
186	popl	%edi
187
188	ret
189
190EPILOGUE()
191