xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mod_34lsub1.asm (revision 2718af68c3efc72c9769069b5c7f9ed36f6b9def)
1dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
2
3dnl  Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C         cycles/limb
35C Athlon:     1
36C Hammer:     1
37
38
39C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
40C
41C The loop form below and the 64 byte code alignment seem necessary for the
42C claimed speed.  This is a bit strange, since normally k7 isn't very
43C sensitive to such things.  Perhaps there has to be 6 instructions in the
44C first 16 bytes for the BTB entry or something.
45
46defframe(PARAM_SIZE, 8)
47defframe(PARAM_SRC,  4)
48
49dnl  re-use parameter space
50define(SAVE_EDI, `PARAM_SIZE')
51
52	TEXT
53	ALIGN(64)
54PROLOGUE(mpn_mod_34lsub1)
55deflit(`FRAME',0)
56
57	movl	PARAM_SIZE, %ecx
58	movl	PARAM_SRC, %edx
59
60	subl	$2, %ecx
61	ja	L(three_or_more)
62
63	movl	(%edx), %eax
64	jb	L(one)
65
66	movl	4(%edx), %ecx
67	movl	%eax, %edx
68	shrl	$24, %eax		C src[0] low
69
70	andl	$0xFFFFFF, %edx		C src[0] high
71	addl	%edx, %eax
72	movl	%ecx, %edx
73
74	andl	$0xFFFF, %ecx
75	shrl	$16, %edx		C src[1] high
76	addl	%edx, %eax
77
78	shll	$8, %ecx		C src[1] low
79	addl	%ecx, %eax
80
81L(one):
82	ret
83
84
85L(three_or_more):
86	C eax
87	C ebx
88	C ecx	size-2
89	C edx	src
90	C esi
91	C edi
92
93	pushl	%ebx	FRAME_pushl()
94	xorl	%eax, %eax
95	xorl	%ebx, %ebx
96
97	movl	%edi, SAVE_EDI
98	pushl	%esi	FRAME_pushl()
99	xorl	%esi, %esi		C and clear carry flag
100
101
102	C code offset 0x40 at this point
103L(top):
104	C eax	acc 0mod3
105	C ebx	acc 1mod3
106	C ecx	counter, limbs
107	C edx	src
108	C esi	acc 2mod3
109	C edi
110
111	leal	24(%edx), %edx
112	leal	-2(%ecx), %ecx
113	adcl	-24(%edx), %eax
114	adcl	-20(%edx), %ebx
115	adcl	-16(%edx), %esi
116
117	decl	%ecx
118	jng	L(done_loop)
119
120	leal	-2(%ecx), %ecx
121	adcl	-12(%edx), %eax
122	adcl	-8(%edx), %ebx
123	adcl	-4(%edx), %esi
124
125	decl	%ecx
126	jg	L(top)
127
128
129	leal	12(%edx), %edx
130
131
132L(done_loop):
133	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
134
135	incl	%ecx
136	movl	$0xFFFFFFFF, %edi
137	js	L(combine)
138
139	adcl	-12(%edx), %eax
140	decl	%ecx
141	movl	$0xFFFFFF00, %edi
142	js	L(combine)
143
144	adcl	-8(%edx), %ebx
145	movl	$0xFFFF0000, %edi
146
147
148L(combine):
149	C eax	acc 0mod3
150	C ebx	acc 1mod3
151	C ecx
152	C edx
153	C esi	acc 2mod3
154	C edi	mask
155
156	sbbl	%ecx, %ecx		C carry
157	movl	%eax, %edx		C 0mod3
158	shrl	$24, %eax		C 0mod3 high
159
160	andl	%edi, %ecx		C carry masked
161	andl	$0x00FFFFFF, %edx	C 0mod3 low
162	movl	%ebx, %edi		C 1mod3
163
164	subl	%ecx, %eax		C apply carry
165	shrl	$16, %ebx		C 1mod3 high
166	andl	$0xFFFF, %edi
167
168	addl	%edx, %eax		C apply 0mod3 low
169	movl	%esi, %edx		C 2mod3
170	shll	$8, %edi		C 1mod3 low
171
172	addl	%ebx, %eax		C apply 1mod3 high
173	shrl	$8, %esi		C 2mod3 high
174	movzbl	%dl, %edx		C 2mod3 low
175
176	addl	%edi, %eax		C apply 1mod3 low
177	shll	$16, %edx		C 2mod3 low
178
179	addl	%esi, %eax		C apply 2mod3 high
180	popl	%esi	FRAME_popl()
181
182	movl	SAVE_EDI, %edi
183	addl	%edx, %eax		C apply 2mod3 low
184	popl	%ebx	FRAME_popl()
185
186	ret
187
188EPILOGUE()
189