xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mod_34lsub1.asm (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
2
3dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C         cycles/limb
25C Athlon:     1
26C Hammer:     1
27
28
29C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
30C
31C The loop form below and the 64 byte code alignment seem necessary for the
32C claimed speed.  This is a bit strange, since normally k7 isn't very
33C sensitive to such things.  Perhaps there has to be 6 instructions in the
34C first 16 bytes for the BTB entry or something.
35
36defframe(PARAM_SIZE, 8)
37defframe(PARAM_SRC,  4)
38
39dnl  re-use parameter space
40define(SAVE_EDI, `PARAM_SIZE')
41
42	TEXT
43	ALIGN(64)
44PROLOGUE(mpn_mod_34lsub1)
45deflit(`FRAME',0)
46
47	movl	PARAM_SIZE, %ecx
48	movl	PARAM_SRC, %edx
49
50	subl	$2, %ecx
51	ja	L(three_or_more)
52
53	movl	(%edx), %eax
54	jb	L(one)
55
56	movl	4(%edx), %ecx
57	movl	%eax, %edx
58	shrl	$24, %eax		C src[0] low
59
60	andl	$0xFFFFFF, %edx		C src[0] high
61	addl	%edx, %eax
62	movl	%ecx, %edx
63
64	andl	$0xFFFF, %ecx
65	shrl	$16, %edx		C src[1] high
66	addl	%edx, %eax
67
68	shll	$8, %ecx		C src[1] low
69	addl	%ecx, %eax
70
71L(one):
72	ret
73
74
75L(three_or_more):
76	C eax
77	C ebx
78	C ecx	size-2
79	C edx	src
80	C esi
81	C edi
82
83	pushl	%ebx	FRAME_pushl()
84	xorl	%eax, %eax
85	xorl	%ebx, %ebx
86
87	movl	%edi, SAVE_EDI
88	pushl	%esi	FRAME_pushl()
89	xorl	%esi, %esi		C and clear carry flag
90
91
92	C code offset 0x40 at this point
93L(top):
94	C eax	acc 0mod3
95	C ebx	acc 1mod3
96	C ecx	counter, limbs
97	C edx	src
98	C esi	acc 2mod3
99	C edi
100
101	leal	24(%edx), %edx
102	leal	-2(%ecx), %ecx
103	adcl	-24(%edx), %eax
104	adcl	-20(%edx), %ebx
105	adcl	-16(%edx), %esi
106
107	decl	%ecx
108	jng	L(done_loop)
109
110	leal	-2(%ecx), %ecx
111	adcl	-12(%edx), %eax
112	adcl	-8(%edx), %ebx
113	adcl	-4(%edx), %esi
114
115	decl	%ecx
116	jg	L(top)
117
118
119	leal	12(%edx), %edx
120
121
122L(done_loop):
123	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
124
125	incl	%ecx
126	movl	$0xFFFFFFFF, %edi
127	js	L(combine)
128
129	adcl	-12(%edx), %eax
130	decl	%ecx
131	movl	$0xFFFFFF00, %edi
132	js	L(combine)
133
134	adcl	-8(%edx), %ebx
135	movl	$0xFFFF0000, %edi
136
137
138L(combine):
139	C eax	acc 0mod3
140	C ebx	acc 1mod3
141	C ecx
142	C edx
143	C esi	acc 2mod3
144	C edi	mask
145
146	sbbl	%ecx, %ecx		C carry
147	movl	%eax, %edx		C 0mod3
148	shrl	$24, %eax		C 0mod3 high
149
150	andl	%edi, %ecx		C carry masked
151	andl	$0x00FFFFFF, %edx	C 0mod3 low
152	movl	%ebx, %edi		C 1mod3
153
154	subl	%ecx, %eax		C apply carry
155	shrl	$16, %ebx		C 1mod3 high
156	andl	$0xFFFF, %edi
157
158	addl	%edx, %eax		C apply 0mod3 low
159	movl	%esi, %edx		C 2mod3
160	shll	$8, %edi		C 1mod3 low
161
162	addl	%ebx, %eax		C apply 1mod3 high
163	shrl	$8, %esi		C 2mod3 high
164	movzbl	%dl, %edx		C 2mod3 low
165
166	addl	%edi, %eax		C apply 1mod3 low
167	shll	$16, %edx		C 2mod3 low
168
169	addl	%esi, %eax		C apply 2mod3 high
170	popl	%esi	FRAME_popl()
171
172	movl	SAVE_EDI, %edi
173	addl	%edx, %eax		C apply 2mod3 low
174	popl	%ebx	FRAME_popl()
175
176	ret
177
178EPILOGUE()
179