xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/mod_34lsub1.asm (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1dnl  Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
2
3dnl  Copyright 2000-2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C Pentium4: 1.0 cycles/limb
35
36
37C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
38C
39C Enhancements:
40C
41C There might a couple of cycles to save by using plain integer code for
42C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to
43C about 46 (inclusive of some function call overheads).
44
45defframe(PARAM_SIZE, 8)
46defframe(PARAM_SRC,  4)
47
48dnl  re-use parameter space
49define(SAVE_EBX, `PARAM_SRC')
50define(SAVE_ESI, `PARAM_SIZE')
51
52	TEXT
53	ALIGN(16)
54PROLOGUE(mpn_mod_34lsub1)
55deflit(`FRAME',0)
56
57	movl	PARAM_SIZE, %ecx
58	movl	PARAM_SRC, %edx
59	movl	(%edx), %eax
60
61	subl	$2, %ecx
62	ja	L(three_or_more)
63	jne	L(one)
64
65	movl	4(%edx), %edx
66	movl	%eax, %ecx
67	shrl	$24, %eax		C src[0] high
68
69	andl	$0x00FFFFFF, %ecx	C src[0] low
70	addl	%ecx, %eax
71
72	movl	%edx, %ecx
73	shll	$8, %edx
74
75	shrl	$16, %ecx		C src[1] low
76	addl	%ecx, %eax
77
78	andl	$0x00FFFF00, %edx	C src[1] high
79	addl	%edx, %eax
80
81L(one):
82	ret
83
84
85L(three_or_more):
86	pxor	%mm0, %mm0
87	pxor	%mm1, %mm1
88	pxor	%mm2, %mm2
89
90	pcmpeqd	%mm7, %mm7
91	psrlq	$32, %mm7	C 0x00000000FFFFFFFF, low 32 bits
92
93	pcmpeqd	%mm6, %mm6
94	psrlq	$40, %mm6	C 0x0000000000FFFFFF, low 24 bits
95
96L(top):
97	C eax
98	C ebx
99	C ecx	counter, size-2 to 0, -1 or -2
100	C edx	src, incrementing
101	C
102	C mm0	sum 0mod3
103	C mm1	sum 1mod3
104	C mm2	sum 2mod3
105	C mm3
106	C mm4
107	C mm5
108	C mm6	0x0000000000FFFFFF
109	C mm7	0x00000000FFFFFFFF
110
111	movd	(%edx), %mm3
112	paddq	%mm3, %mm0
113
114	movd	4(%edx), %mm3
115	paddq	%mm3, %mm1
116
117	movd	8(%edx), %mm3
118	paddq	%mm3, %mm2
119
120	addl	$12, %edx
121	subl	$3, %ecx
122	ja	L(top)
123
124
125	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
126
127	addl	$1, %ecx
128	js	L(combine)		C 0 more
129
130	movd	(%edx), %mm3
131	paddq	%mm3, %mm0
132
133	jz	L(combine)		C 1 more
134
135	movd	4(%edx), %mm3
136	paddq	%mm3, %mm1
137
138L(combine):
139	movq	%mm7, %mm3		C low halves
140	pand	%mm0, %mm3
141
142	movq	%mm7, %mm4
143	pand	%mm1, %mm4
144
145	movq	%mm7, %mm5
146	pand	%mm2, %mm5
147
148	psrlq	$32, %mm0		C high halves
149	psrlq	$32, %mm1
150	psrlq	$32, %mm2
151
152	paddq	%mm0, %mm4		C fold high halves to give 33 bits each
153	paddq	%mm1, %mm5
154	paddq	%mm2, %mm3
155
156	psllq	$8, %mm4		C combine at respective offsets
157	psllq	$16, %mm5
158	paddq	%mm4, %mm3
159	paddq	%mm5, %mm3		C 0x000cxxxxxxxxxxxx, 50 bits
160
161	pand	%mm3, %mm6		C fold at 24 bits
162	psrlq	$24, %mm3
163
164	paddq	%mm6, %mm3
165	movd	%mm3, %eax
166
167	ASSERT(z,	C nothing left in high dword
168	`psrlq	$32, %mm3
169	movd	%mm3, %ecx
170	orl	%ecx, %ecx')
171
172	emms
173	ret
174
175EPILOGUE()
176