xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/popcount.asm (revision b45fa494daa2ba02187711d31a4144faf0993066)
1dnl  X86-32 and X86-64 mpn_popcount using SSE2.
2
3dnl  Copyright 2006, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23
24C 32-bit                     popcount        hamdist
25C                           cycles/limb     cycles/limb
26C P5:                           -
27C P6 model 0-8,10-12)           -
28C P6 model 9  (Banias)          ?
29C P6 model 13 (Dothan)          4
30C P4 model 0  (Willamette)      ?
31C P4 model 1  (?)               ?
32C P4 model 2  (Northwood)       3.9
33C P4 model 3  (Prescott)        ?
34C P4 model 4  (Nocona)          ?
35C K6:                           -
36C K7:                           -
37C K8:                           ?
38
39C 64-bit                     popcount        hamdist
40C                           cycles/limb     cycles/limb
41C P4 model 4 (Nocona):          8
42C K8:                           7.5
43C K10:				3.5
44C P6 core2:			3.68
45C P6 corei7:			3.15
46
47C TODO
48C  * Make a mpn_hamdist based on this.  Alignment could either be handled by
49C    using movdqu for one operand and movdqa for the other, or by painfully
50C    shifting as we go.  Unfortunately, there seem to be no useable shift
51C    instruction, except for one that takes an immediate count.
52C  * It would probably be possible to cut a few cycles/limb using software
53C    pipelining.
54C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
55C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
56C    integer based popcount.  Such a combined loop would handle 6 limbs in
57C    about 30 cycles on K8.
58C  * We could save a byte or two by using 32-bit operations on areg.
59C  * Check if using movdqa to a temp of and then register-based pand is faster.
60
61ifelse(GMP_LIMB_BITS,`32',
62`	define(`up',  `%edx')
63	define(`n',   `%ecx')
64	define(`areg',`%eax')
65	define(`breg',`%ebx')
66	define(`zero',`%xmm4')
67	define(`LIMB32',`	$1')
68	define(`LIMB64',`dnl')
69',`
70	define(`up',  `%rdi')
71	define(`n',   `%rsi')
72	define(`areg',`%rax')
73	define(`breg',`%rdx')
74	define(`zero',`%xmm8')
75	define(`LIMB32',`dnl')
76	define(`LIMB64',`	$1')
77')
78
79define(`mm01010101',`%xmm6')
80define(`mm00110011',`%xmm7')
81define(`mm00001111',`%xmm2')
82
83define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
84define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
85define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
86
87undefine(`psadbw')			C override inherited m4 version
88
89ASM_START()
90
91C Make cnsts global to work around Apple relocation bug.
92ifdef(`DARWIN',`
93	define(`cnsts', MPN(popccnsts))
94	GLOBL	cnsts')
95
96	TEXT
97	ALIGN(32)
98PROLOGUE(mpn_popcount)
99
100LIMB32(`mov	4(%esp), up	')
101LIMB32(`mov	8(%esp), n	')
102LIMB32(`push	%ebx		')
103
104	pxor	%xmm3, %xmm3		C zero grand total count
105LIMB64(`pxor	zero, zero	')
106ifdef(`PIC',`
107	LEA(	cnsts, breg)
108',`
109LIMB32(`mov	$cnsts, breg	')
110LIMB64(`movabs	$cnsts, breg	')
111')
112
113	movdqa	-48(breg), mm01010101
114	movdqa	-32(breg), mm00110011
115	movdqa	-16(breg), mm00001111
116
117	mov	up, areg
118	and	$-16, up		C round `up' down to 128-bit boundary
119	and	$12, areg		C 32:areg = 0, 4, 8, 12
120					C 64:areg = 0, 8
121	movdqa	(up), %xmm0
122	pand	64(breg,areg,4), %xmm0
123	shr	$m4_log2(GMP_LIMB_BYTES), %eax
124	add	areg, n			C compensate n for rounded down `up'
125
126	pxor	%xmm4, %xmm4
127	sub	$LIMBS_PER_XMM, n
128	jbe	L(sum)
129
130	sub	$LIMBS_PER_XMM, n
131	ja	L(ent)
132	jmp	L(lsum)
133
134	ALIGN(16)
135L(top):	movdqa	(up), %xmm0
136L(ent):	movdqa	16(up), %xmm4
137
138	movdqa	%xmm0, %xmm1
139	movdqa	%xmm4, %xmm5
140	psrld	$1, %xmm0
141	psrld	$1, %xmm4
142	pand	mm01010101, %xmm0
143	pand	mm01010101, %xmm4
144	psubd	%xmm0, %xmm1
145	psubd	%xmm4, %xmm5
146
147	movdqa	%xmm1, %xmm0
148	movdqa	%xmm5, %xmm4
149	psrlq	$2, %xmm1
150	psrlq	$2, %xmm5
151	pand	mm00110011, %xmm0
152	pand	mm00110011, %xmm4
153	pand	mm00110011, %xmm1
154	pand	mm00110011, %xmm5
155	paddq	%xmm0, %xmm1
156	paddq	%xmm4, %xmm5
157
158LIMB32(`pxor	zero, zero	')
159
160	add	$32, up
161	sub	$LIMBS_PER_2XMM, n
162
163	paddq	%xmm5, %xmm1
164	movdqa	%xmm1, %xmm0
165	psrlq	$4, %xmm1
166	pand	mm00001111, %xmm0
167	pand	mm00001111, %xmm1
168	paddq	%xmm0, %xmm1
169
170	psadbw	zero, %xmm1
171	paddq	%xmm1, %xmm3		C add to grand total
172
173	jnc	L(top)
174L(end):
175	add	$LIMBS_PER_2XMM, n
176	jz	L(rt)
177	movdqa	(up), %xmm0
178	pxor	%xmm4, %xmm4
179	sub	$LIMBS_PER_XMM, n
180	jbe	L(sum)
181L(lsum):
182	movdqa	%xmm0, %xmm4
183	movdqa	16(up), %xmm0
184L(sum):
185	shl	$m4_log2(GMP_LIMB_BYTES), n
186	and	$12, n
187	pand	(breg,n,4), %xmm0
188
189	movdqa	%xmm0, %xmm1
190	movdqa	%xmm4, %xmm5
191	psrld	$1, %xmm0
192	psrld	$1, %xmm4
193	pand	mm01010101, %xmm0
194	pand	mm01010101, %xmm4
195	psubd	%xmm0, %xmm1
196	psubd	%xmm4, %xmm5
197
198	movdqa	%xmm1, %xmm0
199	movdqa	%xmm5, %xmm4
200	psrlq	$2, %xmm1
201	psrlq	$2, %xmm5
202	pand	mm00110011, %xmm0
203	pand	mm00110011, %xmm4
204	pand	mm00110011, %xmm1
205	pand	mm00110011, %xmm5
206	paddq	%xmm0, %xmm1
207	paddq	%xmm4, %xmm5
208
209LIMB32(`pxor	zero, zero	')
210
211	paddq	%xmm5, %xmm1
212	movdqa	%xmm1, %xmm0
213	psrlq	$4, %xmm1
214	pand	mm00001111, %xmm0
215	pand	mm00001111, %xmm1
216	paddq	%xmm0, %xmm1
217
218	psadbw	zero, %xmm1
219	paddq	%xmm1, %xmm3		C add to grand total
220
221
222C Add the two 64-bit halves of the grand total counter
223L(rt):	movdqa	%xmm3, %xmm0
224	psrldq	$8, %xmm3
225	paddq	%xmm3, %xmm0
226	movd	%xmm0, areg		C movq avoided due to gas bug
227
228LIMB32(`pop	%ebx		')
229	ret
230
231EPILOGUE()
232DEF_OBJECT(dummy,16)
233C Three magic constants used for masking out bits
234	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
235	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
236
237	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
238	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
239
240	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
241	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
242cnsts:
243C Masks for high end of number
244	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
245	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
246
247	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
248	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
249
250	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
251	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
252
253	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
254	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
255C Masks for low end of number
256	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
257	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
258
259	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
260	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
261
262	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
263	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
264
265	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
266	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
267END_OBJECT(dummy)
268