xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/popcount.asm (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1dnl  X86-32 and X86-64 mpn_popcount using SSE2.
2
3dnl  Copyright 2006, 2007, 2011, 2015 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34
35C 32-bit		     popcount	     hamdist
36C			    cycles/limb	    cycles/limb
37C P5				-
38C P6 model 0-8,10-12		-
39C P6 model 9  (Banias)		?
40C P6 model 13 (Dothan)		4
41C P4 model 0  (Willamette)	?
42C P4 model 1  (?)		?
43C P4 model 2  (Northwood)	3.9
44C P4 model 3  (Prescott)	?
45C P4 model 4  (Nocona)		?
46C AMD K6			-
47C AMD K7			-
48C AMD K8			?
49
50C 64-bit		     popcount	     hamdist
51C			    cycles/limb	    cycles/limb
52C P4 model 4 (Nocona):		8
53C AMD K8,K9			7.5
54C AMD K10			3.5
55C Intel core2			3.68
56C Intel corei			3.15
57C Intel atom		       10.8
58C VIA nano			6.5
59
60C TODO
61C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
62C    using movdqu for one operand and movdqa for the other, or by painfully
63C    shifting as we go.  Unfortunately, there seem to be no usable shift
64C    instruction, except for one that takes an immediate count.
65C  * It would probably be possible to cut a few cycles/limb using software
66C    pipelining.
67C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
68C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
69C    integer based popcount.  Such a combined loop would handle 6 limbs in
70C    about 30 cycles on K8.
71C  * We could save a byte or two by using 32-bit operations on areg.
72C  * Check if using movdqa to a temp of and then register-based pand is faster.
73
74ifelse(GMP_LIMB_BITS,`32',
75`	define(`up',  `%edx')
76	define(`n',   `%ecx')
77	define(`areg',`%eax')
78	define(`breg',`%ebx')
79	define(`zero',`%xmm4')
80	define(`LIMB32',`	$1')
81	define(`LIMB64',`dnl')
82',`
83	define(`up',  `%rdi')
84	define(`n',   `%rsi')
85	define(`areg',`%rax')
86	define(`breg',`%rdx')
87	define(`zero',`%xmm8')
88	define(`LIMB32',`dnl')
89	define(`LIMB64',`	$1')
90')
91
92define(`mm01010101',`%xmm6')
93define(`mm00110011',`%xmm7')
94define(`mm00001111',`%xmm2')
95
96define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
97define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
98define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
99
100undefine(`psadbw')			C override inherited m4 version
101
102C This file is shared between 32-bit and 64-bit builds.  Only the former has
103C LEAL.  Default LEAL as an alias of LEA.
104ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
105
106ASM_START()
107
108C Make cnsts global to work around Apple relocation bug.
109ifdef(`DARWIN',`
110	define(`cnsts', MPN(popccnsts))
111	GLOBL	cnsts')
112
113	TEXT
114	ALIGN(32)
115PROLOGUE(mpn_popcount)
116
117LIMB32(`mov	4(%esp), up	')
118LIMB32(`mov	8(%esp), n	')
119LIMB32(`push	%ebx		')
120
121	pxor	%xmm3, %xmm3		C zero grand total count
122LIMB64(`pxor	zero, zero	')
123ifdef(`PIC',`
124	LEAL(	cnsts, breg)
125',`
126LIMB32(`mov	$cnsts, breg	')
127LIMB64(`movabs	$cnsts, breg	')
128')
129
130	movdqa	-48(breg), mm01010101
131	movdqa	-32(breg), mm00110011
132	movdqa	-16(breg), mm00001111
133
134	mov	up, areg
135	and	$-16, up		C round `up' down to 128-bit boundary
136	and	$12, areg		C 32:areg = 0, 4, 8, 12
137					C 64:areg = 0, 8
138	movdqa	(up), %xmm0
139	pand	64(breg,areg,4), %xmm0
140	shr	$m4_log2(GMP_LIMB_BYTES), %eax
141	add	areg, n			C compensate n for rounded down `up'
142
143	pxor	%xmm4, %xmm4
144	sub	$LIMBS_PER_XMM, n
145	jbe	L(sum)
146
147	sub	$LIMBS_PER_XMM, n
148	ja	L(ent)
149	jmp	L(lsum)
150
151	ALIGN(16)
152L(top):	movdqa	(up), %xmm0
153L(ent):	movdqa	16(up), %xmm4
154
155	movdqa	%xmm0, %xmm1
156	movdqa	%xmm4, %xmm5
157	psrld	$1, %xmm0
158	psrld	$1, %xmm4
159	pand	mm01010101, %xmm0
160	pand	mm01010101, %xmm4
161	psubd	%xmm0, %xmm1
162	psubd	%xmm4, %xmm5
163
164	movdqa	%xmm1, %xmm0
165	movdqa	%xmm5, %xmm4
166	psrlq	$2, %xmm1
167	psrlq	$2, %xmm5
168	pand	mm00110011, %xmm0
169	pand	mm00110011, %xmm4
170	pand	mm00110011, %xmm1
171	pand	mm00110011, %xmm5
172	paddq	%xmm0, %xmm1
173	paddq	%xmm4, %xmm5
174
175LIMB32(`pxor	zero, zero	')
176
177	add	$32, up
178	sub	$LIMBS_PER_2XMM, n
179
180	paddq	%xmm5, %xmm1
181	movdqa	%xmm1, %xmm0
182	psrlq	$4, %xmm1
183	pand	mm00001111, %xmm0
184	pand	mm00001111, %xmm1
185	paddq	%xmm0, %xmm1
186
187	psadbw	zero, %xmm1
188	paddq	%xmm1, %xmm3		C add to grand total
189
190	jnc	L(top)
191L(end):
192	add	$LIMBS_PER_2XMM, n
193	jz	L(rt)
194	movdqa	(up), %xmm0
195	pxor	%xmm4, %xmm4
196	sub	$LIMBS_PER_XMM, n
197	jbe	L(sum)
198L(lsum):
199	movdqa	%xmm0, %xmm4
200	movdqa	16(up), %xmm0
201L(sum):
202	shl	$m4_log2(GMP_LIMB_BYTES), n
203	and	$12, n
204	pand	(breg,n,4), %xmm0
205
206	movdqa	%xmm0, %xmm1
207	movdqa	%xmm4, %xmm5
208	psrld	$1, %xmm0
209	psrld	$1, %xmm4
210	pand	mm01010101, %xmm0
211	pand	mm01010101, %xmm4
212	psubd	%xmm0, %xmm1
213	psubd	%xmm4, %xmm5
214
215	movdqa	%xmm1, %xmm0
216	movdqa	%xmm5, %xmm4
217	psrlq	$2, %xmm1
218	psrlq	$2, %xmm5
219	pand	mm00110011, %xmm0
220	pand	mm00110011, %xmm4
221	pand	mm00110011, %xmm1
222	pand	mm00110011, %xmm5
223	paddq	%xmm0, %xmm1
224	paddq	%xmm4, %xmm5
225
226LIMB32(`pxor	zero, zero	')
227
228	paddq	%xmm5, %xmm1
229	movdqa	%xmm1, %xmm0
230	psrlq	$4, %xmm1
231	pand	mm00001111, %xmm0
232	pand	mm00001111, %xmm1
233	paddq	%xmm0, %xmm1
234
235	psadbw	zero, %xmm1
236	paddq	%xmm1, %xmm3		C add to grand total
237
238
239C Add the two 64-bit halves of the grand total counter
240L(rt):	movdqa	%xmm3, %xmm0
241	psrldq	$8, %xmm3
242	paddq	%xmm3, %xmm0
243	movd	%xmm0, areg		C movq avoided due to gas bug
244
245LIMB32(`pop	%ebx		')
246	ret
247
248EPILOGUE()
249DEF_OBJECT(dummy,16)
250C Three magic constants used for masking out bits
251	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
252	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
253
254	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
255	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
256
257	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
258	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
259cnsts:
260C Masks for high end of number
261	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
262	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
263
264	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
265	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
266
267	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
268	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
269
270	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
271	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
272C Masks for low end of number
273	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
274	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
275
276	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
277	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
278
279	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
280	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
281
282	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
283	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
284END_OBJECT(dummy)
285ASM_END()
286