xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/mmx/popham.asm (revision 5dd36a3bc8bf2a9dec29ceb6349550414570c447)
1dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
2dnl  hamming distance.
3
4dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C			     popcount	     hamdist
36C P3 model 9  (Banias)		?		?
37C P3 model 13 (Dothan)		6		6
38C P4 model 0  (Willamette)
39C P4 model 1  (?)
40C P4 model 2  (Northwood)	8		9
41C P4 model 3  (Prescott)	8		9
42C P4 model 4  (Nocona)
43
44C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
45C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
46C
47C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
48C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
49C and using them saves fiddling about with alignment testing on entry.
50C
51C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
52C might be possible, but 8 c/l relying on out-of-order execution is already
53C quite reasonable.
54
55ifdef(`OPERATION_popcount',,
56`ifdef(`OPERATION_hamdist',,
57`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
58')')')
59
60define(HAM,
61m4_assert_numargs(1)
62`ifdef(`OPERATION_hamdist',`$1')')
63
64define(POP,
65m4_assert_numargs(1)
66`ifdef(`OPERATION_popcount',`$1')')
67
68HAM(`
69defframe(PARAM_SIZE, 12)
70defframe(PARAM_SRC2,  8)
71defframe(PARAM_SRC,   4)
72define(M4_function,mpn_hamdist)
73')
74POP(`
75defframe(PARAM_SIZE,  8)
76defframe(PARAM_SRC,   4)
77define(M4_function,mpn_popcount)
78')
79
80MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
81
82
83ifdef(`PIC',,`
84	dnl  non-PIC
85	RODATA
86	ALIGN(8)
87L(rodata_AAAAAAAAAAAAAAAA):
88	.long	0xAAAAAAAA
89	.long	0xAAAAAAAA
90L(rodata_3333333333333333):
91	.long	0x33333333
92	.long	0x33333333
93L(rodata_0F0F0F0F0F0F0F0F):
94	.long	0x0F0F0F0F
95	.long	0x0F0F0F0F
96')
97
98	TEXT
99	ALIGN(16)
100
101PROLOGUE(M4_function)
102deflit(`FRAME',0)
103
104	movl	PARAM_SIZE, %ecx
105	movl	PARAM_SRC, %eax
106
107ifdef(`PIC',`
108	movl	$0xAAAAAAAA, %edx
109	movd	%edx, %mm7
110	punpckldq %mm7, %mm7
111
112	movl	$0x33333333, %edx
113	movd	%edx, %mm6
114	punpckldq %mm6, %mm6
115
116	movl	$0x0F0F0F0F, %edx
117	movd	%edx, %mm5
118	punpckldq %mm5, %mm5
119
120HAM(`	movl	PARAM_SRC2, %edx')
121
122',`
123	dnl non-PIC
124HAM(`	movl	PARAM_SRC2, %edx')
125	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
126	movq	L(rodata_3333333333333333), %mm6
127	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
128')
129
130	pxor	%mm4, %mm4		C zero
131	pxor	%mm0, %mm0		C total
132
133	subl	$1, %ecx
134	ja	L(top)
135
136L(last):
137	movd	(%eax,%ecx,4), %mm1		C src high limb
138HAM(`	movd	(%edx,%ecx,4), %mm2
139	pxor	%mm2, %mm1
140')
141	jmp	L(loaded)
142
143
144L(top):
145	C eax	src
146	C ebx
147	C ecx	counter, size-1 to 2 or 1, inclusive
148	C edx	[hamdist] src2
149	C
150	C mm0	total (low dword)
151	C mm1	(scratch)
152	C mm2	(scratch)
153	C mm3
154	C mm4	0x0000000000000000
155	C mm5	0x0F0F0F0F0F0F0F0F
156	C mm6	0x3333333333333333
157	C mm7	0xAAAAAAAAAAAAAAAA
158
159	movd	(%eax), %mm1
160	movd	4(%eax), %mm2
161	punpckldq %mm2, %mm1
162	addl	$8, %eax
163
164HAM(`	movd	(%edx), %mm2
165	movd	4(%edx), %mm3
166	punpckldq %mm3, %mm2
167	pxor	%mm2, %mm1
168	addl	$8, %edx
169')
170
171L(loaded):
172	movq	%mm7, %mm2
173	pand	%mm1, %mm2
174	psrlq	$1, %mm2
175	psubd	%mm2, %mm1	C bit pairs
176
177	movq	%mm6, %mm2
178	pand	%mm1, %mm2
179	psrlq	$2, %mm1
180	pand	%mm6, %mm1
181	paddd	%mm2, %mm1	C nibbles
182
183	movq	%mm5, %mm2
184	pand	%mm1, %mm2
185	psrlq	$4, %mm1
186	pand	%mm5, %mm1
187	paddd	%mm2, %mm1	C bytes
188
189	psadbw(	%mm4, %mm1)
190	paddd	%mm1, %mm0	C to total
191
192	subl	$2, %ecx
193	jg	L(top)
194
195	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
196	jz	L(last)
197
198
199	movd	%mm0, %eax
200	emms
201	ret
202
203EPILOGUE()
204