xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mmx/popham.asm (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
2dnl  hamming distance.
3
4dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C        popcount  hamdist
25C K6-2:    9.0       11.5   cycles/limb
26C K6:      12.5      13.0
27
28
29C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
30C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
31C
32C The code here isn't optimal, but it's already a 2x speedup over the plain
33C integer mpn/generic/popcount.c,hamdist.c.
34
35
36ifdef(`OPERATION_popcount',,
37`ifdef(`OPERATION_hamdist',,
38`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
39')m4exit(1)')')
40
41define(HAM,
42m4_assert_numargs(1)
43`ifdef(`OPERATION_hamdist',`$1')')
44
45define(POP,
46m4_assert_numargs(1)
47`ifdef(`OPERATION_popcount',`$1')')
48
49HAM(`
50defframe(PARAM_SIZE,   12)
51defframe(PARAM_SRC2,   8)
52defframe(PARAM_SRC,    4)
53define(M4_function,mpn_hamdist)
54')
55POP(`
56defframe(PARAM_SIZE,   8)
57defframe(PARAM_SRC,    4)
58define(M4_function,mpn_popcount)
59')
60
61MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
62
63
64ifdef(`PIC',,`
65	dnl  non-PIC
66
67	RODATA
68	ALIGN(8)
69
70L(rodata_AAAAAAAAAAAAAAAA):
71	.long	0xAAAAAAAA
72	.long	0xAAAAAAAA
73
74L(rodata_3333333333333333):
75	.long	0x33333333
76	.long	0x33333333
77
78L(rodata_0F0F0F0F0F0F0F0F):
79	.long	0x0F0F0F0F
80	.long	0x0F0F0F0F
81
82L(rodata_000000FF000000FF):
83	.long	0x000000FF
84	.long	0x000000FF
85')
86
87	TEXT
88	ALIGN(32)
89
90POP(`ifdef(`PIC', `
91	C avoid shrl crossing a 32-byte boundary
92	nop')')
93
94PROLOGUE(M4_function)
95deflit(`FRAME',0)
96
97	movl	PARAM_SIZE, %ecx
98
99ifdef(`PIC',`
100	movl	$0xAAAAAAAA, %eax
101	movl	$0x33333333, %edx
102
103	movd	%eax, %mm7
104	movd	%edx, %mm6
105
106	movl	$0x0F0F0F0F, %eax
107	movl	$0x000000FF, %edx
108
109	punpckldq %mm7, %mm7
110	punpckldq %mm6, %mm6
111
112	movd	%eax, %mm5
113	movd	%edx, %mm4
114
115	punpckldq %mm5, %mm5
116	punpckldq %mm4, %mm4
117',`
118
119	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
120	movq	L(rodata_3333333333333333), %mm6
121	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
122	movq	L(rodata_000000FF000000FF), %mm4
123')
124
125define(REG_AAAAAAAAAAAAAAAA, %mm7)
126define(REG_3333333333333333, %mm6)
127define(REG_0F0F0F0F0F0F0F0F, %mm5)
128define(REG_000000FF000000FF, %mm4)
129
130
131	movl	PARAM_SRC, %eax
132HAM(`	movl	PARAM_SRC2, %edx')
133
134	pxor	%mm2, %mm2	C total
135
136	shrl	%ecx
137	jnc	L(top)
138
139Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
140
141HAM(`
142Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
143	pxor	%mm0, %mm1
144')
145
146	incl	%ecx
147	jmp	L(loaded)
148
149
150	ALIGN(16)
151POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
152
153L(top):
154	C eax	src
155	C ebx
156	C ecx	counter, qwords, decrementing
157	C edx	[hamdist] src2
158	C
159	C mm0	(scratch)
160	C mm1	(scratch)
161	C mm2	total (low dword)
162	C mm3
163	C mm4	\
164	C mm5	| special constants
165	C mm6	|
166	C mm7	/
167
168	movq	-8(%eax,%ecx,8), %mm1
169HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
170
171L(loaded):
172	movq	%mm1, %mm0
173	pand	REG_AAAAAAAAAAAAAAAA, %mm1
174
175	psrlq	$1, %mm1
176HAM(`	nop			C code alignment')
177
178	psubd	%mm1, %mm0	C bit pairs
179HAM(`	nop			C code alignment')
180
181
182	movq	%mm0, %mm1
183	psrlq	$2, %mm0
184
185	pand	REG_3333333333333333, %mm0
186	pand	REG_3333333333333333, %mm1
187
188	paddd	%mm1, %mm0	C nibbles
189
190
191	movq	%mm0, %mm1
192	psrlq	$4, %mm0
193
194	pand	REG_0F0F0F0F0F0F0F0F, %mm0
195	pand	REG_0F0F0F0F0F0F0F0F, %mm1
196
197	paddd	%mm1, %mm0	C bytes
198
199	movq	%mm0, %mm1
200	psrlq	$8, %mm0
201
202
203	paddb	%mm1, %mm0	C words
204
205
206	movq	%mm0, %mm1
207	psrlq	$16, %mm0
208
209	paddd	%mm1, %mm0	C dwords
210
211	pand	REG_000000FF000000FF, %mm0
212
213	paddd	%mm0, %mm2	C low to total
214	psrlq	$32, %mm0
215
216	paddd	%mm0, %mm2	C high to total
217	loop	L(top)
218
219
220
221	movd	%mm2, %eax
222	emms_or_femms
223	ret
224
225EPILOGUE()
226