xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/popham.asm (revision ead2c0eee3abe6bcf08c63bfc78eb8a93a579b2b)
1dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
2dnl  distance.
3
4dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C			     popcount	     hamdist
25C P3 generic			6.5		7
26C P3 model 9  (Banias)          ?		?
27C P3 model 13 (Dothan)		5.75		6
28C K7				5		6
29
30C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
31C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
32C
33C The code here is almost certainly not optimal, but is already a 3x speedup
34C over the generic C code.  The main improvement would be to interleave
35C processing of two qwords in the loop so as to fully exploit the available
36C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
37C
38C The loop is based on the example "Efficient 64-bit population count using
39C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
40C page 158 of rev E (reference in mpn/x86/k7/README).
41
42ifdef(`OPERATION_popcount',,
43`ifdef(`OPERATION_hamdist',,
44`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
45')')')
46
47define(HAM,
48m4_assert_numargs(1)
49`ifdef(`OPERATION_hamdist',`$1')')
50
51define(POP,
52m4_assert_numargs(1)
53`ifdef(`OPERATION_popcount',`$1')')
54
55HAM(`
56defframe(PARAM_SIZE,   12)
57defframe(PARAM_SRC2,   8)
58defframe(PARAM_SRC,    4)
59define(M4_function,mpn_hamdist)
60')
61POP(`
62defframe(PARAM_SIZE,   8)
63defframe(PARAM_SRC,    4)
64define(M4_function,mpn_popcount)
65')
66
67MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
68
69
70ifdef(`PIC',,`
71	dnl  non-PIC
72
73	RODATA
74	ALIGN(8)
75
76L(rodata_AAAAAAAAAAAAAAAA):
77	.long	0xAAAAAAAA
78	.long	0xAAAAAAAA
79
80L(rodata_3333333333333333):
81	.long	0x33333333
82	.long	0x33333333
83
84L(rodata_0F0F0F0F0F0F0F0F):
85	.long	0x0F0F0F0F
86	.long	0x0F0F0F0F
87')
88
89	TEXT
90	ALIGN(32)
91
92PROLOGUE(M4_function)
93deflit(`FRAME',0)
94
95	movl	PARAM_SIZE, %ecx
96
97ifdef(`PIC',`
98	movl	$0xAAAAAAAA, %eax
99	movl	$0x33333333, %edx
100
101	movd	%eax, %mm7
102	movd	%edx, %mm6
103
104	movl	$0x0F0F0F0F, %eax
105
106	punpckldq %mm7, %mm7
107	punpckldq %mm6, %mm6
108
109	movd	%eax, %mm5
110	movd	%edx, %mm4
111
112	punpckldq %mm5, %mm5
113
114',`
115	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
116	movq	L(rodata_3333333333333333), %mm6
117	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
118')
119	pxor	%mm4, %mm4
120
121define(REG_AAAAAAAAAAAAAAAA,%mm7)
122define(REG_3333333333333333,%mm6)
123define(REG_0F0F0F0F0F0F0F0F,%mm5)
124define(REG_0000000000000000,%mm4)
125
126
127	movl	PARAM_SRC, %eax
128HAM(`	movl	PARAM_SRC2, %edx')
129
130	pxor	%mm2, %mm2	C total
131
132	shrl	%ecx
133	jnc	L(top)
134
135	movd	(%eax,%ecx,8), %mm1
136
137HAM(`	movd	(%edx,%ecx,8), %mm0
138	pxor	%mm0, %mm1
139')
140	orl	%ecx, %ecx
141	jmp	L(loaded)
142
143
144	ALIGN(16)
145L(top):
146	C eax	src
147	C ebx
148	C ecx	counter, qwords, decrementing
149	C edx	[hamdist] src2
150	C
151	C mm0	(scratch)
152	C mm1	(scratch)
153	C mm2	total (low dword)
154	C mm3
155	C mm4	\
156	C mm5	| special constants
157	C mm6	|
158	C mm7	/
159
160	movq	-8(%eax,%ecx,8), %mm1
161
162HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
163	decl	%ecx
164
165L(loaded):
166	movq	%mm1, %mm0
167	pand	REG_AAAAAAAAAAAAAAAA, %mm1
168
169	psrlq	$1, %mm1
170
171	psubd	%mm1, %mm0	C bit pairs
172
173
174	movq	%mm0, %mm1
175	psrlq	$2, %mm0
176
177	pand	REG_3333333333333333, %mm0
178	pand	REG_3333333333333333, %mm1
179
180	paddd	%mm1, %mm0	C nibbles
181
182
183	movq	%mm0, %mm1
184	psrlq	$4, %mm0
185
186	pand	REG_0F0F0F0F0F0F0F0F, %mm0
187	pand	REG_0F0F0F0F0F0F0F0F, %mm1
188
189	paddd	%mm1, %mm0	C bytes
190
191
192	psadbw(	%mm4, %mm0)
193
194	paddd	%mm0, %mm2	C add to total
195	jnz	L(top)
196
197
198	movd	%mm2, %eax
199	emms
200	ret
201
202EPILOGUE()
203