xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/popham.asm (revision 70f7362772ba52b749c976fb5e86e39a8b2c9afc)
1dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
2dnl  distance.
3
4dnl  Copyright 2000-2002 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C			     popcount	     hamdist
36C P3 generic			6.5		7
37C P3 model 9  (Banias)          5.7		6.1
38C P3 model 13 (Dothan)		5.75		6
39C K7				5		6
40
41C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
42C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
43C
44C The code here is almost certainly not optimal, but is already a 3x speedup
45C over the generic C code.  The main improvement would be to interleave
46C processing of two qwords in the loop so as to fully exploit the available
47C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
48C
49C The loop is based on the example "Efficient 64-bit population count using
50C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
51C page 158 of rev E (reference in mpn/x86/k7/README).
52
53ifdef(`OPERATION_popcount',,
54`ifdef(`OPERATION_hamdist',,
55`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
56')')')
57
58define(HAM,
59m4_assert_numargs(1)
60`ifdef(`OPERATION_hamdist',`$1')')
61
62define(POP,
63m4_assert_numargs(1)
64`ifdef(`OPERATION_popcount',`$1')')
65
66HAM(`
67defframe(PARAM_SIZE,   12)
68defframe(PARAM_SRC2,   8)
69defframe(PARAM_SRC,    4)
70define(M4_function,mpn_hamdist)
71')
72POP(`
73defframe(PARAM_SIZE,   8)
74defframe(PARAM_SRC,    4)
75define(M4_function,mpn_popcount)
76')
77
78MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
79
80
81ifdef(`PIC',,`
82	dnl  non-PIC
83
84	RODATA
85	ALIGN(8)
86
87L(rodata_AAAAAAAAAAAAAAAA):
88	.long	0xAAAAAAAA
89	.long	0xAAAAAAAA
90
91L(rodata_3333333333333333):
92	.long	0x33333333
93	.long	0x33333333
94
95L(rodata_0F0F0F0F0F0F0F0F):
96	.long	0x0F0F0F0F
97	.long	0x0F0F0F0F
98')
99
100	TEXT
101	ALIGN(32)
102
103PROLOGUE(M4_function)
104deflit(`FRAME',0)
105
106	movl	PARAM_SIZE, %ecx
107
108ifdef(`PIC',`
109	movl	$0xAAAAAAAA, %eax
110	movl	$0x33333333, %edx
111
112	movd	%eax, %mm7
113	movd	%edx, %mm6
114
115	movl	$0x0F0F0F0F, %eax
116
117	punpckldq %mm7, %mm7
118	punpckldq %mm6, %mm6
119
120	movd	%eax, %mm5
121	movd	%edx, %mm4
122
123	punpckldq %mm5, %mm5
124
125',`
126	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
127	movq	L(rodata_3333333333333333), %mm6
128	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
129')
130	pxor	%mm4, %mm4
131
132define(REG_AAAAAAAAAAAAAAAA,%mm7)
133define(REG_3333333333333333,%mm6)
134define(REG_0F0F0F0F0F0F0F0F,%mm5)
135define(REG_0000000000000000,%mm4)
136
137
138	movl	PARAM_SRC, %eax
139HAM(`	movl	PARAM_SRC2, %edx')
140
141	pxor	%mm2, %mm2	C total
142
143	shrl	%ecx
144	jnc	L(top)
145
146	movd	(%eax,%ecx,8), %mm1
147
148HAM(`	movd	(%edx,%ecx,8), %mm0
149	pxor	%mm0, %mm1
150')
151	orl	%ecx, %ecx
152	jmp	L(loaded)
153
154
155	ALIGN(16)
156L(top):
157	C eax	src
158	C ebx
159	C ecx	counter, qwords, decrementing
160	C edx	[hamdist] src2
161	C
162	C mm0	(scratch)
163	C mm1	(scratch)
164	C mm2	total (low dword)
165	C mm3
166	C mm4	\
167	C mm5	| special constants
168	C mm6	|
169	C mm7	/
170
171	movq	-8(%eax,%ecx,8), %mm1
172
173HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
174	decl	%ecx
175
176L(loaded):
177	movq	%mm1, %mm0
178	pand	REG_AAAAAAAAAAAAAAAA, %mm1
179
180	psrlq	$1, %mm1
181
182	psubd	%mm1, %mm0	C bit pairs
183
184
185	movq	%mm0, %mm1
186	psrlq	$2, %mm0
187
188	pand	REG_3333333333333333, %mm0
189	pand	REG_3333333333333333, %mm1
190
191	paddd	%mm1, %mm0	C nibbles
192
193
194	movq	%mm0, %mm1
195	psrlq	$4, %mm0
196
197	pand	REG_0F0F0F0F0F0F0F0F, %mm0
198	pand	REG_0F0F0F0F0F0F0F0F, %mm1
199
200	paddd	%mm1, %mm0	C bytes
201
202
203	psadbw(	%mm4, %mm0)
204
205	paddd	%mm0, %mm2	C add to total
206	jnz	L(top)
207
208
209	movd	%mm2, %eax
210	emms
211	ret
212
213EPILOGUE()
214