xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/mmx/popham.asm (revision b1bb3099bf4d47bbe8c7be5b78240a535263771f)
1dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
2dnl  hamming distance.
3
4dnl  Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C			     popcount	     hamdist
25C P3 model 9  (Banias)		?		?
26C P3 model 13 (Dothan)		6		6
27C P4 model 0  (Willamette)
28C P4 model 1  (?)
29C P4 model 2  (Northwood)	8		9
30C P4 model 3  (Prescott)	8		9
31C P4 model 4  (Nocona)
32
33C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
34C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
35C
36C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
37C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
38C and using them saves fiddling about with alignment testing on entry.
39C
40C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
41C might be possible, but 8 c/l relying on out-of-order execution is already
42C quite reasonable.
43
44ifdef(`OPERATION_popcount',,
45`ifdef(`OPERATION_hamdist',,
46`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
47')')')
48
49define(HAM,
50m4_assert_numargs(1)
51`ifdef(`OPERATION_hamdist',`$1')')
52
53define(POP,
54m4_assert_numargs(1)
55`ifdef(`OPERATION_popcount',`$1')')
56
57HAM(`
58defframe(PARAM_SIZE, 12)
59defframe(PARAM_SRC2,  8)
60defframe(PARAM_SRC,   4)
61define(M4_function,mpn_hamdist)
62')
63POP(`
64defframe(PARAM_SIZE,  8)
65defframe(PARAM_SRC,   4)
66define(M4_function,mpn_popcount)
67')
68
69MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
70
71
72ifdef(`PIC',,`
73	dnl  non-PIC
74	RODATA
75	ALIGN(8)
76L(rodata_AAAAAAAAAAAAAAAA):
77	.long	0xAAAAAAAA
78	.long	0xAAAAAAAA
79L(rodata_3333333333333333):
80	.long	0x33333333
81	.long	0x33333333
82L(rodata_0F0F0F0F0F0F0F0F):
83	.long	0x0F0F0F0F
84	.long	0x0F0F0F0F
85')
86
87	TEXT
88	ALIGN(16)
89
90PROLOGUE(M4_function)
91deflit(`FRAME',0)
92
93	movl	PARAM_SIZE, %ecx
94	movl	PARAM_SRC, %eax
95
96ifdef(`PIC',`
97	movl	$0xAAAAAAAA, %edx
98	movd	%edx, %mm7
99	punpckldq %mm7, %mm7
100
101	movl	$0x33333333, %edx
102	movd	%edx, %mm6
103	punpckldq %mm6, %mm6
104
105	movl	$0x0F0F0F0F, %edx
106	movd	%edx, %mm5
107	punpckldq %mm5, %mm5
108
109HAM(`	movl	PARAM_SRC2, %edx')
110
111',`
112	dnl non-PIC
113HAM(`	movl	PARAM_SRC2, %edx')
114	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
115	movq	L(rodata_3333333333333333), %mm6
116	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
117')
118
119	pxor	%mm4, %mm4		C zero
120	pxor	%mm0, %mm0		C total
121
122	subl	$1, %ecx
123	ja	L(top)
124
125L(last):
126	movd	(%eax,%ecx,4), %mm1		C src high limb
127HAM(`	movd	(%edx,%ecx,4), %mm2
128	pxor	%mm2, %mm1
129')
130	jmp	L(loaded)
131
132
133L(top):
134	C eax	src
135	C ebx
136	C ecx	counter, size-1 to 2 or 1, inclusive
137	C edx	[hamdist] src2
138	C
139	C mm0	total (low dword)
140	C mm1	(scratch)
141	C mm2	(scratch)
142	C mm3
143	C mm4	0x0000000000000000
144	C mm5	0x0F0F0F0F0F0F0F0F
145	C mm6	0x3333333333333333
146	C mm7	0xAAAAAAAAAAAAAAAA
147
148	movd	(%eax), %mm1
149	movd	4(%eax), %mm2
150	punpckldq %mm2, %mm1
151	addl	$8, %eax
152
153HAM(`	movd	(%edx), %mm2
154	movd	4(%edx), %mm3
155	punpckldq %mm3, %mm2
156	pxor	%mm2, %mm1
157	addl	$8, %edx
158')
159
160L(loaded):
161	movq	%mm7, %mm2
162	pand	%mm1, %mm2
163	psrlq	$1, %mm2
164	psubd	%mm2, %mm1	C bit pairs
165
166	movq	%mm6, %mm2
167	pand	%mm1, %mm2
168	psrlq	$2, %mm1
169	pand	%mm6, %mm1
170	paddd	%mm2, %mm1	C nibbles
171
172	movq	%mm5, %mm2
173	pand	%mm1, %mm2
174	psrlq	$4, %mm1
175	pand	%mm5, %mm1
176	paddd	%mm2, %mm1	C bytes
177
178	psadbw(	%mm4, %mm1)
179	paddd	%mm1, %mm0	C to total
180
181	subl	$2, %ecx
182	jg	L(top)
183
184	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
185	jz	L(last)
186
187
188	movd	%mm0, %eax
189	emms
190	ret
191
192EPILOGUE()
193