xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/k10/popcount.asm (revision 6d322f2f4598f0d8a138f10ea648ec4fabe41f8b)
1dnl  AMD64 mpn_popcount -- population count.
2
3dnl  Copyright 2008, 2010, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C		    cycles/limb
23C AMD K8,K9		 n/a
24C AMD K10		 1.125
25C Intel P4		 n/a
26C Intel core2		 n/a
27C Intel corei		 1.25
28C Intel atom		 n/a
29C VIA nano		 n/a
30
31C * The zero-offset of popcount is misassembled to the offset-less form, which
32C   is one byte shorter and therefore will mess up the switching code.
33C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn,
34C   which is the main reason for our usage of '.byte'.
35
36C TODO
37C  * Improve switching code, the current code sucks.
38
39define(`up',		`%rdi')
40define(`n',		`%rsi')
41
42ABI_SUPPORT(DOS64)
43ABI_SUPPORT(STD64)
44
45ASM_START()
46	TEXT
47	ALIGN(32)
48PROLOGUE(mpn_popcount)
49	FUNC_ENTRY(2)
50
51ifelse(1,1,`
52	lea	(up,n,8), up
53
54C	mov	R32(n), R32(%rcx)
55C	neg	R32(%rcx)
56	imul	$-1, R32(n), R32(%rcx)
57	and	$8-1, R32(%rcx)
58
59	neg	n
60
61	mov	R32(%rcx), R32(%rax)
62	neg	%rax
63	lea	(up,%rax,8),up
64
65	xor	R32(%rax), R32(%rax)
66
67	lea	(%rcx,%rcx,4), %rcx
68
69	lea	L(top)(%rip), %rdx
70	lea	(%rdx,%rcx,2), %rdx
71	jmp	*%rdx
72',`
73	lea	(up,n,8), up
74
75	mov	R32(n), R32(%rcx)
76	neg	R32(%rcx)
77	and	$8-1, R32(%rcx)
78
79	neg	n
80
81	mov	R32(%rcx), R32(%rax)
82	shl	$3, R32(%rax)
83	sub	%rax, up
84
85	xor	R32(%rax), R32(%rax)
86
87C	add	R32(%rcx), R32(%rcx)	C 2x
88C	lea	(%rcx,%rcx,4), %rcx	C 10x
89	imul	$10, R32(%rcx)
90
91	lea	L(top)(%rip), %rdx
92	add	%rcx, %rdx
93	jmp	*%rdx
94')
95
96	ALIGN(32)
97L(top):
98C 0 = n mod 8
99	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00	C popcnt 0(up,n,8), %r8
100	add	%r8, %rax
101C 7 = n mod 8
102	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08	C popcnt 8(up,n,8), %r9
103	add	%r9, %rax
104C 6 = n mod 8
105	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10	C popcnt 16(up,n,8), %r8
106	add	%r8, %rax
107C 5 = n mod 8
108	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18	C popcnt 24(up,n,8), %r9
109	add	%r9, %rax
110C 4 = n mod 8
111	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20	C popcnt 32(up,n,8), %r8
112	add	%r8, %rax
113C 3 = n mod 8
114	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28	C popcnt 40(up,n,8), %r9
115	add	%r9, %rax
116C 2 = n mod 8
117	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30	C popcnt 48(up,n,8), %r8
118	add	%r8, %rax
119C 1 = n mod 8
120	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38	C popcnt 56(up,n,8), %r9
121	add	%r9, %rax
122
123	add	$8, n
124	js	L(top)
125	FUNC_EXIT()
126	ret
127EPILOGUE()
128