xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/k10/popcount.asm (revision 8e33eff89e26cf71871ead62f0d5063e1313c33a)
1dnl  AMD64 mpn_popcount -- population count.
2
3dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C		    cycles/limb
34C AMD K8,K9		 n/a
35C AMD K10		 1.125
36C Intel P4		 n/a
37C Intel core2		 n/a
38C Intel corei		 1.25
39C Intel atom		 n/a
40C VIA nano		 n/a
41
42C * The zero-offset of popcount is misassembled to the offset-less form, which
43C   is one byte shorter and therefore will mess up the switching code.
44C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn,
45C   which is the main reason for our usage of '.byte'.
46
47C TODO
48C  * Improve switching code, the current code sucks.
49
50define(`up',		`%rdi')
51define(`n',		`%rsi')
52
53ABI_SUPPORT(DOS64)
54ABI_SUPPORT(STD64)
55
56ASM_START()
57	TEXT
58	ALIGN(32)
59PROLOGUE(mpn_popcount)
60	FUNC_ENTRY(2)
61
62ifelse(1,1,`
63	lea	(up,n,8), up
64
65C	mov	R32(n), R32(%rcx)
66C	neg	R32(%rcx)
67	imul	$-1, R32(n), R32(%rcx)
68	and	$8-1, R32(%rcx)
69
70	neg	n
71
72	mov	R32(%rcx), R32(%rax)
73	neg	%rax
74	lea	(up,%rax,8),up
75
76	xor	R32(%rax), R32(%rax)
77
78	lea	(%rcx,%rcx,4), %rcx
79
80	lea	L(top)(%rip), %rdx
81	lea	(%rdx,%rcx,2), %rdx
82	jmp	*%rdx
83',`
84	lea	(up,n,8), up
85
86	mov	R32(n), R32(%rcx)
87	neg	R32(%rcx)
88	and	$8-1, R32(%rcx)
89
90	neg	n
91
92	mov	R32(%rcx), R32(%rax)
93	shl	$3, R32(%rax)
94	sub	%rax, up
95
96	xor	R32(%rax), R32(%rax)
97
98C	add	R32(%rcx), R32(%rcx)	C 2x
99C	lea	(%rcx,%rcx,4), %rcx	C 10x
100	imul	$10, R32(%rcx)
101
102	lea	L(top)(%rip), %rdx
103	add	%rcx, %rdx
104	jmp	*%rdx
105')
106
107	ALIGN(32)
108L(top):
109C 0 = n mod 8
110	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00	C popcnt 0(up,n,8), %r8
111	add	%r8, %rax
112C 7 = n mod 8
113	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08	C popcnt 8(up,n,8), %r9
114	add	%r9, %rax
115C 6 = n mod 8
116	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10	C popcnt 16(up,n,8), %r8
117	add	%r8, %rax
118C 5 = n mod 8
119	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18	C popcnt 24(up,n,8), %r9
120	add	%r9, %rax
121C 4 = n mod 8
122	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20	C popcnt 32(up,n,8), %r8
123	add	%r8, %rax
124C 3 = n mod 8
125	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28	C popcnt 40(up,n,8), %r9
126	add	%r9, %rax
127C 2 = n mod 8
128	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30	C popcnt 48(up,n,8), %r8
129	add	%r8, %rax
130C 1 = n mod 8
131	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38	C popcnt 56(up,n,8), %r9
132	add	%r9, %rax
133
134	add	$8, n
135	js	L(top)
136	FUNC_EXIT()
137	ret
138EPILOGUE()
139