xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bd1/popcount.asm (revision 122b5006ee1bd67145794b4cde92f4fe4781a5ec)
1dnl  AMD64 SSSE3/XOP mpn_popcount -- population count.
2
3dnl  Copyright 2010-2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C		    cycles/limb	  good for cpu?
35C AMD K8,K9		n/a
36C AMD K10		n/a
37C AMD bd1		 1.27		y
38C AMD bd2		 1.24		y
39C AMD bd3		 ?
40C AMD bd4		 1.22
41C AMD zen		n/a
42C AMD bobcat		n/a
43C AMD jaguar		n/a
44C Intel P4		n/a
45C Intel CNR		n/a
46C Intel PNR		n/a
47C Intel NHM		n/a
48C Intel SBR		n/a
49C Intel IBR		n/a
50C Intel HWL		n/a
51C Intel BWL		n/a
52C Intel SKL		n/a
53C Intel atom		n/a
54C Intel SLM		n/a
55C VIA nano		n/a
56
57C TODO
58C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
59C    intend to support old systems.
60
61C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
62C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
63C We fall back to the core2 code.
64ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
65MULFUNC_PROLOGUE(mpn_popcount)
66include_mpn(`x86_64/core2/popcount.asm')
67',`
68
69define(`up',		`%rdi')
70define(`n',		`%rsi')
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75ASM_START()
76	TEXT
77	ALIGN(32)
78PROLOGUE(mpn_popcount)
79	FUNC_ENTRY(3)
80	lea	L(cnsts)(%rip), %r9
81
82ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
83	     `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
84	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
85	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
86	movdqa	OFF3`'(%r9), %xmm9	C masks
87	pxor	%xmm4, %xmm4
88	pxor	%xmm5, %xmm5		C 0-reg
89	pxor	%xmm8, %xmm8		C grand total count
90
91	xor	R32(%rdx), R32(%rdx)
92
93	mov	R32(n), R32(%rax)
94	and	$7, R32(%rax)
95ifdef(`PIC',`
96	movslq	(%r9,%rax,4), %rax
97	add	%r9, %rax
98	jmp	*%rax
99',`
100	jmp	*(%r9,%rax,8)
101')
102
103L(1):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up),%rdx
104	add	$8, up
105	dec	n
106	jnz	L(top)
107	mov	%rdx, %rax
108	FUNC_EXIT()
109	ret
110
111L(2):	add	$-48, up
112	jmp	L(e2)
113
114L(3):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
115	add	$-40, up
116	jmp	L(e2)
117
118L(4):	add	$-32, up
119	jmp	L(e4)
120
121L(5):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
122	add	$-24, up
123	jmp	L(e4)
124
125L(6):	add	$-16, up
126	jmp	L(e6)
127
128L(7):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
129	add	$-8, up
130	jmp	L(e6)
131
132	ALIGN(32)
133L(top):	lddqu	(up), %xmm0
134	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
135	pand	%xmm9, %xmm0
136	pand	%xmm9, %xmm1
137	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
138	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1, %xmm7, %xmm7, %xmm3
139	paddb	%xmm2, %xmm3
140	paddb	%xmm3, %xmm4
141L(e6):	lddqu	16(up), %xmm0
142	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
143	pand	%xmm9, %xmm0
144	pand	%xmm9, %xmm1
145	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
146	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
147	paddb	%xmm2, %xmm3
148	paddb	%xmm3, %xmm4
149L(e4):	lddqu	32(up), %xmm0
150	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
151	pand	%xmm9, %xmm0
152	pand	%xmm9, %xmm1
153	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0, %xmm7, %xmm7, %xmm2
154	.byte	0x8f,0xe9,0x78,0xd3,0xec	C vphaddubq %xmm4, %xmm5
155	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
156	paddb	%xmm2, %xmm4
157L(e2):	popcnt	48(up), %r8
158	popcnt	56(up), %r9
159	add	$64, up
160	paddq	%xmm5, %xmm8			C sum to 2 x 64-bit counts
161	add	%r8, %rdx
162	add	%r9, %rdx
163	sub	$8, n
164	jg	L(top)
165
166	.byte	0x8f,0xe9,0x78,0xd3,0xec	C vphaddubq %xmm4, %xmm5
167	paddq	%xmm5, %xmm8
168	pshufd	$14, %xmm8, %xmm0
169	paddq	%xmm8, %xmm0
170	movq	%xmm0, %rax
171	add	%rdx, %rax
172	FUNC_EXIT()
173	ret
174EPILOGUE()
175DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
176	JMPENT(	L(top), L(cnsts))
177	JMPENT(	L(1), L(cnsts))
178	JMPENT(	L(2), L(cnsts))
179	JMPENT(	L(3), L(cnsts))
180	JMPENT(	L(4), L(cnsts))
181	JMPENT(	L(5), L(cnsts))
182	JMPENT(	L(6), L(cnsts))
183	JMPENT(	L(7), L(cnsts))
184	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
185	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
186	.byte	-4,-4,-4,-4,-4,-4,-4,-4
187	.byte	-4,-4,-4,-4,-4,-4,-4,-4
188	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
189	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
190END_OBJECT(L(cnsts))
191')
192