xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/bd1/hamdist.asm (revision d536862b7d93d77932ef5de7eebdc48d76921b77)
1dnl  AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
2
3dnl  Copyright 2010-2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C		    cycles/limb	  good for cpu?
35C AMD K8,K9		n/a
36C AMD K10		n/a
37C AMD bd1	     1.51-2.0		y
38C AMD bd2	     1.50-1.9		y
39C AMD bd3		 ?
40C AMD bd4		 ?
41C AMD zen		n/a
42C AMD bobcat		n/a
43C AMD jaguar		n/a
44C Intel P4		n/a
45C Intel PNR		n/a
46C Intel NHM		n/a
47C Intel SBR		n/a
48C Intel IBR		n/a
49C Intel HWL		n/a
50C Intel BWL		n/a
51C Intel SKL		n/a
52C Intel atom		n/a
53C Intel SLM		n/a
54C VIA nano		n/a
55
56C TODO
57C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
58C    intend to support old systems.
59
60C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
61C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
62C We fall back to the core2 code.
63ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
64MULFUNC_PROLOGUE(mpn_hamdist)
65include_mpn(`x86_64/core2/hamdist.asm')
66',`
67
68define(`up',		`%rdi')
69define(`vp',		`%rsi')
70define(`n',		`%rdx')
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75ASM_START()
76	TEXT
77	ALIGN(32)
78PROLOGUE(mpn_hamdist)
79	FUNC_ENTRY(3)
80	cmp	$5, n
81	jl	L(sma)
82
83	lea	L(cnsts)(%rip), %r9
84
85	xor	R32(%r10), R32(%r10)
86	test	$8, R8(vp)
87	jz	L(ali)
88	mov	(up), %r8
89	xor	(vp), %r8
90	add	$8, up
91	add	$8, vp
92	dec	n
93	popcnt	%r8, %r10
94L(ali):
95
96ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
97	     `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
98	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
99	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
100	movdqa	OFF3`'(%r9), %xmm5	C masks
101	pxor	%xmm4, %xmm4
102	pxor	%xmm8, %xmm8		C grand total count
103
104	mov	R32(n), R32(%rax)
105	and	$6, R32(%rax)
106	lea	-64(up,%rax,8), up
107	lea	-64(vp,%rax,8), vp
108ifdef(`PIC',`
109	movslq	(%r9,%rax,2), %r11
110	add	%r9, %r11
111	jmp	*%r11
112',`
113	jmp	*(%r9,%rax,4)
114')
115
116L(0):	add	$64, up
117	add	$64, vp
118	sub	$2, n
119
120	ALIGN(32)
121L(top):	lddqu	(up), %xmm0
122	pxor	(vp), %xmm0
123	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
124	pand	%xmm5, %xmm0
125	pand	%xmm5, %xmm1
126	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
127	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
128	paddb	%xmm2, %xmm3
129	paddb	%xmm3, %xmm4
130L(6):	lddqu	16(up), %xmm0
131	pxor	16(vp), %xmm0
132	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
133	pand	%xmm5, %xmm0
134	pand	%xmm5, %xmm1
135	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
136	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
137	paddb	%xmm2, %xmm3
138	paddb	%xmm3, %xmm4
139L(4):	lddqu	32(up), %xmm0
140	pxor	32(vp), %xmm0
141	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
142	pand	%xmm5, %xmm0
143	pand	%xmm5, %xmm1
144	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
145	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
146	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
147	paddb	%xmm2, %xmm3
148	paddb	%xmm2, %xmm4
149	paddq	%xmm0, %xmm8		C sum to 2 x 64-bit counts
150L(2):	mov	48(up), %r8
151	mov	56(up), %r9
152	add	$64, up
153	xor	48(vp), %r8
154	xor	56(vp), %r9
155	add	$64, vp
156	popcnt	%r8, %r8
157	popcnt	%r9, %r9
158	add	%r8, %r10
159	add	%r9, %r10
160	sub	$8, n
161	jg	L(top)
162
163	test	$1, R8(n)
164	jz	L(x)
165	mov	(up), %r8
166	xor	(vp), %r8
167	popcnt	%r8, %r8
168	add	%r8, %r10
169L(x):	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
170	paddq	%xmm0, %xmm8
171	pshufd	$14, %xmm8, %xmm0
172	paddq	%xmm8, %xmm0
173	movq	%xmm0, %rax
174	add	%r10, %rax
175	FUNC_EXIT()
176	ret
177
178L(sma):	mov	(up), %r8
179	xor	(vp), %r8
180	popcnt	%r8, %rax
181	dec	n
182	jz	L(ed)
183L(tp):	mov	8(up), %r8
184	add	$8, up
185	xor	8(vp), %r8
186	add	$8, vp
187	popcnt	%r8, %r8
188	add	%r8, %rax
189	dec	n
190	jnz	L(tp)
191L(ed):	FUNC_EXIT()
192	ret
193EPILOGUE()
194DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
195	JMPENT(	L(0), L(cnsts))
196	JMPENT(	L(2), L(cnsts))
197	JMPENT(	L(4), L(cnsts))
198	JMPENT(	L(6), L(cnsts))
199	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
200	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
201	.byte	-4,-4,-4,-4,-4,-4,-4,-4
202	.byte	-4,-4,-4,-4,-4,-4,-4,-4
203	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
204	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
205END_OBJECT(L(cnsts))
206')
207