xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm64/hamdist.asm (revision d90047b5d07facf36e6c01dcc0bded8997ce9cc2)
1dnl  ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
2
3dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C Cortex-A53	 ?
35C Cortex-A57	 ?
36
37C TODO
38C  * Consider greater unrolling.
39C  * Arrange to align the pointer, if that helps performance.  Use the same
40C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
41C    valgrind!)
42C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
43C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
44
45changecom(@&*$)
46
47C INPUT PARAMETERS
48define(`ap', x0)
49define(`bp', x1)
50define(`n',  x2)
51
52C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
53C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
54C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
55C  allows the huge count code to jump deep into the code (at L(chu)).
56
57define(`maxsize',  0x1fff)
58define(`chunksize',0x1ff0)
59
60ASM_START()
61PROLOGUE(mpn_hamdist)
62
63	mov	x11, #maxsize
64	cmp	n, x11
65	b.hi	L(gt8k)
66
67L(lt8k):
68	movi	v4.16b, #0			C clear summation register
69	movi	v5.16b, #0			C clear summation register
70
71	tbz	n, #0, L(xx0)
72	sub	n, n, #1
73	ld1	{v0.1d}, [ap], #8		C load 1 limb
74	ld1	{v16.1d}, [bp], #8		C load 1 limb
75	eor	v0.16b, v0.16b, v16.16b
76	cnt	v6.16b, v0.16b
77	uadalp	v4.8h,  v6.16b			C could also splat
78
79L(xx0):	tbz	n, #1, L(x00)
80	sub	n, n, #2
81	ld1	{v0.2d}, [ap], #16		C load 2 limbs
82	ld1	{v16.2d}, [bp], #16		C load 2 limbs
83	eor	v0.16b, v0.16b, v16.16b
84	cnt	v6.16b, v0.16b
85	uadalp	v4.8h,  v6.16b
86
87L(x00):	tbz	n, #2, L(000)
88	subs	n, n, #4
89	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
90	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
91	b.ls	L(sum)
92
93L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
94	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
95	eor	v0.16b, v0.16b, v16.16b
96	eor	v1.16b, v1.16b, v17.16b
97	sub	n, n, #4
98	cnt	v6.16b, v0.16b
99	cnt	v7.16b, v1.16b
100	b	L(mid)
101
102L(000):	subs	n, n, #8
103	b.lo	L(e0)
104
105L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
106	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
107	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
108	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
109	eor	v2.16b, v2.16b, v18.16b
110	eor	v3.16b, v3.16b, v19.16b
111	cnt	v6.16b, v2.16b
112	cnt	v7.16b, v3.16b
113	subs	n, n, #8
114	b.lo	L(end)
115
116L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
117	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
118	eor	v0.16b, v0.16b, v16.16b
119	eor	v1.16b, v1.16b, v17.16b
120	uadalp	v4.8h,  v6.16b
121	cnt	v6.16b, v0.16b
122	uadalp	v5.8h,  v7.16b
123	cnt	v7.16b, v1.16b
124L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
125	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
126	eor	v2.16b, v2.16b, v18.16b
127	eor	v3.16b, v3.16b, v19.16b
128	subs	n, n, #8
129	uadalp	v4.8h,  v6.16b
130	cnt	v6.16b, v2.16b
131	uadalp	v5.8h,  v7.16b
132	cnt	v7.16b, v3.16b
133	b.hs	L(top)
134
135L(end):	uadalp	v4.8h,  v6.16b
136	uadalp	v5.8h,  v7.16b
137L(sum):	eor	v0.16b, v0.16b, v16.16b
138	eor	v1.16b, v1.16b, v17.16b
139	cnt	v6.16b, v0.16b
140	cnt	v7.16b, v1.16b
141	uadalp	v4.8h,  v6.16b
142	uadalp	v5.8h,  v7.16b
143	add	v4.8h, v4.8h, v5.8h
144					C we have 8 16-bit counts
145L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
146	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
147	mov	x0, v4.d[0]
148	mov	x1, v4.d[1]
149	add	x0, x0, x1
150	ret
151
152C Code for count > maxsize.  Splits operand and calls above code.
153define(`ap2', x5)			C caller-saves reg not used above
154define(`bp2', x6)			C caller-saves reg not used above
155L(gt8k):
156	mov	x8, x30
157	mov	x7, n			C full count (caller-saves reg not used above)
158	mov	x4, #0			C total sum  (caller-saves reg not used above)
159	mov	x9, #chunksize*8	C caller-saves reg not used above
160	mov	x10, #chunksize		C caller-saves reg not used above
161
1621:	add	ap2, ap, x9		C point at subsequent block
163	add	bp2, bp, x9		C point at subsequent block
164	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
165	movi	v4.16b, #0		C clear chunk summation register
166	movi	v5.16b, #0		C clear chunk summation register
167	bl	L(chu)			C jump deep inside code
168	add	x4, x4, x0
169	mov	ap, ap2			C put chunk pointer in place for calls
170	mov	bp, bp2			C put chunk pointer in place for calls
171	sub	x7, x7, x10
172	cmp	x7, x11
173	b.hi	1b
174
175	mov	n, x7			C count for final invocation
176	bl	L(lt8k)
177	add	x0, x4, x0
178	mov	x30, x8
179	ret
180EPILOGUE()
181