xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm64/hamdist.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1dnl  ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
2
3dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C Cortex-A53	 4.5
35C Cortex-A57	 1.9
36C X-Gene	 4.36
37
38C TODO
39C  * Consider greater unrolling.
40C  * Arrange to align the pointer, if that helps performance.  Use the same
41C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
42C    valgrind!)
43C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
44C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
45
46changecom(blah)
47
48C INPUT PARAMETERS
49define(`ap', x0)
50define(`bp', x1)
51define(`n',  x2)
52
53C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
54C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
55C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
56C  allows the huge count code to jump deep into the code (at L(chu)).
57
58define(`maxsize',  0x1fff)
59define(`chunksize',0x1ff0)
60
61ASM_START()
62PROLOGUE(mpn_hamdist)
63
64	mov	x11, #maxsize
65	cmp	n, x11
66	b.hi	L(gt8k)
67
68L(lt8k):
69	movi	v4.16b, #0			C clear summation register
70	movi	v5.16b, #0			C clear summation register
71
72	tbz	n, #0, L(xx0)
73	sub	n, n, #1
74	ld1	{v0.1d}, [ap], #8		C load 1 limb
75	ld1	{v16.1d}, [bp], #8		C load 1 limb
76	eor	v0.16b, v0.16b, v16.16b
77	cnt	v6.16b, v0.16b
78	uadalp	v4.8h,  v6.16b			C could also splat
79
80L(xx0):	tbz	n, #1, L(x00)
81	sub	n, n, #2
82	ld1	{v0.2d}, [ap], #16		C load 2 limbs
83	ld1	{v16.2d}, [bp], #16		C load 2 limbs
84	eor	v0.16b, v0.16b, v16.16b
85	cnt	v6.16b, v0.16b
86	uadalp	v4.8h,  v6.16b
87
88L(x00):	tbz	n, #2, L(000)
89	subs	n, n, #4
90	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
91	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
92	b.ls	L(sum)
93
94L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
95	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
96	eor	v0.16b, v0.16b, v16.16b
97	eor	v1.16b, v1.16b, v17.16b
98	sub	n, n, #4
99	cnt	v6.16b, v0.16b
100	cnt	v7.16b, v1.16b
101	b	L(mid)
102
103L(000):	subs	n, n, #8
104	b.lo	L(e0)
105
106L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
107	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
108	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
109	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
110	eor	v2.16b, v2.16b, v18.16b
111	eor	v3.16b, v3.16b, v19.16b
112	cnt	v6.16b, v2.16b
113	cnt	v7.16b, v3.16b
114	subs	n, n, #8
115	b.lo	L(end)
116
117L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
118	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
119	eor	v0.16b, v0.16b, v16.16b
120	eor	v1.16b, v1.16b, v17.16b
121	uadalp	v4.8h,  v6.16b
122	cnt	v6.16b, v0.16b
123	uadalp	v5.8h,  v7.16b
124	cnt	v7.16b, v1.16b
125L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
126	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
127	eor	v2.16b, v2.16b, v18.16b
128	eor	v3.16b, v3.16b, v19.16b
129	subs	n, n, #8
130	uadalp	v4.8h,  v6.16b
131	cnt	v6.16b, v2.16b
132	uadalp	v5.8h,  v7.16b
133	cnt	v7.16b, v3.16b
134	b.hs	L(top)
135
136L(end):	uadalp	v4.8h,  v6.16b
137	uadalp	v5.8h,  v7.16b
138L(sum):	eor	v0.16b, v0.16b, v16.16b
139	eor	v1.16b, v1.16b, v17.16b
140	cnt	v6.16b, v0.16b
141	cnt	v7.16b, v1.16b
142	uadalp	v4.8h,  v6.16b
143	uadalp	v5.8h,  v7.16b
144	add	v4.8h, v4.8h, v5.8h
145					C we have 8 16-bit counts
146L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
147	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
148	mov	x0, v4.d[0]
149	mov	x1, v4.d[1]
150	add	x0, x0, x1
151	ret
152
153C Code for count > maxsize.  Splits operand and calls above code.
154define(`ap2', x5)			C caller-saves reg not used above
155define(`bp2', x6)			C caller-saves reg not used above
156L(gt8k):
157	mov	x8, x30
158	mov	x7, n			C full count (caller-saves reg not used above)
159	mov	x4, #0			C total sum  (caller-saves reg not used above)
160	mov	x9, #chunksize*8	C caller-saves reg not used above
161	mov	x10, #chunksize		C caller-saves reg not used above
162
1631:	add	ap2, ap, x9		C point at subsequent block
164	add	bp2, bp, x9		C point at subsequent block
165	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
166	movi	v4.16b, #0		C clear chunk summation register
167	movi	v5.16b, #0		C clear chunk summation register
168	bl	L(chu)			C jump deep inside code
169	add	x4, x4, x0
170	mov	ap, ap2			C put chunk pointer in place for calls
171	mov	bp, bp2			C put chunk pointer in place for calls
172	sub	x7, x7, x10
173	cmp	x7, x11
174	b.hi	1b
175
176	mov	n, x7			C count for final invocation
177	bl	L(lt8k)
178	add	x0, x4, x0
179	mov	x30, x8
180	ret
181EPILOGUE()
182