xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/hamdist.asm (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1dnl  IA-64 mpn_hamdist -- mpn hamming distance.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003-2005 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C           cycles/limb
36C Itanium:       2
37C Itanium 2:     1
38
39C INPUT PARAMETERS
40define(`up', `r32')
41define(`vp', `r33')
42define(`n', `r34')
43
44define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
45define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
46define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
47define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
48define(`s',`r8')
49
50
51ASM_START()
52PROLOGUE(mpn_hamdist)
53	.prologue
54ifdef(`HAVE_ABI_32',
55`	addp4		up = 0, up		C			M I
56	addp4		vp = 0, vp		C			M I
57	zxt4		n = n			C			I
58	;;
59')
60
61 {.mmi;	ld8		r10 = [up], 8		C load first ulimb	M01
62	ld8		r11 = [vp], 8		C load first vlimb	M01
63	mov.i		r2 = ar.lc		C save ar.lc		I0
64}{.mmi;	and		r14 = 3, n		C			M I
65	cmp.lt		p15, p0 = 4, n		C small count?		M I
66	add		n = -5, n		C			M I
67	;;
68}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
69	cmp.eq		p7, p0 = 2, r14		C			M I
70	cmp.eq		p8, p0 = 3, r14		C			M I
71}{.bbb
72  (p6)	br.dptk		.Lb01			C			B
73  (p7)	br.dptk		.Lb10			C			B
74  (p8)	br.dptk		.Lb11			C			B
75}
76
77
78.Lb00:	ld8		u1 = [up], 8		C			M01
79	ld8		v1 = [vp], 8		C			M01
80	shr.u		n = n, 2		C			I0
81	xor		x0 = r10, r11		C			M I
82	;;
83	ld8		u2 = [up], 8		C			M01
84	ld8		v2 = [vp], 8		C			M01
85	mov.i		ar.lc = n		C			I0
86	xor		x1 = u1, v1		C			M I
87	;;
88	ld8		u3 = [up], 8		C			M01
89	ld8		v3 = [vp], 8		C			M01
90	xor		x2 = u2, v2		C			M I
91	mov		s = 0			C			M I
92  (p15)	br.cond.dptk	.grt4			C			B
93	;;
94	popcnt		c0 = x0			C			I0
95	xor		x3 = u3, v3		C			M I
96	;;
97	popcnt		c1 = x1			C			I0
98	;;
99	popcnt		c2 = x2			C			I0
100	br		.Lcj4			C			B
101
102.grt4:	ld8		u0 = [up], 8		C			M01
103	ld8		v0 = [vp], 8		C			M01
104	xor		x1 = u1, v1		C			M I
105	;;
106	ld8		u1 = [up], 8		C			M01
107	ld8		v1 = [vp], 8		C			M01
108	xor		x2 = u2, v2		C			M I
109	;;
110	ld8		u2 = [up], 8		C			M01
111	ld8		v2 = [vp], 8		C			M01
112	popcnt		c0 = x0			C			I0
113	xor		x3 = u3, v3		C			M I
114	;;
115	ld8		u3 = [up], 8		C			M01
116	ld8		v3 = [vp], 8		C			M01
117	popcnt		c1 = x1			C			I0
118	xor		x0 = u0, v0		C			M I
119	br.cloop.dpnt	.grt8			C			B
120
121	popcnt		c2 = x2			C			I0
122	xor		x1 = u1, v1		C			M I
123	br		.Lcj8			C			B
124
125.grt8:	ld8		u0 = [up], 8		C			M01
126	ld8		v0 = [vp], 8		C			M01
127	popcnt		c2 = x2			C			I0
128	xor		x1 = u1, v1		C			M I
129	br		.LL00			C			B
130
131
132.Lb01:	xor		x3 = r10, r11		C			M I
133	shr.u		n = n, 2		C			I0
134  (p15)	br.cond.dptk	.grt1			C			B
135	;;
136	popcnt		r8 = x3			C			I0
137	br.ret.sptk.many b0			C			B
138
139.grt1:	ld8		u0 = [up], 8		C			M01
140	ld8		v0 = [vp], 8		C			M01
141	mov.i		ar.lc = n		C			I0
142	;;
143	ld8		u1 = [up], 8		C			M01
144	ld8		v1 = [vp], 8		C			M01
145	mov		s = 0			C			M I
146	;;
147	ld8		u2 = [up], 8		C			M01
148	ld8		v2 = [vp], 8		C			M01
149	;;
150	ld8		u3 = [up], 8		C			M01
151	ld8		v3 = [vp], 8		C			M01
152	xor		x0 = u0, v0		C			M I
153	br.cloop.dpnt	.grt5			C			B
154
155	xor		x1 = u1, v1		C			M I
156	;;
157	popcnt		c3 = x3			C			I0
158	xor		x2 = u2, v2		C			M I
159	;;
160	popcnt		c0 = x0			C			I0
161	xor		x3 = u3, v3		C			M I
162	;;
163	popcnt		c1 = x1			C			I0
164	br		.Lcj5			C			B
165
166.grt5:	ld8		u0 = [up], 8		C			M01
167	ld8		v0 = [vp], 8		C			M01
168	xor		x1 = u1, v1		C			M I
169	;;
170	ld8		u1 = [up], 8		C			M01
171	ld8		v1 = [vp], 8		C			M01
172	popcnt		c3 = x3			C			I0
173	xor		x2 = u2, v2		C			M I
174	;;
175	ld8		u2 = [up], 8		C			M01
176	ld8		v2 = [vp], 8		C			M01
177	popcnt		c0 = x0			C			I0
178	xor		x3 = u3, v3		C			M I
179	;;
180	ld8		u3 = [up], 8		C			M01
181	ld8		v3 = [vp], 8		C			M01
182	popcnt		c1 = x1			C			I0
183	xor		x0 = u0, v0		C			M I
184	br.cloop.dpnt	.Loop			C			B
185	br		.Lend			C			B
186
187
188.Lb10:	ld8		u3 = [up], 8		C			M01
189	ld8		v3 = [vp], 8		C			M01
190	xor		x2 = r10, r11		C			M I
191  (p15)	br.cond.dptk	.grt2			C			B
192	;;
193	xor		x3 = u3, v3		C			M I
194	;;
195	popcnt		c2 = x2			C			I0
196	;;
197	popcnt		c3 = x3			C			I0
198	;;
199	add		s = c2, c3		C			M I
200	br.ret.sptk.many b0			C			B
201
202.grt2:	ld8		u0 = [up], 8		C			M01
203	ld8		v0 = [vp], 8		C			M01
204	shr.u		n = n, 2		C			I0
205	;;
206	ld8		u1 = [up], 8		C			M01
207	ld8		v1 = [vp], 8		C			M01
208	mov.i		ar.lc = n		C			I0
209	mov		s = 0			C			M I
210	;;
211	ld8		u2 = [up], 8		C			M01
212	ld8		v2 = [vp], 8		C			M01
213	xor		x3 = u3, v3		C			M I
214	;;
215	ld8		u3 = [up], 8		C			M01
216	ld8		v3 = [vp], 8		C			M01
217	xor		x0 = u0, v0		C			M I
218	br.cloop.dptk	.grt6			C			B
219
220	popcnt		c2 = x2			C			I0
221	xor		x1 = u1, v1		C			M I
222	;;
223	popcnt		c3 = x3			C			I0
224	xor		x2 = u2, v2		C			M I
225	;;
226	popcnt		c0 = x0			C			I0
227	xor		x3 = u3, v3		C			M I
228	br		.Lcj6			C			B
229
230.grt6:	ld8		u0 = [up], 8		C			M01
231	ld8		v0 = [vp], 8		C			M01
232	popcnt		c2 = x2			C			I0
233	xor		x1 = u1, v1		C			M I
234	;;
235	ld8		u1 = [up], 8		C			M01
236	ld8		v1 = [vp], 8		C			M01
237	popcnt		c3 = x3			C			I0
238	xor		x2 = u2, v2		C			M I
239	;;
240	ld8		u2 = [up], 8		C			M01
241	ld8		v2 = [vp], 8		C			M01
242	popcnt		c0 = x0			C			I0
243	xor		x3 = u3, v3		C			M I
244	br		.LL10			C			B
245
246
247.Lb11:	ld8		u2 = [up], 8		C			M01
248	ld8		v2 = [vp], 8		C			M01
249	shr.u		n = n, 2		C			I0
250	xor		x1 = r10, r11		C			M I
251	;;
252	ld8		u3 = [up], 8		C			M01
253	ld8		v3 = [vp], 8		C			M01
254	xor		x2 = u2, v2		C			M I
255  (p15)	br.cond.dptk	.grt3			C			B
256	;;
257	xor		x3 = u3, v3		C			M I
258	;;
259	popcnt		c1 = x1			C			I0
260	;;
261	popcnt		c2 = x2			C			I0
262	;;
263	popcnt		c3 = x3			C			I0
264	;;
265	add		s = c1, c2		C			M I
266	;;
267	add		s = s, c3		C			M I
268	br.ret.sptk.many b0			C			B
269
270.grt3:	ld8		u0 = [up], 8		C			M01
271	ld8		v0 = [vp], 8		C			M01
272	mov.i		ar.lc = n		C			I0
273	;;
274	ld8		u1 = [up], 8		C			M01
275	ld8		v1 = [vp], 8		C			M01
276	mov		s = 0			C			M I
277	;;
278	ld8		u2 = [up], 8		C			M01
279	ld8		v2 = [vp], 8		C			M01
280	xor		x3 = u3, v3		C			M I
281	;;
282	ld8		u3 = [up], 8		C			M01
283	ld8		v3 = [vp], 8		C			M01
284	popcnt		c1 = x1			C			I0
285	xor		x0 = u0, v0		C			M I
286	br.cloop.dptk	.grt7			C			B
287	popcnt		c2 = x2			C			I0
288	xor		x1 = u1, v1		C			M I
289	;;
290	popcnt		c3 = x3			C			I0
291	xor		x2 = u2, v2		C			M I
292	br		.Lcj7			C			B
293
294.grt7:	ld8		u0 = [up], 8		C			M01
295	ld8		v0 = [vp], 8		C			M01
296	popcnt		c2 = x2			C			I0
297	xor		x1 = u1, v1		C			M I
298	;;
299	ld8		u1 = [up], 8		C			M01
300	ld8		v1 = [vp], 8		C			M01
301	popcnt		c3 = x3			C			I0
302	xor		x2 = u2, v2		C			M I
303	br		.LL11			C			B
304
305
306	ALIGN(32)
307.Loop:	ld8		u0 = [up], 8		C			M01
308	ld8		v0 = [vp], 8		C			M01
309	popcnt		c2 = x2			C			I0
310	add		s = s, c3		C			M I
311	xor		x1 = u1, v1		C			M I
312	nop.b		1			C			-
313	;;
314.LL00:	ld8		u1 = [up], 8		C			M01
315	ld8		v1 = [vp], 8		C			M01
316	popcnt		c3 = x3			C			I0
317	add		s = s, c0		C			M I
318	xor		x2 = u2, v2		C			M I
319	nop.b		1			C			-
320	;;
321.LL11:	ld8		u2 = [up], 8		C			M01
322	ld8		v2 = [vp], 8		C			M01
323	popcnt		c0 = x0			C			I0
324	add		s = s, c1		C			M I
325	xor		x3 = u3, v3		C			M I
326	nop.b		1			C			-
327	;;
328.LL10:	ld8		u3 = [up], 8		C			M01
329	ld8		v3 = [vp], 8		C			M01
330	popcnt		c1 = x1			C			I0
331	add		s = s, c2		C			M I
332	xor		x0 = u0, v0		C			M I
333	br.cloop.dptk	.Loop			C			B
334	;;
335
336.Lend:	popcnt		c2 = x2			C			I0
337	add		s = s, c3		C			M I
338	xor		x1 = u1, v1		C			M I
339	;;
340.Lcj8:	popcnt		c3 = x3			C			I0
341	add		s = s, c0		C			M I
342	xor		x2 = u2, v2		C			M I
343	;;
344.Lcj7:	popcnt		c0 = x0			C			I0
345	add		s = s, c1		C			M I
346	xor		x3 = u3, v3		C			M I
347	;;
348.Lcj6:	popcnt		c1 = x1			C			I0
349	add		s = s, c2		C			M I
350	;;
351.Lcj5:	popcnt		c2 = x2			C			I0
352	add		s = s, c3		C			M I
353	;;
354.Lcj4:	popcnt		c3 = x3			C			I0
355	add		s = s, c0		C			M I
356	;;
357	add		s = s, c1		C			M I
358	;;
359	add		s = s, c2		C			M I
360	;;
361	add		s = s, c3		C			M I
362	mov.i		ar.lc = r2		C			I0
363	br.ret.sptk.many b0			C			B
364EPILOGUE()
365ASM_END()
366