xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/hamdist.asm (revision 8450a7c42673d65e3b1f6560d3b6ecd317a6cbe8)
1dnl  IA-64 mpn_hamdist -- mpn hamming distance.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C           cycles/limb
25C Itanium:       2
26C Itanium 2:     1
27
28C INPUT PARAMETERS
29define(`up', `r32')
30define(`vp', `r33')
31define(`n', `r34')
32
33define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
34define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
35define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
36define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
37define(`s',`r8')
38
39
40ASM_START()
41PROLOGUE(mpn_hamdist)
42	.prologue
43ifdef(`HAVE_ABI_32',
44`	addp4		up = 0, up		C			M I
45	addp4		vp = 0, vp		C			M I
46	zxt4		n = n			C			I
47	;;
48')
49
50 {.mmi;	ld8		r10 = [up], 8		C load first ulimb	M01
51	ld8		r11 = [vp], 8		C load first vlimb	M01
52	mov.i		r2 = ar.lc		C save ar.lc		I0
53}{.mmi;	and		r14 = 3, n		C			M I
54	cmp.lt		p15, p0 = 4, n		C small count?		M I
55	add		n = -5, n		C			M I
56	;;
57}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
58	cmp.eq		p7, p0 = 2, r14		C			M I
59	cmp.eq		p8, p0 = 3, r14		C			M I
60}{.bbb
61  (p6)	br.dptk		.Lb01			C			B
62  (p7)	br.dptk		.Lb10			C			B
63  (p8)	br.dptk		.Lb11			C			B
64}
65
66
67.Lb00:	ld8		u1 = [up], 8		C			M01
68	ld8		v1 = [vp], 8		C			M01
69	shr.u		n = n, 2		C			I0
70	xor		x0 = r10, r11		C			M I
71	;;
72	ld8		u2 = [up], 8		C			M01
73	ld8		v2 = [vp], 8		C			M01
74	mov.i		ar.lc = n		C			I0
75	xor		x1 = u1, v1		C			M I
76	;;
77	ld8		u3 = [up], 8		C			M01
78	ld8		v3 = [vp], 8		C			M01
79	xor		x2 = u2, v2		C			M I
80	mov		s = 0			C			M I
81  (p15)	br.cond.dptk	.grt4			C			B
82	;;
83	popcnt		c0 = x0			C			I0
84	xor		x3 = u3, v3		C			M I
85	;;
86	popcnt		c1 = x1			C			I0
87	;;
88	popcnt		c2 = x2			C			I0
89	br		.Lcj4			C			B
90
91.grt4:	ld8		u0 = [up], 8		C			M01
92	ld8		v0 = [vp], 8		C			M01
93	xor		x1 = u1, v1		C			M I
94	;;
95	ld8		u1 = [up], 8		C			M01
96	ld8		v1 = [vp], 8		C			M01
97	xor		x2 = u2, v2		C			M I
98	;;
99	ld8		u2 = [up], 8		C			M01
100	ld8		v2 = [vp], 8		C			M01
101	popcnt		c0 = x0			C			I0
102	xor		x3 = u3, v3		C			M I
103	;;
104	ld8		u3 = [up], 8		C			M01
105	ld8		v3 = [vp], 8		C			M01
106	popcnt		c1 = x1			C			I0
107	xor		x0 = u0, v0		C			M I
108	br.cloop.dpnt	.grt8			C			B
109
110	popcnt		c2 = x2			C			I0
111	xor		x1 = u1, v1		C			M I
112	br		.Lcj8			C			B
113
114.grt8:	ld8		u0 = [up], 8		C			M01
115	ld8		v0 = [vp], 8		C			M01
116	popcnt		c2 = x2			C			I0
117	xor		x1 = u1, v1		C			M I
118	br		.LL00			C			B
119
120
121.Lb01:	xor		x3 = r10, r11		C			M I
122	shr.u		n = n, 2		C			I0
123  (p15)	br.cond.dptk	.grt1			C			B
124	;;
125	popcnt		r8 = x3			C			I0
126	br.ret.sptk.many b0			C			B
127
128.grt1:	ld8		u0 = [up], 8		C			M01
129	ld8		v0 = [vp], 8		C			M01
130	mov.i		ar.lc = n		C			I0
131	;;
132	ld8		u1 = [up], 8		C			M01
133	ld8		v1 = [vp], 8		C			M01
134	mov		s = 0			C			M I
135	;;
136	ld8		u2 = [up], 8		C			M01
137	ld8		v2 = [vp], 8		C			M01
138	;;
139	ld8		u3 = [up], 8		C			M01
140	ld8		v3 = [vp], 8		C			M01
141	xor		x0 = u0, v0		C			M I
142	br.cloop.dpnt	.grt5			C			B
143
144	xor		x1 = u1, v1		C			M I
145	;;
146	popcnt		c3 = x3			C			I0
147	xor		x2 = u2, v2		C			M I
148	;;
149	popcnt		c0 = x0			C			I0
150	xor		x3 = u3, v3		C			M I
151	;;
152	popcnt		c1 = x1			C			I0
153	br		.Lcj5			C			B
154
155.grt5:	ld8		u0 = [up], 8		C			M01
156	ld8		v0 = [vp], 8		C			M01
157	xor		x1 = u1, v1		C			M I
158	;;
159	ld8		u1 = [up], 8		C			M01
160	ld8		v1 = [vp], 8		C			M01
161	popcnt		c3 = x3			C			I0
162	xor		x2 = u2, v2		C			M I
163	;;
164	ld8		u2 = [up], 8		C			M01
165	ld8		v2 = [vp], 8		C			M01
166	popcnt		c0 = x0			C			I0
167	xor		x3 = u3, v3		C			M I
168	;;
169	ld8		u3 = [up], 8		C			M01
170	ld8		v3 = [vp], 8		C			M01
171	popcnt		c1 = x1			C			I0
172	xor		x0 = u0, v0		C			M I
173	br.cloop.dpnt	.Loop			C			B
174	br		.Lend			C			B
175
176
177.Lb10:	ld8		u3 = [up], 8		C			M01
178	ld8		v3 = [vp], 8		C			M01
179	xor		x2 = r10, r11		C			M I
180  (p15)	br.cond.dptk	.grt2			C			B
181	;;
182	xor		x3 = u3, v3		C			M I
183	;;
184	popcnt		c2 = x2			C			I0
185	;;
186	popcnt		c3 = x3			C			I0
187	;;
188	add		s = c2, c3		C			M I
189	br.ret.sptk.many b0			C			B
190
191.grt2:	ld8		u0 = [up], 8		C			M01
192	ld8		v0 = [vp], 8		C			M01
193	shr.u		n = n, 2		C			I0
194	;;
195	ld8		u1 = [up], 8		C			M01
196	ld8		v1 = [vp], 8		C			M01
197	mov.i		ar.lc = n		C			I0
198	mov		s = 0			C			M I
199	;;
200	ld8		u2 = [up], 8		C			M01
201	ld8		v2 = [vp], 8		C			M01
202	xor		x3 = u3, v3		C			M I
203	;;
204	ld8		u3 = [up], 8		C			M01
205	ld8		v3 = [vp], 8		C			M01
206	xor		x0 = u0, v0		C			M I
207	br.cloop.dptk	.grt6			C			B
208
209	popcnt		c2 = x2			C			I0
210	xor		x1 = u1, v1		C			M I
211	;;
212	popcnt		c3 = x3			C			I0
213	xor		x2 = u2, v2		C			M I
214	;;
215	popcnt		c0 = x0			C			I0
216	xor		x3 = u3, v3		C			M I
217	br		.Lcj6			C			B
218
219.grt6:	ld8		u0 = [up], 8		C			M01
220	ld8		v0 = [vp], 8		C			M01
221	popcnt		c2 = x2			C			I0
222	xor		x1 = u1, v1		C			M I
223	;;
224	ld8		u1 = [up], 8		C			M01
225	ld8		v1 = [vp], 8		C			M01
226	popcnt		c3 = x3			C			I0
227	xor		x2 = u2, v2		C			M I
228	;;
229	ld8		u2 = [up], 8		C			M01
230	ld8		v2 = [vp], 8		C			M01
231	popcnt		c0 = x0			C			I0
232	xor		x3 = u3, v3		C			M I
233	br		.LL10			C			B
234
235
236.Lb11:	ld8		u2 = [up], 8		C			M01
237	ld8		v2 = [vp], 8		C			M01
238	shr.u		n = n, 2		C			I0
239	xor		x1 = r10, r11		C			M I
240	;;
241	ld8		u3 = [up], 8		C			M01
242	ld8		v3 = [vp], 8		C			M01
243	xor		x2 = u2, v2		C			M I
244  (p15)	br.cond.dptk	.grt3			C			B
245	;;
246	xor		x3 = u3, v3		C			M I
247	;;
248	popcnt		c1 = x1			C			I0
249	;;
250	popcnt		c2 = x2			C			I0
251	;;
252	popcnt		c3 = x3			C			I0
253	;;
254	add		s = c1, c2		C			M I
255	;;
256	add		s = s, c3		C			M I
257	br.ret.sptk.many b0			C			B
258
259.grt3:	ld8		u0 = [up], 8		C			M01
260	ld8		v0 = [vp], 8		C			M01
261	mov.i		ar.lc = n		C			I0
262	;;
263	ld8		u1 = [up], 8		C			M01
264	ld8		v1 = [vp], 8		C			M01
265	mov		s = 0			C			M I
266	;;
267	ld8		u2 = [up], 8		C			M01
268	ld8		v2 = [vp], 8		C			M01
269	xor		x3 = u3, v3		C			M I
270	;;
271	ld8		u3 = [up], 8		C			M01
272	ld8		v3 = [vp], 8		C			M01
273	popcnt		c1 = x1			C			I0
274	xor		x0 = u0, v0		C			M I
275	br.cloop.dptk	.grt7			C			B
276	popcnt		c2 = x2			C			I0
277	xor		x1 = u1, v1		C			M I
278	;;
279	popcnt		c3 = x3			C			I0
280	xor		x2 = u2, v2		C			M I
281	br		.Lcj7			C			B
282
283.grt7:	ld8		u0 = [up], 8		C			M01
284	ld8		v0 = [vp], 8		C			M01
285	popcnt		c2 = x2			C			I0
286	xor		x1 = u1, v1		C			M I
287	;;
288	ld8		u1 = [up], 8		C			M01
289	ld8		v1 = [vp], 8		C			M01
290	popcnt		c3 = x3			C			I0
291	xor		x2 = u2, v2		C			M I
292	br		.LL11			C			B
293
294
295	ALIGN(32)
296.Loop:	ld8		u0 = [up], 8		C			M01
297	ld8		v0 = [vp], 8		C			M01
298	popcnt		c2 = x2			C			I0
299	add		s = s, c3		C			M I
300	xor		x1 = u1, v1		C			M I
301	nop.b		1			C			-
302	;;
303.LL00:	ld8		u1 = [up], 8		C			M01
304	ld8		v1 = [vp], 8		C			M01
305	popcnt		c3 = x3			C			I0
306	add		s = s, c0		C			M I
307	xor		x2 = u2, v2		C			M I
308	nop.b		1			C			-
309	;;
310.LL11:	ld8		u2 = [up], 8		C			M01
311	ld8		v2 = [vp], 8		C			M01
312	popcnt		c0 = x0			C			I0
313	add		s = s, c1		C			M I
314	xor		x3 = u3, v3		C			M I
315	nop.b		1			C			-
316	;;
317.LL10:	ld8		u3 = [up], 8		C			M01
318	ld8		v3 = [vp], 8		C			M01
319	popcnt		c1 = x1			C			I0
320	add		s = s, c2		C			M I
321	xor		x0 = u0, v0		C			M I
322	br.cloop.dptk	.Loop			C			B
323	;;
324
325.Lend:	popcnt		c2 = x2			C			I0
326	add		s = s, c3		C			M I
327	xor		x1 = u1, v1		C			M I
328	;;
329.Lcj8:	popcnt		c3 = x3			C			I0
330	add		s = s, c0		C			M I
331	xor		x2 = u2, v2		C			M I
332	;;
333.Lcj7:	popcnt		c0 = x0			C			I0
334	add		s = s, c1		C			M I
335	xor		x3 = u3, v3		C			M I
336	;;
337.Lcj6:	popcnt		c1 = x1			C			I0
338	add		s = s, c2		C			M I
339	;;
340.Lcj5:	popcnt		c2 = x2			C			I0
341	add		s = s, c3		C			M I
342	;;
343.Lcj4:	popcnt		c3 = x3			C			I0
344	add		s = s, c0		C			M I
345	;;
346	add		s = s, c1		C			M I
347	;;
348	add		s = s, c2		C			M I
349	;;
350	add		s = s, c3		C			M I
351	mov.i		ar.lc = r2		C			I0
352	br.ret.sptk.many b0			C			B
353EPILOGUE()
354ASM_END()
355