xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/hamdist.asm (revision 567219e1d7461bff1b180e494a9674a287b057a7)
1dnl  IA-64 mpn_hamdist -- mpn hamming distance.
2
3dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C           cycles/limb
23C Itanium:       2
24C Itanium 2:     1
25
26C INPUT PARAMETERS
27define(`up', `r32')
28define(`vp', `r33')
29define(`n', `r34')
30
31define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
32define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
33define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
34define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
35define(`s',`r8')
36
37
38ASM_START()
39PROLOGUE(mpn_hamdist)
40	.prologue
41ifdef(`HAVE_ABI_32',
42`	addp4		up = 0, up		C			M I
43	addp4		vp = 0, vp		C			M I
44	zxt4		n = n			C			I
45	;;
46')
47
48 {.mmi;	ld8		r10 = [up], 8		C load first ulimb	M01
49	ld8		r11 = [vp], 8		C load first vlimb	M01
50	mov.i		r2 = ar.lc		C save ar.lc		I0
51}{.mmi;	and		r14 = 3, n		C			M I
52	cmp.lt		p15, p0 = 4, n		C small count?		M I
53	add		n = -5, n		C			M I
54	;;
55}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
56	cmp.eq		p7, p0 = 2, r14		C			M I
57	cmp.eq		p8, p0 = 3, r14		C			M I
58}{.bbb
59  (p6)	br.dptk		.Lb01			C			B
60  (p7)	br.dptk		.Lb10			C			B
61  (p8)	br.dptk		.Lb11			C			B
62}
63
64
65.Lb00:	ld8		u1 = [up], 8		C			M01
66	ld8		v1 = [vp], 8		C			M01
67	shr.u		n = n, 2		C			I0
68	xor		x0 = r10, r11		C			M I
69	;;
70	ld8		u2 = [up], 8		C			M01
71	ld8		v2 = [vp], 8		C			M01
72	mov.i		ar.lc = n		C			I0
73	xor		x1 = u1, v1		C			M I
74	;;
75	ld8		u3 = [up], 8		C			M01
76	ld8		v3 = [vp], 8		C			M01
77	xor		x2 = u2, v2		C			M I
78	mov		s = 0			C			M I
79  (p15)	br.cond.dptk	.grt4			C			B
80	;;
81	popcnt		c0 = x0			C			I0
82	xor		x3 = u3, v3		C			M I
83	;;
84	popcnt		c1 = x1			C			I0
85	;;
86	popcnt		c2 = x2			C			I0
87	br		.Lcj4			C			B
88
89.grt4:	ld8		u0 = [up], 8		C			M01
90	ld8		v0 = [vp], 8		C			M01
91	xor		x1 = u1, v1		C			M I
92	;;
93	ld8		u1 = [up], 8		C			M01
94	ld8		v1 = [vp], 8		C			M01
95	xor		x2 = u2, v2		C			M I
96	;;
97	ld8		u2 = [up], 8		C			M01
98	ld8		v2 = [vp], 8		C			M01
99	popcnt		c0 = x0			C			I0
100	xor		x3 = u3, v3		C			M I
101	;;
102	ld8		u3 = [up], 8		C			M01
103	ld8		v3 = [vp], 8		C			M01
104	popcnt		c1 = x1			C			I0
105	xor		x0 = u0, v0		C			M I
106	br.cloop.dpnt	.grt8			C			B
107
108	popcnt		c2 = x2			C			I0
109	xor		x1 = u1, v1		C			M I
110	br		.Lcj8			C			B
111
112.grt8:	ld8		u0 = [up], 8		C			M01
113	ld8		v0 = [vp], 8		C			M01
114	popcnt		c2 = x2			C			I0
115	xor		x1 = u1, v1		C			M I
116	br		.LL00			C			B
117
118
119.Lb01:	xor		x3 = r10, r11		C			M I
120	shr.u		n = n, 2		C			I0
121  (p15)	br.cond.dptk	.grt1			C			B
122	;;
123	popcnt		r8 = x3			C			I0
124	br.ret.sptk.many b0			C			B
125
126.grt1:	ld8		u0 = [up], 8		C			M01
127	ld8		v0 = [vp], 8		C			M01
128	mov.i		ar.lc = n		C			I0
129	;;
130	ld8		u1 = [up], 8		C			M01
131	ld8		v1 = [vp], 8		C			M01
132	mov		s = 0			C			M I
133	;;
134	ld8		u2 = [up], 8		C			M01
135	ld8		v2 = [vp], 8		C			M01
136	;;
137	ld8		u3 = [up], 8		C			M01
138	ld8		v3 = [vp], 8		C			M01
139	xor		x0 = u0, v0		C			M I
140	br.cloop.dpnt	.grt5			C			B
141
142	xor		x1 = u1, v1		C			M I
143	;;
144	popcnt		c3 = x3			C			I0
145	xor		x2 = u2, v2		C			M I
146	;;
147	popcnt		c0 = x0			C			I0
148	xor		x3 = u3, v3		C			M I
149	;;
150	popcnt		c1 = x1			C			I0
151	br		.Lcj5			C			B
152
153.grt5:	ld8		u0 = [up], 8		C			M01
154	ld8		v0 = [vp], 8		C			M01
155	xor		x1 = u1, v1		C			M I
156	;;
157	ld8		u1 = [up], 8		C			M01
158	ld8		v1 = [vp], 8		C			M01
159	popcnt		c3 = x3			C			I0
160	xor		x2 = u2, v2		C			M I
161	;;
162	ld8		u2 = [up], 8		C			M01
163	ld8		v2 = [vp], 8		C			M01
164	popcnt		c0 = x0			C			I0
165	xor		x3 = u3, v3		C			M I
166	;;
167	ld8		u3 = [up], 8		C			M01
168	ld8		v3 = [vp], 8		C			M01
169	popcnt		c1 = x1			C			I0
170	xor		x0 = u0, v0		C			M I
171	br.cloop.dpnt	.Loop			C			B
172	br		.Lend			C			B
173
174
175.Lb10:	ld8		u3 = [up], 8		C			M01
176	ld8		v3 = [vp], 8		C			M01
177	xor		x2 = r10, r11		C			M I
178  (p15)	br.cond.dptk	.grt2			C			B
179	;;
180	xor		x3 = u3, v3		C			M I
181	;;
182	popcnt		c2 = x2			C			I0
183	;;
184	popcnt		c3 = x3			C			I0
185	;;
186	add		s = c2, c3		C			M I
187	br.ret.sptk.many b0			C			B
188
189.grt2:	ld8		u0 = [up], 8		C			M01
190	ld8		v0 = [vp], 8		C			M01
191	shr.u		n = n, 2		C			I0
192	;;
193	ld8		u1 = [up], 8		C			M01
194	ld8		v1 = [vp], 8		C			M01
195	mov.i		ar.lc = n		C			I0
196	mov		s = 0			C			M I
197	;;
198	ld8		u2 = [up], 8		C			M01
199	ld8		v2 = [vp], 8		C			M01
200	xor		x3 = u3, v3		C			M I
201	;;
202	ld8		u3 = [up], 8		C			M01
203	ld8		v3 = [vp], 8		C			M01
204	xor		x0 = u0, v0		C			M I
205	br.cloop.dptk	.grt6			C			B
206
207	popcnt		c2 = x2			C			I0
208	xor		x1 = u1, v1		C			M I
209	;;
210	popcnt		c3 = x3			C			I0
211	xor		x2 = u2, v2		C			M I
212	;;
213	popcnt		c0 = x0			C			I0
214	xor		x3 = u3, v3		C			M I
215	br		.Lcj6			C			B
216
217.grt6:	ld8		u0 = [up], 8		C			M01
218	ld8		v0 = [vp], 8		C			M01
219	popcnt		c2 = x2			C			I0
220	xor		x1 = u1, v1		C			M I
221	;;
222	ld8		u1 = [up], 8		C			M01
223	ld8		v1 = [vp], 8		C			M01
224	popcnt		c3 = x3			C			I0
225	xor		x2 = u2, v2		C			M I
226	;;
227	ld8		u2 = [up], 8		C			M01
228	ld8		v2 = [vp], 8		C			M01
229	popcnt		c0 = x0			C			I0
230	xor		x3 = u3, v3		C			M I
231	br		.LL10			C			B
232
233
234.Lb11:	ld8		u2 = [up], 8		C			M01
235	ld8		v2 = [vp], 8		C			M01
236	shr.u		n = n, 2		C			I0
237	xor		x1 = r10, r11		C			M I
238	;;
239	ld8		u3 = [up], 8		C			M01
240	ld8		v3 = [vp], 8		C			M01
241	xor		x2 = u2, v2		C			M I
242  (p15)	br.cond.dptk	.grt3			C			B
243	;;
244	xor		x3 = u3, v3		C			M I
245	;;
246	popcnt		c1 = x1			C			I0
247	;;
248	popcnt		c2 = x2			C			I0
249	;;
250	popcnt		c3 = x3			C			I0
251	;;
252	add		s = c1, c2		C			M I
253	;;
254	add		s = s, c3		C			M I
255	br.ret.sptk.many b0			C			B
256
257.grt3:	ld8		u0 = [up], 8		C			M01
258	ld8		v0 = [vp], 8		C			M01
259	mov.i		ar.lc = n		C			I0
260	;;
261	ld8		u1 = [up], 8		C			M01
262	ld8		v1 = [vp], 8		C			M01
263	mov		s = 0			C			M I
264	;;
265	ld8		u2 = [up], 8		C			M01
266	ld8		v2 = [vp], 8		C			M01
267	xor		x3 = u3, v3		C			M I
268	;;
269	ld8		u3 = [up], 8		C			M01
270	ld8		v3 = [vp], 8		C			M01
271	popcnt		c1 = x1			C			I0
272	xor		x0 = u0, v0		C			M I
273	br.cloop.dptk	.grt7			C			B
274	popcnt		c2 = x2			C			I0
275	xor		x1 = u1, v1		C			M I
276	;;
277	popcnt		c3 = x3			C			I0
278	xor		x2 = u2, v2		C			M I
279	br		.Lcj7			C			B
280
281.grt7:	ld8		u0 = [up], 8		C			M01
282	ld8		v0 = [vp], 8		C			M01
283	popcnt		c2 = x2			C			I0
284	xor		x1 = u1, v1		C			M I
285	;;
286	ld8		u1 = [up], 8		C			M01
287	ld8		v1 = [vp], 8		C			M01
288	popcnt		c3 = x3			C			I0
289	xor		x2 = u2, v2		C			M I
290	br		.LL11			C			B
291
292
293	ALIGN(32)
294.Loop:	ld8		u0 = [up], 8		C			M01
295	ld8		v0 = [vp], 8		C			M01
296	popcnt		c2 = x2			C			I0
297	add		s = s, c3		C			M I
298	xor		x1 = u1, v1		C			M I
299	nop.b		1			C			-
300	;;
301.LL00:	ld8		u1 = [up], 8		C			M01
302	ld8		v1 = [vp], 8		C			M01
303	popcnt		c3 = x3			C			I0
304	add		s = s, c0		C			M I
305	xor		x2 = u2, v2		C			M I
306	nop.b		1			C			-
307	;;
308.LL11:	ld8		u2 = [up], 8		C			M01
309	ld8		v2 = [vp], 8		C			M01
310	popcnt		c0 = x0			C			I0
311	add		s = s, c1		C			M I
312	xor		x3 = u3, v3		C			M I
313	nop.b		1			C			-
314	;;
315.LL10:	ld8		u3 = [up], 8		C			M01
316	ld8		v3 = [vp], 8		C			M01
317	popcnt		c1 = x1			C			I0
318	add		s = s, c2		C			M I
319	xor		x0 = u0, v0		C			M I
320	br.cloop.dptk	.Loop			C			B
321	;;
322
323.Lend:	popcnt		c2 = x2			C			I0
324	add		s = s, c3		C			M I
325	xor		x1 = u1, v1		C			M I
326	;;
327.Lcj8:	popcnt		c3 = x3			C			I0
328	add		s = s, c0		C			M I
329	xor		x2 = u2, v2		C			M I
330	;;
331.Lcj7:	popcnt		c0 = x0			C			I0
332	add		s = s, c1		C			M I
333	xor		x3 = u3, v3		C			M I
334	;;
335.Lcj6:	popcnt		c1 = x1			C			I0
336	add		s = s, c2		C			M I
337	;;
338.Lcj5:	popcnt		c2 = x2			C			I0
339	add		s = s, c3		C			M I
340	;;
341.Lcj4:	popcnt		c3 = x3			C			I0
342	add		s = s, c0		C			M I
343	;;
344	add		s = s, c1		C			M I
345	;;
346	add		s = s, c2		C			M I
347	;;
348	add		s = s, c3		C			M I
349	mov.i		ar.lc = r2		C			I0
350	br.ret.sptk.many b0			C			B
351EPILOGUE()
352ASM_END()
353