xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/popcount.asm (revision fa28c6faa16e0b00edee7acdcaf4899797043def)
1dnl  IA-64 mpn_popcount -- mpn population count.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
6dnl  Inc.
7
8dnl  This file is part of the GNU MP Library.
9
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25C           cycles/limb
26C Itanium:       1.5
27C Itanium 2:     1
28
29C INPUT PARAMETERS
30define(`up', `r32')
31define(`n', `r33')
32
33define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
34define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
35define(`s',`r8')
36
37
38ASM_START()
39PROLOGUE(mpn_popcount)
40	.prologue
41ifdef(`HAVE_ABI_32',
42`	addp4		up = 0, up		C			M I
43	zxt4		n = n			C			I
44	;;
45')
46
47 {.mmi;	add		r9 = 512, up		C prefetch pointer	M I
48	ld8		r10 = [up], 8		C load first limb	M01
49	mov.i		r2 = ar.lc		C save ar.lc		I0
50}{.mmi;	and		r14 = 3, n		C			M I
51	cmp.lt		p15, p14 = 4, n		C small count?		M I
52	add		n = -5, n		C			M I
53	;;
54}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
55	cmp.eq		p7, p0 = 2, r14		C			M I
56	cmp.eq		p8, p0 = 3, r14		C			M I
57}{.bbb
58  (p6)	br.dptk		.Lb01			C			B
59  (p7)	br.dptk		.Lb10			C			B
60  (p8)	br.dptk		.Lb11			C			B
61}
62
63
64.Lb00:	ld8		u1 = [up], 8		C			M01
65	shr.u		n = n, 2		C			I0
66	mov		s = 0			C			M I
67	;;
68	ld8		u2 = [up], 8		C			M01
69	popcnt		c0 = r10		C			I0
70	mov.i		ar.lc = n		C			I0
71	;;
72	ld8		u3 = [up], 8		C			M01
73	popcnt		c1 = u1			C			I0
74  (p15)	br.cond.dptk	.grt4			C			B
75	;;
76	nop.m	0				C			-
77	nop.m	0				C			-
78	popcnt		c2 = u2			C			I0
79	;;
80	mov		s = c0			C			M I
81	popcnt		c3 = u3			C			I0
82	br		.Lcj4			C			B
83
84.grt4:	ld8		u0 = [up], 8		C			M01
85	popcnt		c2 = u2			C			I0
86	br		.LL00			C			B
87
88
89.Lb01:
90	popcnt		s = r10			C			I0
91  (p14)	br.ret.sptk.many b0			C			B
92
93.grt1:	ld8		u0 = [up], 8		C			M01
94	shr.u		n = n, 2		C			I0
95	;;
96	ld8		u1 = [up], 8		C			M01
97	mov.i		ar.lc = n		C			I0
98	;;
99	ld8		u2 = [up], 8		C			M01
100	popcnt		c0 = u0			C			I0
101	mov		c3 = 0			C			I0
102
103	;;
104	ld8		u3 = [up], 8		C			M01
105	popcnt		c1 = u1			C			I0
106	br.cloop.dptk	.Loop			C			B
107	br		.Lend			C			B
108
109
110.Lb10:	ld8		u3 = [up], 8		C			M01
111	shr.u		n = n, 2		C			I0
112  (p15)	br.cond.dptk	.grt2			C			B
113
114	popcnt		s = r10			C			I0
115	;;
116	popcnt		c3 = u3			C			I0
117	br		.Lcj2			C			B
118
119.grt2:	ld8		u0 = [up], 8		C			M01
120	mov.i		ar.lc = n		C			I0
121	popcnt		c2 = r10		C			I0
122	;;
123	ld8		u1 = [up], 8		C			M01
124	popcnt		c3 = u3			C			I0
125	mov		s = 0			C			M I
126	;;
127	ld8		u2 = [up], 8		C			M01
128	popcnt		c0 = u0			C			I0
129	br		.LL10			C			B
130
131
132.Lb11:	ld8		u2 = [up], 8		C			M01
133	shr.u		n = n, 2		C			I0
134	mov		s = 0			C			M I
135	;;
136	ld8		u3 = [up], 8		C			M01
137	popcnt		s = r10			C			I0
138  (p15)	br.cond.dptk	.grt3			C			B
139
140	popcnt		c2 = u2			C			I0
141	;;
142	popcnt		c3 = u3			C			I0
143	br		.Lcj3			C			B
144
145.grt3:	ld8		u0 = [up], 8		C			M01
146	popcnt		c2 = u2			C			I0
147	mov.i		ar.lc = n		C			I0
148	mov		c1 = 0
149	;;
150	ld8		u1 = [up], 8		C			M01
151	popcnt		c3 = u3			C			I0
152	br		.LL11			C			B
153
154
155.Loop:	ld8		u0 = [up], 8		C			M01
156	popcnt		c2 = u2			C			I0
157	add		s = s, c3		C			M I
158	;;
159.LL00:	ld8		u1 = [up], 8		C			M01
160	popcnt		c3 = u3			C			I0
161	add		s = s, c0		C			M I
162	;;
163.LL11:	ld8		u2 = [up], 8		C			M01
164	popcnt		c0 = u0			C			I0
165	add		s = s, c1		C			M I
166	;;
167.LL10:	ld8		u3 = [up], 8		C			M01
168	popcnt		c1 = u1			C			I0
169	add		s = s, c2		C			M I
170	lfetch		[r9], 32		C			M01
171	nop.m		0			C			-
172	br.cloop.dptk	.Loop			C			B
173	;;
174
175.Lend:	popcnt		c2 = u2			C			I0
176	add		s = s, c3		C			M I
177	;;
178	popcnt		c3 = u3			C			I0
179	add		s = s, c0		C			M I
180	;;
181.Lcj4:	add		s = s, c1		C			M I
182	;;
183.Lcj3:	add		s = s, c2		C			M I
184	;;
185.Lcj2:	add		s = s, c3		C			M I
186	mov.i		ar.lc = r2		C			I0
187	br.ret.sptk.many b0			C			B
188EPILOGUE()
189ASM_END()
190