xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/popcount.asm (revision eceb233b9bd0dfebb902ed73b531ae6964fa3f9b)
1dnl  IA-64 mpn_popcount -- mpn population count.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2000-2005 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C           cycles/limb
36C Itanium:       1.5
37C Itanium 2:     1
38
39C INPUT PARAMETERS
40define(`up', `r32')
41define(`n', `r33')
42
43define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
44define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
45define(`s',`r8')
46
47
48ASM_START()
49PROLOGUE(mpn_popcount)
50	.prologue
51ifdef(`HAVE_ABI_32',
52`	addp4		up = 0, up		C			M I
53	nop.m		0
54	zxt4		n = n			C			I
55	;;
56')
57
58 {.mmi;	add		r9 = 512, up		C prefetch pointer	M I
59	ld8		r10 = [up], 8		C load first limb	M01
60	mov.i		r2 = ar.lc		C save ar.lc		I0
61}{.mmi;	and		r14 = 3, n		C			M I
62	cmp.lt		p15, p14 = 4, n		C small count?		M I
63	add		n = -5, n		C			M I
64	;;
65}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
66	cmp.eq		p7, p0 = 2, r14		C			M I
67	cmp.eq		p8, p0 = 3, r14		C			M I
68}{.bbb
69  (p6)	br.dptk		.Lb01			C			B
70  (p7)	br.dptk		.Lb10			C			B
71  (p8)	br.dptk		.Lb11			C			B
72}
73
74
75.Lb00:	ld8		u1 = [up], 8		C			M01
76	shr.u		n = n, 2		C			I0
77	mov		s = 0			C			M I
78	;;
79	ld8		u2 = [up], 8		C			M01
80	popcnt		c0 = r10		C			I0
81	mov.i		ar.lc = n		C			I0
82	;;
83	ld8		u3 = [up], 8		C			M01
84	popcnt		c1 = u1			C			I0
85  (p15)	br.cond.dptk	.grt4			C			B
86	;;
87	nop.m	0				C			-
88	nop.m	0				C			-
89	popcnt		c2 = u2			C			I0
90	;;
91	mov		s = c0			C			M I
92	popcnt		c3 = u3			C			I0
93	br		.Lcj4			C			B
94
95.grt4:	ld8		u0 = [up], 8		C			M01
96	popcnt		c2 = u2			C			I0
97	br		.LL00			C			B
98
99
100.Lb01:
101	popcnt		s = r10			C			I0
102  (p14)	br.ret.sptk.many b0			C			B
103
104.grt1:	ld8		u0 = [up], 8		C			M01
105	shr.u		n = n, 2		C			I0
106	;;
107	ld8		u1 = [up], 8		C			M01
108	mov.i		ar.lc = n		C			I0
109	;;
110	ld8		u2 = [up], 8		C			M01
111	popcnt		c0 = u0			C			I0
112	mov		c3 = 0			C			I0
113
114	;;
115	ld8		u3 = [up], 8		C			M01
116	popcnt		c1 = u1			C			I0
117	br.cloop.dptk	.Loop			C			B
118	br		.Lend			C			B
119
120
121.Lb10:	ld8		u3 = [up], 8		C			M01
122	shr.u		n = n, 2		C			I0
123  (p15)	br.cond.dptk	.grt2			C			B
124
125	popcnt		s = r10			C			I0
126	;;
127	popcnt		c3 = u3			C			I0
128	br		.Lcj2			C			B
129
130.grt2:	ld8		u0 = [up], 8		C			M01
131	mov.i		ar.lc = n		C			I0
132	popcnt		c2 = r10		C			I0
133	;;
134	ld8		u1 = [up], 8		C			M01
135	popcnt		c3 = u3			C			I0
136	mov		s = 0			C			M I
137	;;
138	ld8		u2 = [up], 8		C			M01
139	popcnt		c0 = u0			C			I0
140	br		.LL10			C			B
141
142
143.Lb11:	ld8		u2 = [up], 8		C			M01
144	shr.u		n = n, 2		C			I0
145	mov		s = 0			C			M I
146	;;
147	ld8		u3 = [up], 8		C			M01
148	popcnt		s = r10			C			I0
149  (p15)	br.cond.dptk	.grt3			C			B
150
151	popcnt		c2 = u2			C			I0
152	;;
153	popcnt		c3 = u3			C			I0
154	br		.Lcj3			C			B
155
156.grt3:	ld8		u0 = [up], 8		C			M01
157	popcnt		c2 = u2			C			I0
158	mov.i		ar.lc = n		C			I0
159	mov		c1 = 0
160	;;
161	ld8		u1 = [up], 8		C			M01
162	popcnt		c3 = u3			C			I0
163	br		.LL11			C			B
164
165
166.Loop:	ld8		u0 = [up], 8		C			M01
167	popcnt		c2 = u2			C			I0
168	add		s = s, c3		C			M I
169	;;
170.LL00:	ld8		u1 = [up], 8		C			M01
171	popcnt		c3 = u3			C			I0
172	add		s = s, c0		C			M I
173	;;
174.LL11:	ld8		u2 = [up], 8		C			M01
175	popcnt		c0 = u0			C			I0
176	add		s = s, c1		C			M I
177	;;
178.LL10:	ld8		u3 = [up], 8		C			M01
179	popcnt		c1 = u1			C			I0
180	add		s = s, c2		C			M I
181	lfetch		[r9], 32		C			M01
182	nop.m		0			C			-
183	br.cloop.dptk	.Loop			C			B
184	;;
185
186.Lend:	popcnt		c2 = u2			C			I0
187	add		s = s, c3		C			M I
188	;;
189	popcnt		c3 = u3			C			I0
190	add		s = s, c0		C			M I
191	;;
192.Lcj4:	add		s = s, c1		C			M I
193	;;
194.Lcj3:	add		s = s, c2		C			M I
195	;;
196.Lcj2:	add		s = s, c3		C			M I
197	mov.i		ar.lc = r2		C			I0
198	br.ret.sptk.many b0			C			B
199EPILOGUE()
200ASM_END()
201