xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/popcount.asm (revision d25ffa98a4bfca1fe272f3c182496ec9934faac7)
1dnl  IA-64 mpn_popcount -- mpn population count.
2
3dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C           cycles/limb
24C Itanium:       1.5
25C Itanium 2:     1
26
27C INPUT PARAMETERS
28define(`up', `r32')
29define(`n', `r33')
30
31define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
32define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
33define(`s',`r8')
34
35
36ASM_START()
37PROLOGUE(mpn_popcount)
38	.prologue
39ifdef(`HAVE_ABI_32',
40`	addp4		up = 0, up		C			M I
41	zxt4		n = n			C			I
42	;;
43')
44
45 {.mmi;	add		r9 = 512, up		C prefetch pointer	M I
46	ld8		r10 = [up], 8		C load first limb	M01
47	mov.i		r2 = ar.lc		C save ar.lc		I0
48}{.mmi;	and		r14 = 3, n		C			M I
49	cmp.lt		p15, p14 = 4, n		C small count?		M I
50	add		n = -5, n		C			M I
51	;;
52}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
53	cmp.eq		p7, p0 = 2, r14		C			M I
54	cmp.eq		p8, p0 = 3, r14		C			M I
55}{.bbb
56  (p6)	br.dptk		.Lb01			C			B
57  (p7)	br.dptk		.Lb10			C			B
58  (p8)	br.dptk		.Lb11			C			B
59}
60
61
62.Lb00:	ld8		u1 = [up], 8		C			M01
63	shr.u		n = n, 2		C			I0
64	mov		s = 0			C			M I
65	;;
66	ld8		u2 = [up], 8		C			M01
67	popcnt		c0 = r10		C			I0
68	mov.i		ar.lc = n		C			I0
69	;;
70	ld8		u3 = [up], 8		C			M01
71	popcnt		c1 = u1			C			I0
72  (p15)	br.cond.dptk	.grt4			C			B
73	;;
74	nop.m	0				C			-
75	nop.m	0				C			-
76	popcnt		c2 = u2			C			I0
77	;;
78	mov		s = c0			C			M I
79	popcnt		c3 = u3			C			I0
80	br		.Lcj4			C			B
81
82.grt4:	ld8		u0 = [up], 8		C			M01
83	popcnt		c2 = u2			C			I0
84	br		.LL00			C			B
85
86
87.Lb01:
88	popcnt		s = r10			C			I0
89  (p14)	br.ret.sptk.many b0			C			B
90
91.grt1:	ld8		u0 = [up], 8		C			M01
92	shr.u		n = n, 2		C			I0
93	;;
94	ld8		u1 = [up], 8		C			M01
95	mov.i		ar.lc = n		C			I0
96	;;
97	ld8		u2 = [up], 8		C			M01
98	popcnt		c0 = u0			C			I0
99	mov		c3 = 0			C			I0
100
101	;;
102	ld8		u3 = [up], 8		C			M01
103	popcnt		c1 = u1			C			I0
104	br.cloop.dptk	.Loop			C			B
105	br		.Lend			C			B
106
107
108.Lb10:	ld8		u3 = [up], 8		C			M01
109	shr.u		n = n, 2		C			I0
110  (p15)	br.cond.dptk	.grt2			C			B
111
112	popcnt		s = r10			C			I0
113	;;
114	popcnt		c3 = u3			C			I0
115	br		.Lcj2			C			B
116
117.grt2:	ld8		u0 = [up], 8		C			M01
118	mov.i		ar.lc = n		C			I0
119	popcnt		c2 = r10		C			I0
120	;;
121	ld8		u1 = [up], 8		C			M01
122	popcnt		c3 = u3			C			I0
123	mov		s = 0			C			M I
124	;;
125	ld8		u2 = [up], 8		C			M01
126	popcnt		c0 = u0			C			I0
127	br		.LL10			C			B
128
129
130.Lb11:	ld8		u2 = [up], 8		C			M01
131	shr.u		n = n, 2		C			I0
132	mov		s = 0			C			M I
133	;;
134	ld8		u3 = [up], 8		C			M01
135	popcnt		s = r10			C			I0
136  (p15)	br.cond.dptk	.grt3			C			B
137
138	popcnt		c2 = u2			C			I0
139	;;
140	popcnt		c3 = u3			C			I0
141	br		.Lcj3			C			B
142
143.grt3:	ld8		u0 = [up], 8		C			M01
144	popcnt		c2 = u2			C			I0
145	mov.i		ar.lc = n		C			I0
146	mov		c1 = 0
147	;;
148	ld8		u1 = [up], 8		C			M01
149	popcnt		c3 = u3			C			I0
150	br		.LL11			C			B
151
152
153.Loop:	ld8		u0 = [up], 8		C			M01
154	popcnt		c2 = u2			C			I0
155	add		s = s, c3		C			M I
156	;;
157.LL00:	ld8		u1 = [up], 8		C			M01
158	popcnt		c3 = u3			C			I0
159	add		s = s, c0		C			M I
160	;;
161.LL11:	ld8		u2 = [up], 8		C			M01
162	popcnt		c0 = u0			C			I0
163	add		s = s, c1		C			M I
164	;;
165.LL10:	ld8		u3 = [up], 8		C			M01
166	popcnt		c1 = u1			C			I0
167	add		s = s, c2		C			M I
168	lfetch		[r9], 32		C			M01
169	nop.m		0			C			-
170	br.cloop.dptk	.Loop			C			B
171	;;
172
173.Lend:	popcnt		c2 = u2			C			I0
174	add		s = s, c3		C			M I
175	;;
176	popcnt		c3 = u3			C			I0
177	add		s = s, c0		C			M I
178	;;
179.Lcj4:	add		s = s, c1		C			M I
180	;;
181.Lcj3:	add		s = s, c2		C			M I
182	;;
183.Lcj2:	add		s = s, c3		C			M I
184	mov.i		ar.lc = r2		C			I0
185	br.ret.sptk.many b0			C			B
186EPILOGUE()
187ASM_END()
188