1dnl IA-64 mpn_popcount -- mpn population count. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, 6dnl Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25C cycles/limb 26C Itanium: 1.5 27C Itanium 2: 1 28 29C INPUT PARAMETERS 30define(`up', `r32') 31define(`n', `r33') 32 33define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') 34define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') 35define(`s',`r8') 36 37 38ASM_START() 39PROLOGUE(mpn_popcount) 40 .prologue 41ifdef(`HAVE_ABI_32', 42` addp4 up = 0, up C M I 43 zxt4 n = n C I 44 ;; 45') 46 47 {.mmi; add r9 = 512, up C prefetch pointer M I 48 ld8 r10 = [up], 8 C load first limb M01 49 mov.i r2 = ar.lc C save ar.lc I0 50}{.mmi; and r14 = 3, n C M I 51 cmp.lt p15, p14 = 4, n C small count? M I 52 add n = -5, n C M I 53 ;; 54}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 55 cmp.eq p7, p0 = 2, r14 C M I 56 cmp.eq p8, p0 = 3, r14 C M I 57}{.bbb 58 (p6) br.dptk .Lb01 C B 59 (p7) br.dptk .Lb10 C B 60 (p8) br.dptk .Lb11 C B 61} 62 63 64.Lb00: ld8 u1 = [up], 8 C M01 65 shr.u n = n, 2 C I0 66 mov s = 0 C M I 67 ;; 68 ld8 u2 = [up], 8 C M01 69 popcnt c0 = r10 C I0 70 mov.i ar.lc = n C I0 71 ;; 72 ld8 u3 = [up], 8 C M01 73 popcnt c1 = u1 C I0 74 (p15) br.cond.dptk .grt4 C B 75 ;; 76 nop.m 0 C - 77 nop.m 0 C - 78 popcnt c2 = u2 C I0 79 ;; 80 mov s = c0 C M I 81 popcnt c3 = u3 C I0 82 br .Lcj4 C B 83 84.grt4: ld8 u0 = [up], 8 C M01 85 popcnt c2 = u2 C I0 86 br .LL00 C B 87 88 89.Lb01: 90 popcnt s = r10 C I0 91 (p14) br.ret.sptk.many b0 C B 92 93.grt1: ld8 u0 = [up], 8 C M01 94 shr.u n = n, 2 C I0 95 ;; 96 ld8 u1 = [up], 8 C M01 97 mov.i ar.lc = n C I0 98 ;; 99 ld8 u2 = [up], 8 C M01 100 popcnt c0 = u0 C I0 101 mov c3 = 0 C I0 102 103 ;; 104 ld8 u3 = [up], 8 C M01 105 popcnt c1 = u1 C I0 106 br.cloop.dptk .Loop C B 107 br .Lend C B 108 109 110.Lb10: ld8 u3 = [up], 8 C M01 111 shr.u n = n, 2 C I0 112 (p15) br.cond.dptk .grt2 C B 113 114 popcnt s = r10 C I0 115 ;; 116 popcnt c3 = u3 C I0 117 br .Lcj2 C B 118 119.grt2: ld8 u0 = [up], 8 C M01 120 mov.i ar.lc = n C I0 121 popcnt c2 = r10 C I0 122 ;; 123 ld8 u1 = [up], 8 C M01 124 popcnt c3 = u3 C I0 125 mov s = 0 C M I 126 ;; 127 ld8 u2 = [up], 8 C M01 128 popcnt c0 = u0 C I0 129 br .LL10 C B 130 131 132.Lb11: ld8 u2 = [up], 8 C M01 133 shr.u n = n, 2 C I0 134 mov s = 0 C M I 135 ;; 136 ld8 u3 = [up], 8 C M01 137 popcnt s = r10 C I0 138 (p15) br.cond.dptk .grt3 C B 139 140 popcnt c2 = u2 C I0 141 ;; 142 popcnt c3 = u3 C I0 143 br .Lcj3 C B 144 145.grt3: ld8 u0 = [up], 8 C M01 146 popcnt c2 = u2 C I0 147 mov.i ar.lc = n C I0 148 mov c1 = 0 149 ;; 150 ld8 u1 = [up], 8 C M01 151 popcnt c3 = u3 C I0 152 br .LL11 C B 153 154 155.Loop: ld8 u0 = [up], 8 C M01 156 popcnt c2 = u2 C I0 157 add s = s, c3 C M I 158 ;; 159.LL00: ld8 u1 = [up], 8 C M01 160 popcnt c3 = u3 C I0 161 add s = s, c0 C M I 162 ;; 163.LL11: ld8 u2 = [up], 8 C M01 164 popcnt c0 = u0 C I0 165 add s = s, c1 C M I 166 ;; 167.LL10: ld8 u3 = [up], 8 C M01 168 popcnt c1 = u1 C I0 169 add s = s, c2 C M I 170 lfetch [r9], 32 C M01 171 nop.m 0 C - 172 br.cloop.dptk .Loop C B 173 ;; 174 175.Lend: popcnt c2 = u2 C I0 176 add s = s, c3 C M I 177 ;; 178 popcnt c3 = u3 C I0 179 add s = s, c0 C M I 180 ;; 181.Lcj4: add s = s, c1 C M I 182 ;; 183.Lcj3: add s = s, c2 C M I 184 ;; 185.Lcj2: add s = s, c3 C M I 186 mov.i ar.lc = r2 C I0 187 br.ret.sptk.many b0 C B 188EPILOGUE() 189ASM_END() 190