1dnl IA-64 mpn_hamdist -- mpn hamming distance. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: 2 26C Itanium 2: 1 27 28C INPUT PARAMETERS 29define(`up', `r32') 30define(`vp', `r33') 31define(`n', `r34') 32 33define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') 34define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') 35define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27') 36define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') 37define(`s',`r8') 38 39 40ASM_START() 41PROLOGUE(mpn_hamdist) 42 .prologue 43ifdef(`HAVE_ABI_32', 44` addp4 up = 0, up C M I 45 addp4 vp = 0, vp C M I 46 zxt4 n = n C I 47 ;; 48') 49 50 {.mmi; ld8 r10 = [up], 8 C load first ulimb M01 51 ld8 r11 = [vp], 8 C load first vlimb M01 52 mov.i r2 = ar.lc C save ar.lc I0 53}{.mmi; and r14 = 3, n C M I 54 cmp.lt p15, p0 = 4, n C small count? M I 55 add n = -5, n C M I 56 ;; 57}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 58 cmp.eq p7, p0 = 2, r14 C M I 59 cmp.eq p8, p0 = 3, r14 C M I 60}{.bbb 61 (p6) br.dptk .Lb01 C B 62 (p7) br.dptk .Lb10 C B 63 (p8) br.dptk .Lb11 C B 64} 65 66 67.Lb00: ld8 u1 = [up], 8 C M01 68 ld8 v1 = [vp], 8 C M01 69 shr.u n = n, 2 C I0 70 xor x0 = r10, r11 C M I 71 ;; 72 ld8 u2 = [up], 8 C M01 73 ld8 v2 = [vp], 8 C M01 74 mov.i ar.lc = n C I0 75 xor x1 = u1, v1 C M I 76 ;; 77 ld8 u3 = [up], 8 C M01 78 ld8 v3 = [vp], 8 C M01 79 xor x2 = u2, v2 C M I 80 mov s = 0 C M I 81 (p15) br.cond.dptk .grt4 C B 82 ;; 83 popcnt c0 = x0 C I0 84 xor x3 = u3, v3 C M I 85 ;; 86 popcnt c1 = x1 C I0 87 ;; 88 popcnt c2 = x2 C I0 89 br .Lcj4 C B 90 91.grt4: ld8 u0 = [up], 8 C M01 92 ld8 v0 = [vp], 8 C M01 93 xor x1 = u1, v1 C M I 94 ;; 95 ld8 u1 = [up], 8 C M01 96 ld8 v1 = [vp], 8 C M01 97 xor x2 = u2, v2 C M I 98 ;; 99 ld8 u2 = [up], 8 C M01 100 ld8 v2 = [vp], 8 C M01 101 popcnt c0 = x0 C I0 102 xor x3 = u3, v3 C M I 103 ;; 104 ld8 u3 = [up], 8 C M01 105 ld8 v3 = [vp], 8 C M01 106 popcnt c1 = x1 C I0 107 xor x0 = u0, v0 C M I 108 br.cloop.dpnt .grt8 C B 109 110 popcnt c2 = x2 C I0 111 xor x1 = u1, v1 C M I 112 br .Lcj8 C B 113 114.grt8: ld8 u0 = [up], 8 C M01 115 ld8 v0 = [vp], 8 C M01 116 popcnt c2 = x2 C I0 117 xor x1 = u1, v1 C M I 118 br .LL00 C B 119 120 121.Lb01: xor x3 = r10, r11 C M I 122 shr.u n = n, 2 C I0 123 (p15) br.cond.dptk .grt1 C B 124 ;; 125 popcnt r8 = x3 C I0 126 br.ret.sptk.many b0 C B 127 128.grt1: ld8 u0 = [up], 8 C M01 129 ld8 v0 = [vp], 8 C M01 130 mov.i ar.lc = n C I0 131 ;; 132 ld8 u1 = [up], 8 C M01 133 ld8 v1 = [vp], 8 C M01 134 mov s = 0 C M I 135 ;; 136 ld8 u2 = [up], 8 C M01 137 ld8 v2 = [vp], 8 C M01 138 ;; 139 ld8 u3 = [up], 8 C M01 140 ld8 v3 = [vp], 8 C M01 141 xor x0 = u0, v0 C M I 142 br.cloop.dpnt .grt5 C B 143 144 xor x1 = u1, v1 C M I 145 ;; 146 popcnt c3 = x3 C I0 147 xor x2 = u2, v2 C M I 148 ;; 149 popcnt c0 = x0 C I0 150 xor x3 = u3, v3 C M I 151 ;; 152 popcnt c1 = x1 C I0 153 br .Lcj5 C B 154 155.grt5: ld8 u0 = [up], 8 C M01 156 ld8 v0 = [vp], 8 C M01 157 xor x1 = u1, v1 C M I 158 ;; 159 ld8 u1 = [up], 8 C M01 160 ld8 v1 = [vp], 8 C M01 161 popcnt c3 = x3 C I0 162 xor x2 = u2, v2 C M I 163 ;; 164 ld8 u2 = [up], 8 C M01 165 ld8 v2 = [vp], 8 C M01 166 popcnt c0 = x0 C I0 167 xor x3 = u3, v3 C M I 168 ;; 169 ld8 u3 = [up], 8 C M01 170 ld8 v3 = [vp], 8 C M01 171 popcnt c1 = x1 C I0 172 xor x0 = u0, v0 C M I 173 br.cloop.dpnt .Loop C B 174 br .Lend C B 175 176 177.Lb10: ld8 u3 = [up], 8 C M01 178 ld8 v3 = [vp], 8 C M01 179 xor x2 = r10, r11 C M I 180 (p15) br.cond.dptk .grt2 C B 181 ;; 182 xor x3 = u3, v3 C M I 183 ;; 184 popcnt c2 = x2 C I0 185 ;; 186 popcnt c3 = x3 C I0 187 ;; 188 add s = c2, c3 C M I 189 br.ret.sptk.many b0 C B 190 191.grt2: ld8 u0 = [up], 8 C M01 192 ld8 v0 = [vp], 8 C M01 193 shr.u n = n, 2 C I0 194 ;; 195 ld8 u1 = [up], 8 C M01 196 ld8 v1 = [vp], 8 C M01 197 mov.i ar.lc = n C I0 198 mov s = 0 C M I 199 ;; 200 ld8 u2 = [up], 8 C M01 201 ld8 v2 = [vp], 8 C M01 202 xor x3 = u3, v3 C M I 203 ;; 204 ld8 u3 = [up], 8 C M01 205 ld8 v3 = [vp], 8 C M01 206 xor x0 = u0, v0 C M I 207 br.cloop.dptk .grt6 C B 208 209 popcnt c2 = x2 C I0 210 xor x1 = u1, v1 C M I 211 ;; 212 popcnt c3 = x3 C I0 213 xor x2 = u2, v2 C M I 214 ;; 215 popcnt c0 = x0 C I0 216 xor x3 = u3, v3 C M I 217 br .Lcj6 C B 218 219.grt6: ld8 u0 = [up], 8 C M01 220 ld8 v0 = [vp], 8 C M01 221 popcnt c2 = x2 C I0 222 xor x1 = u1, v1 C M I 223 ;; 224 ld8 u1 = [up], 8 C M01 225 ld8 v1 = [vp], 8 C M01 226 popcnt c3 = x3 C I0 227 xor x2 = u2, v2 C M I 228 ;; 229 ld8 u2 = [up], 8 C M01 230 ld8 v2 = [vp], 8 C M01 231 popcnt c0 = x0 C I0 232 xor x3 = u3, v3 C M I 233 br .LL10 C B 234 235 236.Lb11: ld8 u2 = [up], 8 C M01 237 ld8 v2 = [vp], 8 C M01 238 shr.u n = n, 2 C I0 239 xor x1 = r10, r11 C M I 240 ;; 241 ld8 u3 = [up], 8 C M01 242 ld8 v3 = [vp], 8 C M01 243 xor x2 = u2, v2 C M I 244 (p15) br.cond.dptk .grt3 C B 245 ;; 246 xor x3 = u3, v3 C M I 247 ;; 248 popcnt c1 = x1 C I0 249 ;; 250 popcnt c2 = x2 C I0 251 ;; 252 popcnt c3 = x3 C I0 253 ;; 254 add s = c1, c2 C M I 255 ;; 256 add s = s, c3 C M I 257 br.ret.sptk.many b0 C B 258 259.grt3: ld8 u0 = [up], 8 C M01 260 ld8 v0 = [vp], 8 C M01 261 mov.i ar.lc = n C I0 262 ;; 263 ld8 u1 = [up], 8 C M01 264 ld8 v1 = [vp], 8 C M01 265 mov s = 0 C M I 266 ;; 267 ld8 u2 = [up], 8 C M01 268 ld8 v2 = [vp], 8 C M01 269 xor x3 = u3, v3 C M I 270 ;; 271 ld8 u3 = [up], 8 C M01 272 ld8 v3 = [vp], 8 C M01 273 popcnt c1 = x1 C I0 274 xor x0 = u0, v0 C M I 275 br.cloop.dptk .grt7 C B 276 popcnt c2 = x2 C I0 277 xor x1 = u1, v1 C M I 278 ;; 279 popcnt c3 = x3 C I0 280 xor x2 = u2, v2 C M I 281 br .Lcj7 C B 282 283.grt7: ld8 u0 = [up], 8 C M01 284 ld8 v0 = [vp], 8 C M01 285 popcnt c2 = x2 C I0 286 xor x1 = u1, v1 C M I 287 ;; 288 ld8 u1 = [up], 8 C M01 289 ld8 v1 = [vp], 8 C M01 290 popcnt c3 = x3 C I0 291 xor x2 = u2, v2 C M I 292 br .LL11 C B 293 294 295 ALIGN(32) 296.Loop: ld8 u0 = [up], 8 C M01 297 ld8 v0 = [vp], 8 C M01 298 popcnt c2 = x2 C I0 299 add s = s, c3 C M I 300 xor x1 = u1, v1 C M I 301 nop.b 1 C - 302 ;; 303.LL00: ld8 u1 = [up], 8 C M01 304 ld8 v1 = [vp], 8 C M01 305 popcnt c3 = x3 C I0 306 add s = s, c0 C M I 307 xor x2 = u2, v2 C M I 308 nop.b 1 C - 309 ;; 310.LL11: ld8 u2 = [up], 8 C M01 311 ld8 v2 = [vp], 8 C M01 312 popcnt c0 = x0 C I0 313 add s = s, c1 C M I 314 xor x3 = u3, v3 C M I 315 nop.b 1 C - 316 ;; 317.LL10: ld8 u3 = [up], 8 C M01 318 ld8 v3 = [vp], 8 C M01 319 popcnt c1 = x1 C I0 320 add s = s, c2 C M I 321 xor x0 = u0, v0 C M I 322 br.cloop.dptk .Loop C B 323 ;; 324 325.Lend: popcnt c2 = x2 C I0 326 add s = s, c3 C M I 327 xor x1 = u1, v1 C M I 328 ;; 329.Lcj8: popcnt c3 = x3 C I0 330 add s = s, c0 C M I 331 xor x2 = u2, v2 C M I 332 ;; 333.Lcj7: popcnt c0 = x0 C I0 334 add s = s, c1 C M I 335 xor x3 = u3, v3 C M I 336 ;; 337.Lcj6: popcnt c1 = x1 C I0 338 add s = s, c2 C M I 339 ;; 340.Lcj5: popcnt c2 = x2 C I0 341 add s = s, c3 C M I 342 ;; 343.Lcj4: popcnt c3 = x3 C I0 344 add s = s, c0 C M I 345 ;; 346 add s = s, c1 C M I 347 ;; 348 add s = s, c2 C M I 349 ;; 350 add s = s, c3 C M I 351 mov.i ar.lc = r2 C I0 352 br.ret.sptk.many b0 C B 353EPILOGUE() 354ASM_END() 355