1dnl IA-64 mpn_hamdist -- mpn hamming distance. 2 3dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C Itanium: 2 24C Itanium 2: 1 25 26C INPUT PARAMETERS 27define(`up', `r32') 28define(`vp', `r33') 29define(`n', `r34') 30 31define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') 32define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') 33define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27') 34define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') 35define(`s',`r8') 36 37 38ASM_START() 39PROLOGUE(mpn_hamdist) 40 .prologue 41ifdef(`HAVE_ABI_32', 42` addp4 up = 0, up C M I 43 addp4 vp = 0, vp C M I 44 zxt4 n = n C I 45 ;; 46') 47 48 {.mmi; ld8 r10 = [up], 8 C load first ulimb M01 49 ld8 r11 = [vp], 8 C load first vlimb M01 50 mov.i r2 = ar.lc C save ar.lc I0 51}{.mmi; and r14 = 3, n C M I 52 cmp.lt p15, p0 = 4, n C small count? M I 53 add n = -5, n C M I 54 ;; 55}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 56 cmp.eq p7, p0 = 2, r14 C M I 57 cmp.eq p8, p0 = 3, r14 C M I 58}{.bbb 59 (p6) br.dptk .Lb01 C B 60 (p7) br.dptk .Lb10 C B 61 (p8) br.dptk .Lb11 C B 62} 63 64 65.Lb00: ld8 u1 = [up], 8 C M01 66 ld8 v1 = [vp], 8 C M01 67 shr.u n = n, 2 C I0 68 xor x0 = r10, r11 C M I 69 ;; 70 ld8 u2 = [up], 8 C M01 71 ld8 v2 = [vp], 8 C M01 72 mov.i ar.lc = n C I0 73 xor x1 = u1, v1 C M I 74 ;; 75 ld8 u3 = [up], 8 C M01 76 ld8 v3 = [vp], 8 C M01 77 xor x2 = u2, v2 C M I 78 mov s = 0 C M I 79 (p15) br.cond.dptk .grt4 C B 80 ;; 81 popcnt c0 = x0 C I0 82 xor x3 = u3, v3 C M I 83 ;; 84 popcnt c1 = x1 C I0 85 ;; 86 popcnt c2 = x2 C I0 87 br .Lcj4 C B 88 89.grt4: ld8 u0 = [up], 8 C M01 90 ld8 v0 = [vp], 8 C M01 91 xor x1 = u1, v1 C M I 92 ;; 93 ld8 u1 = [up], 8 C M01 94 ld8 v1 = [vp], 8 C M01 95 xor x2 = u2, v2 C M I 96 ;; 97 ld8 u2 = [up], 8 C M01 98 ld8 v2 = [vp], 8 C M01 99 popcnt c0 = x0 C I0 100 xor x3 = u3, v3 C M I 101 ;; 102 ld8 u3 = [up], 8 C M01 103 ld8 v3 = [vp], 8 C M01 104 popcnt c1 = x1 C I0 105 xor x0 = u0, v0 C M I 106 br.cloop.dpnt .grt8 C B 107 108 popcnt c2 = x2 C I0 109 xor x1 = u1, v1 C M I 110 br .Lcj8 C B 111 112.grt8: ld8 u0 = [up], 8 C M01 113 ld8 v0 = [vp], 8 C M01 114 popcnt c2 = x2 C I0 115 xor x1 = u1, v1 C M I 116 br .LL00 C B 117 118 119.Lb01: xor x3 = r10, r11 C M I 120 shr.u n = n, 2 C I0 121 (p15) br.cond.dptk .grt1 C B 122 ;; 123 popcnt r8 = x3 C I0 124 br.ret.sptk.many b0 C B 125 126.grt1: ld8 u0 = [up], 8 C M01 127 ld8 v0 = [vp], 8 C M01 128 mov.i ar.lc = n C I0 129 ;; 130 ld8 u1 = [up], 8 C M01 131 ld8 v1 = [vp], 8 C M01 132 mov s = 0 C M I 133 ;; 134 ld8 u2 = [up], 8 C M01 135 ld8 v2 = [vp], 8 C M01 136 ;; 137 ld8 u3 = [up], 8 C M01 138 ld8 v3 = [vp], 8 C M01 139 xor x0 = u0, v0 C M I 140 br.cloop.dpnt .grt5 C B 141 142 xor x1 = u1, v1 C M I 143 ;; 144 popcnt c3 = x3 C I0 145 xor x2 = u2, v2 C M I 146 ;; 147 popcnt c0 = x0 C I0 148 xor x3 = u3, v3 C M I 149 ;; 150 popcnt c1 = x1 C I0 151 br .Lcj5 C B 152 153.grt5: ld8 u0 = [up], 8 C M01 154 ld8 v0 = [vp], 8 C M01 155 xor x1 = u1, v1 C M I 156 ;; 157 ld8 u1 = [up], 8 C M01 158 ld8 v1 = [vp], 8 C M01 159 popcnt c3 = x3 C I0 160 xor x2 = u2, v2 C M I 161 ;; 162 ld8 u2 = [up], 8 C M01 163 ld8 v2 = [vp], 8 C M01 164 popcnt c0 = x0 C I0 165 xor x3 = u3, v3 C M I 166 ;; 167 ld8 u3 = [up], 8 C M01 168 ld8 v3 = [vp], 8 C M01 169 popcnt c1 = x1 C I0 170 xor x0 = u0, v0 C M I 171 br.cloop.dpnt .Loop C B 172 br .Lend C B 173 174 175.Lb10: ld8 u3 = [up], 8 C M01 176 ld8 v3 = [vp], 8 C M01 177 xor x2 = r10, r11 C M I 178 (p15) br.cond.dptk .grt2 C B 179 ;; 180 xor x3 = u3, v3 C M I 181 ;; 182 popcnt c2 = x2 C I0 183 ;; 184 popcnt c3 = x3 C I0 185 ;; 186 add s = c2, c3 C M I 187 br.ret.sptk.many b0 C B 188 189.grt2: ld8 u0 = [up], 8 C M01 190 ld8 v0 = [vp], 8 C M01 191 shr.u n = n, 2 C I0 192 ;; 193 ld8 u1 = [up], 8 C M01 194 ld8 v1 = [vp], 8 C M01 195 mov.i ar.lc = n C I0 196 mov s = 0 C M I 197 ;; 198 ld8 u2 = [up], 8 C M01 199 ld8 v2 = [vp], 8 C M01 200 xor x3 = u3, v3 C M I 201 ;; 202 ld8 u3 = [up], 8 C M01 203 ld8 v3 = [vp], 8 C M01 204 xor x0 = u0, v0 C M I 205 br.cloop.dptk .grt6 C B 206 207 popcnt c2 = x2 C I0 208 xor x1 = u1, v1 C M I 209 ;; 210 popcnt c3 = x3 C I0 211 xor x2 = u2, v2 C M I 212 ;; 213 popcnt c0 = x0 C I0 214 xor x3 = u3, v3 C M I 215 br .Lcj6 C B 216 217.grt6: ld8 u0 = [up], 8 C M01 218 ld8 v0 = [vp], 8 C M01 219 popcnt c2 = x2 C I0 220 xor x1 = u1, v1 C M I 221 ;; 222 ld8 u1 = [up], 8 C M01 223 ld8 v1 = [vp], 8 C M01 224 popcnt c3 = x3 C I0 225 xor x2 = u2, v2 C M I 226 ;; 227 ld8 u2 = [up], 8 C M01 228 ld8 v2 = [vp], 8 C M01 229 popcnt c0 = x0 C I0 230 xor x3 = u3, v3 C M I 231 br .LL10 C B 232 233 234.Lb11: ld8 u2 = [up], 8 C M01 235 ld8 v2 = [vp], 8 C M01 236 shr.u n = n, 2 C I0 237 xor x1 = r10, r11 C M I 238 ;; 239 ld8 u3 = [up], 8 C M01 240 ld8 v3 = [vp], 8 C M01 241 xor x2 = u2, v2 C M I 242 (p15) br.cond.dptk .grt3 C B 243 ;; 244 xor x3 = u3, v3 C M I 245 ;; 246 popcnt c1 = x1 C I0 247 ;; 248 popcnt c2 = x2 C I0 249 ;; 250 popcnt c3 = x3 C I0 251 ;; 252 add s = c1, c2 C M I 253 ;; 254 add s = s, c3 C M I 255 br.ret.sptk.many b0 C B 256 257.grt3: ld8 u0 = [up], 8 C M01 258 ld8 v0 = [vp], 8 C M01 259 mov.i ar.lc = n C I0 260 ;; 261 ld8 u1 = [up], 8 C M01 262 ld8 v1 = [vp], 8 C M01 263 mov s = 0 C M I 264 ;; 265 ld8 u2 = [up], 8 C M01 266 ld8 v2 = [vp], 8 C M01 267 xor x3 = u3, v3 C M I 268 ;; 269 ld8 u3 = [up], 8 C M01 270 ld8 v3 = [vp], 8 C M01 271 popcnt c1 = x1 C I0 272 xor x0 = u0, v0 C M I 273 br.cloop.dptk .grt7 C B 274 popcnt c2 = x2 C I0 275 xor x1 = u1, v1 C M I 276 ;; 277 popcnt c3 = x3 C I0 278 xor x2 = u2, v2 C M I 279 br .Lcj7 C B 280 281.grt7: ld8 u0 = [up], 8 C M01 282 ld8 v0 = [vp], 8 C M01 283 popcnt c2 = x2 C I0 284 xor x1 = u1, v1 C M I 285 ;; 286 ld8 u1 = [up], 8 C M01 287 ld8 v1 = [vp], 8 C M01 288 popcnt c3 = x3 C I0 289 xor x2 = u2, v2 C M I 290 br .LL11 C B 291 292 293 ALIGN(32) 294.Loop: ld8 u0 = [up], 8 C M01 295 ld8 v0 = [vp], 8 C M01 296 popcnt c2 = x2 C I0 297 add s = s, c3 C M I 298 xor x1 = u1, v1 C M I 299 nop.b 1 C - 300 ;; 301.LL00: ld8 u1 = [up], 8 C M01 302 ld8 v1 = [vp], 8 C M01 303 popcnt c3 = x3 C I0 304 add s = s, c0 C M I 305 xor x2 = u2, v2 C M I 306 nop.b 1 C - 307 ;; 308.LL11: ld8 u2 = [up], 8 C M01 309 ld8 v2 = [vp], 8 C M01 310 popcnt c0 = x0 C I0 311 add s = s, c1 C M I 312 xor x3 = u3, v3 C M I 313 nop.b 1 C - 314 ;; 315.LL10: ld8 u3 = [up], 8 C M01 316 ld8 v3 = [vp], 8 C M01 317 popcnt c1 = x1 C I0 318 add s = s, c2 C M I 319 xor x0 = u0, v0 C M I 320 br.cloop.dptk .Loop C B 321 ;; 322 323.Lend: popcnt c2 = x2 C I0 324 add s = s, c3 C M I 325 xor x1 = u1, v1 C M I 326 ;; 327.Lcj8: popcnt c3 = x3 C I0 328 add s = s, c0 C M I 329 xor x2 = u2, v2 C M I 330 ;; 331.Lcj7: popcnt c0 = x0 C I0 332 add s = s, c1 C M I 333 xor x3 = u3, v3 C M I 334 ;; 335.Lcj6: popcnt c1 = x1 C I0 336 add s = s, c2 C M I 337 ;; 338.Lcj5: popcnt c2 = x2 C I0 339 add s = s, c3 C M I 340 ;; 341.Lcj4: popcnt c3 = x3 C I0 342 add s = s, c0 C M I 343 ;; 344 add s = s, c1 C M I 345 ;; 346 add s = s, c2 C M I 347 ;; 348 add s = s, c3 C M I 349 mov.i ar.lc = r2 C I0 350 br.ret.sptk.many b0 C B 351EPILOGUE() 352ASM_END() 353