1dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store 2dnl store the result to a (n+1)-limb number. 3 4dnl Copyright 2004 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C Itanium: 3.15 25C Itanium 2: 1.625 26 27C Note that this is very similar to addmul_2.asm. If you change this file, 28C please change that file too. 29 30C TODO 31C * Clean up variable names, and try to decrease the number of distinct 32C registers used. 33C * Cleanup feed-in code to not require zeroing several registers. 34C * Make sure we don't depend on uninitialized predicate registers. 35C * We currently cross-jump very aggressively, at the expense of a few cycles 36C per operation. Consider changing that. 37C * Could perhaps save a few cycles by using 1 c/l carry propagation in 38C wind-down code. 39C * Ultimately rewrite. The problem with this code is that it first uses a 40C loaded u value in one xma pair, then leaves it live over several unrelated 41C xma pairs, before it uses it again. It should actually be quite possible 42C to just swap some aligned xma pairs around. But we should then schedule 43C u loads further from the first use. 44 45C INPUT PARAMETERS 46define(`rp',`r32') 47define(`up',`r33') 48define(`n',`r34') 49define(`vp',`r35') 50 51define(`srp',`r3') 52 53define(`v0',`f6') 54define(`v1',`f7') 55 56define(`s0',`r14') 57define(`acc0',`r15') 58 59define(`pr0_0',`r16') define(`pr0_1',`r17') 60define(`pr0_2',`r18') define(`pr0_3',`r19') 61 62define(`pr1_0',`r20') define(`pr1_1',`r21') 63define(`pr1_2',`r22') define(`pr1_3',`r23') 64 65define(`acc1_0',`r24') define(`acc1_1',`r25') 66define(`acc1_2',`r26') define(`acc1_3',`r27') 67 68dnl define(`',`r28') 69dnl define(`',`r29') 70dnl define(`',`r30') 71dnl define(`',`r31') 72 73define(`fp0b_0',`f8') define(`fp0b_1',`f9') 74define(`fp0b_2',`f10') define(`fp0b_3',`f11') 75 76define(`fp1a_0',`f12') define(`fp1a_1',`f13') 77define(`fp1a_2',`f14') define(`fp1a_3',`f15') 78 79define(`fp1b_0',`f32') define(`fp1b_1',`f33') 80define(`fp1b_2',`f34') define(`fp1b_3',`f35') 81 82define(`fp2a_0',`f36') define(`fp2a_1',`f37') 83define(`fp2a_2',`f38') define(`fp2a_3',`f39') 84 85define(`u_0',`f44') define(`u_1',`f45') 86define(`u_2',`f46') define(`u_3',`f47') 87 88define(`ux',`f49') 89define(`uy',`f51') 90 91ASM_START() 92PROLOGUE(mpn_mul_2) 93 .prologue 94 .save ar.lc, r2 95 .body 96 97ifdef(`HAVE_ABI_32', 98` addp4 rp = 0, rp C M I 99 addp4 up = 0, up C M I 100 addp4 vp = 0, vp C M I 101 zxt4 n = n C I 102 ;;') 103 104{.mmi C 00 105 ldf8 ux = [up], 8 C M 106 ldf8 v0 = [vp], 8 C M 107 mov.i r2 = ar.lc C I0 108}{.mmi 109 nop 0 C M 110 and r14 = 3, n C M I 111 add n = -2, n C M I 112 ;; 113}{.mmi C 01 114 ldf8 uy = [up], 8 C M 115 ldf8 v1 = [vp] C M 116 shr.u n = n, 2 C I 117}{.mmi 118 nop 0 C M 119 cmp.eq p10, p0 = 1, r14 C M I 120 cmp.eq p11, p0 = 2, r14 C M I 121 ;; 122}{.mmi C 02 123 nop 0 C M 124 cmp.eq p12, p0 = 3, r14 C M I 125 mov.i ar.lc = n C I0 126}{.bbb 127 (p10) br.dptk .Lb01 C B 128 (p11) br.dptk .Lb10 C B 129 (p12) br.dptk .Lb11 C B 130 ;; 131} 132 133 ALIGN(32) 134.Lb00: ldf8 u_1 = [up], 8 135 mov acc1_2 = 0 136 mov pr1_2 = 0 137 mov pr0_3 = 0 138 cmp.ne p8, p9 = r0, r0 139 ;; 140 xma.l fp0b_3 = ux, v0, f0 141 cmp.ne p12, p13 = r0, r0 142 ldf8 u_2 = [up], 8 143 xma.hu fp1a_3 = ux, v0, f0 144 br.cloop.dptk .grt4 145 146 xma.l fp0b_0 = uy, v0, f0 147 xma.hu fp1a_0 = uy, v0, f0 148 ;; 149 getf.sig acc0 = fp0b_3 150 xma.l fp1b_3 = ux, v1, fp1a_3 151 xma.hu fp2a_3 = ux, v1, fp1a_3 152 ;; 153 xma.l fp0b_1 = u_1, v0, f0 154 xma.hu fp1a_1 = u_1, v0, f0 155 ;; 156 getf.sig pr0_0 = fp0b_0 157 xma.l fp1b_0 = uy, v1, fp1a_0 158 xma.hu fp2a_0 = uy, v1, fp1a_0 159 ;; 160 getf.sig pr1_3 = fp1b_3 161 getf.sig acc1_3 = fp2a_3 162 xma.l fp0b_2 = u_2, v0, f0 163 xma.hu fp1a_2 = u_2, v0, f0 164 br .Lcj4 165 166.grt4: xma.l fp0b_0 = uy, v0, f0 167 xma.hu fp1a_0 = uy, v0, f0 168 ;; 169 getf.sig acc0 = fp0b_3 170 xma.l fp1b_3 = ux, v1, fp1a_3 171 ldf8 u_3 = [up], 8 172 xma.hu fp2a_3 = ux, v1, fp1a_3 173 ;; 174 xma.l fp0b_1 = u_1, v0, f0 175 xma.hu fp1a_1 = u_1, v0, f0 176 ;; 177 getf.sig pr0_0 = fp0b_0 178 xma.l fp1b_0 = uy, v1, fp1a_0 179 xma.hu fp2a_0 = uy, v1, fp1a_0 180 ;; 181 ldf8 u_0 = [up], 8 182 getf.sig pr1_3 = fp1b_3 183 ;; 184 getf.sig acc1_3 = fp2a_3 185 xma.l fp0b_2 = u_2, v0, f0 186 xma.hu fp1a_2 = u_2, v0, f0 187 br .LL00 188 189 190 ALIGN(32) 191.Lb01: ldf8 u_0 = [up], 8 C M 192 mov acc1_1 = 0 C M I 193 mov pr1_1 = 0 C M I 194 mov pr0_2 = 0 C M I 195 cmp.ne p6, p7 = r0, r0 C M I 196 ;; 197 xma.l fp0b_2 = ux, v0, f0 C F 198 cmp.ne p10, p11 = r0, r0 C M I 199 ldf8 u_1 = [up], 8 C M 200 xma.hu fp1a_2 = ux, v0, f0 C F 201 ;; 202 xma.l fp0b_3 = uy, v0, f0 C F 203 xma.hu fp1a_3 = uy, v0, f0 C F 204 ;; 205 getf.sig acc0 = fp0b_2 C M 206 xma.l fp1b_2 = ux, v1,fp1a_2 C F 207 xma.hu fp2a_2 = ux, v1,fp1a_2 C F 208 ldf8 u_2 = [up], 8 C M 209 br.cloop.dptk .grt5 210 211 xma.l fp0b_0 = u_0, v0, f0 C F 212 xma.hu fp1a_0 = u_0, v0, f0 C F 213 ;; 214 getf.sig pr0_3 = fp0b_3 C M 215 xma.l fp1b_3 = uy, v1,fp1a_3 C F 216 xma.hu fp2a_3 = uy, v1,fp1a_3 C F 217 ;; 218 getf.sig pr1_2 = fp1b_2 C M 219 getf.sig acc1_2 = fp2a_2 C M 220 xma.l fp0b_1 = u_1, v0, f0 C F 221 xma.hu fp1a_1 = u_1, v0, f0 C F 222 br .Lcj5 223 224.grt5: xma.l fp0b_0 = u_0, v0, f0 225 xma.hu fp1a_0 = u_0, v0, f0 226 ;; 227 getf.sig pr0_3 = fp0b_3 228 xma.l fp1b_3 = uy, v1, fp1a_3 229 xma.hu fp2a_3 = uy, v1, fp1a_3 230 ;; 231 ldf8 u_3 = [up], 8 232 getf.sig pr1_2 = fp1b_2 233 ;; 234 getf.sig acc1_2 = fp2a_2 235 xma.l fp0b_1 = u_1, v0, f0 236 xma.hu fp1a_1 = u_1, v0, f0 237 br .LL01 238 239 240C We have two variants for n = 2. They turn out to run at exactly the same 241C speed. But the first, odd variant might allow one cycle to be trimmed. 242 ALIGN(32) 243ifdef(`',` 244.Lb10: C 03 245 br.cloop.dptk .grt2 246 C 04 247 C 05 248 C 06 249 xma.l fp0b_1 = ux, v0, f0 C 0 250 xma.hu fp1a_1 = ux, v0, f0 C 1 251 ;; C 07 252 xma.l fp0b_2 = uy, v0, f0 C 1 253 xma.l fp1b_1 = ux, v1, f0 C 1 254 ;; C 08 255 xma.hu fp1a_2 = uy, v0, f0 C 2 256 xma.hu fp2a_1 = ux, v1, f0 C 2 257 ;; C 09 258 xma.l fp1b_2 = uy, v1, f0 C 2 259 xma.hu fp2a_2 = uy, v1, f0 C 3 260 ;; C 10 261 getf.sig r16 = fp1a_1 262 stf8 [rp] = fp0b_1, 8 263 ;; C 11 264 getf.sig r17 = fp0b_2 265 C 12 266 getf.sig r18 = fp1b_1 267 C 13 268 getf.sig r19 = fp1a_2 269 C 14 270 getf.sig r20 = fp2a_1 271 C 15 272 getf.sig r21 = fp1b_2 273 ;; C 16 274 getf.sig r8 = fp2a_2 275 add r24 = r16, r17 276 ;; C 17 277 cmp.ltu p6, p7 = r24, r16 278 add r26 = r24, r18 279 ;; C 18 280 cmp.ltu p8, p9 = r26, r24 281 ;; C 19 282 st8 [rp] = r26, 8 283 (p6) add r25 = r19, r20, 1 284 (p7) add r25 = r19, r20 285 ;; C 20 286 (p8) add r27 = r25, r21, 1 287 (p9) add r27 = r25, r21 288 (p6) cmp.leu p10, p0 = r25, r19 289 (p7) cmp.ltu p10, p0 = r25, r19 290 ;; C 21 291 (p10) add r8 = 1, r8 292 (p8) cmp.leu p12, p0 = r27, r25 293 (p9) cmp.ltu p12, p0 = r27, r25 294 ;; C 22 295 st8 [rp] = r27, 8 296 mov.i ar.lc = r2 297 (p12) add r8 = 1, r8 298 br.ret.sptk.many b0 299') 300 301.Lb10: C 03 302 br.cloop.dptk .grt2 303 C 04 304 C 05 305 C 06 306 xma.l fp0b_1 = ux, v0, f0 307 xma.hu fp1a_1 = ux, v0, f0 308 ;; C 07 309 xma.l fp0b_2 = uy, v0, f0 310 xma.hu fp1a_2 = uy, v0, f0 311 ;; C 08 312 C 09 313 C 10 314 stf8 [rp] = fp0b_1, 8 315 xma.l fp1b_1 = ux, v1, fp1a_1 316 xma.hu fp2a_1 = ux, v1, fp1a_1 317 ;; C 11 318 getf.sig acc0 = fp0b_2 319 xma.l fp1b_2 = uy, v1, fp1a_2 320 xma.hu fp2a_2 = uy, v1, fp1a_2 321 ;; C 12 322 C 13 323 C 14 324 getf.sig pr1_1 = fp1b_1 325 C 15 326 getf.sig acc1_1 = fp2a_1 327 C 16 328 getf.sig pr1_2 = fp1b_2 329 C 17 330 getf.sig r8 = fp2a_2 331 ;; C 18 332 C 19 333 add s0 = pr1_1, acc0 334 ;; C 20 335 st8 [rp] = s0, 8 336 cmp.ltu p8, p9 = s0, pr1_1 337 sub r31 = -1, acc1_1 338 ;; C 21 339 .pred.rel "mutex", p8, p9 340 (p8) add acc0 = pr1_2, acc1_1, 1 341 (p9) add acc0 = pr1_2, acc1_1 342 (p8) cmp.leu p10, p0 = r31, pr1_2 343 (p9) cmp.ltu p10, p0 = r31, pr1_2 344 ;; C 22 345 st8 [rp] = acc0, 8 346 mov.i ar.lc = r2 347 (p10) add r8 = 1, r8 348 br.ret.sptk.many b0 349 350 351.grt2: ldf8 u_3 = [up], 8 352 mov acc1_0 = 0 353 mov pr1_0 = 0 354 ;; 355 mov pr0_1 = 0 356 xma.l fp0b_1 = ux, v0, f0 357 ldf8 u_0 = [up], 8 358 xma.hu fp1a_1 = ux, v0, f0 359 ;; 360 xma.l fp0b_2 = uy, v0, f0 361 xma.hu fp1a_2 = uy, v0, f0 362 ;; 363 getf.sig acc0 = fp0b_1 364 xma.l fp1b_1 = ux, v1, fp1a_1 365 xma.hu fp2a_1 = ux, v1, fp1a_1 366 ;; 367 ldf8 u_1 = [up], 8 368 xma.l fp0b_3 = u_3, v0, f0 369 xma.hu fp1a_3 = u_3, v0, f0 370 ;; 371 getf.sig pr0_2 = fp0b_2 372 xma.l fp1b_2 = uy, v1, fp1a_2 373 xma.hu fp2a_2 = uy, v1, fp1a_2 374 ;; 375 ldf8 u_2 = [up], 8 376 getf.sig pr1_1 = fp1b_1 377 ;; 378 getf.sig acc1_1 = fp2a_1 379 xma.l fp0b_0 = u_0, v0, f0 380 cmp.ne p8, p9 = r0, r0 381 cmp.ne p12, p13 = r0, r0 382 xma.hu fp1a_0 = u_0, v0, f0 383 br .LL10 384 385 386 ALIGN(32) 387.Lb11: mov acc1_3 = 0 388 mov pr1_3 = 0 389 mov pr0_0 = 0 390 cmp.ne p6, p7 = r0, r0 391 ;; 392 ldf8 u_2 = [up], 8 393 br.cloop.dptk .grt3 394 ;; 395 xma.l fp0b_0 = ux, v0, f0 396 xma.hu fp1a_0 = ux, v0, f0 397 ;; 398 cmp.ne p10, p11 = r0, r0 399 xma.l fp0b_1 = uy, v0, f0 400 xma.hu fp1a_1 = uy, v0, f0 401 ;; 402 getf.sig acc0 = fp0b_0 403 xma.l fp1b_0 = ux, v1, fp1a_0 404 xma.hu fp2a_0 = ux, v1, fp1a_0 405 ;; 406 xma.l fp0b_2 = u_2, v0, f0 407 xma.hu fp1a_2 = u_2, v0, f0 408 ;; 409 getf.sig pr0_1 = fp0b_1 410 xma.l fp1b_1 = uy, v1, fp1a_1 411 xma.hu fp2a_1 = uy, v1, fp1a_1 412 ;; 413 getf.sig pr1_0 = fp1b_0 414 getf.sig acc1_0 = fp2a_0 415 br .Lcj3 416 417.grt3: xma.l fp0b_0 = ux, v0, f0 418 cmp.ne p10, p11 = r0, r0 419 ldf8 u_3 = [up], 8 420 xma.hu fp1a_0 = ux, v0, f0 421 ;; 422 xma.l fp0b_1 = uy, v0, f0 423 xma.hu fp1a_1 = uy, v0, f0 424 ;; 425 getf.sig acc0 = fp0b_0 426 xma.l fp1b_0 = ux, v1, fp1a_0 427 ldf8 u_0 = [up], 8 428 xma.hu fp2a_0 = ux, v1, fp1a_0 429 ;; 430 xma.l fp0b_2 = u_2, v0, f0 431 xma.hu fp1a_2 = u_2, v0, f0 432 ;; 433 getf.sig pr0_1 = fp0b_1 434 xma.l fp1b_1 = uy, v1, fp1a_1 435 xma.hu fp2a_1 = uy, v1, fp1a_1 436 ;; 437 ldf8 u_1 = [up], 8 438 getf.sig pr1_0 = fp1b_0 439 ;; 440 getf.sig acc1_0 = fp2a_0 441 xma.l fp0b_3 = u_3, v0, f0 442 xma.hu fp1a_3 = u_3, v0, f0 443 br .LL11 444 445 446C *** MAIN LOOP START *** 447 ALIGN(32) 448.Loop: C 00 449 .pred.rel "mutex", p12, p13 450 getf.sig pr0_3 = fp0b_3 451 xma.l fp1b_3 = u_3, v1, fp1a_3 452 (p12) add s0 = pr1_0, acc0, 1 453 (p13) add s0 = pr1_0, acc0 454 xma.hu fp2a_3 = u_3, v1, fp1a_3 455 ;; C 01 456 .pred.rel "mutex", p8, p9 457 .pred.rel "mutex", p12, p13 458 ldf8 u_3 = [up], 8 459 getf.sig pr1_2 = fp1b_2 460 (p8) cmp.leu p6, p7 = acc0, pr0_1 461 (p9) cmp.ltu p6, p7 = acc0, pr0_1 462 (p12) cmp.leu p10, p11 = s0, pr1_0 463 (p13) cmp.ltu p10, p11 = s0, pr1_0 464 ;; C 02 465 .pred.rel "mutex", p6, p7 466 getf.sig acc1_2 = fp2a_2 467 st8 [rp] = s0, 8 468 xma.l fp0b_1 = u_1, v0, f0 469 (p6) add acc0 = pr0_2, acc1_0, 1 470 (p7) add acc0 = pr0_2, acc1_0 471 xma.hu fp1a_1 = u_1, v0, f0 472 ;; C 03 473.LL01: 474 .pred.rel "mutex", p10, p11 475 getf.sig pr0_0 = fp0b_0 476 xma.l fp1b_0 = u_0, v1, fp1a_0 477 (p10) add s0 = pr1_1, acc0, 1 478 (p11) add s0 = pr1_1, acc0 479 xma.hu fp2a_0 = u_0, v1, fp1a_0 480 ;; C 04 481 .pred.rel "mutex", p6, p7 482 .pred.rel "mutex", p10, p11 483 ldf8 u_0 = [up], 8 484 getf.sig pr1_3 = fp1b_3 485 (p6) cmp.leu p8, p9 = acc0, pr0_2 486 (p7) cmp.ltu p8, p9 = acc0, pr0_2 487 (p10) cmp.leu p12, p13 = s0, pr1_1 488 (p11) cmp.ltu p12, p13 = s0, pr1_1 489 ;; C 05 490 .pred.rel "mutex", p8, p9 491 getf.sig acc1_3 = fp2a_3 492 st8 [rp] = s0, 8 493 xma.l fp0b_2 = u_2, v0, f0 494 (p8) add acc0 = pr0_3, acc1_1, 1 495 (p9) add acc0 = pr0_3, acc1_1 496 xma.hu fp1a_2 = u_2, v0, f0 497 ;; C 06 498.LL00: 499 .pred.rel "mutex", p12, p13 500 getf.sig pr0_1 = fp0b_1 501 xma.l fp1b_1 = u_1, v1, fp1a_1 502 (p12) add s0 = pr1_2, acc0, 1 503 (p13) add s0 = pr1_2, acc0 504 xma.hu fp2a_1 = u_1, v1, fp1a_1 505 ;; C 07 506 .pred.rel "mutex", p8, p9 507 .pred.rel "mutex", p12, p13 508 ldf8 u_1 = [up], 8 509 getf.sig pr1_0 = fp1b_0 510 (p8) cmp.leu p6, p7 = acc0, pr0_3 511 (p9) cmp.ltu p6, p7 = acc0, pr0_3 512 (p12) cmp.leu p10, p11 = s0, pr1_2 513 (p13) cmp.ltu p10, p11 = s0, pr1_2 514 ;; C 08 515 .pred.rel "mutex", p6, p7 516 getf.sig acc1_0 = fp2a_0 517 st8 [rp] = s0, 8 518 xma.l fp0b_3 = u_3, v0, f0 519 (p6) add acc0 = pr0_0, acc1_2, 1 520 (p7) add acc0 = pr0_0, acc1_2 521 xma.hu fp1a_3 = u_3, v0, f0 522 ;; C 09 523.LL11: 524 .pred.rel "mutex", p10, p11 525 getf.sig pr0_2 = fp0b_2 526 xma.l fp1b_2 = u_2, v1, fp1a_2 527 (p10) add s0 = pr1_3, acc0, 1 528 (p11) add s0 = pr1_3, acc0 529 xma.hu fp2a_2 = u_2, v1, fp1a_2 530 ;; C 10 531 .pred.rel "mutex", p6, p7 532 .pred.rel "mutex", p10, p11 533 ldf8 u_2 = [up], 8 534 getf.sig pr1_1 = fp1b_1 535 (p6) cmp.leu p8, p9 = acc0, pr0_0 536 (p7) cmp.ltu p8, p9 = acc0, pr0_0 537 (p10) cmp.leu p12, p13 = s0, pr1_3 538 (p11) cmp.ltu p12, p13 = s0, pr1_3 539 ;; C 11 540 .pred.rel "mutex", p8, p9 541 getf.sig acc1_1 = fp2a_1 542 st8 [rp] = s0, 8 543 xma.l fp0b_0 = u_0, v0, f0 544 (p8) add acc0 = pr0_1, acc1_3, 1 545 (p9) add acc0 = pr0_1, acc1_3 546 xma.hu fp1a_0 = u_0, v0, f0 547.LL10: br.cloop.dptk .Loop C 12 548 ;; 549C *** MAIN LOOP END *** 550 551.Lcj6: 552 .pred.rel "mutex", p12, p13 553 getf.sig pr0_3 = fp0b_3 554 xma.l fp1b_3 = u_3, v1, fp1a_3 555 (p12) add s0 = pr1_0, acc0, 1 556 (p13) add s0 = pr1_0, acc0 557 xma.hu fp2a_3 = u_3, v1, fp1a_3 558 ;; 559 .pred.rel "mutex", p8, p9 560 .pred.rel "mutex", p12, p13 561 getf.sig pr1_2 = fp1b_2 562 (p8) cmp.leu p6, p7 = acc0, pr0_1 563 (p9) cmp.ltu p6, p7 = acc0, pr0_1 564 (p12) cmp.leu p10, p11 = s0, pr1_0 565 (p13) cmp.ltu p10, p11 = s0, pr1_0 566 ;; 567 .pred.rel "mutex", p6, p7 568 getf.sig acc1_2 = fp2a_2 569 st8 [rp] = s0, 8 570 xma.l fp0b_1 = u_1, v0, f0 571 (p6) add acc0 = pr0_2, acc1_0, 1 572 (p7) add acc0 = pr0_2, acc1_0 573 xma.hu fp1a_1 = u_1, v0, f0 574 ;; 575.Lcj5: 576 .pred.rel "mutex", p10, p11 577 getf.sig pr0_0 = fp0b_0 578 xma.l fp1b_0 = u_0, v1, fp1a_0 579 (p10) add s0 = pr1_1, acc0, 1 580 (p11) add s0 = pr1_1, acc0 581 xma.hu fp2a_0 = u_0, v1, fp1a_0 582 ;; 583 .pred.rel "mutex", p6, p7 584 .pred.rel "mutex", p10, p11 585 getf.sig pr1_3 = fp1b_3 586 (p6) cmp.leu p8, p9 = acc0, pr0_2 587 (p7) cmp.ltu p8, p9 = acc0, pr0_2 588 (p10) cmp.leu p12, p13 = s0, pr1_1 589 (p11) cmp.ltu p12, p13 = s0, pr1_1 590 ;; 591 .pred.rel "mutex", p8, p9 592 getf.sig acc1_3 = fp2a_3 593 st8 [rp] = s0, 8 594 xma.l fp0b_2 = u_2, v0, f0 595 (p8) add acc0 = pr0_3, acc1_1, 1 596 (p9) add acc0 = pr0_3, acc1_1 597 xma.hu fp1a_2 = u_2, v0, f0 598 ;; 599.Lcj4: 600 .pred.rel "mutex", p12, p13 601 getf.sig pr0_1 = fp0b_1 602 xma.l fp1b_1 = u_1, v1, fp1a_1 603 (p12) add s0 = pr1_2, acc0, 1 604 (p13) add s0 = pr1_2, acc0 605 xma.hu fp2a_1 = u_1, v1, fp1a_1 606 ;; 607 .pred.rel "mutex", p8, p9 608 .pred.rel "mutex", p12, p13 609 getf.sig pr1_0 = fp1b_0 610 (p8) cmp.leu p6, p7 = acc0, pr0_3 611 (p9) cmp.ltu p6, p7 = acc0, pr0_3 612 (p12) cmp.leu p10, p11 = s0, pr1_2 613 (p13) cmp.ltu p10, p11 = s0, pr1_2 614 ;; 615 .pred.rel "mutex", p6, p7 616 getf.sig acc1_0 = fp2a_0 617 st8 [rp] = s0, 8 618 (p6) add acc0 = pr0_0, acc1_2, 1 619 (p7) add acc0 = pr0_0, acc1_2 620 ;; 621.Lcj3: 622 .pred.rel "mutex", p10, p11 623 getf.sig pr0_2 = fp0b_2 624 xma.l fp1b_2 = u_2, v1, fp1a_2 625 (p10) add s0 = pr1_3, acc0, 1 626 (p11) add s0 = pr1_3, acc0 627 xma.hu fp2a_2 = u_2, v1, fp1a_2 628 ;; 629 .pred.rel "mutex", p6, p7 630 .pred.rel "mutex", p10, p11 631 getf.sig pr1_1 = fp1b_1 632 (p6) cmp.leu p8, p9 = acc0, pr0_0 633 (p7) cmp.ltu p8, p9 = acc0, pr0_0 634 (p10) cmp.leu p12, p13 = s0, pr1_3 635 (p11) cmp.ltu p12, p13 = s0, pr1_3 636 ;; 637 .pred.rel "mutex", p8, p9 638 getf.sig acc1_1 = fp2a_1 639 st8 [rp] = s0, 8 640 (p8) add acc0 = pr0_1, acc1_3, 1 641 (p9) add acc0 = pr0_1, acc1_3 642 ;; 643 .pred.rel "mutex", p12, p13 644 (p12) add s0 = pr1_0, acc0, 1 645 (p13) add s0 = pr1_0, acc0 646 ;; 647 .pred.rel "mutex", p8, p9 648 .pred.rel "mutex", p12, p13 649 getf.sig pr1_2 = fp1b_2 650 (p8) cmp.leu p6, p7 = acc0, pr0_1 651 (p9) cmp.ltu p6, p7 = acc0, pr0_1 652 (p12) cmp.leu p10, p11 = s0, pr1_0 653 (p13) cmp.ltu p10, p11 = s0, pr1_0 654 ;; 655 .pred.rel "mutex", p6, p7 656 getf.sig acc1_2 = fp2a_2 657 st8 [rp] = s0, 8 658 (p6) add acc0 = pr0_2, acc1_0, 1 659 (p7) add acc0 = pr0_2, acc1_0 660 ;; 661 .pred.rel "mutex", p10, p11 662 (p10) add s0 = pr1_1, acc0, 1 663 (p11) add s0 = pr1_1, acc0 664 ;; 665 .pred.rel "mutex", p6, p7 666 .pred.rel "mutex", p10, p11 667 (p6) cmp.leu p8, p9 = acc0, pr0_2 668 (p7) cmp.ltu p8, p9 = acc0, pr0_2 669 (p10) cmp.leu p12, p13 = s0, pr1_1 670 (p11) cmp.ltu p12, p13 = s0, pr1_1 671 ;; 672 .pred.rel "mutex", p8, p9 673 st8 [rp] = s0, 8 674 (p8) add acc0 = pr1_2, acc1_1, 1 675 (p9) add acc0 = pr1_2, acc1_1 676 ;; 677 .pred.rel "mutex", p8, p9 678 (p8) cmp.leu p10, p11 = acc0, pr1_2 679 (p9) cmp.ltu p10, p11 = acc0, pr1_2 680 (p12) add acc0 = 1, acc0 681 ;; 682 st8 [rp] = acc0, 8 683 (p12) cmp.eq.or p10, p0 = 0, acc0 684 mov r8 = acc1_2 685 ;; 686 .pred.rel "mutex", p10, p11 687 (p10) add r8 = 1, r8 688 mov.i ar.lc = r2 689 br.ret.sptk.many b0 690EPILOGUE() 691ASM_END() 692