1dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and 2dnl add the result to a (n+1)-limb number. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25C cycles/limb 26C Itanium: 3.65 27C Itanium 2: 1.625 28 29C TODO 30C * Clean up variable names, and try to decrease the number of distinct 31C registers used. 32C * Clean up feed-in code to not require zeroing several registers. 33C * Make sure we don't depend on uninitialised predicate registers. 34C * Could perhaps save a few cycles by using 1 c/l carry propagation in 35C wind-down code. 36C * Ultimately rewrite. The problem with this code is that it first uses a 37C loaded u value in one xma pair, then leaves it live over several unrelated 38C xma pairs, before it uses it again. It should actually be quite possible 39C to just swap some aligned xma pairs around. But we should then schedule 40C u loads further from the first use. 41 42C INPUT PARAMETERS 43define(`rp',`r32') 44define(`up',`r33') 45define(`n',`r34') 46define(`vp',`r35') 47 48define(`srp',`r3') 49 50define(`v0',`f6') 51define(`v1',`f7') 52 53define(`s0',`r14') 54define(`acc0',`r15') 55 56define(`pr0_0',`r16') define(`pr0_1',`r17') 57define(`pr0_2',`r18') define(`pr0_3',`r19') 58 59define(`pr1_0',`r20') define(`pr1_1',`r21') 60define(`pr1_2',`r22') define(`pr1_3',`r23') 61 62define(`acc1_0',`r24') define(`acc1_1',`r25') 63define(`acc1_2',`r26') define(`acc1_3',`r27') 64 65dnl define(`',`r28') 66dnl define(`',`r29') 67dnl define(`',`r30') 68dnl define(`',`r31') 69 70define(`fp0b_0',`f8') define(`fp0b_1',`f9') 71define(`fp0b_2',`f10') define(`fp0b_3',`f11') 72 73define(`fp1a_0',`f12') define(`fp1a_1',`f13') 74define(`fp1a_2',`f14') define(`fp1a_3',`f15') 75 76define(`fp1b_0',`f32') define(`fp1b_1',`f33') 77define(`fp1b_2',`f34') define(`fp1b_3',`f35') 78 79define(`fp2a_0',`f36') define(`fp2a_1',`f37') 80define(`fp2a_2',`f38') define(`fp2a_3',`f39') 81 82define(`r_0',`f40') define(`r_1',`f41') 83define(`r_2',`f42') define(`r_3',`f43') 84 85define(`u_0',`f44') define(`u_1',`f45') 86define(`u_2',`f46') define(`u_3',`f47') 87 88define(`rx',`f48') 89define(`ux',`f49') 90define(`ry',`f50') 91define(`uy',`f51') 92 93ASM_START() 94PROLOGUE(mpn_addmul_2s) 95 .prologue 96 .save ar.lc, r2 97 .body 98 99ifdef(`HAVE_ABI_32',` 100.mmi; addp4 rp = 0, rp C M I 101 addp4 up = 0, up C M I 102 addp4 vp = 0, vp C M I 103.mmi; nop 1 104 nop 1 105 zxt4 n = n C I 106 ;;') 107 108.mmi; ldf8 ux = [up], 8 C M 109 ldf8 v0 = [vp], 8 C M 110 mov r2 = ar.lc C I0 111.mmi; ldf8 rx = [rp], 8 C M 112 and r14 = 3, n C M I 113 add n = -2, n C M I 114 ;; 115.mmi; ldf8 uy = [up], 8 C M 116 ldf8 v1 = [vp] C M 117 shr.u n = n, 2 C I0 118.mmi; ldf8 ry = [rp], -8 C M 119 cmp.eq p14, p0 = 1, r14 C M I 120 cmp.eq p11, p0 = 2, r14 C M I 121 ;; 122.mmi; add srp = 16, rp C M I 123 cmp.eq p15, p0 = 3, r14 C M I 124 mov ar.lc = n C I0 125.bbb; (p14) br.dptk L(x01) C B 126 (p11) br.dptk L(x10) C B 127 (p15) br.dptk L(x11) C B 128 ;; 129 130L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair 131 mov fp2a_3 = f0 132 br L(b00) 133L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair 134 mov fp2a_2 = f0 135 br L(b01) 136L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair 137 mov fp2a_1 = f0 138 br L(b10) 139L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair 140 mov fp2a_0 = f0 141 br L(b11) 142 143EPILOGUE() 144 145PROLOGUE(mpn_addmul_2) 146 .prologue 147 .save ar.lc, r2 148 .body 149 150ifdef(`HAVE_ABI_32',` 151.mmi; addp4 rp = 0, rp C M I 152 addp4 up = 0, up C M I 153 addp4 vp = 0, vp C M I 154.mmi; nop 1 155 nop 1 156 zxt4 n = n C I 157 ;;') 158 159.mmi; ldf8 ux = [up], 8 C M 160 ldf8 v0 = [vp], 8 C M 161 mov r2 = ar.lc C I0 162.mmi; ldf8 rx = [rp], 8 C M 163 and r14 = 3, n C M I 164 add n = -2, n C M I 165 ;; 166.mmi; ldf8 uy = [up], 8 C M 167 ldf8 v1 = [vp] C M 168 shr.u n = n, 2 C I0 169.mmi; ldf8 ry = [rp], -8 C M 170 cmp.eq p14, p0 = 1, r14 C M I 171 cmp.eq p11, p0 = 2, r14 C M I 172 ;; 173.mmi; add srp = 16, rp C M I 174 cmp.eq p15, p6 = 3, r14 C M I 175 mov ar.lc = n C I0 176.bbb; (p14) br.dptk L(b01) C B 177 (p11) br.dptk L(b10) C B 178 (p15) br.dptk L(b11) C B 179 ;; 180 181 ALIGN(32) 182L(b00): 183.mmi; ldf8 r_1 = [srp], 8 184 ldf8 u_1 = [up], 8 185 mov acc1_2 = 0 186.mmi; mov pr1_2 = 0 187 mov pr0_3 = 0 188 cmp.ne p8, p9 = r0, r0 189 ;; 190.mfi; ldf8 r_2 = [srp], 8 191 xma.l fp0b_3 = ux, v0, rx 192 cmp.ne p12, p13 = r0, r0 193.mfb; ldf8 u_2 = [up], 8 194 xma.hu fp1b_3 = ux, v0, rx 195 br.cloop.dptk L(gt4) 196 197 xma.l fp0b_0 = uy, v0, ry 198 xma.hu fp1a_0 = uy, v0, ry 199 ;; 200 getfsig acc0 = fp0b_3 201 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 202 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 203 ;; 204 xma.l fp0b_1 = u_1, v0, r_1 205 xma.hu fp1a_1 = u_1, v0, r_1 206 ;; 207 getfsig pr0_0 = fp0b_0 208 xma.l fp1b_0 = uy, v1, fp1a_0 209 xma.hu fp2a_0 = uy, v1, fp1a_0 210 ;; 211 getfsig pr1_3 = fp1b_3 212 getfsig acc1_3 = fp2a_3 213 xma.l fp0b_2 = u_2, v0, r_2 214 xma.hu fp1a_2 = u_2, v0, r_2 215 br L(cj4) 216 217L(gt4): xma.l fp0b_0 = uy, v0, ry 218 xma.hu fp1a_0 = uy, v0, ry 219 ;; 220 ldf8 r_3 = [srp], 8 221 getfsig acc0 = fp0b_3 222 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 223 ldf8 u_3 = [up], 8 224 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s 225 ;; 226 xma.l fp0b_1 = u_1, v0, r_1 227 xma.hu fp1a_1 = u_1, v0, r_1 228 ;; 229 ldf8 r_0 = [srp], 8 230 getfsig pr0_0 = fp0b_0 231 xma.l fp1b_0 = uy, v1, fp1a_0 232 xma.hu fp2a_0 = uy, v1, fp1a_0 233 ;; 234 ldf8 u_0 = [up], 8 235 getfsig pr1_3 = fp1b_3 236 xma.l fp0b_2 = u_2, v0, r_2 237 ;; 238 getfsig acc1_3 = fp2a_3 239 xma.hu fp1a_2 = u_2, v0, r_2 240 br L(00) 241 242 243 ALIGN(32) 244L(b01): 245.mmi; ldf8 r_0 = [srp], 8 C M 246 ldf8 u_0 = [up], 8 C M 247 mov acc1_1 = 0 C M I 248.mmi; mov pr1_1 = 0 C M I 249 mov pr0_2 = 0 C M I 250 cmp.ne p6, p7 = r0, r0 C M I 251 ;; 252.mfi; ldf8 r_1 = [srp], 8 C M 253 xma.l fp0b_2 = ux, v0, rx C F 254 cmp.ne p10, p11 = r0, r0 C M I 255.mfi; ldf8 u_1 = [up], 8 C M 256 xma.hu fp1b_2 = ux, v0, rx C F 257 nop 1 258 ;; 259 xma.l fp0b_3 = uy, v0, ry C F 260 xma.hu fp1a_3 = uy, v0, ry C F 261 ;; 262.mmf; getfsig acc0 = fp0b_2 C M 263 ldf8 r_2 = [srp], 8 C M 264 (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s 265.mfb; ldf8 u_2 = [up], 8 C M 266 (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s 267 br.cloop.dptk L(gt5) 268 269 xma.l fp0b_0 = u_0, v0, r_0 C F 270 xma.hu fp1a_0 = u_0, v0, r_0 C F 271 ;; 272 getfsig pr0_3 = fp0b_3 C M 273 xma.l fp1b_3 = uy, v1,fp1a_3 C F 274 xma.hu fp2a_3 = uy, v1,fp1a_3 C F 275 ;; 276 getfsig pr1_2 = fp1b_2 C M 277 getfsig acc1_2 = fp2a_2 C M 278 xma.l fp0b_1 = u_1, v0, r_1 C F 279 xma.hu fp1a_1 = u_1, v0, r_1 C F 280 br L(cj5) 281 282L(gt5): xma.l fp0b_0 = u_0, v0, r_0 283 xma.hu fp1a_0 = u_0, v0, r_0 284 ;; 285 getfsig pr0_3 = fp0b_3 286 ldf8 r_3 = [srp], 8 287 xma.l fp1b_3 = uy, v1, fp1a_3 288 xma.hu fp2a_3 = uy, v1, fp1a_3 289 ;; 290 ldf8 u_3 = [up], 8 291 getfsig pr1_2 = fp1b_2 292 xma.l fp0b_1 = u_1, v0, r_1 293 ;; 294 getfsig acc1_2 = fp2a_2 295 xma.hu fp1a_1 = u_1, v0, r_1 296 br L(01) 297 298 299 ALIGN(32) 300L(b10): br.cloop.dptk L(gt2) 301 xma.l fp0b_1 = ux, v0, rx 302 xma.hu fp1b_1 = ux, v0, rx 303 ;; 304 xma.l fp0b_2 = uy, v0, ry 305 xma.hu fp1a_2 = uy, v0, ry 306 ;; 307 stf8 [rp] = fp0b_1, 8 308 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 309 (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 310 ;; 311 getfsig acc0 = fp0b_2 312 xma.l fp1b_2 = uy, v1, fp1a_2 313 xma.hu fp2a_2 = uy, v1, fp1a_2 314 ;; 315 getfsig pr1_1 = fp1b_1 316 getfsig acc1_1 = fp2a_1 317 mov ar.lc = r2 318 getfsig pr1_2 = fp1b_2 319 getfsig r8 = fp2a_2 320 ;; 321 add s0 = pr1_1, acc0 322 ;; 323 st8 [rp] = s0, 8 324 cmp.ltu p8, p9 = s0, pr1_1 325 sub r31 = -1, acc1_1 326 ;; 327 .pred.rel "mutex", p8, p9 328 (p8) add acc0 = pr1_2, acc1_1, 1 329 (p9) add acc0 = pr1_2, acc1_1 330 (p8) cmp.leu p10, p0 = r31, pr1_2 331 (p9) cmp.ltu p10, p0 = r31, pr1_2 332 ;; 333 st8 [rp] = acc0, 8 334 (p10) add r8 = 1, r8 335 br.ret.sptk.many b0 336 337 338L(gt2): 339.mmi; ldf8 r_3 = [srp], 8 340 ldf8 u_3 = [up], 8 341 mov acc1_0 = 0 342 ;; 343.mfi; ldf8 r_0 = [srp], 8 344 xma.l fp0b_1 = ux, v0, rx 345 mov pr1_0 = 0 346.mfi; ldf8 u_0 = [up], 8 347 xma.hu fp1b_1 = ux, v0, rx 348 mov pr0_1 = 0 349 ;; 350 xma.l fp0b_2 = uy, v0, ry 351 xma.hu fp1a_2 = uy, v0, ry 352 ;; 353 getfsig acc0 = fp0b_1 354 ldf8 r_1 = [srp], 8 355 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 356 (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s 357 ;; 358 ldf8 u_1 = [up], 8 359 xma.l fp0b_3 = u_3, v0, r_3 360 xma.hu fp1a_3 = u_3, v0, r_3 361 ;; 362 getfsig pr0_2 = fp0b_2 363 ldf8 r_2 = [srp], 8 364 xma.l fp1b_2 = uy, v1, fp1a_2 365 xma.hu fp2a_2 = uy, v1, fp1a_2 366 ;; 367 ldf8 u_2 = [up], 8 368 getfsig pr1_1 = fp1b_1 369 ;; 370.mfi; getfsig acc1_1 = fp2a_1 371 xma.l fp0b_0 = u_0, v0, r_0 372 cmp.ne p8, p9 = r0, r0 373.mfb; cmp.ne p12, p13 = r0, r0 374 xma.hu fp1a_0 = u_0, v0, r_0 375 br.cloop.sptk.clr L(top) 376 br.many L(end) 377 378 379 ALIGN(32) 380L(b11): ldf8 r_2 = [srp], 8 381 mov pr1_3 = 0 382 mov pr0_0 = 0 383 ;; 384 ldf8 u_2 = [up], 8 385 mov acc1_3 = 0 386 br.cloop.dptk L(gt3) 387 ;; 388 cmp.ne p6, p7 = r0, r0 389 xma.l fp0b_0 = ux, v0, rx 390 xma.hu fp1b_0 = ux, v0, rx 391 ;; 392 cmp.ne p10, p11 = r0, r0 393 xma.l fp0b_1 = uy, v0, ry 394 xma.hu fp1a_1 = uy, v0, ry 395 ;; 396 getfsig acc0 = fp0b_0 397 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 398 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 399 ;; 400 xma.l fp0b_2 = uy, v1, r_2 401 xma.hu fp1a_2 = uy, v1, r_2 402 ;; 403 getfsig pr0_1 = fp0b_1 404 xma.l fp1b_1 = u_2, v0, fp1a_1 405 xma.hu fp2a_1 = u_2, v0, fp1a_1 406 ;; 407 getfsig pr1_0 = fp1b_0 408 getfsig acc1_0 = fp2a_0 409 br L(cj3) 410 411L(gt3): ldf8 r_3 = [srp], 8 412 xma.l fp0b_0 = ux, v0, rx 413 cmp.ne p10, p11 = r0, r0 414 ldf8 u_3 = [up], 8 415 xma.hu fp1b_0 = ux, v0, rx 416 cmp.ne p6, p7 = r0, r0 417 ;; 418 xma.l fp0b_1 = uy, v0, ry 419 xma.hu fp1a_1 = uy, v0, ry 420 ;; 421 getfsig acc0 = fp0b_0 422 ldf8 r_0 = [srp], 8 423 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 424 ldf8 u_0 = [up], 8 425 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s 426 ;; 427 xma.l fp0b_2 = u_2, v0, r_2 428 xma.hu fp1a_2 = u_2, v0, r_2 429 ;; 430 getfsig pr0_1 = fp0b_1 431 ldf8 r_1 = [srp], 8 432 xma.l fp1b_1 = uy, v1, fp1a_1 433 xma.hu fp2a_1 = uy, v1, fp1a_1 434 ;; 435 ldf8 u_1 = [up], 8 436 getfsig pr1_0 = fp1b_0 437 ;; 438 getfsig acc1_0 = fp2a_0 439 xma.l fp0b_3 = u_3, v0, r_3 440 xma.hu fp1a_3 = u_3, v0, r_3 441 br L(11) 442 443 444C *** MAIN LOOP START *** 445 ALIGN(32) 446L(top): C 00 447 .pred.rel "mutex", p12, p13 448 getfsig pr0_3 = fp0b_3 449 ldf8 r_3 = [srp], 8 450 xma.l fp1b_3 = u_3, v1, fp1a_3 451 (p12) add s0 = pr1_0, acc0, 1 452 (p13) add s0 = pr1_0, acc0 453 xma.hu fp2a_3 = u_3, v1, fp1a_3 454 ;; C 01 455 .pred.rel "mutex", p8, p9 456 .pred.rel "mutex", p12, p13 457 ldf8 u_3 = [up], 8 458 getfsig pr1_2 = fp1b_2 459 (p8) cmp.leu p6, p7 = acc0, pr0_1 460 (p9) cmp.ltu p6, p7 = acc0, pr0_1 461 (p12) cmp.leu p10, p11 = s0, pr1_0 462 (p13) cmp.ltu p10, p11 = s0, pr1_0 463 ;; C 02 464 .pred.rel "mutex", p6, p7 465 getfsig acc1_2 = fp2a_2 466 st8 [rp] = s0, 8 467 xma.l fp0b_1 = u_1, v0, r_1 468 (p6) add acc0 = pr0_2, acc1_0, 1 469 (p7) add acc0 = pr0_2, acc1_0 470 xma.hu fp1a_1 = u_1, v0, r_1 471 ;; C 03 472L(01): 473 .pred.rel "mutex", p10, p11 474 getfsig pr0_0 = fp0b_0 475 ldf8 r_0 = [srp], 8 476 xma.l fp1b_0 = u_0, v1, fp1a_0 477 (p10) add s0 = pr1_1, acc0, 1 478 (p11) add s0 = pr1_1, acc0 479 xma.hu fp2a_0 = u_0, v1, fp1a_0 480 ;; C 04 481 .pred.rel "mutex", p6, p7 482 .pred.rel "mutex", p10, p11 483 ldf8 u_0 = [up], 8 484 getfsig pr1_3 = fp1b_3 485 (p6) cmp.leu p8, p9 = acc0, pr0_2 486 (p7) cmp.ltu p8, p9 = acc0, pr0_2 487 (p10) cmp.leu p12, p13 = s0, pr1_1 488 (p11) cmp.ltu p12, p13 = s0, pr1_1 489 ;; C 05 490 .pred.rel "mutex", p8, p9 491 getfsig acc1_3 = fp2a_3 492 st8 [rp] = s0, 8 493 xma.l fp0b_2 = u_2, v0, r_2 494 (p8) add acc0 = pr0_3, acc1_1, 1 495 (p9) add acc0 = pr0_3, acc1_1 496 xma.hu fp1a_2 = u_2, v0, r_2 497 ;; C 06 498L(00): 499 .pred.rel "mutex", p12, p13 500 getfsig pr0_1 = fp0b_1 501 ldf8 r_1 = [srp], 8 502 xma.l fp1b_1 = u_1, v1, fp1a_1 503 (p12) add s0 = pr1_2, acc0, 1 504 (p13) add s0 = pr1_2, acc0 505 xma.hu fp2a_1 = u_1, v1, fp1a_1 506 ;; C 07 507 .pred.rel "mutex", p8, p9 508 .pred.rel "mutex", p12, p13 509 ldf8 u_1 = [up], 8 510 getfsig pr1_0 = fp1b_0 511 (p8) cmp.leu p6, p7 = acc0, pr0_3 512 (p9) cmp.ltu p6, p7 = acc0, pr0_3 513 (p12) cmp.leu p10, p11 = s0, pr1_2 514 (p13) cmp.ltu p10, p11 = s0, pr1_2 515 ;; C 08 516 .pred.rel "mutex", p6, p7 517 getfsig acc1_0 = fp2a_0 518 st8 [rp] = s0, 8 519 xma.l fp0b_3 = u_3, v0, r_3 520 (p6) add acc0 = pr0_0, acc1_2, 1 521 (p7) add acc0 = pr0_0, acc1_2 522 xma.hu fp1a_3 = u_3, v0, r_3 523 ;; C 09 524L(11): 525 .pred.rel "mutex", p10, p11 526 getfsig pr0_2 = fp0b_2 527 ldf8 r_2 = [srp], 8 528 xma.l fp1b_2 = u_2, v1, fp1a_2 529 (p10) add s0 = pr1_3, acc0, 1 530 (p11) add s0 = pr1_3, acc0 531 xma.hu fp2a_2 = u_2, v1, fp1a_2 532 ;; C 10 533 .pred.rel "mutex", p6, p7 534 .pred.rel "mutex", p10, p11 535 ldf8 u_2 = [up], 8 536 getfsig pr1_1 = fp1b_1 537 (p6) cmp.leu p8, p9 = acc0, pr0_0 538 (p7) cmp.ltu p8, p9 = acc0, pr0_0 539 (p10) cmp.leu p12, p13 = s0, pr1_3 540 (p11) cmp.ltu p12, p13 = s0, pr1_3 541 ;; C 11 542 .pred.rel "mutex", p8, p9 543 getfsig acc1_1 = fp2a_1 544 st8 [rp] = s0, 8 545 xma.l fp0b_0 = u_0, v0, r_0 546 (p8) add acc0 = pr0_1, acc1_3, 1 547 (p9) add acc0 = pr0_1, acc1_3 548 xma.hu fp1a_0 = u_0, v0, r_0 549L(10): br.cloop.sptk.clr L(top) C 12 550 ;; 551C *** MAIN LOOP END *** 552L(end): 553 .pred.rel "mutex", p12, p13 554.mfi; getfsig pr0_3 = fp0b_3 555 xma.l fp1b_3 = u_3, v1, fp1a_3 556 (p12) add s0 = pr1_0, acc0, 1 557.mfi; (p13) add s0 = pr1_0, acc0 558 xma.hu fp2a_3 = u_3, v1, fp1a_3 559 nop 1 560 ;; 561 .pred.rel "mutex", p8, p9 562 .pred.rel "mutex", p12, p13 563.mmi; getfsig pr1_2 = fp1b_2 564 st8 [rp] = s0, 8 565 (p8) cmp.leu p6, p7 = acc0, pr0_1 566.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 567 (p12) cmp.leu p10, p11 = s0, pr1_0 568 (p13) cmp.ltu p10, p11 = s0, pr1_0 569 ;; 570 .pred.rel "mutex", p6, p7 571.mfi; getfsig acc1_2 = fp2a_2 572 xma.l fp0b_1 = u_1, v0, r_1 573 nop 1 574.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 575 (p7) add acc0 = pr0_2, acc1_0 576 xma.hu fp1a_1 = u_1, v0, r_1 577 ;; 578L(cj5): 579 .pred.rel "mutex", p10, p11 580.mfi; getfsig pr0_0 = fp0b_0 581 xma.l fp1b_0 = u_0, v1, fp1a_0 582 (p10) add s0 = pr1_1, acc0, 1 583.mfi; (p11) add s0 = pr1_1, acc0 584 xma.hu fp2a_0 = u_0, v1, fp1a_0 585 nop 1 586 ;; 587 .pred.rel "mutex", p6, p7 588 .pred.rel "mutex", p10, p11 589.mmi; getfsig pr1_3 = fp1b_3 590 st8 [rp] = s0, 8 591 (p6) cmp.leu p8, p9 = acc0, pr0_2 592.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 593 (p10) cmp.leu p12, p13 = s0, pr1_1 594 (p11) cmp.ltu p12, p13 = s0, pr1_1 595 ;; 596 .pred.rel "mutex", p8, p9 597.mfi; getfsig acc1_3 = fp2a_3 598 xma.l fp0b_2 = u_2, v0, r_2 599 nop 1 600.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 601 (p9) add acc0 = pr0_3, acc1_1 602 xma.hu fp1a_2 = u_2, v0, r_2 603 ;; 604L(cj4): 605 .pred.rel "mutex", p12, p13 606.mfi; getfsig pr0_1 = fp0b_1 607 xma.l fp1b_1 = u_1, v1, fp1a_1 608 (p12) add s0 = pr1_2, acc0, 1 609.mfi; (p13) add s0 = pr1_2, acc0 610 xma.hu fp2a_1 = u_1, v1, fp1a_1 611 nop 1 612 ;; 613 .pred.rel "mutex", p8, p9 614 .pred.rel "mutex", p12, p13 615.mmi; getfsig pr1_0 = fp1b_0 616 st8 [rp] = s0, 8 617 (p8) cmp.leu p6, p7 = acc0, pr0_3 618.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 619 (p12) cmp.leu p10, p11 = s0, pr1_2 620 (p13) cmp.ltu p10, p11 = s0, pr1_2 621 ;; 622 .pred.rel "mutex", p6, p7 623.mmi; getfsig acc1_0 = fp2a_0 624 (p6) add acc0 = pr0_0, acc1_2, 1 625 (p7) add acc0 = pr0_0, acc1_2 626 ;; 627L(cj3): 628 .pred.rel "mutex", p10, p11 629.mfi; getfsig pr0_2 = fp0b_2 630 xma.l fp1b_2 = u_2, v1, fp1a_2 631 (p10) add s0 = pr1_3, acc0, 1 632.mfi; (p11) add s0 = pr1_3, acc0 633 xma.hu fp2a_2 = u_2, v1, fp1a_2 634 nop 1 635 ;; 636 .pred.rel "mutex", p6, p7 637 .pred.rel "mutex", p10, p11 638.mmi; getfsig pr1_1 = fp1b_1 639 st8 [rp] = s0, 8 640 (p6) cmp.leu p8, p9 = acc0, pr0_0 641.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 642 (p10) cmp.leu p12, p13 = s0, pr1_3 643 (p11) cmp.ltu p12, p13 = s0, pr1_3 644 ;; 645 .pred.rel "mutex", p8, p9 646.mmi; getfsig acc1_1 = fp2a_1 647 (p8) add acc0 = pr0_1, acc1_3, 1 648 (p9) add acc0 = pr0_1, acc1_3 649 ;; 650 .pred.rel "mutex", p12, p13 651.mmi; (p12) add s0 = pr1_0, acc0, 1 652 (p13) add s0 = pr1_0, acc0 653 nop 1 654 ;; 655 .pred.rel "mutex", p8, p9 656 .pred.rel "mutex", p12, p13 657.mmi; getfsig pr1_2 = fp1b_2 658 st8 [rp] = s0, 8 659 (p8) cmp.leu p6, p7 = acc0, pr0_1 660.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 661 (p12) cmp.leu p10, p11 = s0, pr1_0 662 (p13) cmp.ltu p10, p11 = s0, pr1_0 663 ;; 664 .pred.rel "mutex", p6, p7 665.mmi; getfsig r8 = fp2a_2 666 (p6) add acc0 = pr0_2, acc1_0, 1 667 (p7) add acc0 = pr0_2, acc1_0 668 ;; 669 .pred.rel "mutex", p10, p11 670.mmi; (p10) add s0 = pr1_1, acc0, 1 671 (p11) add s0 = pr1_1, acc0 672 (p6) cmp.leu p8, p9 = acc0, pr0_2 673 ;; 674 .pred.rel "mutex", p10, p11 675.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 676 (p10) cmp.leu p12, p13 = s0, pr1_1 677 (p11) cmp.ltu p12, p13 = s0, pr1_1 678 ;; 679 .pred.rel "mutex", p8, p9 680.mmi; st8 [rp] = s0, 8 681 (p8) add acc0 = pr1_2, acc1_1, 1 682 (p9) add acc0 = pr1_2, acc1_1 683 ;; 684 .pred.rel "mutex", p8, p9 685.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 686 (p9) cmp.ltu p10, p11 = acc0, pr1_2 687 (p12) add acc0 = 1, acc0 688 ;; 689.mmi; st8 [rp] = acc0, 8 690 (p12) cmpeqor p10, p0 = 0, acc0 691 nop 1 692 ;; 693.mib; (p10) add r8 = 1, r8 694 mov ar.lc = r2 695 br.ret.sptk.many b0 696EPILOGUE() 697ASM_END() 698