1dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the 2dnl result to a second limb vector. 3 4dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: 3.0 26C Itanium 2: 2.0 27 28C TODO 29C * Further optimize feed-in and wind-down code, both for speed and code size. 30C * Handle low limb input and results specially, using a common stf8 in the 31C epilogue. 32C * Use 1 c/l carry propagation scheme in wind-down code. 33C * Use extra pointer registers for `up' and rp to speed up feed-in loads. 34C * Work out final differences with mul_1.asm. That function is 300 bytes 35C smaller than this due to better loop scheduling and thus simpler feed-in 36C code. 37 38C INPUT PARAMETERS 39define(`rp', `r32') 40define(`up', `r33') 41define(`n', `r34') 42define(`vl', `r35') 43 44ASM_START() 45PROLOGUE(mpn_addmul_1) 46 .prologue 47 .save ar.lc, r2 48 .body 49 50ifdef(`HAVE_ABI_32', 51` addp4 rp = 0, rp C M I 52 addp4 up = 0, up C M I 53 zxt4 n = n C I 54 ;; 55') 56{.mmi 57 adds r15 = -1, n C M I 58 mov r20 = rp C M I 59 mov.i r2 = ar.lc C I0 60} 61{.mmi 62 ldf8 f7 = [up], 8 C M 63 ldf8 f8 = [rp], 8 C M 64 and r14 = 3, n C M I 65 ;; 66} 67{.mmi 68 setf.sig f6 = vl C M2 M3 69 cmp.eq p10, p0 = 0, r14 C M I 70 shr.u r31 = r15, 2 C I0 71} 72{.mmi 73 cmp.eq p11, p0 = 2, r14 C M I 74 cmp.eq p12, p0 = 3, r14 C M I 75 nop.i 0 C I 76 ;; 77} 78{.mii 79 cmp.ne p6, p7 = r0, r0 C M I 80 mov.i ar.lc = r31 C I0 81 cmp.ne p8, p9 = r0, r0 C M I 82} 83{.bbb 84 (p10) br.dptk .Lb00 C B 85 (p11) br.dptk .Lb10 C B 86 (p12) br.dptk .Lb11 C B 87 ;; 88} 89 90.Lb01: br.cloop.dptk .grt1 C B 91 92 xma.l f39 = f7, f6, f8 C F 93 xma.hu f43 = f7, f6, f8 C F 94 ;; 95 getf.sig r8 = f43 C M2 96 stf8 [r20] = f39 C M2 M3 97 mov.i ar.lc = r2 C I0 98 br.ret.sptk.many b0 C B 99 100.grt1: 101 ldf8 f32 = [up], 8 102 ldf8 f44 = [rp], 8 103 ;; 104 ldf8 f33 = [up], 8 105 ldf8 f45 = [rp], 8 106 ;; 107 ldf8 f34 = [up], 8 108 xma.l f39 = f7, f6, f8 109 ldf8 f46 = [rp], 8 110 xma.hu f43 = f7, f6, f8 111 ;; 112 ldf8 f35 = [up], 8 113 ldf8 f47 = [rp], 8 114 br.cloop.dptk .grt5 115 116 xma.l f36 = f32, f6, f44 117 xma.hu f40 = f32, f6, f44 118 ;; 119 stf8 [r20] = f39, 8 120 xma.l f37 = f33, f6, f45 121 xma.hu f41 = f33, f6, f45 122 ;; 123 getf.sig r31 = f43 124 getf.sig r24 = f36 125 xma.l f38 = f34, f6, f46 126 xma.hu f42 = f34, f6, f46 127 ;; 128 getf.sig r28 = f40 129 getf.sig r25 = f37 130 xma.l f39 = f35, f6, f47 131 xma.hu f43 = f35, f6, f47 132 ;; 133 getf.sig r29 = f41 134 getf.sig r26 = f38 135 br .Lcj5 136 137.grt5: 138 mov r30 = 0 139 xma.l f36 = f32, f6, f44 140 xma.hu f40 = f32, f6, f44 141 ;; 142 ldf8 f32 = [up], 8 143 xma.l f37 = f33, f6, f45 144 ldf8 f44 = [rp], 8 145 xma.hu f41 = f33, f6, f45 146 ;; 147 ldf8 f33 = [up], 8 148 getf.sig r27 = f39 149 ;; 150 getf.sig r31 = f43 151 xma.l f38 = f34, f6, f46 152 ldf8 f45 = [rp], 8 153 xma.hu f42 = f34, f6, f46 154 ;; 155 ldf8 f34 = [up], 8 156 getf.sig r24 = f36 157 ;; 158 getf.sig r28 = f40 159 xma.l f39 = f35, f6, f47 160 ldf8 f46 = [rp], 8 161 xma.hu f43 = f35, f6, f47 162 ;; 163 ldf8 f35 = [up], 8 164 getf.sig r25 = f37 165 br.cloop.dptk .Loop 166 br .Le0 167 168 169.Lb10: ldf8 f35 = [up], 8 170 ldf8 f47 = [rp], 8 171 br.cloop.dptk .grt2 172 173 xma.l f38 = f7, f6, f8 174 xma.hu f42 = f7, f6, f8 175 ;; 176 xma.l f39 = f35, f6, f47 177 xma.hu f43 = f35, f6, f47 178 ;; 179 getf.sig r30 = f42 180 stf8 [r20] = f38, 8 181 getf.sig r27 = f39 182 getf.sig r8 = f43 183 br .Lcj2 184 185.grt2: 186 ldf8 f32 = [up], 8 187 ldf8 f44 = [rp], 8 188 ;; 189 ldf8 f33 = [up], 8 190 xma.l f38 = f7, f6, f8 191 ldf8 f45 = [rp], 8 192 xma.hu f42 = f7, f6, f8 193 ;; 194 ldf8 f34 = [up], 8 195 xma.l f39 = f35, f6, f47 196 ldf8 f46 = [rp], 8 197 xma.hu f43 = f35, f6, f47 198 ;; 199 ldf8 f35 = [up], 8 200 ldf8 f47 = [rp], 8 201 br.cloop.dptk .grt6 202 203 stf8 [r20] = f38, 8 204 xma.l f36 = f32, f6, f44 205 xma.hu f40 = f32, f6, f44 206 ;; 207 getf.sig r30 = f42 208 getf.sig r27 = f39 209 xma.l f37 = f33, f6, f45 210 xma.hu f41 = f33, f6, f45 211 ;; 212 getf.sig r31 = f43 213 getf.sig r24 = f36 214 xma.l f38 = f34, f6, f46 215 xma.hu f42 = f34, f6, f46 216 ;; 217 getf.sig r28 = f40 218 getf.sig r25 = f37 219 xma.l f39 = f35, f6, f47 220 xma.hu f43 = f35, f6, f47 221 br .Lcj6 222 223.grt6: 224 mov r29 = 0 225 xma.l f36 = f32, f6, f44 226 xma.hu f40 = f32, f6, f44 227 ;; 228 ldf8 f32 = [up], 8 229 getf.sig r26 = f38 230 ;; 231 getf.sig r30 = f42 232 xma.l f37 = f33, f6, f45 233 ldf8 f44 = [rp], 8 234 xma.hu f41 = f33, f6, f45 235 ;; 236 ldf8 f33 = [up], 8 237 getf.sig r27 = f39 238 ;; 239 getf.sig r31 = f43 240 xma.l f38 = f34, f6, f46 241 ldf8 f45 = [rp], 8 242 xma.hu f42 = f34, f6, f46 243 ;; 244 ldf8 f34 = [up], 8 245 getf.sig r24 = f36 246 br .LL10 247 248 249.Lb11: ldf8 f34 = [up], 8 250 ldf8 f46 = [rp], 8 251 ;; 252 ldf8 f35 = [up], 8 253 ldf8 f47 = [rp], 8 254 br.cloop.dptk .grt3 255 ;; 256 257 xma.l f37 = f7, f6, f8 258 xma.hu f41 = f7, f6, f8 259 xma.l f38 = f34, f6, f46 260 xma.hu f42 = f34, f6, f46 261 xma.l f39 = f35, f6, f47 262 xma.hu f43 = f35, f6, f47 263 ;; 264 getf.sig r29 = f41 265 stf8 [r20] = f37, 8 266 getf.sig r26 = f38 267 getf.sig r30 = f42 268 getf.sig r27 = f39 269 getf.sig r8 = f43 270 br .Lcj3 271 272.grt3: 273 ldf8 f32 = [up], 8 274 xma.l f37 = f7, f6, f8 275 ldf8 f44 = [rp], 8 276 xma.hu f41 = f7, f6, f8 277 ;; 278 ldf8 f33 = [up], 8 279 xma.l f38 = f34, f6, f46 280 ldf8 f45 = [rp], 8 281 xma.hu f42 = f34, f6, f46 282 ;; 283 ldf8 f34 = [up], 8 284 xma.l f39 = f35, f6, f47 285 ldf8 f46 = [rp], 8 286 xma.hu f43 = f35, f6, f47 287 ;; 288 ldf8 f35 = [up], 8 289 getf.sig r25 = f37 C FIXME 290 ldf8 f47 = [rp], 8 291 br.cloop.dptk .grt7 292 293 getf.sig r29 = f41 294 stf8 [r20] = f37, 8 C FIXME 295 xma.l f36 = f32, f6, f44 296 getf.sig r26 = f38 297 xma.hu f40 = f32, f6, f44 298 ;; 299 getf.sig r30 = f42 300 xma.l f37 = f33, f6, f45 301 getf.sig r27 = f39 302 xma.hu f41 = f33, f6, f45 303 ;; 304 getf.sig r31 = f43 305 xma.l f38 = f34, f6, f46 306 getf.sig r24 = f36 307 xma.hu f42 = f34, f6, f46 308 br .Lcj7 309 310.grt7: 311 getf.sig r29 = f41 312 xma.l f36 = f32, f6, f44 313 mov r28 = 0 314 xma.hu f40 = f32, f6, f44 315 ;; 316 ldf8 f32 = [up], 8 317 getf.sig r26 = f38 318 ;; 319 getf.sig r30 = f42 320 xma.l f37 = f33, f6, f45 321 ldf8 f44 = [rp], 8 322 xma.hu f41 = f33, f6, f45 323 ;; 324 ldf8 f33 = [up], 8 325 getf.sig r27 = f39 326 br .LL11 327 328 329.Lb00: ldf8 f33 = [up], 8 330 ldf8 f45 = [rp], 8 331 ;; 332 ldf8 f34 = [up], 8 333 ldf8 f46 = [rp], 8 334 ;; 335 ldf8 f35 = [up], 8 336 xma.l f36 = f7, f6, f8 337 ldf8 f47 = [rp], 8 338 xma.hu f40 = f7, f6, f8 339 br.cloop.dptk .grt4 340 341 xma.l f37 = f33, f6, f45 342 xma.hu f41 = f33, f6, f45 343 xma.l f38 = f34, f6, f46 344 xma.hu f42 = f34, f6, f46 345 ;; 346 getf.sig r28 = f40 347 stf8 [r20] = f36, 8 348 xma.l f39 = f35, f6, f47 349 getf.sig r25 = f37 350 xma.hu f43 = f35, f6, f47 351 ;; 352 getf.sig r29 = f41 353 getf.sig r26 = f38 354 getf.sig r30 = f42 355 getf.sig r27 = f39 356 br .Lcj4 357 358.grt4: 359 ldf8 f32 = [up], 8 360 xma.l f37 = f33, f6, f45 361 ldf8 f44 = [rp], 8 362 xma.hu f41 = f33, f6, f45 363 ;; 364 ldf8 f33 = [up], 8 365 xma.l f38 = f34, f6, f46 366 ldf8 f45 = [rp], 8 367 xma.hu f42 = f34, f6, f46 368 ;; 369 ldf8 f34 = [up], 8 370 getf.sig r24 = f36 C FIXME 371 xma.l f39 = f35, f6, f47 372 ldf8 f46 = [rp], 8 373 getf.sig r28 = f40 374 xma.hu f43 = f35, f6, f47 375 ;; 376 ldf8 f35 = [up], 8 377 getf.sig r25 = f37 378 ldf8 f47 = [rp], 8 379 br.cloop.dptk .grt8 380 381 getf.sig r29 = f41 382 stf8 [r20] = f36, 8 C FIXME 383 xma.l f36 = f32, f6, f44 384 getf.sig r26 = f38 385 getf.sig r30 = f42 386 xma.hu f40 = f32, f6, f44 387 ;; 388 xma.l f37 = f33, f6, f45 389 getf.sig r27 = f39 390 xma.hu f41 = f33, f6, f45 391 br .Lcj8 392 393.grt8: 394 getf.sig r29 = f41 395 xma.l f36 = f32, f6, f44 396 mov r31 = 0 397 xma.hu f40 = f32, f6, f44 398 ;; 399 ldf8 f32 = [up], 8 400 getf.sig r26 = f38 401 br .LL00 402 403 404C *** MAIN LOOP START *** 405 ALIGN(32) C insn fed cycle # 406.Loop: 407 .pred.rel "mutex", p6, p7 C num by i1 i2 408 getf.sig r29 = f41 C 00 16 0 0 409 xma.l f36 = f32, f6, f44 C 01 06,15 0 0 410 (p6) add r14 = r30, r27, 1 C 02 0 0 411 ldf8 f47 = [rp], 8 C 03 0 0 412 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0 413 (p7) add r14 = r30, r27 C 05 0 0 414 ;; 415 .pred.rel "mutex", p6, p7 416 ldf8 f32 = [up], 8 C 06 1 1 417 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1 418 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1 419 getf.sig r26 = f38 C 09 25 2 1 420 st8 [r20] = r14, 8 C 10 2 1 421 nop.b 0 C 11 2 1 422 ;; 423.LL00: 424 .pred.rel "mutex", p8, p9 425 getf.sig r30 = f42 C 12 28 3 2 426 xma.l f37 = f33, f6, f45 C 13 18,27 3 2 427 (p8) add r16 = r31, r24, 1 C 14 3 2 428 ldf8 f44 = [rp], 8 C 15 3 2 429 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2 430 (p9) add r16 = r31, r24 C 17 3 2 431 ;; 432 .pred.rel "mutex", p8, p9 433 ldf8 f33 = [up], 8 C 18 4 3 434 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3 435 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3 436 getf.sig r27 = f39 C 21 37 5 3 437 st8 [r20] = r16, 8 C 22 5 3 438 nop.b 0 C 23 5 3 439 ;; 440.LL11: 441 .pred.rel "mutex", p6, p7 442 getf.sig r31 = f43 C 24 40 6 4 443 xma.l f38 = f34, f6, f46 C 25 30,39 6 4 444 (p6) add r14 = r28, r25, 1 C 26 6 4 445 ldf8 f45 = [rp], 8 C 27 6 4 446 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4 447 (p7) add r14 = r28, r25 C 29 6 4 448 ;; 449 .pred.rel "mutex", p6, p7 450 ldf8 f34 = [up], 8 C 30 7 5 451 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5 452 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5 453 getf.sig r24 = f36 C 33 01 8 5 454 st8 [r20] = r14, 8 C 34 8 5 455 nop.b 0 C 35 8 5 456 ;; 457.LL10: 458 .pred.rel "mutex", p8, p9 459 getf.sig r28 = f40 C 36 04 9 6 460 xma.l f39 = f35, f6, f47 C 37 42,03 9 6 461 (p8) add r16 = r29, r26, 1 C 38 9 6 462 ldf8 f46 = [rp], 8 C 39 9 6 463 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6 464 (p9) add r16 = r29, r26 C 41 9 6 465 ;; 466 .pred.rel "mutex", p8, p9 467 ldf8 f35 = [up], 8 C 42 10 7 468 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7 469 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7 470 getf.sig r25 = f37 C 45 13 11 7 471 st8 [r20] = r16, 8 C 46 11 7 472 br.cloop.dptk .Loop C 47 11 7 473C *** MAIN LOOP END *** 474 ;; 475.Le0: 476 .pred.rel "mutex", p6, p7 477 getf.sig r29 = f41 C 478 xma.l f36 = f32, f6, f44 C 479 (p6) add r14 = r30, r27, 1 C 480 ldf8 f47 = [rp], 8 C 481 xma.hu f40 = f32, f6, f44 C 482 (p7) add r14 = r30, r27 C 483 ;; 484 .pred.rel "mutex", p6, p7 485 (p6) cmp.leu p8, p9 = r14, r27 C 486 (p7) cmp.ltu p8, p9 = r14, r27 C 487 getf.sig r26 = f38 C 488 st8 [r20] = r14, 8 C 489 ;; 490 .pred.rel "mutex", p8, p9 491 getf.sig r30 = f42 C 492 xma.l f37 = f33, f6, f45 C 493 (p8) add r16 = r31, r24, 1 C 494 xma.hu f41 = f33, f6, f45 C 495 (p9) add r16 = r31, r24 C 496 ;; 497 .pred.rel "mutex", p8, p9 498 (p8) cmp.leu p6, p7 = r16, r24 C 499 (p9) cmp.ltu p6, p7 = r16, r24 C 500 getf.sig r27 = f39 C 501 st8 [r20] = r16, 8 C 502 ;; 503.Lcj8: 504 .pred.rel "mutex", p6, p7 505 getf.sig r31 = f43 C 506 xma.l f38 = f34, f6, f46 C 507 (p6) add r14 = r28, r25, 1 C 508 xma.hu f42 = f34, f6, f46 C 509 (p7) add r14 = r28, r25 C 510 ;; 511 .pred.rel "mutex", p6, p7 512 (p6) cmp.leu p8, p9 = r14, r25 C 513 (p7) cmp.ltu p8, p9 = r14, r25 C 514 getf.sig r24 = f36 C 515 st8 [r20] = r14, 8 C 516 ;; 517.Lcj7: 518 .pred.rel "mutex", p8, p9 519 getf.sig r28 = f40 C 520 xma.l f39 = f35, f6, f47 C 521 (p8) add r16 = r29, r26, 1 C 522 xma.hu f43 = f35, f6, f47 C 523 (p9) add r16 = r29, r26 C 524 ;; 525 .pred.rel "mutex", p8, p9 526 (p8) cmp.leu p6, p7 = r16, r26 C 527 (p9) cmp.ltu p6, p7 = r16, r26 C 528 getf.sig r25 = f37 C 529 st8 [r20] = r16, 8 C 530 ;; 531.Lcj6: 532 .pred.rel "mutex", p6, p7 533 getf.sig r29 = f41 C 534 (p6) add r14 = r30, r27, 1 C 535 (p7) add r14 = r30, r27 C 536 ;; 537 .pred.rel "mutex", p6, p7 538 (p6) cmp.leu p8, p9 = r14, r27 C 539 (p7) cmp.ltu p8, p9 = r14, r27 C 540 getf.sig r26 = f38 C 541 st8 [r20] = r14, 8 C 542 ;; 543.Lcj5: 544 .pred.rel "mutex", p8, p9 545 getf.sig r30 = f42 C 546 (p8) add r16 = r31, r24, 1 C 547 (p9) add r16 = r31, r24 C 548 ;; 549 .pred.rel "mutex", p8, p9 550 (p8) cmp.leu p6, p7 = r16, r24 C 551 (p9) cmp.ltu p6, p7 = r16, r24 C 552 getf.sig r27 = f39 C 553 st8 [r20] = r16, 8 C 554 ;; 555.Lcj4: 556 .pred.rel "mutex", p6, p7 557 getf.sig r8 = f43 C 558 (p6) add r14 = r28, r25, 1 C 559 (p7) add r14 = r28, r25 C 560 ;; 561 .pred.rel "mutex", p6, p7 562 st8 [r20] = r14, 8 C 563 (p6) cmp.leu p8, p9 = r14, r25 C 564 (p7) cmp.ltu p8, p9 = r14, r25 C 565 ;; 566.Lcj3: 567 .pred.rel "mutex", p8, p9 568 (p8) add r16 = r29, r26, 1 C 569 (p9) add r16 = r29, r26 C 570 ;; 571 .pred.rel "mutex", p8, p9 572 st8 [r20] = r16, 8 C 573 (p8) cmp.leu p6, p7 = r16, r26 C 574 (p9) cmp.ltu p6, p7 = r16, r26 C 575 ;; 576.Lcj2: 577 .pred.rel "mutex", p6, p7 578 (p6) add r14 = r30, r27, 1 C 579 (p7) add r14 = r30, r27 C 580 ;; 581 .pred.rel "mutex", p6, p7 582 st8 [r20] = r14 C 583 (p6) cmp.leu p8, p9 = r14, r27 C 584 (p7) cmp.ltu p8, p9 = r14, r27 C 585 ;; 586 (p8) add r8 = 1, r8 C M I 587 mov.i ar.lc = r2 C I0 588 br.ret.sptk.many b0 C B 589EPILOGUE() 590ASM_END() 591