1dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the 2dnl result to a second limb vector. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software 7dnl Foundation, Inc. 8 9dnl This file is part of the GNU MP Library. 10 11dnl The GNU MP Library is free software; you can redistribute it and/or modify 12dnl it under the terms of the GNU Lesser General Public License as published 13dnl by the Free Software Foundation; either version 3 of the License, or (at 14dnl your option) any later version. 15 16dnl The GNU MP Library is distributed in the hope that it will be useful, but 17dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19dnl License for more details. 20 21dnl You should have received a copy of the GNU Lesser General Public License 22dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 23 24include(`../config.m4') 25 26C cycles/limb 27C Itanium: 3.0 28C Itanium 2: 2.0 29 30C TODO 31C * Further optimize feed-in and wind-down code, both for speed and code size. 32C * Handle low limb input and results specially, using a common stf8 in the 33C epilogue. 34C * Use 1 c/l carry propagation scheme in wind-down code. 35C * Use extra pointer registers for `up' and rp to speed up feed-in loads. 36C * Work out final differences with mul_1.asm. That function is 300 bytes 37C smaller than this due to better loop scheduling and thus simpler feed-in 38C code. 39 40C INPUT PARAMETERS 41define(`rp', `r32') 42define(`up', `r33') 43define(`n', `r34') 44define(`vl', `r35') 45 46ASM_START() 47PROLOGUE(mpn_addmul_1) 48 .prologue 49 .save ar.lc, r2 50 .body 51 52ifdef(`HAVE_ABI_32', 53` addp4 rp = 0, rp C M I 54 addp4 up = 0, up C M I 55 zxt4 n = n C I 56 ;; 57') 58{.mmi 59 adds r15 = -1, n C M I 60 mov r20 = rp C M I 61 mov.i r2 = ar.lc C I0 62} 63{.mmi 64 ldf8 f7 = [up], 8 C M 65 ldf8 f8 = [rp], 8 C M 66 and r14 = 3, n C M I 67 ;; 68} 69{.mmi 70 setf.sig f6 = vl C M2 M3 71 cmp.eq p10, p0 = 0, r14 C M I 72 shr.u r31 = r15, 2 C I0 73} 74{.mmi 75 cmp.eq p11, p0 = 2, r14 C M I 76 cmp.eq p12, p0 = 3, r14 C M I 77 nop.i 0 C I 78 ;; 79} 80{.mii 81 cmp.ne p6, p7 = r0, r0 C M I 82 mov.i ar.lc = r31 C I0 83 cmp.ne p8, p9 = r0, r0 C M I 84} 85{.bbb 86 (p10) br.dptk .Lb00 C B 87 (p11) br.dptk .Lb10 C B 88 (p12) br.dptk .Lb11 C B 89 ;; 90} 91 92.Lb01: br.cloop.dptk .grt1 C B 93 94 xma.l f39 = f7, f6, f8 C F 95 xma.hu f43 = f7, f6, f8 C F 96 ;; 97 getf.sig r8 = f43 C M2 98 stf8 [r20] = f39 C M2 M3 99 mov.i ar.lc = r2 C I0 100 br.ret.sptk.many b0 C B 101 102.grt1: 103 ldf8 f32 = [up], 8 104 ldf8 f44 = [rp], 8 105 ;; 106 ldf8 f33 = [up], 8 107 ldf8 f45 = [rp], 8 108 ;; 109 ldf8 f34 = [up], 8 110 xma.l f39 = f7, f6, f8 111 ldf8 f46 = [rp], 8 112 xma.hu f43 = f7, f6, f8 113 ;; 114 ldf8 f35 = [up], 8 115 ldf8 f47 = [rp], 8 116 br.cloop.dptk .grt5 117 118 xma.l f36 = f32, f6, f44 119 xma.hu f40 = f32, f6, f44 120 ;; 121 stf8 [r20] = f39, 8 122 xma.l f37 = f33, f6, f45 123 xma.hu f41 = f33, f6, f45 124 ;; 125 getf.sig r31 = f43 126 getf.sig r24 = f36 127 xma.l f38 = f34, f6, f46 128 xma.hu f42 = f34, f6, f46 129 ;; 130 getf.sig r28 = f40 131 getf.sig r25 = f37 132 xma.l f39 = f35, f6, f47 133 xma.hu f43 = f35, f6, f47 134 ;; 135 getf.sig r29 = f41 136 getf.sig r26 = f38 137 br .Lcj5 138 139.grt5: 140 mov r30 = 0 141 xma.l f36 = f32, f6, f44 142 xma.hu f40 = f32, f6, f44 143 ;; 144 ldf8 f32 = [up], 8 145 xma.l f37 = f33, f6, f45 146 ldf8 f44 = [rp], 8 147 xma.hu f41 = f33, f6, f45 148 ;; 149 ldf8 f33 = [up], 8 150 getf.sig r27 = f39 151 ;; 152 getf.sig r31 = f43 153 xma.l f38 = f34, f6, f46 154 ldf8 f45 = [rp], 8 155 xma.hu f42 = f34, f6, f46 156 ;; 157 ldf8 f34 = [up], 8 158 getf.sig r24 = f36 159 ;; 160 getf.sig r28 = f40 161 xma.l f39 = f35, f6, f47 162 ldf8 f46 = [rp], 8 163 xma.hu f43 = f35, f6, f47 164 ;; 165 ldf8 f35 = [up], 8 166 getf.sig r25 = f37 167 br.cloop.dptk .Loop 168 br .Le0 169 170 171.Lb10: ldf8 f35 = [up], 8 172 ldf8 f47 = [rp], 8 173 br.cloop.dptk .grt2 174 175 xma.l f38 = f7, f6, f8 176 xma.hu f42 = f7, f6, f8 177 ;; 178 xma.l f39 = f35, f6, f47 179 xma.hu f43 = f35, f6, f47 180 ;; 181 getf.sig r30 = f42 182 stf8 [r20] = f38, 8 183 getf.sig r27 = f39 184 getf.sig r8 = f43 185 br .Lcj2 186 187.grt2: 188 ldf8 f32 = [up], 8 189 ldf8 f44 = [rp], 8 190 ;; 191 ldf8 f33 = [up], 8 192 xma.l f38 = f7, f6, f8 193 ldf8 f45 = [rp], 8 194 xma.hu f42 = f7, f6, f8 195 ;; 196 ldf8 f34 = [up], 8 197 xma.l f39 = f35, f6, f47 198 ldf8 f46 = [rp], 8 199 xma.hu f43 = f35, f6, f47 200 ;; 201 ldf8 f35 = [up], 8 202 ldf8 f47 = [rp], 8 203 br.cloop.dptk .grt6 204 205 stf8 [r20] = f38, 8 206 xma.l f36 = f32, f6, f44 207 xma.hu f40 = f32, f6, f44 208 ;; 209 getf.sig r30 = f42 210 getf.sig r27 = f39 211 xma.l f37 = f33, f6, f45 212 xma.hu f41 = f33, f6, f45 213 ;; 214 getf.sig r31 = f43 215 getf.sig r24 = f36 216 xma.l f38 = f34, f6, f46 217 xma.hu f42 = f34, f6, f46 218 ;; 219 getf.sig r28 = f40 220 getf.sig r25 = f37 221 xma.l f39 = f35, f6, f47 222 xma.hu f43 = f35, f6, f47 223 br .Lcj6 224 225.grt6: 226 mov r29 = 0 227 xma.l f36 = f32, f6, f44 228 xma.hu f40 = f32, f6, f44 229 ;; 230 ldf8 f32 = [up], 8 231 getf.sig r26 = f38 232 ;; 233 getf.sig r30 = f42 234 xma.l f37 = f33, f6, f45 235 ldf8 f44 = [rp], 8 236 xma.hu f41 = f33, f6, f45 237 ;; 238 ldf8 f33 = [up], 8 239 getf.sig r27 = f39 240 ;; 241 getf.sig r31 = f43 242 xma.l f38 = f34, f6, f46 243 ldf8 f45 = [rp], 8 244 xma.hu f42 = f34, f6, f46 245 ;; 246 ldf8 f34 = [up], 8 247 getf.sig r24 = f36 248 br .LL10 249 250 251.Lb11: ldf8 f34 = [up], 8 252 ldf8 f46 = [rp], 8 253 ;; 254 ldf8 f35 = [up], 8 255 ldf8 f47 = [rp], 8 256 br.cloop.dptk .grt3 257 ;; 258 259 xma.l f37 = f7, f6, f8 260 xma.hu f41 = f7, f6, f8 261 xma.l f38 = f34, f6, f46 262 xma.hu f42 = f34, f6, f46 263 xma.l f39 = f35, f6, f47 264 xma.hu f43 = f35, f6, f47 265 ;; 266 getf.sig r29 = f41 267 stf8 [r20] = f37, 8 268 getf.sig r26 = f38 269 getf.sig r30 = f42 270 getf.sig r27 = f39 271 getf.sig r8 = f43 272 br .Lcj3 273 274.grt3: 275 ldf8 f32 = [up], 8 276 xma.l f37 = f7, f6, f8 277 ldf8 f44 = [rp], 8 278 xma.hu f41 = f7, f6, f8 279 ;; 280 ldf8 f33 = [up], 8 281 xma.l f38 = f34, f6, f46 282 ldf8 f45 = [rp], 8 283 xma.hu f42 = f34, f6, f46 284 ;; 285 ldf8 f34 = [up], 8 286 xma.l f39 = f35, f6, f47 287 ldf8 f46 = [rp], 8 288 xma.hu f43 = f35, f6, f47 289 ;; 290 ldf8 f35 = [up], 8 291 getf.sig r25 = f37 C FIXME 292 ldf8 f47 = [rp], 8 293 br.cloop.dptk .grt7 294 295 getf.sig r29 = f41 296 stf8 [r20] = f37, 8 C FIXME 297 xma.l f36 = f32, f6, f44 298 getf.sig r26 = f38 299 xma.hu f40 = f32, f6, f44 300 ;; 301 getf.sig r30 = f42 302 xma.l f37 = f33, f6, f45 303 getf.sig r27 = f39 304 xma.hu f41 = f33, f6, f45 305 ;; 306 getf.sig r31 = f43 307 xma.l f38 = f34, f6, f46 308 getf.sig r24 = f36 309 xma.hu f42 = f34, f6, f46 310 br .Lcj7 311 312.grt7: 313 getf.sig r29 = f41 314 xma.l f36 = f32, f6, f44 315 mov r28 = 0 316 xma.hu f40 = f32, f6, f44 317 ;; 318 ldf8 f32 = [up], 8 319 getf.sig r26 = f38 320 ;; 321 getf.sig r30 = f42 322 xma.l f37 = f33, f6, f45 323 ldf8 f44 = [rp], 8 324 xma.hu f41 = f33, f6, f45 325 ;; 326 ldf8 f33 = [up], 8 327 getf.sig r27 = f39 328 br .LL11 329 330 331.Lb00: ldf8 f33 = [up], 8 332 ldf8 f45 = [rp], 8 333 ;; 334 ldf8 f34 = [up], 8 335 ldf8 f46 = [rp], 8 336 ;; 337 ldf8 f35 = [up], 8 338 xma.l f36 = f7, f6, f8 339 ldf8 f47 = [rp], 8 340 xma.hu f40 = f7, f6, f8 341 br.cloop.dptk .grt4 342 343 xma.l f37 = f33, f6, f45 344 xma.hu f41 = f33, f6, f45 345 xma.l f38 = f34, f6, f46 346 xma.hu f42 = f34, f6, f46 347 ;; 348 getf.sig r28 = f40 349 stf8 [r20] = f36, 8 350 xma.l f39 = f35, f6, f47 351 getf.sig r25 = f37 352 xma.hu f43 = f35, f6, f47 353 ;; 354 getf.sig r29 = f41 355 getf.sig r26 = f38 356 getf.sig r30 = f42 357 getf.sig r27 = f39 358 br .Lcj4 359 360.grt4: 361 ldf8 f32 = [up], 8 362 xma.l f37 = f33, f6, f45 363 ldf8 f44 = [rp], 8 364 xma.hu f41 = f33, f6, f45 365 ;; 366 ldf8 f33 = [up], 8 367 xma.l f38 = f34, f6, f46 368 ldf8 f45 = [rp], 8 369 xma.hu f42 = f34, f6, f46 370 ;; 371 ldf8 f34 = [up], 8 372 getf.sig r24 = f36 C FIXME 373 xma.l f39 = f35, f6, f47 374 ldf8 f46 = [rp], 8 375 getf.sig r28 = f40 376 xma.hu f43 = f35, f6, f47 377 ;; 378 ldf8 f35 = [up], 8 379 getf.sig r25 = f37 380 ldf8 f47 = [rp], 8 381 br.cloop.dptk .grt8 382 383 getf.sig r29 = f41 384 stf8 [r20] = f36, 8 C FIXME 385 xma.l f36 = f32, f6, f44 386 getf.sig r26 = f38 387 getf.sig r30 = f42 388 xma.hu f40 = f32, f6, f44 389 ;; 390 xma.l f37 = f33, f6, f45 391 getf.sig r27 = f39 392 xma.hu f41 = f33, f6, f45 393 br .Lcj8 394 395.grt8: 396 getf.sig r29 = f41 397 xma.l f36 = f32, f6, f44 398 mov r31 = 0 399 xma.hu f40 = f32, f6, f44 400 ;; 401 ldf8 f32 = [up], 8 402 getf.sig r26 = f38 403 br .LL00 404 405 406C *** MAIN LOOP START *** 407 ALIGN(32) C insn fed cycle # 408.Loop: 409 .pred.rel "mutex", p6, p7 C num by i1 i2 410 getf.sig r29 = f41 C 00 16 0 0 411 xma.l f36 = f32, f6, f44 C 01 06,15 0 0 412 (p6) add r14 = r30, r27, 1 C 02 0 0 413 ldf8 f47 = [rp], 8 C 03 0 0 414 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0 415 (p7) add r14 = r30, r27 C 05 0 0 416 ;; 417 .pred.rel "mutex", p6, p7 418 ldf8 f32 = [up], 8 C 06 1 1 419 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1 420 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1 421 getf.sig r26 = f38 C 09 25 2 1 422 st8 [r20] = r14, 8 C 10 2 1 423 nop.b 0 C 11 2 1 424 ;; 425.LL00: 426 .pred.rel "mutex", p8, p9 427 getf.sig r30 = f42 C 12 28 3 2 428 xma.l f37 = f33, f6, f45 C 13 18,27 3 2 429 (p8) add r16 = r31, r24, 1 C 14 3 2 430 ldf8 f44 = [rp], 8 C 15 3 2 431 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2 432 (p9) add r16 = r31, r24 C 17 3 2 433 ;; 434 .pred.rel "mutex", p8, p9 435 ldf8 f33 = [up], 8 C 18 4 3 436 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3 437 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3 438 getf.sig r27 = f39 C 21 37 5 3 439 st8 [r20] = r16, 8 C 22 5 3 440 nop.b 0 C 23 5 3 441 ;; 442.LL11: 443 .pred.rel "mutex", p6, p7 444 getf.sig r31 = f43 C 24 40 6 4 445 xma.l f38 = f34, f6, f46 C 25 30,39 6 4 446 (p6) add r14 = r28, r25, 1 C 26 6 4 447 ldf8 f45 = [rp], 8 C 27 6 4 448 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4 449 (p7) add r14 = r28, r25 C 29 6 4 450 ;; 451 .pred.rel "mutex", p6, p7 452 ldf8 f34 = [up], 8 C 30 7 5 453 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5 454 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5 455 getf.sig r24 = f36 C 33 01 8 5 456 st8 [r20] = r14, 8 C 34 8 5 457 nop.b 0 C 35 8 5 458 ;; 459.LL10: 460 .pred.rel "mutex", p8, p9 461 getf.sig r28 = f40 C 36 04 9 6 462 xma.l f39 = f35, f6, f47 C 37 42,03 9 6 463 (p8) add r16 = r29, r26, 1 C 38 9 6 464 ldf8 f46 = [rp], 8 C 39 9 6 465 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6 466 (p9) add r16 = r29, r26 C 41 9 6 467 ;; 468 .pred.rel "mutex", p8, p9 469 ldf8 f35 = [up], 8 C 42 10 7 470 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7 471 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7 472 getf.sig r25 = f37 C 45 13 11 7 473 st8 [r20] = r16, 8 C 46 11 7 474 br.cloop.dptk .Loop C 47 11 7 475C *** MAIN LOOP END *** 476 ;; 477.Le0: 478 .pred.rel "mutex", p6, p7 479 getf.sig r29 = f41 C 480 xma.l f36 = f32, f6, f44 C 481 (p6) add r14 = r30, r27, 1 C 482 ldf8 f47 = [rp], 8 C 483 xma.hu f40 = f32, f6, f44 C 484 (p7) add r14 = r30, r27 C 485 ;; 486 .pred.rel "mutex", p6, p7 487 (p6) cmp.leu p8, p9 = r14, r27 C 488 (p7) cmp.ltu p8, p9 = r14, r27 C 489 getf.sig r26 = f38 C 490 st8 [r20] = r14, 8 C 491 ;; 492 .pred.rel "mutex", p8, p9 493 getf.sig r30 = f42 C 494 xma.l f37 = f33, f6, f45 C 495 (p8) add r16 = r31, r24, 1 C 496 xma.hu f41 = f33, f6, f45 C 497 (p9) add r16 = r31, r24 C 498 ;; 499 .pred.rel "mutex", p8, p9 500 (p8) cmp.leu p6, p7 = r16, r24 C 501 (p9) cmp.ltu p6, p7 = r16, r24 C 502 getf.sig r27 = f39 C 503 st8 [r20] = r16, 8 C 504 ;; 505.Lcj8: 506 .pred.rel "mutex", p6, p7 507 getf.sig r31 = f43 C 508 xma.l f38 = f34, f6, f46 C 509 (p6) add r14 = r28, r25, 1 C 510 xma.hu f42 = f34, f6, f46 C 511 (p7) add r14 = r28, r25 C 512 ;; 513 .pred.rel "mutex", p6, p7 514 (p6) cmp.leu p8, p9 = r14, r25 C 515 (p7) cmp.ltu p8, p9 = r14, r25 C 516 getf.sig r24 = f36 C 517 st8 [r20] = r14, 8 C 518 ;; 519.Lcj7: 520 .pred.rel "mutex", p8, p9 521 getf.sig r28 = f40 C 522 xma.l f39 = f35, f6, f47 C 523 (p8) add r16 = r29, r26, 1 C 524 xma.hu f43 = f35, f6, f47 C 525 (p9) add r16 = r29, r26 C 526 ;; 527 .pred.rel "mutex", p8, p9 528 (p8) cmp.leu p6, p7 = r16, r26 C 529 (p9) cmp.ltu p6, p7 = r16, r26 C 530 getf.sig r25 = f37 C 531 st8 [r20] = r16, 8 C 532 ;; 533.Lcj6: 534 .pred.rel "mutex", p6, p7 535 getf.sig r29 = f41 C 536 (p6) add r14 = r30, r27, 1 C 537 (p7) add r14 = r30, r27 C 538 ;; 539 .pred.rel "mutex", p6, p7 540 (p6) cmp.leu p8, p9 = r14, r27 C 541 (p7) cmp.ltu p8, p9 = r14, r27 C 542 getf.sig r26 = f38 C 543 st8 [r20] = r14, 8 C 544 ;; 545.Lcj5: 546 .pred.rel "mutex", p8, p9 547 getf.sig r30 = f42 C 548 (p8) add r16 = r31, r24, 1 C 549 (p9) add r16 = r31, r24 C 550 ;; 551 .pred.rel "mutex", p8, p9 552 (p8) cmp.leu p6, p7 = r16, r24 C 553 (p9) cmp.ltu p6, p7 = r16, r24 C 554 getf.sig r27 = f39 C 555 st8 [r20] = r16, 8 C 556 ;; 557.Lcj4: 558 .pred.rel "mutex", p6, p7 559 getf.sig r8 = f43 C 560 (p6) add r14 = r28, r25, 1 C 561 (p7) add r14 = r28, r25 C 562 ;; 563 .pred.rel "mutex", p6, p7 564 st8 [r20] = r14, 8 C 565 (p6) cmp.leu p8, p9 = r14, r25 C 566 (p7) cmp.ltu p8, p9 = r14, r25 C 567 ;; 568.Lcj3: 569 .pred.rel "mutex", p8, p9 570 (p8) add r16 = r29, r26, 1 C 571 (p9) add r16 = r29, r26 C 572 ;; 573 .pred.rel "mutex", p8, p9 574 st8 [r20] = r16, 8 C 575 (p8) cmp.leu p6, p7 = r16, r26 C 576 (p9) cmp.ltu p6, p7 = r16, r26 C 577 ;; 578.Lcj2: 579 .pred.rel "mutex", p6, p7 580 (p6) add r14 = r30, r27, 1 C 581 (p7) add r14 = r30, r27 C 582 ;; 583 .pred.rel "mutex", p6, p7 584 st8 [r20] = r14 C 585 (p6) cmp.leu p8, p9 = r14, r27 C 586 (p7) cmp.ltu p8, p9 = r14, r27 C 587 ;; 588 (p8) add r8 = 1, r8 C M I 589 mov.i ar.lc = r2 C I0 590 br.ret.sptk.many b0 C B 591EPILOGUE() 592ASM_END() 593