1dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the 2dnl result from a second limb vector. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25C cycles/limb 26C Itanium: 4.0 27C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) 28 29C TODO 30C * Optimize feed-in and wind-down code, both for speed and code size. 31C * Handle low limb input and results specially, using a common stf8 in the 32C epilogue. 33C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in 34C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and 35C save a cycle. 36 37C INPUT PARAMETERS 38define(`rp', `r32') 39define(`up', `r33') 40define(`n', `r34') 41define(`vl', `r35') 42 43ASM_START() 44PROLOGUE(mpn_submul_1) 45 .prologue 46 .save ar.lc, r2 47 .body 48 49ifdef(`HAVE_ABI_32', 50` addp4 rp = 0, rp C M I 51 addp4 up = 0, up C M I 52 zxt4 n = n C I 53 ;; 54') 55{.mmi 56 mov r10 = rp C M I 57 mov r9 = up C M I 58 sub vl = r0, vl C M I negate vl 59} 60{.mmi 61 ldf8 f8 = [rp], 8 C M 62 ldf8 f7 = [up], 8 C M 63 add r19 = -1, n C M I n - 1 64 ;; 65} 66{.mmi 67 cmp.eq p6, p0 = 0, vl C M I 68 mov r8 = 0 C M I zero cylimb 69 mov r2 = ar.lc C I0 70} 71{.mmi 72 setf.sig f6 = vl C M2 M3 73 and r14 = 3, n C M I 74 shr.u r19 = r19, 2 C I0 75 ;; 76} 77{.mmb 78 nop 0 79 cmp.eq p10, p0 = 0, r14 C M I 80 (p6) br.spnt .Ldone C B vl == 0 81} 82{.mmi 83 cmp.eq p11, p0 = 2, r14 C M I 84 cmp.eq p12, p0 = 3, r14 C M I 85 mov ar.lc = r19 C I0 86} 87{.bbb 88 (p10) br.dptk .Lb00 C B 89 (p11) br.dptk .Lb10 C B 90 (p12) br.dptk .Lb11 C B 91 ;; 92} 93 94.Lb01: br.cloop.dptk .grt1 95 96 xma.l f39 = f7, f6, f8 97 xma.hu f43 = f7, f6, f8 98 ;; 99 getf.sig r27 = f39 C lo 100 getf.sig r31 = f43 C hi 101 ld8 r20 = [r9], 8 102 br .Lcj1 103 104.grt1: ldf8 f44 = [rp], 8 105 ldf8 f32 = [up], 8 106 ;; 107 ldf8 f45 = [rp], 8 108 ldf8 f33 = [up], 8 109 ;; 110 ldf8 f46 = [rp], 8 111 xma.l f39 = f7, f6, f8 112 ldf8 f34 = [up], 8 113 xma.hu f43 = f7, f6, f8 114 ;; 115 ldf8 f47 = [rp], 8 116 xma.l f36 = f32, f6, f44 117 ldf8 f35 = [up], 8 118 xma.hu f40 = f32, f6, f44 119 br.cloop.dptk .grt5 120 ;; 121 122 getf.sig r27 = f39 C lo 123 xma.l f37 = f33, f6, f45 124 ld8 r20 = [r9], 8 125 xma.hu f41 = f33, f6, f45 126 ;; 127 getf.sig r31 = f43 C hi 128 getf.sig r24 = f36 C lo 129 xma.l f38 = f34, f6, f46 130 ld8 r21 = [r9], 8 131 xma.hu f42 = f34, f6, f46 132 ;; 133 getf.sig r28 = f40 C hi 134 getf.sig r25 = f37 C lo 135 xma.l f39 = f35, f6, f47 136 ld8 r22 = [r9], 8 137 xma.hu f43 = f35, f6, f47 138 ;; 139 getf.sig r29 = f41 C hi 140 getf.sig r26 = f38 C lo 141 ld8 r23 = [r9], 8 142 br .Lcj5 143 144.grt5: ldf8 f44 = [rp], 8 145 ldf8 f32 = [up], 8 146 ;; 147 getf.sig r27 = f39 C lo 148 xma.l f37 = f33, f6, f45 149 ld8 r20 = [r9], 8 150 xma.hu f41 = f33, f6, f45 151 ;; 152 ldf8 f45 = [rp], 8 153 getf.sig r31 = f43 C hi 154 ldf8 f33 = [up], 8 155 ;; 156 getf.sig r24 = f36 C lo 157 xma.l f38 = f34, f6, f46 158 ld8 r21 = [r9], 8 159 xma.hu f42 = f34, f6, f46 160 ;; 161 ldf8 f46 = [rp], 8 162 getf.sig r28 = f40 C hi 163 ldf8 f34 = [up], 8 164 ;; 165 getf.sig r25 = f37 C lo 166 xma.l f39 = f35, f6, f47 167 ld8 r22 = [r9], 8 168 xma.hu f43 = f35, f6, f47 169 ;; 170 ldf8 f47 = [rp], 8 171 getf.sig r29 = f41 C hi 172 ldf8 f35 = [up], 8 173 ;; 174 getf.sig r26 = f38 C lo 175 xma.l f36 = f32, f6, f44 176 ld8 r23 = [r9], 8 177 xma.hu f40 = f32, f6, f44 178 br.cloop.dptk .Loop 179 br .Lend 180 181 182.Lb10: ldf8 f47 = [rp], 8 183 ldf8 f35 = [up], 8 184 br.cloop.dptk .grt2 185 186 xma.l f38 = f7, f6, f8 187 xma.hu f42 = f7, f6, f8 188 ;; 189 xma.l f39 = f35, f6, f47 190 xma.hu f43 = f35, f6, f47 191 ;; 192 getf.sig r26 = f38 C lo 193 getf.sig r30 = f42 C hi 194 ld8 r23 = [r9], 8 195 ;; 196 getf.sig r27 = f39 C lo 197 getf.sig r31 = f43 C hi 198 ld8 r20 = [r9], 8 199 br .Lcj2 200 201.grt2: ldf8 f44 = [rp], 8 202 ldf8 f32 = [up], 8 203 ;; 204 ldf8 f45 = [rp], 8 205 ldf8 f33 = [up], 8 206 xma.l f38 = f7, f6, f8 207 xma.hu f42 = f7, f6, f8 208 ;; 209 ldf8 f46 = [rp], 8 210 ldf8 f34 = [up], 8 211 xma.l f39 = f35, f6, f47 212 xma.hu f43 = f35, f6, f47 213 ;; 214 ldf8 f47 = [rp], 8 215 ldf8 f35 = [up], 8 216 ;; 217 getf.sig r26 = f38 C lo 218 xma.l f36 = f32, f6, f44 219 ld8 r23 = [r9], 8 220 xma.hu f40 = f32, f6, f44 221 br.cloop.dptk .grt6 222 223 getf.sig r30 = f42 C hi 224 ;; 225 getf.sig r27 = f39 C lo 226 xma.l f37 = f33, f6, f45 227 ld8 r20 = [r9], 8 228 xma.hu f41 = f33, f6, f45 229 ;; 230 getf.sig r31 = f43 C hi 231 getf.sig r24 = f36 C lo 232 xma.l f38 = f34, f6, f46 233 ld8 r21 = [r9], 8 234 xma.hu f42 = f34, f6, f46 235 ;; 236 getf.sig r28 = f40 C hi 237 getf.sig r25 = f37 C lo 238 xma.l f39 = f35, f6, f47 239 ld8 r22 = [r9], 8 240 xma.hu f43 = f35, f6, f47 241 br .Lcj6 242 243.grt6: ldf8 f44 = [rp], 8 244 getf.sig r30 = f42 C hi 245 ldf8 f32 = [up], 8 246 ;; 247 getf.sig r27 = f39 C lo 248 xma.l f37 = f33, f6, f45 249 ld8 r20 = [r9], 8 250 xma.hu f41 = f33, f6, f45 251 ;; 252 ldf8 f45 = [rp], 8 253 getf.sig r31 = f43 C hi 254 ldf8 f33 = [up], 8 255 ;; 256 getf.sig r24 = f36 C lo 257 xma.l f38 = f34, f6, f46 258 ld8 r21 = [r9], 8 259 xma.hu f42 = f34, f6, f46 260 ;; 261 ldf8 f46 = [rp], 8 262 getf.sig r28 = f40 C hi 263 ldf8 f34 = [up], 8 264 ;; 265 getf.sig r25 = f37 C lo 266 xma.l f39 = f35, f6, f47 267 ld8 r22 = [r9], 8 268 xma.hu f43 = f35, f6, f47 269 br .LL10 270 271 272.Lb11: ldf8 f46 = [rp], 8 273 ldf8 f34 = [up], 8 274 ;; 275 ldf8 f47 = [rp], 8 276 ldf8 f35 = [up], 8 277 br.cloop.dptk .grt3 278 279 xma.l f37 = f7, f6, f8 280 xma.hu f41 = f7, f6, f8 281 ;; 282 xma.l f38 = f34, f6, f46 283 xma.hu f42 = f34, f6, f46 284 ;; 285 getf.sig r25 = f37 C lo 286 xma.l f39 = f35, f6, f47 287 xma.hu f43 = f35, f6, f47 288 ;; 289 getf.sig r29 = f41 C hi 290 ld8 r22 = [r9], 8 291 ;; 292 getf.sig r26 = f38 C lo 293 getf.sig r30 = f42 C hi 294 ld8 r23 = [r9], 8 295 ;; 296 getf.sig r27 = f39 C lo 297 getf.sig r31 = f43 C hi 298 ld8 r20 = [r9], 8 299 br .Lcj3 300 301.grt3: ldf8 f44 = [rp], 8 302 xma.l f37 = f7, f6, f8 303 ldf8 f32 = [up], 8 304 xma.hu f41 = f7, f6, f8 305 ;; 306 ldf8 f45 = [rp], 8 307 xma.l f38 = f34, f6, f46 308 ldf8 f33 = [up], 8 309 xma.hu f42 = f34, f6, f46 310 ;; 311 ldf8 f46 = [rp], 8 312 ldf8 f34 = [up], 8 313 ;; 314 getf.sig r25 = f37 C lo 315 xma.l f39 = f35, f6, f47 316 ld8 r22 = [r9], 8 317 xma.hu f43 = f35, f6, f47 318 ;; 319 ldf8 f47 = [rp], 8 320 getf.sig r29 = f41 C hi 321 ldf8 f35 = [up], 8 322 ;; 323 getf.sig r26 = f38 C lo 324 xma.l f36 = f32, f6, f44 325 ld8 r23 = [r9], 8 326 xma.hu f40 = f32, f6, f44 327 br.cloop.dptk .grt7 328 ;; 329 330 getf.sig r30 = f42 C hi 331 getf.sig r27 = f39 C lo 332 xma.l f37 = f33, f6, f45 333 ld8 r20 = [r9], 8 334 xma.hu f41 = f33, f6, f45 335 ;; 336 getf.sig r31 = f43 C hi 337 getf.sig r24 = f36 C lo 338 xma.l f38 = f34, f6, f46 339 ld8 r21 = [r9], 8 340 xma.hu f42 = f34, f6, f46 341 br .Lcj7 342 343.grt7: ldf8 f44 = [rp], 8 344 getf.sig r30 = f42 C hi 345 ldf8 f32 = [up], 8 346 ;; 347 getf.sig r27 = f39 C lo 348 xma.l f37 = f33, f6, f45 349 ld8 r20 = [r9], 8 350 xma.hu f41 = f33, f6, f45 351 ;; 352 ldf8 f45 = [rp], 8 353 getf.sig r31 = f43 C hi 354 ldf8 f33 = [up], 8 355 ;; 356 getf.sig r24 = f36 C lo 357 xma.l f38 = f34, f6, f46 358 ld8 r21 = [r9], 8 359 xma.hu f42 = f34, f6, f46 360 br .LL11 361 362 363.Lb00: ldf8 f45 = [rp], 8 364 ldf8 f33 = [up], 8 365 ;; 366 ldf8 f46 = [rp], 8 367 ldf8 f34 = [up], 8 368 ;; 369 ldf8 f47 = [rp], 8 370 xma.l f36 = f7, f6, f8 371 ldf8 f35 = [up], 8 372 xma.hu f40 = f7, f6, f8 373 br.cloop.dptk .grt4 374 375 xma.l f37 = f33, f6, f45 376 xma.hu f41 = f33, f6, f45 377 ;; 378 getf.sig r24 = f36 C lo 379 xma.l f38 = f34, f6, f46 380 ld8 r21 = [r9], 8 381 xma.hu f42 = f34, f6, f46 382 ;; 383 getf.sig r28 = f40 C hi 384 xma.l f39 = f35, f6, f47 385 getf.sig r25 = f37 C lo 386 ld8 r22 = [r9], 8 387 xma.hu f43 = f35, f6, f47 388 ;; 389 getf.sig r29 = f41 C hi 390 getf.sig r26 = f38 C lo 391 ld8 r23 = [r9], 8 392 ;; 393 getf.sig r30 = f42 C hi 394 getf.sig r27 = f39 C lo 395 ld8 r20 = [r9], 8 396 br .Lcj4 397 398.grt4: ldf8 f44 = [rp], 8 399 xma.l f37 = f33, f6, f45 400 ldf8 f32 = [up], 8 401 xma.hu f41 = f33, f6, f45 402 ;; 403 ldf8 f45 = [rp], 8 404 ldf8 f33 = [up], 8 405 xma.l f38 = f34, f6, f46 406 getf.sig r24 = f36 C lo 407 ld8 r21 = [r9], 8 408 xma.hu f42 = f34, f6, f46 409 ;; 410 ldf8 f46 = [rp], 8 411 getf.sig r28 = f40 C hi 412 ldf8 f34 = [up], 8 413 xma.l f39 = f35, f6, f47 414 getf.sig r25 = f37 C lo 415 ld8 r22 = [r9], 8 416 xma.hu f43 = f35, f6, f47 417 ;; 418 ldf8 f47 = [rp], 8 419 getf.sig r29 = f41 C hi 420 ldf8 f35 = [up], 8 421 ;; 422 getf.sig r26 = f38 C lo 423 xma.l f36 = f32, f6, f44 424 ld8 r23 = [r9], 8 425 xma.hu f40 = f32, f6, f44 426 br.cloop.dptk .grt8 427 ;; 428 429 getf.sig r30 = f42 C hi 430 getf.sig r27 = f39 C lo 431 xma.l f37 = f33, f6, f45 432 ld8 r20 = [r9], 8 433 xma.hu f41 = f33, f6, f45 434 br .Lcj8 435 436.grt8: ldf8 f44 = [rp], 8 437 getf.sig r30 = f42 C hi 438 ldf8 f32 = [up], 8 439 ;; 440 getf.sig r27 = f39 C lo 441 xma.l f37 = f33, f6, f45 442 ld8 r20 = [r9], 8 443 xma.hu f41 = f33, f6, f45 444 br .LL00 445 446 ALIGN(32) 447.Loop: 448{.mmi 449 ldf8 f44 = [rp], 8 450 cmp.ltu p6, p0 = r27, r8 C lo cmp 451 sub r14 = r27, r8 C lo sub 452} 453{.mmi 454 getf.sig r30 = f42 C hi 455 ldf8 f32 = [up], 8 456 sub r8 = r20, r31 C hi sub 457 ;; C 01 458} 459{.mmf 460 getf.sig r27 = f39 C lo 461 st8 [r10] = r14, 8 462 xma.l f37 = f33, f6, f45 463} 464{.mfi 465 ld8 r20 = [r9], 8 466 xma.hu f41 = f33, f6, f45 467 (p6) add r8 = 1, r8 468 ;; C 02 469} 470{.mmi 471.LL00: ldf8 f45 = [rp], 8 472 cmp.ltu p6, p0 = r24, r8 473 sub r14 = r24, r8 474} 475{.mmi 476 getf.sig r31 = f43 C hi 477 ldf8 f33 = [up], 8 478 sub r8 = r21, r28 479 ;; C 03 480} 481{.mmf 482 getf.sig r24 = f36 C lo 483 st8 [r10] = r14, 8 484 xma.l f38 = f34, f6, f46 485} 486{.mfi 487 ld8 r21 = [r9], 8 488 xma.hu f42 = f34, f6, f46 489 (p6) add r8 = 1, r8 490 ;; C 04 491} 492{.mmi 493.LL11: ldf8 f46 = [rp], 8 494 cmp.ltu p6, p0 = r25, r8 495 sub r14 = r25, r8 496} 497{.mmi 498 getf.sig r28 = f40 C hi 499 ldf8 f34 = [up], 8 500 sub r8 = r22, r29 501 ;; C 05 502} 503{.mmf 504 getf.sig r25 = f37 C lo 505 st8 [r10] = r14, 8 506 xma.l f39 = f35, f6, f47 507} 508{.mfi 509 ld8 r22 = [r9], 8 510 xma.hu f43 = f35, f6, f47 511 (p6) add r8 = 1, r8 512 ;; C 06 513} 514{.mmi 515.LL10: ldf8 f47 = [rp], 8 516 cmp.ltu p6, p0 = r26, r8 517 sub r14 = r26, r8 518} 519{.mmi 520 getf.sig r29 = f41 C hi 521 ldf8 f35 = [up], 8 522 sub r8 = r23, r30 523 ;; C 07 524} 525{.mmf 526 getf.sig r26 = f38 C lo 527 st8 [r10] = r14, 8 528 xma.l f36 = f32, f6, f44 529} 530{.mfi 531 ld8 r23 = [r9], 8 532 xma.hu f40 = f32, f6, f44 533 (p6) add r8 = 1, r8 534} 535 br.cloop.dptk .Loop 536 ;; 537 538.Lend: 539 cmp.ltu p6, p0 = r27, r8 540 sub r14 = r27, r8 541 getf.sig r30 = f42 542 sub r8 = r20, r31 543 ;; 544 getf.sig r27 = f39 545 st8 [r10] = r14, 8 546 xma.l f37 = f33, f6, f45 547 ld8 r20 = [r9], 8 548 xma.hu f41 = f33, f6, f45 549 (p6) add r8 = 1, r8 550 ;; 551.Lcj8: 552 cmp.ltu p6, p0 = r24, r8 553 sub r14 = r24, r8 554 getf.sig r31 = f43 555 sub r8 = r21, r28 556 ;; 557 getf.sig r24 = f36 558 st8 [r10] = r14, 8 559 xma.l f38 = f34, f6, f46 560 ld8 r21 = [r9], 8 561 xma.hu f42 = f34, f6, f46 562 (p6) add r8 = 1, r8 563 ;; 564.Lcj7: 565 cmp.ltu p6, p0 = r25, r8 566 sub r14 = r25, r8 567 getf.sig r28 = f40 568 sub r8 = r22, r29 569 ;; 570 getf.sig r25 = f37 571 st8 [r10] = r14, 8 572 xma.l f39 = f35, f6, f47 573 ld8 r22 = [r9], 8 574 xma.hu f43 = f35, f6, f47 575 (p6) add r8 = 1, r8 576 ;; 577.Lcj6: 578 cmp.ltu p6, p0 = r26, r8 579 sub r14 = r26, r8 580 getf.sig r29 = f41 581 sub r8 = r23, r30 582 ;; 583 getf.sig r26 = f38 584 st8 [r10] = r14, 8 585 ld8 r23 = [r9], 8 586 (p6) add r8 = 1, r8 587 ;; 588.Lcj5: 589 cmp.ltu p6, p0 = r27, r8 590 sub r14 = r27, r8 591 getf.sig r30 = f42 592 sub r8 = r20, r31 593 ;; 594 getf.sig r27 = f39 595 st8 [r10] = r14, 8 596 ld8 r20 = [r9], 8 597 (p6) add r8 = 1, r8 598 ;; 599.Lcj4: 600 cmp.ltu p6, p0 = r24, r8 601 sub r14 = r24, r8 602 getf.sig r31 = f43 603 sub r8 = r21, r28 604 ;; 605 st8 [r10] = r14, 8 606 (p6) add r8 = 1, r8 607 ;; 608.Lcj3: 609 cmp.ltu p6, p0 = r25, r8 610 sub r14 = r25, r8 611 sub r8 = r22, r29 612 ;; 613 st8 [r10] = r14, 8 614 (p6) add r8 = 1, r8 615 ;; 616.Lcj2: 617 cmp.ltu p6, p0 = r26, r8 618 sub r14 = r26, r8 619 sub r8 = r23, r30 620 ;; 621 st8 [r10] = r14, 8 622 (p6) add r8 = 1, r8 623 ;; 624.Lcj1: 625 cmp.ltu p6, p0 = r27, r8 626 sub r14 = r27, r8 627 sub r8 = r20, r31 628 ;; 629 st8 [r10] = r14, 8 630 mov ar.lc = r2 631 (p6) add r8 = 1, r8 632 br.ret.sptk.many b0 633.Ldone: mov ar.lc = r2 634 br.ret.sptk.many b0 635EPILOGUE() 636ASM_END() 637