1dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and 2dnl store the result in a second limb vector. 3 4dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: 4.0 26C Itanium 2: 2.0 27 28C TODO 29C * Further optimize feed-in and wind-down code, both for speed and code size. 30C * Handle low limb input and results specially, using a common stf8 in the 31C epilogue. 32C * Use 1 c/l carry propagation scheme in wind-down code. 33C * Use extra pointer register for `up' to speed up feed-in loads. 34C * Work out final differences with addmul_1.asm. 35 36C INPUT PARAMETERS 37define(`rp', `r32') 38define(`up', `r33') 39define(`n', `r34') 40define(`vl', `r35') 41define(`cy', `r36') C for mpn_mul_1c 42 43ASM_START() 44PROLOGUE(mpn_mul_1) 45 .prologue 46 .save ar.lc, r2 47 .body 48 49ifdef(`HAVE_ABI_32', 50` addp4 rp = 0, rp C M I 51 addp4 up = 0, up C M I 52 zxt4 n = n C I 53 ;; 54') 55{.mfi 56 adds r15 = -1, n C M I 57 mov f9 = f0 C F 58 mov.i r2 = ar.lc C I0 59} 60{.mmi 61 ldf8 f7 = [up], 8 C M 62 nop.m 0 C M 63 and r14 = 3, n C M I 64 ;; 65} 66.Lcommon: 67{.mii 68 setf.sig f6 = vl C M2 M3 69 shr.u r31 = r15, 2 C I0 70 cmp.eq p10, p0 = 0, r14 C M I 71} 72{.mii 73 cmp.eq p11, p0 = 2, r14 C M I 74 cmp.eq p12, p0 = 3, r14 C M I 75 nop.i 0 C I 76 ;; 77} 78{.mii 79 cmp.ne p6, p7 = r0, r0 C M I 80 mov.i ar.lc = r31 C I0 81 cmp.ne p8, p9 = r0, r0 C M I 82} 83{.bbb 84 (p10) br.dptk .Lb00 C B 85 (p11) br.dptk .Lb10 C B 86 (p12) br.dptk .Lb11 C B 87 ;; 88} 89 90.Lb01: mov r20 = 0 91 br.cloop.dptk .grt1 C B 92 93 xma.l f39 = f7, f6, f9 C F 94 xma.hu f43 = f7, f6, f9 C F 95 ;; 96 getf.sig r8 = f43 C M2 97 stf8 [rp] = f39 C M2 M3 98 mov.i ar.lc = r2 C I0 99 br.ret.sptk.many b0 C B 100 101.grt1: 102 ldf8 f32 = [up], 8 103 ;; 104 ldf8 f33 = [up], 8 105 ;; 106 ldf8 f34 = [up], 8 107 xma.l f39 = f7, f6, f9 108 xma.hu f43 = f7, f6, f9 109 ;; 110 ldf8 f35 = [up], 8 111 br.cloop.dptk .grt5 112 113 xma.l f36 = f32, f6, f0 114 xma.hu f40 = f32, f6, f0 115 ;; 116 stf8 [rp] = f39, 8 117 xma.l f37 = f33, f6, f0 118 xma.hu f41 = f33, f6, f0 119 ;; 120 getf.sig r21 = f43 121 getf.sig r18 = f36 122 xma.l f38 = f34, f6, f0 123 xma.hu f42 = f34, f6, f0 124 ;; 125 getf.sig r22 = f40 126 getf.sig r19 = f37 127 xma.l f39 = f35, f6, f0 128 xma.hu f43 = f35, f6, f0 129 ;; 130 getf.sig r23 = f41 131 getf.sig r16 = f38 132 br .Lcj5 133 134.grt5: 135 xma.l f36 = f32, f6, f0 136 xma.hu f40 = f32, f6, f0 137 ;; 138 getf.sig r17 = f39 139 ldf8 f32 = [up], 8 140 xma.l f37 = f33, f6, f0 141 xma.hu f41 = f33, f6, f0 142 ;; 143 getf.sig r21 = f43 144 ldf8 f33 = [up], 8 145 xma.l f38 = f34, f6, f0 146 ;; 147 getf.sig r18 = f36 148 xma.hu f42 = f34, f6, f0 149 ;; 150 getf.sig r22 = f40 151 ldf8 f34 = [up], 8 152 xma.l f39 = f35, f6, f0 153 ;; 154 getf.sig r19 = f37 155 xma.hu f43 = f35, f6, f0 156 br .LL01 157 158 159.Lb10: ldf8 f35 = [up], 8 160 mov r23 = 0 161 br.cloop.dptk .grt2 162 163 xma.l f38 = f7, f6, f9 164 xma.hu f42 = f7, f6, f9 165 ;; 166 stf8 [rp] = f38, 8 167 xma.l f39 = f35, f6, f42 168 xma.hu f43 = f35, f6, f42 169 ;; 170 getf.sig r8 = f43 171 stf8 [rp] = f39 172 mov.i ar.lc = r2 173 br.ret.sptk.many b0 174 175 176.grt2: 177 ldf8 f32 = [up], 8 178 ;; 179 ldf8 f33 = [up], 8 180 xma.l f38 = f7, f6, f9 181 xma.hu f42 = f7, f6, f9 182 ;; 183 ldf8 f34 = [up], 8 184 xma.l f39 = f35, f6, f0 185 xma.hu f43 = f35, f6, f0 186 ;; 187 ldf8 f35 = [up], 8 188 br.cloop.dptk .grt6 189 190 stf8 [rp] = f38, 8 191 xma.l f36 = f32, f6, f0 192 xma.hu f40 = f32, f6, f0 193 ;; 194 getf.sig r20 = f42 195 getf.sig r17 = f39 196 xma.l f37 = f33, f6, f0 197 xma.hu f41 = f33, f6, f0 198 ;; 199 getf.sig r21 = f43 200 getf.sig r18 = f36 201 xma.l f38 = f34, f6, f0 202 xma.hu f42 = f34, f6, f0 203 ;; 204 getf.sig r22 = f40 205 getf.sig r19 = f37 206 xma.l f39 = f35, f6, f0 207 xma.hu f43 = f35, f6, f0 208 br .Lcj6 209 210.grt6: 211 getf.sig r16 = f38 212 xma.l f36 = f32, f6, f0 213 xma.hu f40 = f32, f6, f0 214 ;; 215 getf.sig r20 = f42 216 ldf8 f32 = [up], 8 217 xma.l f37 = f33, f6, f0 218 ;; 219 getf.sig r17 = f39 220 xma.hu f41 = f33, f6, f0 221 ;; 222 getf.sig r21 = f43 223 ldf8 f33 = [up], 8 224 xma.l f38 = f34, f6, f0 225 ;; 226 getf.sig r18 = f36 227 xma.hu f42 = f34, f6, f0 228 br .LL10 229 230 231.Lb11: ldf8 f34 = [up], 8 232 mov r22 = 0 233 ;; 234 ldf8 f35 = [up], 8 235 br.cloop.dptk .grt3 236 ;; 237 238 xma.l f37 = f7, f6, f9 239 xma.hu f41 = f7, f6, f9 240 xma.l f38 = f34, f6, f0 241 xma.hu f42 = f34, f6, f0 242 xma.l f39 = f35, f6, f0 243 xma.hu f43 = f35, f6, f0 244 ;; 245 getf.sig r23 = f41 246 stf8 [rp] = f37, 8 247 getf.sig r16 = f38 248 getf.sig r20 = f42 249 getf.sig r17 = f39 250 getf.sig r8 = f43 251 br .Lcj3 252 253.grt3: 254 ldf8 f32 = [up], 8 255 xma.l f37 = f7, f6, f9 256 xma.hu f41 = f7, f6, f9 257 ;; 258 ldf8 f33 = [up], 8 259 xma.l f38 = f34, f6, f0 260 xma.hu f42 = f34, f6, f0 261 ;; 262 getf.sig r19 = f37 263 ldf8 f34 = [up], 8 264 xma.l f39 = f35, f6, f0 265 xma.hu f43 = f35, f6, f0 266 ;; 267 getf.sig r23 = f41 268 ldf8 f35 = [up], 8 269 br.cloop.dptk .grt7 270 271 getf.sig r16 = f38 272 xma.l f36 = f32, f6, f0 273 getf.sig r20 = f42 274 xma.hu f40 = f32, f6, f0 275 ;; 276 getf.sig r17 = f39 277 xma.l f37 = f33, f6, f0 278 getf.sig r21 = f43 279 xma.hu f41 = f33, f6, f0 280 ;; 281 getf.sig r18 = f36 282 st8 [rp] = r19, 8 283 xma.l f38 = f34, f6, f0 284 xma.hu f42 = f34, f6, f0 285 br .Lcj7 286 287.grt7: 288 getf.sig r16 = f38 289 xma.l f36 = f32, f6, f0 290 xma.hu f40 = f32, f6, f0 291 ;; 292 getf.sig r20 = f42 293 ldf8 f32 = [up], 8 294 xma.l f37 = f33, f6, f0 295 ;; 296 getf.sig r17 = f39 297 xma.hu f41 = f33, f6, f0 298 br .LL11 299 300 301.Lb00: ldf8 f33 = [up], 8 302 mov r21 = 0 303 ;; 304 ldf8 f34 = [up], 8 305 ;; 306 ldf8 f35 = [up], 8 307 xma.l f36 = f7, f6, f9 308 xma.hu f40 = f7, f6, f9 309 br.cloop.dptk .grt4 310 311 xma.l f37 = f33, f6, f0 312 xma.hu f41 = f33, f6, f0 313 xma.l f38 = f34, f6, f0 314 xma.hu f42 = f34, f6, f0 315 ;; 316 getf.sig r22 = f40 317 stf8 [rp] = f36, 8 318 xma.l f39 = f35, f6, f0 319 getf.sig r19 = f37 320 xma.hu f43 = f35, f6, f0 321 ;; 322 getf.sig r23 = f41 323 getf.sig r16 = f38 324 getf.sig r20 = f42 325 getf.sig r17 = f39 326 br .Lcj4 327 328.grt4: 329 ldf8 f32 = [up], 8 330 xma.l f37 = f33, f6, f0 331 xma.hu f41 = f33, f6, f0 332 ;; 333 getf.sig r18 = f36 334 ldf8 f33 = [up], 8 335 xma.l f38 = f34, f6, f0 336 xma.hu f42 = f34, f6, f0 337 ;; 338 getf.sig r22 = f40 339 ldf8 f34 = [up], 8 340 xma.l f39 = f35, f6, f0 341 ;; 342 getf.sig r19 = f37 343 getf.sig r23 = f41 344 xma.hu f43 = f35, f6, f0 345 ldf8 f35 = [up], 8 346 br.cloop.dptk .grt8 347 348 getf.sig r16 = f38 349 xma.l f36 = f32, f6, f0 350 getf.sig r20 = f42 351 xma.hu f40 = f32, f6, f0 352 ;; 353 getf.sig r17 = f39 354 st8 [rp] = r18, 8 355 xma.l f37 = f33, f6, f0 356 xma.hu f41 = f33, f6, f0 357 br .Lcj8 358 359.grt8: 360 getf.sig r16 = f38 361 xma.l f36 = f32, f6, f0 362 xma.hu f40 = f32, f6, f0 363 br .LL00 364 365 366C *** MAIN LOOP START *** 367 ALIGN(32) 368.Loop: 369 .pred.rel "mutex",p6,p7 370 getf.sig r16 = f38 371 xma.l f36 = f32, f6, f0 372 (p6) cmp.leu p8, p9 = r24, r17 373 st8 [rp] = r24, 8 374 xma.hu f40 = f32, f6, f0 375 (p7) cmp.ltu p8, p9 = r24, r17 376 ;; 377.LL00: 378 .pred.rel "mutex",p8,p9 379 getf.sig r20 = f42 380 (p8) add r24 = r18, r21, 1 381 nop.b 0 382 ldf8 f32 = [up], 8 383 (p9) add r24 = r18, r21 384 nop.b 0 385 ;; 386 .pred.rel "mutex",p8,p9 387 getf.sig r17 = f39 388 xma.l f37 = f33, f6, f0 389 (p8) cmp.leu p6, p7 = r24, r18 390 st8 [rp] = r24, 8 391 xma.hu f41 = f33, f6, f0 392 (p9) cmp.ltu p6, p7 = r24, r18 393 ;; 394.LL11: 395 .pred.rel "mutex",p6,p7 396 getf.sig r21 = f43 397 (p6) add r24 = r19, r22, 1 398 nop.b 0 399 ldf8 f33 = [up], 8 400 (p7) add r24 = r19, r22 401 nop.b 0 402 ;; 403 .pred.rel "mutex",p6,p7 404 getf.sig r18 = f36 405 xma.l f38 = f34, f6, f0 406 (p6) cmp.leu p8, p9 = r24, r19 407 st8 [rp] = r24, 8 408 xma.hu f42 = f34, f6, f0 409 (p7) cmp.ltu p8, p9 = r24, r19 410 ;; 411.LL10: 412 .pred.rel "mutex",p8,p9 413 getf.sig r22 = f40 414 (p8) add r24 = r16, r23, 1 415 nop.b 0 416 ldf8 f34 = [up], 8 417 (p9) add r24 = r16, r23 418 nop.b 0 419 ;; 420 .pred.rel "mutex",p8,p9 421 getf.sig r19 = f37 422 xma.l f39 = f35, f6, f0 423 (p8) cmp.leu p6, p7 = r24, r16 424 st8 [rp] = r24, 8 425 xma.hu f43 = f35, f6, f0 426 (p9) cmp.ltu p6, p7 = r24, r16 427 ;; 428.LL01: 429 .pred.rel "mutex",p6,p7 430 getf.sig r23 = f41 431 (p6) add r24 = r17, r20, 1 432 nop.b 0 433 ldf8 f35 = [up], 8 434 (p7) add r24 = r17, r20 435 br.cloop.dptk .Loop 436C *** MAIN LOOP END *** 437 ;; 438 439.Lcj9: 440 .pred.rel "mutex",p6,p7 441 getf.sig r16 = f38 442 xma.l f36 = f32, f6, f0 443 (p6) cmp.leu p8, p9 = r24, r17 444 st8 [rp] = r24, 8 445 xma.hu f40 = f32, f6, f0 446 (p7) cmp.ltu p8, p9 = r24, r17 447 ;; 448 .pred.rel "mutex",p8,p9 449 getf.sig r20 = f42 450 (p8) add r24 = r18, r21, 1 451 (p9) add r24 = r18, r21 452 ;; 453 .pred.rel "mutex",p8,p9 454 getf.sig r17 = f39 455 xma.l f37 = f33, f6, f0 456 (p8) cmp.leu p6, p7 = r24, r18 457 st8 [rp] = r24, 8 458 xma.hu f41 = f33, f6, f0 459 (p9) cmp.ltu p6, p7 = r24, r18 460 ;; 461.Lcj8: 462 .pred.rel "mutex",p6,p7 463 getf.sig r21 = f43 464 (p6) add r24 = r19, r22, 1 465 (p7) add r24 = r19, r22 466 ;; 467 .pred.rel "mutex",p6,p7 468 getf.sig r18 = f36 469 xma.l f38 = f34, f6, f0 470 (p6) cmp.leu p8, p9 = r24, r19 471 st8 [rp] = r24, 8 472 xma.hu f42 = f34, f6, f0 473 (p7) cmp.ltu p8, p9 = r24, r19 474 ;; 475.Lcj7: 476 .pred.rel "mutex",p8,p9 477 getf.sig r22 = f40 478 (p8) add r24 = r16, r23, 1 479 (p9) add r24 = r16, r23 480 ;; 481 .pred.rel "mutex",p8,p9 482 getf.sig r19 = f37 483 xma.l f39 = f35, f6, f0 484 (p8) cmp.leu p6, p7 = r24, r16 485 st8 [rp] = r24, 8 486 xma.hu f43 = f35, f6, f0 487 (p9) cmp.ltu p6, p7 = r24, r16 488 ;; 489.Lcj6: 490 .pred.rel "mutex",p6,p7 491 getf.sig r23 = f41 492 (p6) add r24 = r17, r20, 1 493 (p7) add r24 = r17, r20 494 ;; 495 .pred.rel "mutex",p6,p7 496 (p6) cmp.leu p8, p9 = r24, r17 497 (p7) cmp.ltu p8, p9 = r24, r17 498 getf.sig r16 = f38 499 st8 [rp] = r24, 8 500 ;; 501.Lcj5: 502 .pred.rel "mutex",p8,p9 503 getf.sig r20 = f42 504 (p8) add r24 = r18, r21, 1 505 (p9) add r24 = r18, r21 506 ;; 507 .pred.rel "mutex",p8,p9 508 (p8) cmp.leu p6, p7 = r24, r18 509 (p9) cmp.ltu p6, p7 = r24, r18 510 getf.sig r17 = f39 511 st8 [rp] = r24, 8 512 ;; 513.Lcj4: 514 .pred.rel "mutex",p6,p7 515 getf.sig r8 = f43 516 (p6) add r24 = r19, r22, 1 517 (p7) add r24 = r19, r22 518 ;; 519 .pred.rel "mutex",p6,p7 520 st8 [rp] = r24, 8 521 (p6) cmp.leu p8, p9 = r24, r19 522 (p7) cmp.ltu p8, p9 = r24, r19 523 ;; 524.Lcj3: 525 .pred.rel "mutex",p8,p9 526 (p8) add r24 = r16, r23, 1 527 (p9) add r24 = r16, r23 528 ;; 529 .pred.rel "mutex",p8,p9 530 st8 [rp] = r24, 8 531 (p8) cmp.leu p6, p7 = r24, r16 532 (p9) cmp.ltu p6, p7 = r24, r16 533 ;; 534.Lcj2: 535 .pred.rel "mutex",p6,p7 536 (p6) add r24 = r17, r20, 1 537 (p7) add r24 = r17, r20 538 ;; 539 .pred.rel "mutex",p6,p7 540 st8 [rp] = r24, 8 541 (p6) cmp.leu p8, p9 = r24, r17 542 (p7) cmp.ltu p8, p9 = r24, r17 543 ;; 544 .pred.rel "mutex",p8,p9 545 (p8) add r8 = 1, r8 546 mov.i ar.lc = r2 547 br.ret.sptk.many b0 548EPILOGUE() 549 550PROLOGUE(mpn_mul_1c) 551 .prologue 552 .save ar.lc, r2 553 .body 554 555ifdef(`HAVE_ABI_32', 556` addp4 rp = 0, rp C M I 557 addp4 up = 0, up C M I 558 zxt4 n = n C I 559 ;; 560') 561{.mmi 562 adds r15 = -1, n C M I 563 setf.sig f9 = cy C M2 M3 564 mov.i r2 = ar.lc C I0 565} 566{.mmb 567 ldf8 f7 = [up], 8 C M 568 and r14 = 3, n C M I 569 br.sptk .Lcommon 570 ;; 571} 572EPILOGUE() 573ASM_END() 574