1dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and 2dnl store the result in a second limb vector. 3 4dnl Contributed to the GNU project by Torbjorn Granlund. 5 6dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software 7dnl Foundation, Inc. 8 9dnl This file is part of the GNU MP Library. 10 11dnl The GNU MP Library is free software; you can redistribute it and/or modify 12dnl it under the terms of the GNU Lesser General Public License as published 13dnl by the Free Software Foundation; either version 3 of the License, or (at 14dnl your option) any later version. 15 16dnl The GNU MP Library is distributed in the hope that it will be useful, but 17dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19dnl License for more details. 20 21dnl You should have received a copy of the GNU Lesser General Public License 22dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 23 24include(`../config.m4') 25 26C cycles/limb 27C Itanium: 4.0 28C Itanium 2: 2.0 29 30C TODO 31C * Further optimize feed-in and wind-down code, both for speed and code size. 32C * Handle low limb input and results specially, using a common stf8 in the 33C epilogue. 34C * Use 1 c/l carry propagation scheme in wind-down code. 35C * Use extra pointer register for `up' to speed up feed-in loads. 36C * Work out final differences with addmul_1.asm. 37 38C INPUT PARAMETERS 39define(`rp', `r32') 40define(`up', `r33') 41define(`n', `r34') 42define(`vl', `r35') 43define(`cy', `r36') C for mpn_mul_1c 44 45ASM_START() 46PROLOGUE(mpn_mul_1) 47 .prologue 48 .save ar.lc, r2 49 .body 50 51ifdef(`HAVE_ABI_32', 52` addp4 rp = 0, rp C M I 53 addp4 up = 0, up C M I 54 zxt4 n = n C I 55 ;; 56') 57{.mfi 58 adds r15 = -1, n C M I 59 mov f9 = f0 C F 60 mov.i r2 = ar.lc C I0 61} 62{.mmi 63 ldf8 f7 = [up], 8 C M 64 nop.m 0 C M 65 and r14 = 3, n C M I 66 ;; 67} 68.Lcommon: 69{.mii 70 setf.sig f6 = vl C M2 M3 71 shr.u r31 = r15, 2 C I0 72 cmp.eq p10, p0 = 0, r14 C M I 73} 74{.mii 75 cmp.eq p11, p0 = 2, r14 C M I 76 cmp.eq p12, p0 = 3, r14 C M I 77 nop.i 0 C I 78 ;; 79} 80{.mii 81 cmp.ne p6, p7 = r0, r0 C M I 82 mov.i ar.lc = r31 C I0 83 cmp.ne p8, p9 = r0, r0 C M I 84} 85{.bbb 86 (p10) br.dptk .Lb00 C B 87 (p11) br.dptk .Lb10 C B 88 (p12) br.dptk .Lb11 C B 89 ;; 90} 91 92.Lb01: mov r20 = 0 93 br.cloop.dptk .grt1 C B 94 95 xma.l f39 = f7, f6, f9 C F 96 xma.hu f43 = f7, f6, f9 C F 97 ;; 98 getf.sig r8 = f43 C M2 99 stf8 [rp] = f39 C M2 M3 100 mov.i ar.lc = r2 C I0 101 br.ret.sptk.many b0 C B 102 103.grt1: 104 ldf8 f32 = [up], 8 105 ;; 106 ldf8 f33 = [up], 8 107 ;; 108 ldf8 f34 = [up], 8 109 xma.l f39 = f7, f6, f9 110 xma.hu f43 = f7, f6, f9 111 ;; 112 ldf8 f35 = [up], 8 113 br.cloop.dptk .grt5 114 115 xma.l f36 = f32, f6, f0 116 xma.hu f40 = f32, f6, f0 117 ;; 118 stf8 [rp] = f39, 8 119 xma.l f37 = f33, f6, f0 120 xma.hu f41 = f33, f6, f0 121 ;; 122 getf.sig r21 = f43 123 getf.sig r18 = f36 124 xma.l f38 = f34, f6, f0 125 xma.hu f42 = f34, f6, f0 126 ;; 127 getf.sig r22 = f40 128 getf.sig r19 = f37 129 xma.l f39 = f35, f6, f0 130 xma.hu f43 = f35, f6, f0 131 ;; 132 getf.sig r23 = f41 133 getf.sig r16 = f38 134 br .Lcj5 135 136.grt5: 137 xma.l f36 = f32, f6, f0 138 xma.hu f40 = f32, f6, f0 139 ;; 140 getf.sig r17 = f39 141 ldf8 f32 = [up], 8 142 xma.l f37 = f33, f6, f0 143 xma.hu f41 = f33, f6, f0 144 ;; 145 getf.sig r21 = f43 146 ldf8 f33 = [up], 8 147 xma.l f38 = f34, f6, f0 148 ;; 149 getf.sig r18 = f36 150 xma.hu f42 = f34, f6, f0 151 ;; 152 getf.sig r22 = f40 153 ldf8 f34 = [up], 8 154 xma.l f39 = f35, f6, f0 155 ;; 156 getf.sig r19 = f37 157 xma.hu f43 = f35, f6, f0 158 br .LL01 159 160 161.Lb10: ldf8 f35 = [up], 8 162 mov r23 = 0 163 br.cloop.dptk .grt2 164 165 xma.l f38 = f7, f6, f9 166 xma.hu f42 = f7, f6, f9 167 ;; 168 stf8 [rp] = f38, 8 169 xma.l f39 = f35, f6, f42 170 xma.hu f43 = f35, f6, f42 171 ;; 172 getf.sig r8 = f43 173 stf8 [rp] = f39 174 mov.i ar.lc = r2 175 br.ret.sptk.many b0 176 177 178.grt2: 179 ldf8 f32 = [up], 8 180 ;; 181 ldf8 f33 = [up], 8 182 xma.l f38 = f7, f6, f9 183 xma.hu f42 = f7, f6, f9 184 ;; 185 ldf8 f34 = [up], 8 186 xma.l f39 = f35, f6, f0 187 xma.hu f43 = f35, f6, f0 188 ;; 189 ldf8 f35 = [up], 8 190 br.cloop.dptk .grt6 191 192 stf8 [rp] = f38, 8 193 xma.l f36 = f32, f6, f0 194 xma.hu f40 = f32, f6, f0 195 ;; 196 getf.sig r20 = f42 197 getf.sig r17 = f39 198 xma.l f37 = f33, f6, f0 199 xma.hu f41 = f33, f6, f0 200 ;; 201 getf.sig r21 = f43 202 getf.sig r18 = f36 203 xma.l f38 = f34, f6, f0 204 xma.hu f42 = f34, f6, f0 205 ;; 206 getf.sig r22 = f40 207 getf.sig r19 = f37 208 xma.l f39 = f35, f6, f0 209 xma.hu f43 = f35, f6, f0 210 br .Lcj6 211 212.grt6: 213 getf.sig r16 = f38 214 xma.l f36 = f32, f6, f0 215 xma.hu f40 = f32, f6, f0 216 ;; 217 getf.sig r20 = f42 218 ldf8 f32 = [up], 8 219 xma.l f37 = f33, f6, f0 220 ;; 221 getf.sig r17 = f39 222 xma.hu f41 = f33, f6, f0 223 ;; 224 getf.sig r21 = f43 225 ldf8 f33 = [up], 8 226 xma.l f38 = f34, f6, f0 227 ;; 228 getf.sig r18 = f36 229 xma.hu f42 = f34, f6, f0 230 br .LL10 231 232 233.Lb11: ldf8 f34 = [up], 8 234 mov r22 = 0 235 ;; 236 ldf8 f35 = [up], 8 237 br.cloop.dptk .grt3 238 ;; 239 240 xma.l f37 = f7, f6, f9 241 xma.hu f41 = f7, f6, f9 242 xma.l f38 = f34, f6, f0 243 xma.hu f42 = f34, f6, f0 244 xma.l f39 = f35, f6, f0 245 xma.hu f43 = f35, f6, f0 246 ;; 247 getf.sig r23 = f41 248 stf8 [rp] = f37, 8 249 getf.sig r16 = f38 250 getf.sig r20 = f42 251 getf.sig r17 = f39 252 getf.sig r8 = f43 253 br .Lcj3 254 255.grt3: 256 ldf8 f32 = [up], 8 257 xma.l f37 = f7, f6, f9 258 xma.hu f41 = f7, f6, f9 259 ;; 260 ldf8 f33 = [up], 8 261 xma.l f38 = f34, f6, f0 262 xma.hu f42 = f34, f6, f0 263 ;; 264 getf.sig r19 = f37 265 ldf8 f34 = [up], 8 266 xma.l f39 = f35, f6, f0 267 xma.hu f43 = f35, f6, f0 268 ;; 269 getf.sig r23 = f41 270 ldf8 f35 = [up], 8 271 br.cloop.dptk .grt7 272 273 getf.sig r16 = f38 274 xma.l f36 = f32, f6, f0 275 getf.sig r20 = f42 276 xma.hu f40 = f32, f6, f0 277 ;; 278 getf.sig r17 = f39 279 xma.l f37 = f33, f6, f0 280 getf.sig r21 = f43 281 xma.hu f41 = f33, f6, f0 282 ;; 283 getf.sig r18 = f36 284 st8 [rp] = r19, 8 285 xma.l f38 = f34, f6, f0 286 xma.hu f42 = f34, f6, f0 287 br .Lcj7 288 289.grt7: 290 getf.sig r16 = f38 291 xma.l f36 = f32, f6, f0 292 xma.hu f40 = f32, f6, f0 293 ;; 294 getf.sig r20 = f42 295 ldf8 f32 = [up], 8 296 xma.l f37 = f33, f6, f0 297 ;; 298 getf.sig r17 = f39 299 xma.hu f41 = f33, f6, f0 300 br .LL11 301 302 303.Lb00: ldf8 f33 = [up], 8 304 mov r21 = 0 305 ;; 306 ldf8 f34 = [up], 8 307 ;; 308 ldf8 f35 = [up], 8 309 xma.l f36 = f7, f6, f9 310 xma.hu f40 = f7, f6, f9 311 br.cloop.dptk .grt4 312 313 xma.l f37 = f33, f6, f0 314 xma.hu f41 = f33, f6, f0 315 xma.l f38 = f34, f6, f0 316 xma.hu f42 = f34, f6, f0 317 ;; 318 getf.sig r22 = f40 319 stf8 [rp] = f36, 8 320 xma.l f39 = f35, f6, f0 321 getf.sig r19 = f37 322 xma.hu f43 = f35, f6, f0 323 ;; 324 getf.sig r23 = f41 325 getf.sig r16 = f38 326 getf.sig r20 = f42 327 getf.sig r17 = f39 328 br .Lcj4 329 330.grt4: 331 ldf8 f32 = [up], 8 332 xma.l f37 = f33, f6, f0 333 xma.hu f41 = f33, f6, f0 334 ;; 335 getf.sig r18 = f36 336 ldf8 f33 = [up], 8 337 xma.l f38 = f34, f6, f0 338 xma.hu f42 = f34, f6, f0 339 ;; 340 getf.sig r22 = f40 341 ldf8 f34 = [up], 8 342 xma.l f39 = f35, f6, f0 343 ;; 344 getf.sig r19 = f37 345 getf.sig r23 = f41 346 xma.hu f43 = f35, f6, f0 347 ldf8 f35 = [up], 8 348 br.cloop.dptk .grt8 349 350 getf.sig r16 = f38 351 xma.l f36 = f32, f6, f0 352 getf.sig r20 = f42 353 xma.hu f40 = f32, f6, f0 354 ;; 355 getf.sig r17 = f39 356 st8 [rp] = r18, 8 357 xma.l f37 = f33, f6, f0 358 xma.hu f41 = f33, f6, f0 359 br .Lcj8 360 361.grt8: 362 getf.sig r16 = f38 363 xma.l f36 = f32, f6, f0 364 xma.hu f40 = f32, f6, f0 365 br .LL00 366 367 368C *** MAIN LOOP START *** 369 ALIGN(32) 370.Loop: 371 .pred.rel "mutex",p6,p7 372 getf.sig r16 = f38 373 xma.l f36 = f32, f6, f0 374 (p6) cmp.leu p8, p9 = r24, r17 375 st8 [rp] = r24, 8 376 xma.hu f40 = f32, f6, f0 377 (p7) cmp.ltu p8, p9 = r24, r17 378 ;; 379.LL00: 380 .pred.rel "mutex",p8,p9 381 getf.sig r20 = f42 382 (p8) add r24 = r18, r21, 1 383 nop.b 0 384 ldf8 f32 = [up], 8 385 (p9) add r24 = r18, r21 386 nop.b 0 387 ;; 388 .pred.rel "mutex",p8,p9 389 getf.sig r17 = f39 390 xma.l f37 = f33, f6, f0 391 (p8) cmp.leu p6, p7 = r24, r18 392 st8 [rp] = r24, 8 393 xma.hu f41 = f33, f6, f0 394 (p9) cmp.ltu p6, p7 = r24, r18 395 ;; 396.LL11: 397 .pred.rel "mutex",p6,p7 398 getf.sig r21 = f43 399 (p6) add r24 = r19, r22, 1 400 nop.b 0 401 ldf8 f33 = [up], 8 402 (p7) add r24 = r19, r22 403 nop.b 0 404 ;; 405 .pred.rel "mutex",p6,p7 406 getf.sig r18 = f36 407 xma.l f38 = f34, f6, f0 408 (p6) cmp.leu p8, p9 = r24, r19 409 st8 [rp] = r24, 8 410 xma.hu f42 = f34, f6, f0 411 (p7) cmp.ltu p8, p9 = r24, r19 412 ;; 413.LL10: 414 .pred.rel "mutex",p8,p9 415 getf.sig r22 = f40 416 (p8) add r24 = r16, r23, 1 417 nop.b 0 418 ldf8 f34 = [up], 8 419 (p9) add r24 = r16, r23 420 nop.b 0 421 ;; 422 .pred.rel "mutex",p8,p9 423 getf.sig r19 = f37 424 xma.l f39 = f35, f6, f0 425 (p8) cmp.leu p6, p7 = r24, r16 426 st8 [rp] = r24, 8 427 xma.hu f43 = f35, f6, f0 428 (p9) cmp.ltu p6, p7 = r24, r16 429 ;; 430.LL01: 431 .pred.rel "mutex",p6,p7 432 getf.sig r23 = f41 433 (p6) add r24 = r17, r20, 1 434 nop.b 0 435 ldf8 f35 = [up], 8 436 (p7) add r24 = r17, r20 437 br.cloop.dptk .Loop 438C *** MAIN LOOP END *** 439 ;; 440 441.Lcj9: 442 .pred.rel "mutex",p6,p7 443 getf.sig r16 = f38 444 xma.l f36 = f32, f6, f0 445 (p6) cmp.leu p8, p9 = r24, r17 446 st8 [rp] = r24, 8 447 xma.hu f40 = f32, f6, f0 448 (p7) cmp.ltu p8, p9 = r24, r17 449 ;; 450 .pred.rel "mutex",p8,p9 451 getf.sig r20 = f42 452 (p8) add r24 = r18, r21, 1 453 (p9) add r24 = r18, r21 454 ;; 455 .pred.rel "mutex",p8,p9 456 getf.sig r17 = f39 457 xma.l f37 = f33, f6, f0 458 (p8) cmp.leu p6, p7 = r24, r18 459 st8 [rp] = r24, 8 460 xma.hu f41 = f33, f6, f0 461 (p9) cmp.ltu p6, p7 = r24, r18 462 ;; 463.Lcj8: 464 .pred.rel "mutex",p6,p7 465 getf.sig r21 = f43 466 (p6) add r24 = r19, r22, 1 467 (p7) add r24 = r19, r22 468 ;; 469 .pred.rel "mutex",p6,p7 470 getf.sig r18 = f36 471 xma.l f38 = f34, f6, f0 472 (p6) cmp.leu p8, p9 = r24, r19 473 st8 [rp] = r24, 8 474 xma.hu f42 = f34, f6, f0 475 (p7) cmp.ltu p8, p9 = r24, r19 476 ;; 477.Lcj7: 478 .pred.rel "mutex",p8,p9 479 getf.sig r22 = f40 480 (p8) add r24 = r16, r23, 1 481 (p9) add r24 = r16, r23 482 ;; 483 .pred.rel "mutex",p8,p9 484 getf.sig r19 = f37 485 xma.l f39 = f35, f6, f0 486 (p8) cmp.leu p6, p7 = r24, r16 487 st8 [rp] = r24, 8 488 xma.hu f43 = f35, f6, f0 489 (p9) cmp.ltu p6, p7 = r24, r16 490 ;; 491.Lcj6: 492 .pred.rel "mutex",p6,p7 493 getf.sig r23 = f41 494 (p6) add r24 = r17, r20, 1 495 (p7) add r24 = r17, r20 496 ;; 497 .pred.rel "mutex",p6,p7 498 (p6) cmp.leu p8, p9 = r24, r17 499 (p7) cmp.ltu p8, p9 = r24, r17 500 getf.sig r16 = f38 501 st8 [rp] = r24, 8 502 ;; 503.Lcj5: 504 .pred.rel "mutex",p8,p9 505 getf.sig r20 = f42 506 (p8) add r24 = r18, r21, 1 507 (p9) add r24 = r18, r21 508 ;; 509 .pred.rel "mutex",p8,p9 510 (p8) cmp.leu p6, p7 = r24, r18 511 (p9) cmp.ltu p6, p7 = r24, r18 512 getf.sig r17 = f39 513 st8 [rp] = r24, 8 514 ;; 515.Lcj4: 516 .pred.rel "mutex",p6,p7 517 getf.sig r8 = f43 518 (p6) add r24 = r19, r22, 1 519 (p7) add r24 = r19, r22 520 ;; 521 .pred.rel "mutex",p6,p7 522 st8 [rp] = r24, 8 523 (p6) cmp.leu p8, p9 = r24, r19 524 (p7) cmp.ltu p8, p9 = r24, r19 525 ;; 526.Lcj3: 527 .pred.rel "mutex",p8,p9 528 (p8) add r24 = r16, r23, 1 529 (p9) add r24 = r16, r23 530 ;; 531 .pred.rel "mutex",p8,p9 532 st8 [rp] = r24, 8 533 (p8) cmp.leu p6, p7 = r24, r16 534 (p9) cmp.ltu p6, p7 = r24, r16 535 ;; 536.Lcj2: 537 .pred.rel "mutex",p6,p7 538 (p6) add r24 = r17, r20, 1 539 (p7) add r24 = r17, r20 540 ;; 541 .pred.rel "mutex",p6,p7 542 st8 [rp] = r24, 8 543 (p6) cmp.leu p8, p9 = r24, r17 544 (p7) cmp.ltu p8, p9 = r24, r17 545 ;; 546 (p8) add r8 = 1, r8 547 mov.i ar.lc = r2 548 br.ret.sptk.many b0 549EPILOGUE() 550 551PROLOGUE(mpn_mul_1c) 552 .prologue 553 .save ar.lc, r2 554 .body 555 556ifdef(`HAVE_ABI_32', 557` addp4 rp = 0, rp C M I 558 addp4 up = 0, up C M I 559 zxt4 n = n C I 560 ;; 561') 562{.mmi 563 adds r15 = -1, n C M I 564 setf.sig f9 = cy C M2 M3 565 mov.i r2 = ar.lc C I0 566} 567{.mmb 568 ldf8 f7 = [up], 8 C M 569 and r14 = 3, n C M I 570 br.sptk .Lcommon 571 ;; 572} 573EPILOGUE() 574ASM_END() 575