1#include "sparc_arch.h" 2 3.section ".text",#alloc,#execinstr 4 5.global bn_mul_mont_fpu 6.align 32 7bn_mul_mont_fpu: 8 save %sp,-STACK_FRAME-64,%sp 9 10 cmp %i5,4 11 bl,a,pn %icc,.Lret 12 clr %i0 13 andcc %i5,1,%g0 ! %i5 has to be even... 14 bnz,a,pn %icc,.Lret 15 clr %i0 ! signal "unsupported input value" 16 17 srl %i5,1,%i5 18 sethi %hi(0xffff),%l7 19 ld [%i4+0],%g4 ! %g4 reassigned, remember? 20 or %l7,%lo(0xffff),%l7 21 ld [%i4+4],%o0 22 sllx %o0,32,%o0 23 or %o0,%g4,%g4 ! %g4=n0[1].n0[0] 24 25 sll %i5,3,%i5 ! num*=8 26 27 add %sp,STACK_BIAS,%o0 ! real top of stack 28 sll %i5,2,%o1 29 add %o1,%i5,%o1 ! %o1=num*5 30 sub %o0,%o1,%o0 31 and %o0,-2048,%o0 ! optimize TLB utilization 32 sub %o0,STACK_BIAS,%sp ! alloca(5*num*8) 33 34 rd %asi,%o7 ! save %asi 35 add %sp,STACK_BIAS+STACK_FRAME+64,%l0 36 add %l0,%i5,%l1 37 add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends ! 38 add %l1,%i5,%l2 39 add %l2,%i5,%l3 40 add %l3,%i5,%l4 41 42 wr %g0,210,%asi ! setup %asi for 16-bit FP loads 43 44 add %i0,%i5,%i0 ! readjust input pointers to point 45 add %i1,%i5,%i1 ! at the ends too... 46 add %i2,%i5,%i2 47 add %i3,%i5,%i3 48 49 stx %o7,[%sp+STACK_BIAS+STACK_FRAME+48] ! save %asi 50 51 sub %g0,%i5,%l5 ! i=-num 52 sub %g0,%i5,%l6 ! j=-num 53 54 add %i1,%l6,%o3 55 add %i2,%l5,%o4 56 57 ld [%o3+4],%g1 ! bp[0] 58 ld [%o3+0],%o0 59 ld [%o4+4],%g5 ! ap[0] 60 sllx %g1,32,%g1 61 ld [%o4+0],%o1 62 sllx %g5,32,%g5 63 or %g1,%o0,%o0 64 or %g5,%o1,%o1 65 66 add %i3,%l6,%o5 67 68 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 69 mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0 70 stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0] 71 72 ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words 73 .word 0xa1b00c20 ! fzeros %f16 74 ld [%o3+4],%f19 75 .word 0xa5b00c20 ! fzeros %f18 76 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 77 .word 0xa9b00c20 ! fzeros %f20 78 ld [%o5+4],%f23 79 .word 0xadb00c20 ! fzeros %f22 80 81 ! transfer b[i] to FPU as 4x16-bit values 82 ldda [%o4+2]%asi,%f0 83 fxtod %f16,%f16 84 ldda [%o4+0]%asi,%f2 85 fxtod %f18,%f18 86 ldda [%o4+6]%asi,%f4 87 fxtod %f20,%f20 88 ldda [%o4+4]%asi,%f6 89 fxtod %f22,%f22 90 91 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 92 ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8 93 fxtod %f0,%f0 94 ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10 95 fxtod %f2,%f2 96 ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12 97 fxtod %f4,%f4 98 ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14 99 fxtod %f6,%f6 100 101 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 102 fxtod %f8,%f8 103 std %f18,[%l2+%l6] 104 fxtod %f10,%f10 105 std %f20,[%l3+%l6] ! save smashed np[j] in double format 106 fxtod %f12,%f12 107 std %f22,[%l4+%l6] 108 fxtod %f14,%f14 109 110 fmuld %f16,%f0,%f32 111 fmuld %f20,%f8,%f48 112 fmuld %f16,%f2,%f34 113 fmuld %f20,%f10,%f50 114 fmuld %f16,%f4,%f36 115 faddd %f32,%f48,%f48 116 fmuld %f20,%f12,%f52 117 fmuld %f16,%f6,%f38 118 faddd %f34,%f50,%f50 119 fmuld %f20,%f14,%f54 120 fmuld %f18,%f0,%f40 121 faddd %f36,%f52,%f52 122 fmuld %f22,%f8,%f56 123 fmuld %f18,%f2,%f42 124 faddd %f38,%f54,%f54 125 fmuld %f22,%f10,%f58 126 fmuld %f18,%f4,%f44 127 faddd %f40,%f56,%f56 128 fmuld %f22,%f12,%f60 129 fmuld %f18,%f6,%f46 130 faddd %f42,%f58,%f58 131 fmuld %f22,%f14,%f62 132 133 faddd %f44,%f60,%f24 ! %f60 134 faddd %f46,%f62,%f26 ! %f62 135 136 faddd %f52,%f56,%f52 137 faddd %f54,%f58,%f54 138 139 fdtox %f48,%f48 140 fdtox %f50,%f50 141 fdtox %f52,%f52 142 fdtox %f54,%f54 143 144 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 145 add %l6,8,%l6 146 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 147 add %i1,%l6,%o4 148 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 149 add %i3,%l6,%o5 150 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 151 152 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 153 .word 0xa1b00c20 ! fzeros %f16 154 ld [%o4+4],%f19 155 .word 0xa5b00c20 ! fzeros %f18 156 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 157 .word 0xa9b00c20 ! fzeros %f20 158 ld [%o5+4],%f23 159 .word 0xadb00c20 ! fzeros %f22 160 161 fxtod %f16,%f16 162 fxtod %f18,%f18 163 fxtod %f20,%f20 164 fxtod %f22,%f22 165 166 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 167 fmuld %f16,%f0,%f32 168 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 169 fmuld %f20,%f8,%f48 170 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 171 fmuld %f16,%f2,%f34 172 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 173 fmuld %f20,%f10,%f50 174 175 srlx %o0,16,%o7 176 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 177 fmuld %f16,%f4,%f36 178 add %o7,%o1,%o1 179 std %f18,[%l2+%l6] 180 faddd %f32,%f48,%f48 181 fmuld %f20,%f12,%f52 182 srlx %o1,16,%o7 183 std %f20,[%l3+%l6] ! save smashed np[j] in double format 184 fmuld %f16,%f6,%f38 185 add %o7,%o2,%o2 186 std %f22,[%l4+%l6] 187 faddd %f34,%f50,%f50 188 fmuld %f20,%f14,%f54 189 srlx %o2,16,%o7 190 fmuld %f18,%f0,%f40 191 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 192 faddd %f36,%f52,%f52 193 fmuld %f22,%f8,%f56 194 !and %o0,%l7,%o0 195 !and %o1,%l7,%o1 196 !and %o2,%l7,%o2 197 !sllx %o1,16,%o1 198 !sllx %o2,32,%o2 199 !sllx %o3,48,%o7 200 !or %o1,%o0,%o0 201 !or %o2,%o0,%o0 202 !or %o7,%o0,%o0 ! 64-bit result 203 srlx %o3,16,%g1 ! 34-bit carry 204 fmuld %f18,%f2,%f42 205 206 faddd %f38,%f54,%f54 207 fmuld %f22,%f10,%f58 208 fmuld %f18,%f4,%f44 209 faddd %f40,%f56,%f56 210 fmuld %f22,%f12,%f60 211 fmuld %f18,%f6,%f46 212 faddd %f42,%f58,%f58 213 fmuld %f22,%f14,%f62 214 215 faddd %f24,%f48,%f48 216 faddd %f26,%f50,%f50 217 faddd %f44,%f60,%f24 ! %f60 218 faddd %f46,%f62,%f26 ! %f62 219 220 faddd %f52,%f56,%f52 221 faddd %f54,%f58,%f54 222 223 fdtox %f48,%f48 224 fdtox %f50,%f50 225 fdtox %f52,%f52 226 fdtox %f54,%f54 227 228 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 229 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 230 addcc %l6,8,%l6 231 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 232 bz,pn %icc,.L1stskip 233 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 234 235.align 32 ! incidentally already aligned ! 236.L1st: 237 add %i1,%l6,%o4 238 add %i3,%l6,%o5 239 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 240 .word 0xa1b00c20 ! fzeros %f16 241 ld [%o4+4],%f19 242 .word 0xa5b00c20 ! fzeros %f18 243 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 244 .word 0xa9b00c20 ! fzeros %f20 245 ld [%o5+4],%f23 246 .word 0xadb00c20 ! fzeros %f22 247 248 fxtod %f16,%f16 249 fxtod %f18,%f18 250 fxtod %f20,%f20 251 fxtod %f22,%f22 252 253 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 254 fmuld %f16,%f0,%f32 255 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 256 fmuld %f20,%f8,%f48 257 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 258 fmuld %f16,%f2,%f34 259 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 260 fmuld %f20,%f10,%f50 261 262 srlx %o0,16,%o7 263 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 264 fmuld %f16,%f4,%f36 265 add %o7,%o1,%o1 266 std %f18,[%l2+%l6] 267 faddd %f32,%f48,%f48 268 fmuld %f20,%f12,%f52 269 srlx %o1,16,%o7 270 std %f20,[%l3+%l6] ! save smashed np[j] in double format 271 fmuld %f16,%f6,%f38 272 add %o7,%o2,%o2 273 std %f22,[%l4+%l6] 274 faddd %f34,%f50,%f50 275 fmuld %f20,%f14,%f54 276 srlx %o2,16,%o7 277 fmuld %f18,%f0,%f40 278 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 279 and %o0,%l7,%o0 280 faddd %f36,%f52,%f52 281 fmuld %f22,%f8,%f56 282 and %o1,%l7,%o1 283 and %o2,%l7,%o2 284 fmuld %f18,%f2,%f42 285 sllx %o1,16,%o1 286 faddd %f38,%f54,%f54 287 fmuld %f22,%f10,%f58 288 sllx %o2,32,%o2 289 fmuld %f18,%f4,%f44 290 sllx %o3,48,%o7 291 or %o1,%o0,%o0 292 faddd %f40,%f56,%f56 293 fmuld %f22,%f12,%f60 294 or %o2,%o0,%o0 295 fmuld %f18,%f6,%f46 296 or %o7,%o0,%o0 ! 64-bit result 297 faddd %f42,%f58,%f58 298 fmuld %f22,%f14,%f62 299 addcc %g1,%o0,%o0 300 faddd %f24,%f48,%f48 301 srlx %o3,16,%g1 ! 34-bit carry 302 faddd %f26,%f50,%f50 303 bcs,a %xcc,.+8 304 add %g1,1,%g1 305 306 stx %o0,[%l0] ! tp[j-1]= 307 308 faddd %f44,%f60,%f24 ! %f60 309 faddd %f46,%f62,%f26 ! %f62 310 311 faddd %f52,%f56,%f52 312 faddd %f54,%f58,%f54 313 314 fdtox %f48,%f48 315 fdtox %f50,%f50 316 fdtox %f52,%f52 317 fdtox %f54,%f54 318 319 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 320 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 321 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 322 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 323 324 addcc %l6,8,%l6 325 bnz,pt %icc,.L1st 326 add %l0,8,%l0 327 328.L1stskip: 329 fdtox %f24,%f24 330 fdtox %f26,%f26 331 332 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 333 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 334 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 335 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 336 337 srlx %o0,16,%o7 338 std %f24,[%sp+STACK_BIAS+STACK_FRAME+32] 339 add %o7,%o1,%o1 340 std %f26,[%sp+STACK_BIAS+STACK_FRAME+40] 341 srlx %o1,16,%o7 342 add %o7,%o2,%o2 343 srlx %o2,16,%o7 344 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 345 and %o0,%l7,%o0 346 and %o1,%l7,%o1 347 and %o2,%l7,%o2 348 sllx %o1,16,%o1 349 sllx %o2,32,%o2 350 sllx %o3,48,%o7 351 or %o1,%o0,%o0 352 or %o2,%o0,%o0 353 or %o7,%o0,%o0 ! 64-bit result 354 ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4 355 addcc %g1,%o0,%o0 356 ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5 357 srlx %o3,16,%g1 ! 34-bit carry 358 bcs,a %xcc,.+8 359 add %g1,1,%g1 360 361 stx %o0,[%l0] ! tp[j-1]= 362 add %l0,8,%l0 363 364 srlx %o4,16,%o7 365 add %o7,%o5,%o5 366 and %o4,%l7,%o4 367 sllx %o5,16,%o7 368 or %o7,%o4,%o4 369 addcc %g1,%o4,%o4 370 srlx %o5,48,%g1 371 bcs,a %xcc,.+8 372 add %g1,1,%g1 373 374 mov %g1,%i4 375 stx %o4,[%l0] ! tp[num-1]= 376 377 ba .Louter 378 add %l5,8,%l5 379.align 32 380.Louter: 381 sub %g0,%i5,%l6 ! j=-num 382 add %sp,STACK_BIAS+STACK_FRAME+64,%l0 383 384 add %i1,%l6,%o3 385 add %i2,%l5,%o4 386 387 ld [%o3+4],%g1 ! bp[i] 388 ld [%o3+0],%o0 389 ld [%o4+4],%g5 ! ap[0] 390 sllx %g1,32,%g1 391 ld [%o4+0],%o1 392 sllx %g5,32,%g5 393 or %g1,%o0,%o0 394 or %g5,%o1,%o1 395 396 ldx [%l0],%o2 ! tp[0] 397 mulx %o1,%o0,%o0 398 addcc %o2,%o0,%o0 399 mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 400 stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0] 401 402 ! transfer b[i] to FPU as 4x16-bit values 403 ldda [%o4+2]%asi,%f0 404 ldda [%o4+0]%asi,%f2 405 ldda [%o4+6]%asi,%f4 406 ldda [%o4+4]%asi,%f6 407 408 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 409 ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8 410 fxtod %f0,%f0 411 ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10 412 fxtod %f2,%f2 413 ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12 414 fxtod %f4,%f4 415 ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14 416 fxtod %f6,%f6 417 ldd [%l1+%l6],%f16 ! load a[j] in double format 418 fxtod %f8,%f8 419 ldd [%l2+%l6],%f18 420 fxtod %f10,%f10 421 ldd [%l3+%l6],%f20 ! load n[j] in double format 422 fxtod %f12,%f12 423 ldd [%l4+%l6],%f22 424 fxtod %f14,%f14 425 426 fmuld %f16,%f0,%f32 427 fmuld %f20,%f8,%f48 428 fmuld %f16,%f2,%f34 429 fmuld %f20,%f10,%f50 430 fmuld %f16,%f4,%f36 431 faddd %f32,%f48,%f48 432 fmuld %f20,%f12,%f52 433 fmuld %f16,%f6,%f38 434 faddd %f34,%f50,%f50 435 fmuld %f20,%f14,%f54 436 fmuld %f18,%f0,%f40 437 faddd %f36,%f52,%f52 438 fmuld %f22,%f8,%f56 439 fmuld %f18,%f2,%f42 440 faddd %f38,%f54,%f54 441 fmuld %f22,%f10,%f58 442 fmuld %f18,%f4,%f44 443 faddd %f40,%f56,%f56 444 fmuld %f22,%f12,%f60 445 fmuld %f18,%f6,%f46 446 faddd %f42,%f58,%f58 447 fmuld %f22,%f14,%f62 448 449 faddd %f44,%f60,%f24 ! %f60 450 faddd %f46,%f62,%f26 ! %f62 451 452 faddd %f52,%f56,%f52 453 faddd %f54,%f58,%f54 454 455 fdtox %f48,%f48 456 fdtox %f50,%f50 457 fdtox %f52,%f52 458 fdtox %f54,%f54 459 460 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 461 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 462 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 463 add %l6,8,%l6 464 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 465 466 ldd [%l1+%l6],%f16 ! load a[j] in double format 467 ldd [%l2+%l6],%f18 468 ldd [%l3+%l6],%f20 ! load n[j] in double format 469 ldd [%l4+%l6],%f22 470 471 fmuld %f16,%f0,%f32 472 fmuld %f20,%f8,%f48 473 fmuld %f16,%f2,%f34 474 fmuld %f20,%f10,%f50 475 fmuld %f16,%f4,%f36 476 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 477 faddd %f32,%f48,%f48 478 fmuld %f20,%f12,%f52 479 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 480 fmuld %f16,%f6,%f38 481 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 482 faddd %f34,%f50,%f50 483 fmuld %f20,%f14,%f54 484 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 485 fmuld %f18,%f0,%f40 486 487 srlx %o0,16,%o7 488 faddd %f36,%f52,%f52 489 fmuld %f22,%f8,%f56 490 add %o7,%o1,%o1 491 fmuld %f18,%f2,%f42 492 srlx %o1,16,%o7 493 faddd %f38,%f54,%f54 494 fmuld %f22,%f10,%f58 495 add %o7,%o2,%o2 496 fmuld %f18,%f4,%f44 497 srlx %o2,16,%o7 498 faddd %f40,%f56,%f56 499 fmuld %f22,%f12,%f60 500 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 501 ! why? 502 and %o0,%l7,%o0 503 fmuld %f18,%f6,%f46 504 and %o1,%l7,%o1 505 and %o2,%l7,%o2 506 faddd %f42,%f58,%f58 507 fmuld %f22,%f14,%f62 508 sllx %o1,16,%o1 509 faddd %f24,%f48,%f48 510 sllx %o2,32,%o2 511 faddd %f26,%f50,%f50 512 sllx %o3,48,%o7 513 or %o1,%o0,%o0 514 faddd %f44,%f60,%f24 ! %f60 515 or %o2,%o0,%o0 516 faddd %f46,%f62,%f26 ! %f62 517 or %o7,%o0,%o0 ! 64-bit result 518 ldx [%l0],%o7 519 faddd %f52,%f56,%f52 520 addcc %o7,%o0,%o0 521 ! end-of-why? 522 faddd %f54,%f58,%f54 523 srlx %o3,16,%g1 ! 34-bit carry 524 fdtox %f48,%f48 525 bcs,a %xcc,.+8 526 add %g1,1,%g1 527 528 fdtox %f50,%f50 529 fdtox %f52,%f52 530 fdtox %f54,%f54 531 532 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 533 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 534 addcc %l6,8,%l6 535 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 536 bz,pn %icc,.Linnerskip 537 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 538 539 ba .Linner 540 nop 541.align 32 542.Linner: 543 ldd [%l1+%l6],%f16 ! load a[j] in double format 544 ldd [%l2+%l6],%f18 545 ldd [%l3+%l6],%f20 ! load n[j] in double format 546 ldd [%l4+%l6],%f22 547 548 fmuld %f16,%f0,%f32 549 fmuld %f20,%f8,%f48 550 fmuld %f16,%f2,%f34 551 fmuld %f20,%f10,%f50 552 fmuld %f16,%f4,%f36 553 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 554 faddd %f32,%f48,%f48 555 fmuld %f20,%f12,%f52 556 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 557 fmuld %f16,%f6,%f38 558 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 559 faddd %f34,%f50,%f50 560 fmuld %f20,%f14,%f54 561 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 562 fmuld %f18,%f0,%f40 563 564 srlx %o0,16,%o7 565 faddd %f36,%f52,%f52 566 fmuld %f22,%f8,%f56 567 add %o7,%o1,%o1 568 fmuld %f18,%f2,%f42 569 srlx %o1,16,%o7 570 faddd %f38,%f54,%f54 571 fmuld %f22,%f10,%f58 572 add %o7,%o2,%o2 573 fmuld %f18,%f4,%f44 574 srlx %o2,16,%o7 575 faddd %f40,%f56,%f56 576 fmuld %f22,%f12,%f60 577 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 578 and %o0,%l7,%o0 579 fmuld %f18,%f6,%f46 580 and %o1,%l7,%o1 581 and %o2,%l7,%o2 582 faddd %f42,%f58,%f58 583 fmuld %f22,%f14,%f62 584 sllx %o1,16,%o1 585 faddd %f24,%f48,%f48 586 sllx %o2,32,%o2 587 faddd %f26,%f50,%f50 588 sllx %o3,48,%o7 589 or %o1,%o0,%o0 590 faddd %f44,%f60,%f24 ! %f60 591 or %o2,%o0,%o0 592 faddd %f46,%f62,%f26 ! %f62 593 or %o7,%o0,%o0 ! 64-bit result 594 faddd %f52,%f56,%f52 595 addcc %g1,%o0,%o0 596 ldx [%l0+8],%o7 ! tp[j] 597 faddd %f54,%f58,%f54 598 srlx %o3,16,%g1 ! 34-bit carry 599 fdtox %f48,%f48 600 bcs,a %xcc,.+8 601 add %g1,1,%g1 602 fdtox %f50,%f50 603 addcc %o7,%o0,%o0 604 fdtox %f52,%f52 605 bcs,a %xcc,.+8 606 add %g1,1,%g1 607 608 stx %o0,[%l0] ! tp[j-1] 609 fdtox %f54,%f54 610 611 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 612 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 613 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 614 addcc %l6,8,%l6 615 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 616 bnz,pt %icc,.Linner 617 add %l0,8,%l0 618 619.Linnerskip: 620 fdtox %f24,%f24 621 fdtox %f26,%f26 622 623 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 624 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 625 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 626 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 627 628 srlx %o0,16,%o7 629 std %f24,[%sp+STACK_BIAS+STACK_FRAME+32] 630 add %o7,%o1,%o1 631 std %f26,[%sp+STACK_BIAS+STACK_FRAME+40] 632 srlx %o1,16,%o7 633 add %o7,%o2,%o2 634 srlx %o2,16,%o7 635 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 636 and %o0,%l7,%o0 637 and %o1,%l7,%o1 638 and %o2,%l7,%o2 639 sllx %o1,16,%o1 640 sllx %o2,32,%o2 641 sllx %o3,48,%o7 642 or %o1,%o0,%o0 643 or %o2,%o0,%o0 644 ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4 645 or %o7,%o0,%o0 ! 64-bit result 646 ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5 647 addcc %g1,%o0,%o0 648 ldx [%l0+8],%o7 ! tp[j] 649 srlx %o3,16,%g1 ! 34-bit carry 650 bcs,a %xcc,.+8 651 add %g1,1,%g1 652 653 addcc %o7,%o0,%o0 654 bcs,a %xcc,.+8 655 add %g1,1,%g1 656 657 stx %o0,[%l0] ! tp[j-1] 658 add %l0,8,%l0 659 660 srlx %o4,16,%o7 661 add %o7,%o5,%o5 662 and %o4,%l7,%o4 663 sllx %o5,16,%o7 664 or %o7,%o4,%o4 665 addcc %g1,%o4,%o4 666 srlx %o5,48,%g1 667 bcs,a %xcc,.+8 668 add %g1,1,%g1 669 670 addcc %i4,%o4,%o4 671 stx %o4,[%l0] ! tp[num-1] 672 mov %g1,%i4 673 bcs,a %xcc,.+8 674 add %i4,1,%i4 675 676 addcc %l5,8,%l5 677 bnz %icc,.Louter 678 nop 679 680 add %l0,8,%l0 ! adjust tp to point at the end 681 orn %g0,%g0,%g4 682 sub %g0,%i5,%o7 ! n=-num 683 ba .Lsub 684 subcc %g0,%g0,%g0 ! clear %icc.c 685 686.align 32 687.Lsub: 688 ldx [%l0+%o7],%o0 689 add %i3,%o7,%g1 690 ld [%g1+0],%o2 691 ld [%g1+4],%o3 692 srlx %o0,32,%o1 693 subccc %o0,%o2,%o2 694 add %i0,%o7,%g1 695 subccc %o1,%o3,%o3 696 st %o2,[%g1+0] 697 add %o7,8,%o7 698 brnz,pt %o7,.Lsub 699 st %o3,[%g1+4] 700 subc %i4,0,%g4 701 sub %g0,%i5,%o7 ! n=-num 702 ba .Lcopy 703 nop 704 705.align 32 706.Lcopy: 707 ldx [%l0+%o7],%o0 708 add %i0,%o7,%g1 709 ld [%g1+0],%o2 710 ld [%g1+4],%o3 711 stx %g0,[%l0+%o7] 712 and %o0,%g4,%o0 713 srlx %o0,32,%o1 714 andn %o2,%g4,%o2 715 andn %o3,%g4,%o3 716 or %o2,%o0,%o0 717 or %o3,%o1,%o1 718 st %o0,[%g1+0] 719 add %o7,8,%o7 720 brnz,pt %o7,.Lcopy 721 st %o1,[%g1+4] 722 sub %g0,%i5,%o7 ! n=-num 723 724.Lzap: 725 stx %g0,[%l1+%o7] 726 stx %g0,[%l2+%o7] 727 stx %g0,[%l3+%o7] 728 stx %g0,[%l4+%o7] 729 add %o7,8,%o7 730 brnz,pt %o7,.Lzap 731 nop 732 733 ldx [%sp+STACK_BIAS+STACK_FRAME+48],%o7 734 wr %g0,%o7,%asi ! restore %asi 735 736 mov 1,%i0 737.Lret: 738 ret 739 restore 740.type bn_mul_mont_fpu,#function 741.size bn_mul_mont_fpu,(.-bn_mul_mont_fpu) 742.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>" 743.align 32 744