1#ifndef __ASSEMBLER__ 2# define __ASSEMBLER__ 1 3#endif 4#include "crypto/sparc_arch.h" 5 6.section ".text",#alloc,#execinstr 7 8.global bn_mul_mont_fpu 9.align 32 10bn_mul_mont_fpu: 11 save %sp,-STACK_FRAME-64,%sp 12 13 cmp %i5,4 14 bl,a,pn %icc,.Lret 15 clr %i0 16 andcc %i5,1,%g0 ! %i5 has to be even... 17 bnz,a,pn %icc,.Lret 18 clr %i0 ! signal "unsupported input value" 19 20 srl %i5,1,%i5 21 sethi %hi(0xffff),%l7 22 ld [%i4+0],%g4 ! %g4 reassigned, remember? 23 or %l7,%lo(0xffff),%l7 24 ld [%i4+4],%o0 25 sllx %o0,32,%o0 26 or %o0,%g4,%g4 ! %g4=n0[1].n0[0] 27 28 sll %i5,3,%i5 ! num*=8 29 30 add %sp,STACK_BIAS,%o0 ! real top of stack 31 sll %i5,2,%o1 32 add %o1,%i5,%o1 ! %o1=num*5 33 sub %o0,%o1,%o0 34 and %o0,-2048,%o0 ! optimize TLB utilization 35 sub %o0,STACK_BIAS,%sp ! alloca(5*num*8) 36 37 rd %asi,%o7 ! save %asi 38 add %sp,STACK_BIAS+STACK_FRAME+64,%l0 39 add %l0,%i5,%l1 40 add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends ! 41 add %l1,%i5,%l2 42 add %l2,%i5,%l3 43 add %l3,%i5,%l4 44 45 wr %g0,210,%asi ! setup %asi for 16-bit FP loads 46 47 add %i0,%i5,%i0 ! readjust input pointers to point 48 add %i1,%i5,%i1 ! at the ends too... 49 add %i2,%i5,%i2 50 add %i3,%i5,%i3 51 52 stx %o7,[%sp+STACK_BIAS+STACK_FRAME+48] ! save %asi 53 54 sub %g0,%i5,%l5 ! i=-num 55 sub %g0,%i5,%l6 ! j=-num 56 57 add %i1,%l6,%o3 58 add %i2,%l5,%o4 59 60 ld [%o3+4],%g1 ! bp[0] 61 ld [%o3+0],%o0 62 ld [%o4+4],%g5 ! ap[0] 63 sllx %g1,32,%g1 64 ld [%o4+0],%o1 65 sllx %g5,32,%g5 66 or %g1,%o0,%o0 67 or %g5,%o1,%o1 68 69 add %i3,%l6,%o5 70 71 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 72 mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0 73 stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0] 74 75 ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words 76 .word 0xa1b00c20 ! fzeros %f16 77 ld [%o3+4],%f19 78 .word 0xa5b00c20 ! fzeros %f18 79 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 80 .word 0xa9b00c20 ! fzeros %f20 81 ld [%o5+4],%f23 82 .word 0xadb00c20 ! fzeros %f22 83 84 ! transfer b[i] to FPU as 4x16-bit values 85 ldda [%o4+2]%asi,%f0 86 fxtod %f16,%f16 87 ldda [%o4+0]%asi,%f2 88 fxtod %f18,%f18 89 ldda [%o4+6]%asi,%f4 90 fxtod %f20,%f20 91 ldda [%o4+4]%asi,%f6 92 fxtod %f22,%f22 93 94 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 95 ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8 96 fxtod %f0,%f0 97 ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10 98 fxtod %f2,%f2 99 ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12 100 fxtod %f4,%f4 101 ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14 102 fxtod %f6,%f6 103 104 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 105 fxtod %f8,%f8 106 std %f18,[%l2+%l6] 107 fxtod %f10,%f10 108 std %f20,[%l3+%l6] ! save smashed np[j] in double format 109 fxtod %f12,%f12 110 std %f22,[%l4+%l6] 111 fxtod %f14,%f14 112 113 fmuld %f16,%f0,%f32 114 fmuld %f20,%f8,%f48 115 fmuld %f16,%f2,%f34 116 fmuld %f20,%f10,%f50 117 fmuld %f16,%f4,%f36 118 faddd %f32,%f48,%f48 119 fmuld %f20,%f12,%f52 120 fmuld %f16,%f6,%f38 121 faddd %f34,%f50,%f50 122 fmuld %f20,%f14,%f54 123 fmuld %f18,%f0,%f40 124 faddd %f36,%f52,%f52 125 fmuld %f22,%f8,%f56 126 fmuld %f18,%f2,%f42 127 faddd %f38,%f54,%f54 128 fmuld %f22,%f10,%f58 129 fmuld %f18,%f4,%f44 130 faddd %f40,%f56,%f56 131 fmuld %f22,%f12,%f60 132 fmuld %f18,%f6,%f46 133 faddd %f42,%f58,%f58 134 fmuld %f22,%f14,%f62 135 136 faddd %f44,%f60,%f24 ! %f60 137 faddd %f46,%f62,%f26 ! %f62 138 139 faddd %f52,%f56,%f52 140 faddd %f54,%f58,%f54 141 142 fdtox %f48,%f48 143 fdtox %f50,%f50 144 fdtox %f52,%f52 145 fdtox %f54,%f54 146 147 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 148 add %l6,8,%l6 149 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 150 add %i1,%l6,%o4 151 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 152 add %i3,%l6,%o5 153 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 154 155 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 156 .word 0xa1b00c20 ! fzeros %f16 157 ld [%o4+4],%f19 158 .word 0xa5b00c20 ! fzeros %f18 159 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 160 .word 0xa9b00c20 ! fzeros %f20 161 ld [%o5+4],%f23 162 .word 0xadb00c20 ! fzeros %f22 163 164 fxtod %f16,%f16 165 fxtod %f18,%f18 166 fxtod %f20,%f20 167 fxtod %f22,%f22 168 169 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 170 fmuld %f16,%f0,%f32 171 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 172 fmuld %f20,%f8,%f48 173 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 174 fmuld %f16,%f2,%f34 175 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 176 fmuld %f20,%f10,%f50 177 178 srlx %o0,16,%o7 179 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 180 fmuld %f16,%f4,%f36 181 add %o7,%o1,%o1 182 std %f18,[%l2+%l6] 183 faddd %f32,%f48,%f48 184 fmuld %f20,%f12,%f52 185 srlx %o1,16,%o7 186 std %f20,[%l3+%l6] ! save smashed np[j] in double format 187 fmuld %f16,%f6,%f38 188 add %o7,%o2,%o2 189 std %f22,[%l4+%l6] 190 faddd %f34,%f50,%f50 191 fmuld %f20,%f14,%f54 192 srlx %o2,16,%o7 193 fmuld %f18,%f0,%f40 194 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 195 faddd %f36,%f52,%f52 196 fmuld %f22,%f8,%f56 197 !and %o0,%l7,%o0 198 !and %o1,%l7,%o1 199 !and %o2,%l7,%o2 200 !sllx %o1,16,%o1 201 !sllx %o2,32,%o2 202 !sllx %o3,48,%o7 203 !or %o1,%o0,%o0 204 !or %o2,%o0,%o0 205 !or %o7,%o0,%o0 ! 64-bit result 206 srlx %o3,16,%g1 ! 34-bit carry 207 fmuld %f18,%f2,%f42 208 209 faddd %f38,%f54,%f54 210 fmuld %f22,%f10,%f58 211 fmuld %f18,%f4,%f44 212 faddd %f40,%f56,%f56 213 fmuld %f22,%f12,%f60 214 fmuld %f18,%f6,%f46 215 faddd %f42,%f58,%f58 216 fmuld %f22,%f14,%f62 217 218 faddd %f24,%f48,%f48 219 faddd %f26,%f50,%f50 220 faddd %f44,%f60,%f24 ! %f60 221 faddd %f46,%f62,%f26 ! %f62 222 223 faddd %f52,%f56,%f52 224 faddd %f54,%f58,%f54 225 226 fdtox %f48,%f48 227 fdtox %f50,%f50 228 fdtox %f52,%f52 229 fdtox %f54,%f54 230 231 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 232 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 233 addcc %l6,8,%l6 234 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 235 bz,pn %icc,.L1stskip 236 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 237 238.align 32 ! incidentally already aligned ! 239.L1st: 240 add %i1,%l6,%o4 241 add %i3,%l6,%o5 242 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 243 .word 0xa1b00c20 ! fzeros %f16 244 ld [%o4+4],%f19 245 .word 0xa5b00c20 ! fzeros %f18 246 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 247 .word 0xa9b00c20 ! fzeros %f20 248 ld [%o5+4],%f23 249 .word 0xadb00c20 ! fzeros %f22 250 251 fxtod %f16,%f16 252 fxtod %f18,%f18 253 fxtod %f20,%f20 254 fxtod %f22,%f22 255 256 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 257 fmuld %f16,%f0,%f32 258 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 259 fmuld %f20,%f8,%f48 260 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 261 fmuld %f16,%f2,%f34 262 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 263 fmuld %f20,%f10,%f50 264 265 srlx %o0,16,%o7 266 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 267 fmuld %f16,%f4,%f36 268 add %o7,%o1,%o1 269 std %f18,[%l2+%l6] 270 faddd %f32,%f48,%f48 271 fmuld %f20,%f12,%f52 272 srlx %o1,16,%o7 273 std %f20,[%l3+%l6] ! save smashed np[j] in double format 274 fmuld %f16,%f6,%f38 275 add %o7,%o2,%o2 276 std %f22,[%l4+%l6] 277 faddd %f34,%f50,%f50 278 fmuld %f20,%f14,%f54 279 srlx %o2,16,%o7 280 fmuld %f18,%f0,%f40 281 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 282 and %o0,%l7,%o0 283 faddd %f36,%f52,%f52 284 fmuld %f22,%f8,%f56 285 and %o1,%l7,%o1 286 and %o2,%l7,%o2 287 fmuld %f18,%f2,%f42 288 sllx %o1,16,%o1 289 faddd %f38,%f54,%f54 290 fmuld %f22,%f10,%f58 291 sllx %o2,32,%o2 292 fmuld %f18,%f4,%f44 293 sllx %o3,48,%o7 294 or %o1,%o0,%o0 295 faddd %f40,%f56,%f56 296 fmuld %f22,%f12,%f60 297 or %o2,%o0,%o0 298 fmuld %f18,%f6,%f46 299 or %o7,%o0,%o0 ! 64-bit result 300 faddd %f42,%f58,%f58 301 fmuld %f22,%f14,%f62 302 addcc %g1,%o0,%o0 303 faddd %f24,%f48,%f48 304 srlx %o3,16,%g1 ! 34-bit carry 305 faddd %f26,%f50,%f50 306 bcs,a %xcc,.+8 307 add %g1,1,%g1 308 309 stx %o0,[%l0] ! tp[j-1]= 310 311 faddd %f44,%f60,%f24 ! %f60 312 faddd %f46,%f62,%f26 ! %f62 313 314 faddd %f52,%f56,%f52 315 faddd %f54,%f58,%f54 316 317 fdtox %f48,%f48 318 fdtox %f50,%f50 319 fdtox %f52,%f52 320 fdtox %f54,%f54 321 322 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 323 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 324 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 325 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 326 327 addcc %l6,8,%l6 328 bnz,pt %icc,.L1st 329 add %l0,8,%l0 330 331.L1stskip: 332 fdtox %f24,%f24 333 fdtox %f26,%f26 334 335 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 336 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 337 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 338 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 339 340 srlx %o0,16,%o7 341 std %f24,[%sp+STACK_BIAS+STACK_FRAME+32] 342 add %o7,%o1,%o1 343 std %f26,[%sp+STACK_BIAS+STACK_FRAME+40] 344 srlx %o1,16,%o7 345 add %o7,%o2,%o2 346 srlx %o2,16,%o7 347 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 348 and %o0,%l7,%o0 349 and %o1,%l7,%o1 350 and %o2,%l7,%o2 351 sllx %o1,16,%o1 352 sllx %o2,32,%o2 353 sllx %o3,48,%o7 354 or %o1,%o0,%o0 355 or %o2,%o0,%o0 356 or %o7,%o0,%o0 ! 64-bit result 357 ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4 358 addcc %g1,%o0,%o0 359 ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5 360 srlx %o3,16,%g1 ! 34-bit carry 361 bcs,a %xcc,.+8 362 add %g1,1,%g1 363 364 stx %o0,[%l0] ! tp[j-1]= 365 add %l0,8,%l0 366 367 srlx %o4,16,%o7 368 add %o7,%o5,%o5 369 and %o4,%l7,%o4 370 sllx %o5,16,%o7 371 or %o7,%o4,%o4 372 addcc %g1,%o4,%o4 373 srlx %o5,48,%g1 374 bcs,a %xcc,.+8 375 add %g1,1,%g1 376 377 mov %g1,%i4 378 stx %o4,[%l0] ! tp[num-1]= 379 380 ba .Louter 381 add %l5,8,%l5 382.align 32 383.Louter: 384 sub %g0,%i5,%l6 ! j=-num 385 add %sp,STACK_BIAS+STACK_FRAME+64,%l0 386 387 add %i1,%l6,%o3 388 add %i2,%l5,%o4 389 390 ld [%o3+4],%g1 ! bp[i] 391 ld [%o3+0],%o0 392 ld [%o4+4],%g5 ! ap[0] 393 sllx %g1,32,%g1 394 ld [%o4+0],%o1 395 sllx %g5,32,%g5 396 or %g1,%o0,%o0 397 or %g5,%o1,%o1 398 399 ldx [%l0],%o2 ! tp[0] 400 mulx %o1,%o0,%o0 401 addcc %o2,%o0,%o0 402 mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 403 stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0] 404 405 ! transfer b[i] to FPU as 4x16-bit values 406 ldda [%o4+2]%asi,%f0 407 ldda [%o4+0]%asi,%f2 408 ldda [%o4+6]%asi,%f4 409 ldda [%o4+4]%asi,%f6 410 411 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 412 ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8 413 fxtod %f0,%f0 414 ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10 415 fxtod %f2,%f2 416 ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12 417 fxtod %f4,%f4 418 ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14 419 fxtod %f6,%f6 420 ldd [%l1+%l6],%f16 ! load a[j] in double format 421 fxtod %f8,%f8 422 ldd [%l2+%l6],%f18 423 fxtod %f10,%f10 424 ldd [%l3+%l6],%f20 ! load n[j] in double format 425 fxtod %f12,%f12 426 ldd [%l4+%l6],%f22 427 fxtod %f14,%f14 428 429 fmuld %f16,%f0,%f32 430 fmuld %f20,%f8,%f48 431 fmuld %f16,%f2,%f34 432 fmuld %f20,%f10,%f50 433 fmuld %f16,%f4,%f36 434 faddd %f32,%f48,%f48 435 fmuld %f20,%f12,%f52 436 fmuld %f16,%f6,%f38 437 faddd %f34,%f50,%f50 438 fmuld %f20,%f14,%f54 439 fmuld %f18,%f0,%f40 440 faddd %f36,%f52,%f52 441 fmuld %f22,%f8,%f56 442 fmuld %f18,%f2,%f42 443 faddd %f38,%f54,%f54 444 fmuld %f22,%f10,%f58 445 fmuld %f18,%f4,%f44 446 faddd %f40,%f56,%f56 447 fmuld %f22,%f12,%f60 448 fmuld %f18,%f6,%f46 449 faddd %f42,%f58,%f58 450 fmuld %f22,%f14,%f62 451 452 faddd %f44,%f60,%f24 ! %f60 453 faddd %f46,%f62,%f26 ! %f62 454 455 faddd %f52,%f56,%f52 456 faddd %f54,%f58,%f54 457 458 fdtox %f48,%f48 459 fdtox %f50,%f50 460 fdtox %f52,%f52 461 fdtox %f54,%f54 462 463 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 464 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 465 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 466 add %l6,8,%l6 467 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 468 469 ldd [%l1+%l6],%f16 ! load a[j] in double format 470 ldd [%l2+%l6],%f18 471 ldd [%l3+%l6],%f20 ! load n[j] in double format 472 ldd [%l4+%l6],%f22 473 474 fmuld %f16,%f0,%f32 475 fmuld %f20,%f8,%f48 476 fmuld %f16,%f2,%f34 477 fmuld %f20,%f10,%f50 478 fmuld %f16,%f4,%f36 479 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 480 faddd %f32,%f48,%f48 481 fmuld %f20,%f12,%f52 482 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 483 fmuld %f16,%f6,%f38 484 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 485 faddd %f34,%f50,%f50 486 fmuld %f20,%f14,%f54 487 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 488 fmuld %f18,%f0,%f40 489 490 srlx %o0,16,%o7 491 faddd %f36,%f52,%f52 492 fmuld %f22,%f8,%f56 493 add %o7,%o1,%o1 494 fmuld %f18,%f2,%f42 495 srlx %o1,16,%o7 496 faddd %f38,%f54,%f54 497 fmuld %f22,%f10,%f58 498 add %o7,%o2,%o2 499 fmuld %f18,%f4,%f44 500 srlx %o2,16,%o7 501 faddd %f40,%f56,%f56 502 fmuld %f22,%f12,%f60 503 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 504 ! why? 505 and %o0,%l7,%o0 506 fmuld %f18,%f6,%f46 507 and %o1,%l7,%o1 508 and %o2,%l7,%o2 509 faddd %f42,%f58,%f58 510 fmuld %f22,%f14,%f62 511 sllx %o1,16,%o1 512 faddd %f24,%f48,%f48 513 sllx %o2,32,%o2 514 faddd %f26,%f50,%f50 515 sllx %o3,48,%o7 516 or %o1,%o0,%o0 517 faddd %f44,%f60,%f24 ! %f60 518 or %o2,%o0,%o0 519 faddd %f46,%f62,%f26 ! %f62 520 or %o7,%o0,%o0 ! 64-bit result 521 ldx [%l0],%o7 522 faddd %f52,%f56,%f52 523 addcc %o7,%o0,%o0 524 ! end-of-why? 525 faddd %f54,%f58,%f54 526 srlx %o3,16,%g1 ! 34-bit carry 527 fdtox %f48,%f48 528 bcs,a %xcc,.+8 529 add %g1,1,%g1 530 531 fdtox %f50,%f50 532 fdtox %f52,%f52 533 fdtox %f54,%f54 534 535 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 536 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 537 addcc %l6,8,%l6 538 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 539 bz,pn %icc,.Linnerskip 540 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 541 542 ba .Linner 543 nop 544.align 32 545.Linner: 546 ldd [%l1+%l6],%f16 ! load a[j] in double format 547 ldd [%l2+%l6],%f18 548 ldd [%l3+%l6],%f20 ! load n[j] in double format 549 ldd [%l4+%l6],%f22 550 551 fmuld %f16,%f0,%f32 552 fmuld %f20,%f8,%f48 553 fmuld %f16,%f2,%f34 554 fmuld %f20,%f10,%f50 555 fmuld %f16,%f4,%f36 556 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 557 faddd %f32,%f48,%f48 558 fmuld %f20,%f12,%f52 559 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 560 fmuld %f16,%f6,%f38 561 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 562 faddd %f34,%f50,%f50 563 fmuld %f20,%f14,%f54 564 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 565 fmuld %f18,%f0,%f40 566 567 srlx %o0,16,%o7 568 faddd %f36,%f52,%f52 569 fmuld %f22,%f8,%f56 570 add %o7,%o1,%o1 571 fmuld %f18,%f2,%f42 572 srlx %o1,16,%o7 573 faddd %f38,%f54,%f54 574 fmuld %f22,%f10,%f58 575 add %o7,%o2,%o2 576 fmuld %f18,%f4,%f44 577 srlx %o2,16,%o7 578 faddd %f40,%f56,%f56 579 fmuld %f22,%f12,%f60 580 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 581 and %o0,%l7,%o0 582 fmuld %f18,%f6,%f46 583 and %o1,%l7,%o1 584 and %o2,%l7,%o2 585 faddd %f42,%f58,%f58 586 fmuld %f22,%f14,%f62 587 sllx %o1,16,%o1 588 faddd %f24,%f48,%f48 589 sllx %o2,32,%o2 590 faddd %f26,%f50,%f50 591 sllx %o3,48,%o7 592 or %o1,%o0,%o0 593 faddd %f44,%f60,%f24 ! %f60 594 or %o2,%o0,%o0 595 faddd %f46,%f62,%f26 ! %f62 596 or %o7,%o0,%o0 ! 64-bit result 597 faddd %f52,%f56,%f52 598 addcc %g1,%o0,%o0 599 ldx [%l0+8],%o7 ! tp[j] 600 faddd %f54,%f58,%f54 601 srlx %o3,16,%g1 ! 34-bit carry 602 fdtox %f48,%f48 603 bcs,a %xcc,.+8 604 add %g1,1,%g1 605 fdtox %f50,%f50 606 addcc %o7,%o0,%o0 607 fdtox %f52,%f52 608 bcs,a %xcc,.+8 609 add %g1,1,%g1 610 611 stx %o0,[%l0] ! tp[j-1] 612 fdtox %f54,%f54 613 614 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 615 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 616 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 617 addcc %l6,8,%l6 618 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 619 bnz,pt %icc,.Linner 620 add %l0,8,%l0 621 622.Linnerskip: 623 fdtox %f24,%f24 624 fdtox %f26,%f26 625 626 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 627 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 628 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 629 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 630 631 srlx %o0,16,%o7 632 std %f24,[%sp+STACK_BIAS+STACK_FRAME+32] 633 add %o7,%o1,%o1 634 std %f26,[%sp+STACK_BIAS+STACK_FRAME+40] 635 srlx %o1,16,%o7 636 add %o7,%o2,%o2 637 srlx %o2,16,%o7 638 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 639 and %o0,%l7,%o0 640 and %o1,%l7,%o1 641 and %o2,%l7,%o2 642 sllx %o1,16,%o1 643 sllx %o2,32,%o2 644 sllx %o3,48,%o7 645 or %o1,%o0,%o0 646 or %o2,%o0,%o0 647 ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4 648 or %o7,%o0,%o0 ! 64-bit result 649 ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5 650 addcc %g1,%o0,%o0 651 ldx [%l0+8],%o7 ! tp[j] 652 srlx %o3,16,%g1 ! 34-bit carry 653 bcs,a %xcc,.+8 654 add %g1,1,%g1 655 656 addcc %o7,%o0,%o0 657 bcs,a %xcc,.+8 658 add %g1,1,%g1 659 660 stx %o0,[%l0] ! tp[j-1] 661 add %l0,8,%l0 662 663 srlx %o4,16,%o7 664 add %o7,%o5,%o5 665 and %o4,%l7,%o4 666 sllx %o5,16,%o7 667 or %o7,%o4,%o4 668 addcc %g1,%o4,%o4 669 srlx %o5,48,%g1 670 bcs,a %xcc,.+8 671 add %g1,1,%g1 672 673 addcc %i4,%o4,%o4 674 stx %o4,[%l0] ! tp[num-1] 675 mov %g1,%i4 676 bcs,a %xcc,.+8 677 add %i4,1,%i4 678 679 addcc %l5,8,%l5 680 bnz %icc,.Louter 681 nop 682 683 add %l0,8,%l0 ! adjust tp to point at the end 684 orn %g0,%g0,%g4 685 sub %g0,%i5,%o7 ! n=-num 686 ba .Lsub 687 subcc %g0,%g0,%g0 ! clear %icc.c 688 689.align 32 690.Lsub: 691 ldx [%l0+%o7],%o0 692 add %i3,%o7,%g1 693 ld [%g1+0],%o2 694 ld [%g1+4],%o3 695 srlx %o0,32,%o1 696 subccc %o0,%o2,%o2 697 add %i0,%o7,%g1 698 subccc %o1,%o3,%o3 699 st %o2,[%g1+0] 700 add %o7,8,%o7 701 brnz,pt %o7,.Lsub 702 st %o3,[%g1+4] 703 subc %i4,0,%g4 704 sub %g0,%i5,%o7 ! n=-num 705 ba .Lcopy 706 nop 707 708.align 32 709.Lcopy: 710 ldx [%l0+%o7],%o0 711 add %i0,%o7,%g1 712 ld [%g1+0],%o2 713 ld [%g1+4],%o3 714 stx %g0,[%l0+%o7] 715 and %o0,%g4,%o0 716 srlx %o0,32,%o1 717 andn %o2,%g4,%o2 718 andn %o3,%g4,%o3 719 or %o2,%o0,%o0 720 or %o3,%o1,%o1 721 st %o0,[%g1+0] 722 add %o7,8,%o7 723 brnz,pt %o7,.Lcopy 724 st %o1,[%g1+4] 725 sub %g0,%i5,%o7 ! n=-num 726 727.Lzap: 728 stx %g0,[%l1+%o7] 729 stx %g0,[%l2+%o7] 730 stx %g0,[%l3+%o7] 731 stx %g0,[%l4+%o7] 732 add %o7,8,%o7 733 brnz,pt %o7,.Lzap 734 nop 735 736 ldx [%sp+STACK_BIAS+STACK_FRAME+48],%o7 737 wr %g0,%o7,%asi ! restore %asi 738 739 mov 1,%i0 740.Lret: 741 ret 742 restore 743.type bn_mul_mont_fpu,#function 744.size bn_mul_mont_fpu,(.-bn_mul_mont_fpu) 745.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>" 746.align 32 747