1#include "sparc_arch.h" 2 3#ifdef __arch64__ 4.register %g2,#scratch 5.register %g3,#scratch 6# define STPTR stx 7# define SIZE_T 8 8#else 9# define STPTR st 10# define SIZE_T 4 11#endif 12#define LOCALS (STACK_BIAS+STACK_FRAME) 13 14.section ".text",#alloc,#execinstr 15 16#ifdef __PIC__ 17SPARC_PIC_THUNK(%g1) 18#endif 19 20.globl poly1305_init 21.align 32 22poly1305_init: 23 save %sp,-STACK_FRAME-16,%sp 24 nop 25 26 SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1) 27 ld [%g1],%g1 28 29 and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1 30 cmp %g1,SPARCV9_FMADD 31 be .Lpoly1305_init_fma 32 nop 33 34 stx %g0,[%i0+0] 35 stx %g0,[%i0+8] ! zero hash value 36 brz,pn %i1,.Lno_key 37 stx %g0,[%i0+16] 38 39 and %i1,7,%i5 ! alignment factor 40 andn %i1,7,%i1 41 sll %i5,3,%i5 ! *8 42 neg %i5,%i4 43 44 sethi %hi(0x0ffffffc),%o4 45 set 8,%o1 46 or %o4,%lo(0x0ffffffc),%o4 47 set 16,%o2 48 sllx %o4,32,%o5 49 or %o4,%o5,%o5 ! 0x0ffffffc0ffffffc 50 or %o5,3,%o4 ! 0x0ffffffc0fffffff 51 52 ldxa [%i1+%g0]0x88,%o0 ! load little-endian key 53 brz,pt %i5,.Lkey_aligned 54 ldxa [%i1+%o1]0x88,%o1 55 56 ldxa [%i1+%o2]0x88,%o2 57 srlx %o0,%i5,%o0 58 sllx %o1,%i4,%o7 59 srlx %o1,%i5,%o1 60 or %o7,%o0,%o0 61 sllx %o2,%i4,%o2 62 or %o2,%o1,%o1 63 64.Lkey_aligned: 65 and %o4,%o0,%o0 66 and %o5,%o1,%o1 67 stx %o0,[%i0+32+0] ! store key 68 stx %o1,[%i0+32+8] 69 70 andcc %g1,SPARCV9_VIS3,%g0 71 be .Lno_key 72 nop 73 741: call .+8 75 add %o7,poly1305_blocks_vis3-1b,%o7 76 77 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5 78 STPTR %o7,[%i2] 79 STPTR %o5,[%i2+SIZE_T] 80 81 ret 82 restore %g0,1,%o0 ! return 1 83 84.Lno_key: 85 ret 86 restore %g0,%g0,%o0 ! return 0 87.type poly1305_init,#function 88.size poly1305_init,.-poly1305_init 89 90.globl poly1305_blocks 91.align 32 92poly1305_blocks: 93 save %sp,-STACK_FRAME,%sp 94 srln %i2,4,%i2 95 96 brz,pn %i2,.Lno_data 97 nop 98 99 ld [%i0+32+0],%l1 ! load key 100 ld [%i0+32+4],%l0 101 ld [%i0+32+8],%l3 102 ld [%i0+32+12],%l2 103 104 ld [%i0+0],%o1 ! load hash value 105 ld [%i0+4],%o0 106 ld [%i0+8],%o3 107 ld [%i0+12],%o2 108 ld [%i0+16],%l7 109 110 and %i1,7,%i5 ! alignment factor 111 andn %i1,7,%i1 112 set 8,%g2 113 sll %i5,3,%i5 ! *8 114 set 16,%g3 115 neg %i5,%i4 116 117 srl %l1,2,%l4 118 srl %l2,2,%l5 119 add %l1,%l4,%l4 120 srl %l3,2,%l6 121 add %l2,%l5,%l5 122 add %l3,%l6,%l6 123 124.Loop: 125 ldxa [%i1+%g0]0x88,%g1 ! load little-endian input 126 brz,pt %i5,.Linp_aligned 127 ldxa [%i1+%g2]0x88,%g2 128 129 ldxa [%i1+%g3]0x88,%g3 130 srlx %g1,%i5,%g1 131 sllx %g2,%i4,%o5 132 srlx %g2,%i5,%g2 133 or %o5,%g1,%g1 134 sllx %g3,%i4,%g3 135 or %g3,%g2,%g2 136 137.Linp_aligned: 138 srlx %g1,32,%o4 139 addcc %g1,%o0,%o0 ! accumulate input 140 srlx %g2,32,%o5 141 addccc %o4,%o1,%o1 142 addccc %g2,%o2,%o2 143 addccc %o5,%o3,%o3 144 addc %i3,%l7,%l7 145 146 umul %l0,%o0,%g1 147 umul %l1,%o0,%g2 148 umul %l2,%o0,%g3 149 umul %l3,%o0,%g4 150 sub %i2,1,%i2 151 add %i1,16,%i1 152 153 umul %l6,%o1,%o4 154 umul %l0,%o1,%o5 155 umul %l1,%o1,%o7 156 add %o4,%g1,%g1 157 add %o5,%g2,%g2 158 umul %l2,%o1,%o4 159 add %o7,%g3,%g3 160 add %o4,%g4,%g4 161 162 umul %l5,%o2,%o5 163 umul %l6,%o2,%o7 164 umul %l0,%o2,%o4 165 add %o5,%g1,%g1 166 add %o7,%g2,%g2 167 umul %l1,%o2,%o5 168 add %o4,%g3,%g3 169 add %o5,%g4,%g4 170 171 umul %l4,%o3,%o7 172 umul %l5,%o3,%o4 173 umul %l6,%o3,%o5 174 add %o7,%g1,%g1 175 add %o4,%g2,%g2 176 umul %l0,%o3,%o7 177 add %o5,%g3,%g3 178 add %o7,%g4,%g4 179 180 umul %l4,%l7,%o4 181 umul %l5,%l7,%o5 182 umul %l6,%l7,%o7 183 umul %l0,%l7,%l7 184 add %o4,%g2,%g2 185 add %o5,%g3,%g3 186 srlx %g1,32,%o1 187 add %o7,%g4,%g4 188 srlx %g2,32,%o2 189 190 addcc %g2,%o1,%o1 191 srlx %g3,32,%o3 192 set 8,%g2 193 addccc %g3,%o2,%o2 194 srlx %g4,32,%o4 195 set 16,%g3 196 addccc %g4,%o3,%o3 197 addc %o4,%l7,%l7 198 199 srl %l7,2,%o4 ! final reduction step 200 andn %l7,3,%o5 201 and %l7,3,%l7 202 add %o5,%o4,%o4 203 204 addcc %o4,%g1,%o0 205 addccc %g0,%o1,%o1 206 addccc %g0,%o2,%o2 207 addccc %g0,%o3,%o3 208 brnz,pt %i2,.Loop 209 addc %g0,%l7,%l7 210 211 st %o1,[%i0+0] ! store hash value 212 st %o0,[%i0+4] 213 st %o3,[%i0+8] 214 st %o2,[%i0+12] 215 st %l7,[%i0+16] 216 217.Lno_data: 218 ret 219 restore 220.type poly1305_blocks,#function 221.size poly1305_blocks,.-poly1305_blocks 222.align 32 223poly1305_blocks_vis3: 224 save %sp,-STACK_FRAME,%sp 225 srln %i2,4,%i2 226 227 brz,pn %i2,.Lno_data 228 nop 229 230 ldx [%i0+32+0],%o3 ! load key 231 ldx [%i0+32+8],%o4 232 233 ldx [%i0+0],%o0 ! load hash value 234 ldx [%i0+8],%o1 235 ld [%i0+16],%o2 236 237 and %i1,7,%i5 ! alignment factor 238 andn %i1,7,%i1 239 set 8,%l1 240 sll %i5,3,%i5 ! *8 241 set 16,%l2 242 neg %i5,%i4 243 244 srlx %o4,2,%o5 245 b .Loop_vis3 246 add %o4,%o5,%o5 247 248.Loop_vis3: 249 ldxa [%i1+%g0]0x88,%g1 ! load little-endian input 250 brz,pt %i5,.Linp_aligned_vis3 251 ldxa [%i1+%l1]0x88,%g2 252 253 ldxa [%i1+%l2]0x88,%g3 254 srlx %g1,%i5,%g1 255 sllx %g2,%i4,%o7 256 srlx %g2,%i5,%g2 257 or %o7,%g1,%g1 258 sllx %g3,%i4,%g3 259 or %g3,%g2,%g2 260 261.Linp_aligned_vis3: 262 addcc %g1,%o0,%o0 ! accumulate input 263 sub %i2,1,%i2 264 .word 0x93b08269 !addxccc %g2,%o1,%o1 265 add %i1,16,%i1 266 267 mulx %o3,%o0,%g1 ! r0*h0 268 .word 0x95b6c22a !addxc %i3,%o2,%o2 269 .word 0x85b2c2c8 !umulxhi %o3,%o0,%g2 270 mulx %o5,%o1,%g4 ! s1*h1 271 .word 0x9fb342c9 !umulxhi %o5,%o1,%o7 272 addcc %g4,%g1,%g1 273 mulx %o4,%o0,%g4 ! r1*h0 274 .word 0x85b3c222 !addxc %o7,%g2,%g2 275 .word 0x87b302c8 !umulxhi %o4,%o0,%g3 276 addcc %g4,%g2,%g2 277 mulx %o3,%o1,%g4 ! r0*h1 278 .word 0x87b00223 !addxc %g0,%g3,%g3 279 .word 0x9fb2c2c9 !umulxhi %o3,%o1,%o7 280 addcc %g4,%g2,%g2 281 mulx %o5,%o2,%g4 ! s1*h2 282 .word 0x87b3c223 !addxc %o7,%g3,%g3 283 mulx %o3,%o2,%o7 ! r0*h2 284 addcc %g4,%g2,%g2 285 .word 0x87b3c223 !addxc %o7,%g3,%g3 286 287 srlx %g3,2,%g4 ! final reduction step 288 andn %g3,3,%o7 289 and %g3,3,%o2 290 add %o7,%g4,%g4 291 292 addcc %g4,%g1,%o0 293 .word 0x93b00262 !addxccc %g0,%g2,%o1 294 brnz,pt %i2,.Loop_vis3 295 .word 0x95b0022a !addxc %g0,%o2,%o2 296 297 stx %o0,[%i0+0] ! store hash value 298 stx %o1,[%i0+8] 299 st %o2,[%i0+16] 300 301 ret 302 restore 303.type poly1305_blocks_vis3,#function 304.size poly1305_blocks_vis3,.-poly1305_blocks_vis3 305.globl poly1305_emit 306.align 32 307poly1305_emit: 308 save %sp,-STACK_FRAME,%sp 309 310 ld [%i0+0],%o1 ! load hash value 311 ld [%i0+4],%o0 312 ld [%i0+8],%o3 313 ld [%i0+12],%o2 314 ld [%i0+16],%l7 315 316 addcc %o0,5,%l0 ! compare to modulus 317 addccc %o1,0,%l1 318 addccc %o2,0,%l2 319 addccc %o3,0,%l3 320 addc %l7,0,%l7 321 andcc %l7,4,%g0 ! did it carry/borrow? 322 323 movnz %icc,%l0,%o0 324 ld [%i2+0],%l0 ! load nonce 325 movnz %icc,%l1,%o1 326 ld [%i2+4],%l1 327 movnz %icc,%l2,%o2 328 ld [%i2+8],%l2 329 movnz %icc,%l3,%o3 330 ld [%i2+12],%l3 331 332 addcc %l0,%o0,%o0 ! accumulate nonce 333 addccc %l1,%o1,%o1 334 addccc %l2,%o2,%o2 335 addc %l3,%o3,%o3 336 337 srl %o0,8,%l0 338 stb %o0,[%i1+0] ! store little-endian result 339 srl %o0,16,%l1 340 stb %l0,[%i1+1] 341 srl %o0,24,%l2 342 stb %l1,[%i1+2] 343 stb %l2,[%i1+3] 344 345 srl %o1,8,%l0 346 stb %o1,[%i1+4] 347 srl %o1,16,%l1 348 stb %l0,[%i1+5] 349 srl %o1,24,%l2 350 stb %l1,[%i1+6] 351 stb %l2,[%i1+7] 352 353 srl %o2,8,%l0 354 stb %o2,[%i1+8] 355 srl %o2,16,%l1 356 stb %l0,[%i1+9] 357 srl %o2,24,%l2 358 stb %l1,[%i1+10] 359 stb %l2,[%i1+11] 360 361 srl %o3,8,%l0 362 stb %o3,[%i1+12] 363 srl %o3,16,%l1 364 stb %l0,[%i1+13] 365 srl %o3,24,%l2 366 stb %l1,[%i1+14] 367 stb %l2,[%i1+15] 368 369 ret 370 restore 371.type poly1305_emit,#function 372.size poly1305_emit,.-poly1305_emit 373.align 32 374poly1305_init_fma: 375 save %sp,-STACK_FRAME-16,%sp 376 nop 377 378.Lpoly1305_init_fma: 3791: call .+8 380 add %o7,.Lconsts_fma-1b,%o7 381 382 ldd [%o7+8*0],%f16 ! load constants 383 ldd [%o7+8*1],%f18 384 ldd [%o7+8*2],%f20 385 ldd [%o7+8*3],%f22 386 ldd [%o7+8*5],%f26 387 388 std %f16,[%i0+8*0] ! initial hash value, biased 0 389 std %f18,[%i0+8*1] 390 std %f20,[%i0+8*2] 391 std %f22,[%i0+8*3] 392 393 brz,pn %i1,.Lno_key_fma 394 nop 395 396 stx %fsr,[%sp+LOCALS] ! save original %fsr 397 ldx [%o7+8*6],%fsr ! load new %fsr 398 399 std %f16,[%i0+8*4] ! key "template" 400 std %f18,[%i0+8*5] 401 std %f20,[%i0+8*6] 402 std %f22,[%i0+8*7] 403 404 and %i1,7,%l2 405 andn %i1,7,%i1 ! align pointer 406 mov 8,%l0 407 sll %l2,3,%l2 408 mov 16,%l1 409 neg %l2,%l3 410 411 ldxa [%i1+%g0]0x88,%o0 ! load little-endian key 412 ldxa [%i1+%l0]0x88,%o2 413 414 brz %l2,.Lkey_aligned_fma 415 sethi %hi(0xf0000000),%l0 ! 0xf0000000 416 417 ldxa [%i1+%l1]0x88,%o4 418 419 srlx %o0,%l2,%o0 ! align data 420 sllx %o2,%l3,%o1 421 srlx %o2,%l2,%o2 422 or %o1,%o0,%o0 423 sllx %o4,%l3,%o3 424 or %o3,%o2,%o2 425 426.Lkey_aligned_fma: 427 or %l0,3,%l1 ! 0xf0000003 428 srlx %o0,32,%o1 429 andn %o0,%l0,%o0 ! &=0x0fffffff 430 andn %o1,%l1,%o1 ! &=0x0ffffffc 431 srlx %o2,32,%o3 432 andn %o2,%l1,%o2 433 andn %o3,%l1,%o3 434 435 st %o0,[%i0+36] ! fill "template" 436 st %o1,[%i0+44] 437 st %o2,[%i0+52] 438 st %o3,[%i0+60] 439 440 ldd [%i0+8*4],%f0 ! load [biased] key 441 ldd [%i0+8*5],%f4 442 ldd [%i0+8*6],%f8 443 ldd [%i0+8*7],%f12 444 445 fsubd %f0,%f16, %f0 ! r0 446 ldd [%o7+8*7],%f16 ! more constants 447 fsubd %f4,%f18,%f4 ! r1 448 ldd [%o7+8*8],%f18 449 fsubd %f8,%f20,%f8 ! r2 450 ldd [%o7+8*9],%f20 451 fsubd %f12,%f22,%f12 ! r3 452 ldd [%o7+8*10],%f22 453 454 fmuld %f26,%f4,%f52 ! s1 455 fmuld %f26,%f8,%f40 ! s2 456 fmuld %f26,%f12,%f44 ! s3 457 458 faddd %f0,%f16, %f2 459 faddd %f4,%f18,%f6 460 faddd %f8,%f20,%f10 461 faddd %f12,%f22,%f14 462 463 fsubd %f2,%f16, %f2 464 ldd [%o7+8*11],%f16 ! more constants 465 fsubd %f6,%f18,%f6 466 ldd [%o7+8*12],%f18 467 fsubd %f10,%f20,%f10 468 ldd [%o7+8*13],%f20 469 fsubd %f14,%f22,%f14 470 471 fsubd %f0,%f2,%f0 472 std %f2,[%i0+8*5] ! r0hi 473 fsubd %f4,%f6,%f4 474 std %f6,[%i0+8*7] ! r1hi 475 fsubd %f8,%f10,%f8 476 std %f10,[%i0+8*9] ! r2hi 477 fsubd %f12,%f14,%f12 478 std %f14,[%i0+8*11] ! r3hi 479 480 faddd %f52,%f16, %f54 481 faddd %f40,%f18,%f42 482 faddd %f44,%f20,%f46 483 484 fsubd %f54,%f16, %f54 485 fsubd %f42,%f18,%f42 486 fsubd %f46,%f20,%f46 487 488 fsubd %f52,%f54,%f52 489 fsubd %f40,%f42,%f40 490 fsubd %f44,%f46,%f44 491 492 ldx [%sp+LOCALS],%fsr ! restore %fsr 493 494 std %f0,[%i0+8*4] ! r0lo 495 std %f4,[%i0+8*6] ! r1lo 496 std %f8,[%i0+8*8] ! r2lo 497 std %f12,[%i0+8*10] ! r3lo 498 499 std %f54,[%i0+8*13] 500 std %f42,[%i0+8*15] 501 std %f46,[%i0+8*17] 502 503 std %f52,[%i0+8*12] 504 std %f40,[%i0+8*14] 505 std %f44,[%i0+8*16] 506 507 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0 508 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1 509 STPTR %o0,[%i2] 510 STPTR %o1,[%i2+SIZE_T] 511 512 ret 513 restore %g0,1,%o0 ! return 1 514 515.Lno_key_fma: 516 ret 517 restore %g0,%g0,%o0 ! return 0 518.type poly1305_init_fma,#function 519.size poly1305_init_fma,.-poly1305_init_fma 520 521.align 32 522poly1305_blocks_fma: 523 save %sp,-STACK_FRAME-48,%sp 524 srln %i2,4,%i2 525 526 brz,pn %i2,.Labort 527 sub %i2,1,%i2 528 5291: call .+8 530 add %o7,.Lconsts_fma-1b,%o7 531 532 ldd [%o7+8*0],%f16 ! load constants 533 ldd [%o7+8*1],%f18 534 ldd [%o7+8*2],%f20 535 ldd [%o7+8*3],%f22 536 ldd [%o7+8*4],%f24 537 ldd [%o7+8*5],%f26 538 539 ldd [%i0+8*0],%f0 ! load [biased] hash value 540 ldd [%i0+8*1],%f4 541 ldd [%i0+8*2],%f8 542 ldd [%i0+8*3],%f12 543 544 std %f16,[%sp+LOCALS+8*0] ! input "template" 545 sethi %hi((1023+52+96)<<20),%o3 546 std %f18,[%sp+LOCALS+8*1] 547 or %i3,%o3,%o3 548 std %f20,[%sp+LOCALS+8*2] 549 st %o3,[%sp+LOCALS+8*3] 550 551 and %i1,7,%l2 552 andn %i1,7,%i1 ! align pointer 553 mov 8,%l0 554 sll %l2,3,%l2 555 mov 16,%l1 556 neg %l2,%l3 557 558 ldxa [%i1+%g0]0x88,%o0 ! load little-endian input 559 brz %l2,.Linp_aligned_fma 560 ldxa [%i1+%l0]0x88,%o2 561 562 ldxa [%i1+%l1]0x88,%o4 563 add %i1,8,%i1 564 565 srlx %o0,%l2,%o0 ! align data 566 sllx %o2,%l3,%o1 567 srlx %o2,%l2,%o2 568 or %o1,%o0,%o0 569 sllx %o4,%l3,%o3 570 srlx %o4,%l2,%o4 ! pre-shift 571 or %o3,%o2,%o2 572 573.Linp_aligned_fma: 574 srlx %o0,32,%o1 575 movrz %i2,0,%l1 576 srlx %o2,32,%o3 577 add %l1,%i1,%i1 ! conditional advance 578 579 st %o0,[%sp+LOCALS+8*0+4] ! fill "template" 580 st %o1,[%sp+LOCALS+8*1+4] 581 st %o2,[%sp+LOCALS+8*2+4] 582 st %o3,[%sp+LOCALS+8*3+4] 583 584 ldd [%i0+8*4],%f28 ! load key 585 ldd [%i0+8*5],%f30 586 ldd [%i0+8*6],%f32 587 ldd [%i0+8*7],%f34 588 ldd [%i0+8*8],%f36 589 ldd [%i0+8*9],%f38 590 ldd [%i0+8*10],%f48 591 ldd [%i0+8*11],%f50 592 ldd [%i0+8*12],%f52 593 ldd [%i0+8*13],%f54 594 ldd [%i0+8*14],%f40 595 ldd [%i0+8*15],%f42 596 ldd [%i0+8*16],%f44 597 ldd [%i0+8*17],%f46 598 599 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr 600 ldx [%o7+8*6],%fsr ! load new %fsr 601 602 subcc %i2,1,%i2 603 movrz %i2,0,%l1 604 605 ldd [%sp+LOCALS+8*0],%f56 ! load biased input 606 ldd [%sp+LOCALS+8*1],%f58 607 ldd [%sp+LOCALS+8*2],%f60 608 ldd [%sp+LOCALS+8*3],%f62 609 610 fsubd %f0,%f16, %f0 ! de-bias hash value 611 fsubd %f4,%f18,%f4 612 ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load 613 fsubd %f8,%f20,%f8 614 fsubd %f12,%f22,%f12 615 ldxa [%i1+%l0]0x88,%o2 616 617 fsubd %f56,%f16, %f56 ! de-bias input 618 fsubd %f58,%f18,%f58 619 fsubd %f60,%f20,%f60 620 fsubd %f62,%f22,%f62 621 622 brz %l2,.Linp_aligned_fma2 623 add %l1,%i1,%i1 ! conditional advance 624 625 sllx %o0,%l3,%o1 ! align data 626 srlx %o0,%l2,%o3 627 or %o1,%o4,%o0 628 sllx %o2,%l3,%o1 629 srlx %o2,%l2,%o4 ! pre-shift 630 or %o3,%o1,%o2 631.Linp_aligned_fma2: 632 srlx %o0,32,%o1 633 srlx %o2,32,%o3 634 635 faddd %f0,%f56,%f56 ! accumulate input 636 stw %o0,[%sp+LOCALS+8*0+4] 637 faddd %f4,%f58,%f58 638 stw %o1,[%sp+LOCALS+8*1+4] 639 faddd %f8,%f60,%f60 640 stw %o2,[%sp+LOCALS+8*2+4] 641 faddd %f12,%f62,%f62 642 stw %o3,[%sp+LOCALS+8*3+4] 643 644 b .Lentry_fma 645 nop 646 647.align 16 648.Loop_fma: 649 ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load 650 ldxa [%i1+%l0]0x88,%o2 651 movrz %i2,0,%l1 652 653 faddd %f52,%f0,%f0 ! accumulate input 654 faddd %f54,%f2,%f2 655 faddd %f62,%f8,%f8 656 faddd %f60,%f10,%f10 657 658 brz,pn %l2,.Linp_aligned_fma3 659 add %l1,%i1,%i1 ! conditional advance 660 661 sllx %o0,%l3,%o1 ! align data 662 srlx %o0,%l2,%o3 663 or %o1,%o4,%o0 664 sllx %o2,%l3,%o1 665 srlx %o2,%l2,%o4 ! pre-shift 666 or %o3,%o1,%o2 667 668.Linp_aligned_fma3: 669 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 670 faddd %f20,%f4,%f52 671 srlx %o0,32,%o1 672 faddd %f20,%f6,%f54 673 srlx %o2,32,%o3 674 faddd %f24,%f12,%f60 675 st %o0,[%sp+LOCALS+8*0+4] ! fill "template" 676 faddd %f24,%f14,%f62 677 st %o1,[%sp+LOCALS+8*1+4] 678 faddd %f18,%f0,%f48 679 st %o2,[%sp+LOCALS+8*2+4] 680 faddd %f18,%f2,%f50 681 st %o3,[%sp+LOCALS+8*3+4] 682 faddd %f22,%f8,%f56 683 faddd %f22,%f10,%f58 684 685 fsubd %f52,%f20,%f52 686 fsubd %f54,%f20,%f54 687 fsubd %f60,%f24,%f60 688 fsubd %f62,%f24,%f62 689 fsubd %f48,%f18,%f48 690 fsubd %f50,%f18,%f50 691 fsubd %f56,%f22,%f56 692 fsubd %f58,%f22,%f58 693 694 fsubd %f4,%f52,%f4 695 fsubd %f6,%f54,%f6 696 fsubd %f12,%f60,%f12 697 fsubd %f14,%f62,%f14 698 fsubd %f8,%f56,%f8 699 fsubd %f10,%f58,%f10 700 fsubd %f0,%f48,%f0 701 fsubd %f2,%f50,%f2 702 703 faddd %f4,%f48,%f4 704 faddd %f6,%f50,%f6 705 faddd %f12,%f56,%f12 706 faddd %f14,%f58,%f14 707 faddd %f8,%f52,%f8 708 faddd %f10,%f54,%f10 709 .word 0x81be805d !fmaddd %f26,%f60,%f0,%f0 710 .word 0x85be845f !fmaddd %f26,%f62,%f2,%f2 711 712 faddd %f4,%f6,%f58 713 ldd [%i0+8*12],%f52 ! reload constants 714 faddd %f12,%f14,%f62 715 ldd [%i0+8*13],%f54 716 faddd %f8,%f10,%f60 717 ldd [%i0+8*10],%f48 718 faddd %f0,%f2,%f56 719 ldd [%i0+8*11],%f50 720 721.Lentry_fma: 722 fmuld %f58,%f44,%f0 723 fmuld %f58,%f46,%f2 724 fmuld %f58,%f32,%f8 725 fmuld %f58,%f34,%f10 726 fmuld %f58,%f28,%f4 727 fmuld %f58,%f30,%f6 728 fmuld %f58,%f36,%f12 729 fmuld %f58,%f38,%f14 730 731 .word 0x81bfc055 !fmaddd %f62,%f52,%f0,%f0 732 .word 0x85bfc457 !fmaddd %f62,%f54,%f2,%f2 733 .word 0x91bfd04d !fmaddd %f62,%f44,%f8,%f8 734 .word 0x95bfd44f !fmaddd %f62,%f46,%f10,%f10 735 .word 0x89bfc849 !fmaddd %f62,%f40,%f4,%f4 736 .word 0x8dbfcc4b !fmaddd %f62,%f42,%f6,%f6 737 .word 0x99bfd85c !fmaddd %f62,%f28,%f12,%f12 738 .word 0x9dbfdc5e !fmaddd %f62,%f30,%f14,%f14 739 740 .word 0x81bf4049 !fmaddd %f60,%f40,%f0,%f0 741 .word 0x85bf444b !fmaddd %f60,%f42,%f2,%f2 742 .word 0x91bf505c !fmaddd %f60,%f28,%f8,%f8 743 .word 0x95bf545e !fmaddd %f60,%f30,%f10,%f10 744 .word 0x89bf484d !fmaddd %f60,%f44,%f4,%f4 745 ldd [%sp+LOCALS+8*0],%f52 ! load [biased] input 746 .word 0x8dbf4c4f !fmaddd %f60,%f46,%f6,%f6 747 ldd [%sp+LOCALS+8*1],%f54 748 .word 0x99bf5841 !fmaddd %f60,%f32,%f12,%f12 749 ldd [%sp+LOCALS+8*2],%f62 750 .word 0x9dbf5c43 !fmaddd %f60,%f34,%f14,%f14 751 ldd [%sp+LOCALS+8*3],%f60 752 753 .word 0x81be405c !fmaddd %f56,%f28,%f0,%f0 754 fsubd %f52,%f16, %f52 ! de-bias input 755 .word 0x85be445e !fmaddd %f56,%f30,%f2,%f2 756 fsubd %f54,%f18,%f54 757 .word 0x91be5045 !fmaddd %f56,%f36,%f8,%f8 758 fsubd %f62,%f20,%f62 759 .word 0x95be5447 !fmaddd %f56,%f38,%f10,%f10 760 fsubd %f60,%f22,%f60 761 .word 0x89be4841 !fmaddd %f56,%f32,%f4,%f4 762 .word 0x8dbe4c43 !fmaddd %f56,%f34,%f6,%f6 763 .word 0x99be5851 !fmaddd %f56,%f48,%f12,%f12 764 .word 0x9dbe5c53 !fmaddd %f56,%f50,%f14,%f14 765 766 bcc SIZE_T_CC,.Loop_fma 767 subcc %i2,1,%i2 768 769 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 770 faddd %f0,%f18,%f48 771 faddd %f2,%f18,%f50 772 faddd %f8,%f22,%f56 773 faddd %f10,%f22,%f58 774 faddd %f4,%f20,%f52 775 faddd %f6,%f20,%f54 776 faddd %f12,%f24,%f60 777 faddd %f14,%f24,%f62 778 779 fsubd %f48,%f18,%f48 780 fsubd %f50,%f18,%f50 781 fsubd %f56,%f22,%f56 782 fsubd %f58,%f22,%f58 783 fsubd %f52,%f20,%f52 784 fsubd %f54,%f20,%f54 785 fsubd %f60,%f24,%f60 786 fsubd %f62,%f24,%f62 787 788 fsubd %f4,%f52,%f4 789 fsubd %f6,%f54,%f6 790 fsubd %f12,%f60,%f12 791 fsubd %f14,%f62,%f14 792 fsubd %f8,%f56,%f8 793 fsubd %f10,%f58,%f10 794 fsubd %f0,%f48,%f0 795 fsubd %f2,%f50,%f2 796 797 faddd %f4,%f48,%f4 798 faddd %f6,%f50,%f6 799 faddd %f12,%f56,%f12 800 faddd %f14,%f58,%f14 801 faddd %f8,%f52,%f8 802 faddd %f10,%f54,%f10 803 .word 0x81be805d !fmaddd %f26,%f60,%f0,%f0 804 .word 0x85be845f !fmaddd %f26,%f62,%f2,%f2 805 806 faddd %f4,%f6,%f58 807 faddd %f12,%f14,%f62 808 faddd %f8,%f10,%f60 809 faddd %f0,%f2,%f56 810 811 faddd %f58,%f18,%f58 ! bias 812 faddd %f62,%f22,%f62 813 faddd %f60,%f20,%f60 814 faddd %f56,%f16, %f56 815 816 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr 817 818 std %f58,[%i0+8*1] ! store [biased] hash value 819 std %f62,[%i0+8*3] 820 std %f60,[%i0+8*2] 821 std %f56,[%i0+8*0] 822 823.Labort: 824 ret 825 restore 826.type poly1305_blocks_fma,#function 827.size poly1305_blocks_fma,.-poly1305_blocks_fma 828.align 32 829poly1305_emit_fma: 830 save %sp,-STACK_FRAME,%sp 831 832 ld [%i0+8*0+0],%l5 ! load hash 833 ld [%i0+8*0+4],%l0 834 ld [%i0+8*1+0],%o0 835 ld [%i0+8*1+4],%l1 836 ld [%i0+8*2+0],%o1 837 ld [%i0+8*2+4],%l2 838 ld [%i0+8*3+0],%o2 839 ld [%i0+8*3+4],%l3 840 841 sethi %hi(0xfff00000),%o3 842 andn %l5,%o3,%l5 ! mask exponent 843 andn %o0,%o3,%o0 844 andn %o1,%o3,%o1 845 andn %o2,%o3,%o2 ! can be partially reduced... 846 mov 3,%o3 847 848 srl %o2,2,%i3 ! ... so reduce 849 and %o2,%o3,%l4 850 andn %o2,%o3,%o2 851 add %i3,%o2,%o2 852 853 addcc %o2,%l0,%l0 854 addccc %l5,%l1,%l1 855 addccc %o0,%l2,%l2 856 addccc %o1,%l3,%l3 857 addc %g0,%l4,%l4 858 859 addcc %l0,5,%l5 ! compare to modulus 860 addccc %l1,0,%o0 861 addccc %l2,0,%o1 862 addccc %l3,0,%o2 863 addc %l4,0,%o3 864 865 srl %o3,2,%o3 ! did it carry/borrow? 866 neg %o3,%o3 867 sra %o3,31,%o3 ! mask 868 869 andn %l0,%o3,%l0 870 and %l5,%o3,%l5 871 andn %l1,%o3,%l1 872 and %o0,%o3,%o0 873 or %l5,%l0,%l0 874 ld [%i2+0],%l5 ! load nonce 875 andn %l2,%o3,%l2 876 and %o1,%o3,%o1 877 or %o0,%l1,%l1 878 ld [%i2+4],%o0 879 andn %l3,%o3,%l3 880 and %o2,%o3,%o2 881 or %o1,%l2,%l2 882 ld [%i2+8],%o1 883 or %o2,%l3,%l3 884 ld [%i2+12],%o2 885 886 addcc %l5,%l0,%l0 ! accumulate nonce 887 addccc %o0,%l1,%l1 888 addccc %o1,%l2,%l2 889 addc %o2,%l3,%l3 890 891 stb %l0,[%i1+0] ! write little-endian result 892 srl %l0,8,%l0 893 stb %l1,[%i1+4] 894 srl %l1,8,%l1 895 stb %l2,[%i1+8] 896 srl %l2,8,%l2 897 stb %l3,[%i1+12] 898 srl %l3,8,%l3 899 900 stb %l0,[%i1+1] 901 srl %l0,8,%l0 902 stb %l1,[%i1+5] 903 srl %l1,8,%l1 904 stb %l2,[%i1+9] 905 srl %l2,8,%l2 906 stb %l3,[%i1+13] 907 srl %l3,8,%l3 908 909 stb %l0,[%i1+2] 910 srl %l0,8,%l0 911 stb %l1,[%i1+6] 912 srl %l1,8,%l1 913 stb %l2,[%i1+10] 914 srl %l2,8,%l2 915 stb %l3,[%i1+14] 916 srl %l3,8,%l3 917 918 stb %l0,[%i1+3] 919 stb %l1,[%i1+7] 920 stb %l2,[%i1+11] 921 stb %l3,[%i1+15] 922 923 ret 924 restore 925.type poly1305_emit_fma,#function 926.size poly1305_emit_fma,.-poly1305_emit_fma 927.align 64 928.Lconsts_fma: 929.word 0x43300000,0x00000000 ! 2^(52+0) 930.word 0x45300000,0x00000000 ! 2^(52+32) 931.word 0x47300000,0x00000000 ! 2^(52+64) 932.word 0x49300000,0x00000000 ! 2^(52+96) 933.word 0x4b500000,0x00000000 ! 2^(52+130) 934 935.word 0x37f40000,0x00000000 ! 5/2^130 936.word 0,1<<30 ! fsr: truncate, no exceptions 937 938.word 0x44300000,0x00000000 ! 2^(52+16+0) 939.word 0x46300000,0x00000000 ! 2^(52+16+32) 940.word 0x48300000,0x00000000 ! 2^(52+16+64) 941.word 0x4a300000,0x00000000 ! 2^(52+16+96) 942.word 0x3e300000,0x00000000 ! 2^(52+16+0-96) 943.word 0x40300000,0x00000000 ! 2^(52+16+32-96) 944.word 0x42300000,0x00000000 ! 2^(52+16+64-96) 945.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro@openssl.org>" 946.align 4 947