1.text 2 3.align 8 // strategic alignment and padding that allows to use 4 // address value as loop termination condition... 5.quad 0,0,0,0,0,0,0,0 6.type iotas,%object 7iotas: 8.quad 0x0000000000000001 9.quad 0x0000000000008082 10.quad 0x800000000000808a 11.quad 0x8000000080008000 12.quad 0x000000000000808b 13.quad 0x0000000080000001 14.quad 0x8000000080008081 15.quad 0x8000000000008009 16.quad 0x000000000000008a 17.quad 0x0000000000000088 18.quad 0x0000000080008009 19.quad 0x000000008000000a 20.quad 0x000000008000808b 21.quad 0x800000000000008b 22.quad 0x8000000000008089 23.quad 0x8000000000008003 24.quad 0x8000000000008002 25.quad 0x8000000000000080 26.quad 0x000000000000800a 27.quad 0x800000008000000a 28.quad 0x8000000080008081 29.quad 0x8000000000008080 30.quad 0x0000000080000001 31.quad 0x8000000080008008 32.size iotas,.-iotas 33.type KeccakF1600_int,%function 34.align 5 35KeccakF1600_int: 36 adr x28,iotas 37.inst 0xd503233f // paciasp 38 stp x28,x30,[sp,#16] // 32 bytes on top are mine 39 b .Loop 40.align 4 41.Loop: 42 ////////////////////////////////////////// Theta 43 eor x26,x0,x5 44 stp x4,x9,[sp,#0] // offload pair... 45 eor x27,x1,x6 46 eor x28,x2,x7 47 eor x30,x3,x8 48 eor x4,x4,x9 49 eor x26,x26,x10 50 eor x27,x27,x11 51 eor x28,x28,x12 52 eor x30,x30,x13 53 eor x4,x4,x14 54 eor x26,x26,x15 55 eor x27,x27,x16 56 eor x28,x28,x17 57 eor x30,x30,x25 58 eor x4,x4,x19 59 eor x26,x26,x20 60 eor x28,x28,x22 61 eor x27,x27,x21 62 eor x30,x30,x23 63 eor x4,x4,x24 64 65 eor x9,x26,x28,ror#63 66 67 eor x1,x1,x9 68 eor x6,x6,x9 69 eor x11,x11,x9 70 eor x16,x16,x9 71 eor x21,x21,x9 72 73 eor x9,x27,x30,ror#63 74 eor x28,x28,x4,ror#63 75 eor x30,x30,x26,ror#63 76 eor x4,x4,x27,ror#63 77 78 eor x27, x2,x9 // mov x27,x2 79 eor x7,x7,x9 80 eor x12,x12,x9 81 eor x17,x17,x9 82 eor x22,x22,x9 83 84 eor x0,x0,x4 85 eor x5,x5,x4 86 eor x10,x10,x4 87 eor x15,x15,x4 88 eor x20,x20,x4 89 ldp x4,x9,[sp,#0] // re-load offloaded data 90 eor x26, x3,x28 // mov x26,x3 91 eor x8,x8,x28 92 eor x13,x13,x28 93 eor x25,x25,x28 94 eor x23,x23,x28 95 96 eor x28, x4,x30 // mov x28,x4 97 eor x9,x9,x30 98 eor x14,x14,x30 99 eor x19,x19,x30 100 eor x24,x24,x30 101 102 ////////////////////////////////////////// Rho+Pi 103 mov x30,x1 104 ror x1,x6,#64-44 105 //mov x27,x2 106 ror x2,x12,#64-43 107 //mov x26,x3 108 ror x3,x25,#64-21 109 //mov x28,x4 110 ror x4,x24,#64-14 111 112 ror x6,x9,#64-20 113 ror x12,x13,#64-25 114 ror x25,x17,#64-15 115 ror x24,x21,#64-2 116 117 ror x9,x22,#64-61 118 ror x13,x19,#64-8 119 ror x17,x11,#64-10 120 ror x21,x8,#64-55 121 122 ror x22,x14,#64-39 123 ror x19,x23,#64-56 124 ror x11,x7,#64-6 125 ror x8,x16,#64-45 126 127 ror x14,x20,#64-18 128 ror x23,x15,#64-41 129 ror x7,x10,#64-3 130 ror x16,x5,#64-36 131 132 ror x5,x26,#64-28 133 ror x10,x30,#64-1 134 ror x15,x28,#64-27 135 ror x20,x27,#64-62 136 137 ////////////////////////////////////////// Chi+Iota 138 bic x26,x2,x1 139 bic x27,x3,x2 140 bic x28,x0,x4 141 bic x30,x1,x0 142 eor x0,x0,x26 143 bic x26,x4,x3 144 eor x1,x1,x27 145 ldr x27,[sp,#16] 146 eor x3,x3,x28 147 eor x4,x4,x30 148 eor x2,x2,x26 149 ldr x30,[x27],#8 // Iota[i++] 150 151 bic x26,x7,x6 152 tst x27,#255 // are we done? 153 str x27,[sp,#16] 154 bic x27,x8,x7 155 bic x28,x5,x9 156 eor x0,x0,x30 // A[0][0] ^= Iota 157 bic x30,x6,x5 158 eor x5,x5,x26 159 bic x26,x9,x8 160 eor x6,x6,x27 161 eor x8,x8,x28 162 eor x9,x9,x30 163 eor x7,x7,x26 164 165 bic x26,x12,x11 166 bic x27,x13,x12 167 bic x28,x10,x14 168 bic x30,x11,x10 169 eor x10,x10,x26 170 bic x26,x14,x13 171 eor x11,x11,x27 172 eor x13,x13,x28 173 eor x14,x14,x30 174 eor x12,x12,x26 175 176 bic x26,x17,x16 177 bic x27,x25,x17 178 bic x28,x15,x19 179 bic x30,x16,x15 180 eor x15,x15,x26 181 bic x26,x19,x25 182 eor x16,x16,x27 183 eor x25,x25,x28 184 eor x19,x19,x30 185 eor x17,x17,x26 186 187 bic x26,x22,x21 188 bic x27,x23,x22 189 bic x28,x20,x24 190 bic x30,x21,x20 191 eor x20,x20,x26 192 bic x26,x24,x23 193 eor x21,x21,x27 194 eor x23,x23,x28 195 eor x24,x24,x30 196 eor x22,x22,x26 197 198 bne .Loop 199 200 ldr x30,[sp,#24] 201.inst 0xd50323bf // autiasp 202 ret 203.size KeccakF1600_int,.-KeccakF1600_int 204 205.type KeccakF1600,%function 206.align 5 207KeccakF1600: 208.inst 0xd503233f // paciasp 209 stp x29,x30,[sp,#-128]! 210 add x29,sp,#0 211 stp x19,x20,[sp,#16] 212 stp x21,x22,[sp,#32] 213 stp x23,x24,[sp,#48] 214 stp x25,x26,[sp,#64] 215 stp x27,x28,[sp,#80] 216 sub sp,sp,#48 217 218 str x0,[sp,#32] // offload argument 219 mov x26,x0 220 ldp x0,x1,[x0,#16*0] 221 ldp x2,x3,[x26,#16*1] 222 ldp x4,x5,[x26,#16*2] 223 ldp x6,x7,[x26,#16*3] 224 ldp x8,x9,[x26,#16*4] 225 ldp x10,x11,[x26,#16*5] 226 ldp x12,x13,[x26,#16*6] 227 ldp x14,x15,[x26,#16*7] 228 ldp x16,x17,[x26,#16*8] 229 ldp x25,x19,[x26,#16*9] 230 ldp x20,x21,[x26,#16*10] 231 ldp x22,x23,[x26,#16*11] 232 ldr x24,[x26,#16*12] 233 234 bl KeccakF1600_int 235 236 ldr x26,[sp,#32] 237 stp x0,x1,[x26,#16*0] 238 stp x2,x3,[x26,#16*1] 239 stp x4,x5,[x26,#16*2] 240 stp x6,x7,[x26,#16*3] 241 stp x8,x9,[x26,#16*4] 242 stp x10,x11,[x26,#16*5] 243 stp x12,x13,[x26,#16*6] 244 stp x14,x15,[x26,#16*7] 245 stp x16,x17,[x26,#16*8] 246 stp x25,x19,[x26,#16*9] 247 stp x20,x21,[x26,#16*10] 248 stp x22,x23,[x26,#16*11] 249 str x24,[x26,#16*12] 250 251 ldp x19,x20,[x29,#16] 252 add sp,sp,#48 253 ldp x21,x22,[x29,#32] 254 ldp x23,x24,[x29,#48] 255 ldp x25,x26,[x29,#64] 256 ldp x27,x28,[x29,#80] 257 ldp x29,x30,[sp],#128 258.inst 0xd50323bf // autiasp 259 ret 260.size KeccakF1600,.-KeccakF1600 261 262.globl SHA3_absorb 263.type SHA3_absorb,%function 264.align 5 265SHA3_absorb: 266.inst 0xd503233f // paciasp 267 stp x29,x30,[sp,#-128]! 268 add x29,sp,#0 269 stp x19,x20,[sp,#16] 270 stp x21,x22,[sp,#32] 271 stp x23,x24,[sp,#48] 272 stp x25,x26,[sp,#64] 273 stp x27,x28,[sp,#80] 274 sub sp,sp,#64 275 276 stp x0,x1,[sp,#32] // offload arguments 277 stp x2,x3,[sp,#48] 278 279 mov x26,x0 // uint64_t A[5][5] 280 mov x27,x1 // const void *inp 281 mov x28,x2 // size_t len 282 mov x30,x3 // size_t bsz 283 ldp x0,x1,[x26,#16*0] 284 ldp x2,x3,[x26,#16*1] 285 ldp x4,x5,[x26,#16*2] 286 ldp x6,x7,[x26,#16*3] 287 ldp x8,x9,[x26,#16*4] 288 ldp x10,x11,[x26,#16*5] 289 ldp x12,x13,[x26,#16*6] 290 ldp x14,x15,[x26,#16*7] 291 ldp x16,x17,[x26,#16*8] 292 ldp x25,x19,[x26,#16*9] 293 ldp x20,x21,[x26,#16*10] 294 ldp x22,x23,[x26,#16*11] 295 ldr x24,[x26,#16*12] 296 b .Loop_absorb 297 298.align 4 299.Loop_absorb: 300 subs x26,x28,x30 // len - bsz 301 blo .Labsorbed 302 303 str x26,[sp,#48] // save len - bsz 304 ldr x26,[x27],#8 // *inp++ 305#ifdef __AARCH64EB__ 306 rev x26,x26 307#endif 308 eor x0,x0,x26 309 cmp x30,#8*(0+2) 310 blo .Lprocess_block 311 ldr x26,[x27],#8 // *inp++ 312#ifdef __AARCH64EB__ 313 rev x26,x26 314#endif 315 eor x1,x1,x26 316 beq .Lprocess_block 317 ldr x26,[x27],#8 // *inp++ 318#ifdef __AARCH64EB__ 319 rev x26,x26 320#endif 321 eor x2,x2,x26 322 cmp x30,#8*(2+2) 323 blo .Lprocess_block 324 ldr x26,[x27],#8 // *inp++ 325#ifdef __AARCH64EB__ 326 rev x26,x26 327#endif 328 eor x3,x3,x26 329 beq .Lprocess_block 330 ldr x26,[x27],#8 // *inp++ 331#ifdef __AARCH64EB__ 332 rev x26,x26 333#endif 334 eor x4,x4,x26 335 cmp x30,#8*(4+2) 336 blo .Lprocess_block 337 ldr x26,[x27],#8 // *inp++ 338#ifdef __AARCH64EB__ 339 rev x26,x26 340#endif 341 eor x5,x5,x26 342 beq .Lprocess_block 343 ldr x26,[x27],#8 // *inp++ 344#ifdef __AARCH64EB__ 345 rev x26,x26 346#endif 347 eor x6,x6,x26 348 cmp x30,#8*(6+2) 349 blo .Lprocess_block 350 ldr x26,[x27],#8 // *inp++ 351#ifdef __AARCH64EB__ 352 rev x26,x26 353#endif 354 eor x7,x7,x26 355 beq .Lprocess_block 356 ldr x26,[x27],#8 // *inp++ 357#ifdef __AARCH64EB__ 358 rev x26,x26 359#endif 360 eor x8,x8,x26 361 cmp x30,#8*(8+2) 362 blo .Lprocess_block 363 ldr x26,[x27],#8 // *inp++ 364#ifdef __AARCH64EB__ 365 rev x26,x26 366#endif 367 eor x9,x9,x26 368 beq .Lprocess_block 369 ldr x26,[x27],#8 // *inp++ 370#ifdef __AARCH64EB__ 371 rev x26,x26 372#endif 373 eor x10,x10,x26 374 cmp x30,#8*(10+2) 375 blo .Lprocess_block 376 ldr x26,[x27],#8 // *inp++ 377#ifdef __AARCH64EB__ 378 rev x26,x26 379#endif 380 eor x11,x11,x26 381 beq .Lprocess_block 382 ldr x26,[x27],#8 // *inp++ 383#ifdef __AARCH64EB__ 384 rev x26,x26 385#endif 386 eor x12,x12,x26 387 cmp x30,#8*(12+2) 388 blo .Lprocess_block 389 ldr x26,[x27],#8 // *inp++ 390#ifdef __AARCH64EB__ 391 rev x26,x26 392#endif 393 eor x13,x13,x26 394 beq .Lprocess_block 395 ldr x26,[x27],#8 // *inp++ 396#ifdef __AARCH64EB__ 397 rev x26,x26 398#endif 399 eor x14,x14,x26 400 cmp x30,#8*(14+2) 401 blo .Lprocess_block 402 ldr x26,[x27],#8 // *inp++ 403#ifdef __AARCH64EB__ 404 rev x26,x26 405#endif 406 eor x15,x15,x26 407 beq .Lprocess_block 408 ldr x26,[x27],#8 // *inp++ 409#ifdef __AARCH64EB__ 410 rev x26,x26 411#endif 412 eor x16,x16,x26 413 cmp x30,#8*(16+2) 414 blo .Lprocess_block 415 ldr x26,[x27],#8 // *inp++ 416#ifdef __AARCH64EB__ 417 rev x26,x26 418#endif 419 eor x17,x17,x26 420 beq .Lprocess_block 421 ldr x26,[x27],#8 // *inp++ 422#ifdef __AARCH64EB__ 423 rev x26,x26 424#endif 425 eor x25,x25,x26 426 cmp x30,#8*(18+2) 427 blo .Lprocess_block 428 ldr x26,[x27],#8 // *inp++ 429#ifdef __AARCH64EB__ 430 rev x26,x26 431#endif 432 eor x19,x19,x26 433 beq .Lprocess_block 434 ldr x26,[x27],#8 // *inp++ 435#ifdef __AARCH64EB__ 436 rev x26,x26 437#endif 438 eor x20,x20,x26 439 cmp x30,#8*(20+2) 440 blo .Lprocess_block 441 ldr x26,[x27],#8 // *inp++ 442#ifdef __AARCH64EB__ 443 rev x26,x26 444#endif 445 eor x21,x21,x26 446 beq .Lprocess_block 447 ldr x26,[x27],#8 // *inp++ 448#ifdef __AARCH64EB__ 449 rev x26,x26 450#endif 451 eor x22,x22,x26 452 cmp x30,#8*(22+2) 453 blo .Lprocess_block 454 ldr x26,[x27],#8 // *inp++ 455#ifdef __AARCH64EB__ 456 rev x26,x26 457#endif 458 eor x23,x23,x26 459 beq .Lprocess_block 460 ldr x26,[x27],#8 // *inp++ 461#ifdef __AARCH64EB__ 462 rev x26,x26 463#endif 464 eor x24,x24,x26 465 466.Lprocess_block: 467 str x27,[sp,#40] // save inp 468 469 bl KeccakF1600_int 470 471 ldr x27,[sp,#40] // restore arguments 472 ldp x28,x30,[sp,#48] 473 b .Loop_absorb 474 475.align 4 476.Labsorbed: 477 ldr x27,[sp,#32] 478 stp x0,x1,[x27,#16*0] 479 stp x2,x3,[x27,#16*1] 480 stp x4,x5,[x27,#16*2] 481 stp x6,x7,[x27,#16*3] 482 stp x8,x9,[x27,#16*4] 483 stp x10,x11,[x27,#16*5] 484 stp x12,x13,[x27,#16*6] 485 stp x14,x15,[x27,#16*7] 486 stp x16,x17,[x27,#16*8] 487 stp x25,x19,[x27,#16*9] 488 stp x20,x21,[x27,#16*10] 489 stp x22,x23,[x27,#16*11] 490 str x24,[x27,#16*12] 491 492 mov x0,x28 // return value 493 ldp x19,x20,[x29,#16] 494 add sp,sp,#64 495 ldp x21,x22,[x29,#32] 496 ldp x23,x24,[x29,#48] 497 ldp x25,x26,[x29,#64] 498 ldp x27,x28,[x29,#80] 499 ldp x29,x30,[sp],#128 500.inst 0xd50323bf // autiasp 501 ret 502.size SHA3_absorb,.-SHA3_absorb 503.globl SHA3_squeeze 504.type SHA3_squeeze,%function 505.align 5 506SHA3_squeeze: 507.inst 0xd503233f // paciasp 508 stp x29,x30,[sp,#-48]! 509 add x29,sp,#0 510 stp x19,x20,[sp,#16] 511 stp x21,x22,[sp,#32] 512 513 mov x19,x0 // put aside arguments 514 mov x20,x1 515 mov x21,x2 516 mov x22,x3 517 518.Loop_squeeze: 519 ldr x4,[x0],#8 520 cmp x21,#8 521 blo .Lsqueeze_tail 522#ifdef __AARCH64EB__ 523 rev x4,x4 524#endif 525 str x4,[x20],#8 526 subs x21,x21,#8 527 beq .Lsqueeze_done 528 529 subs x3,x3,#8 530 bhi .Loop_squeeze 531 532 mov x0,x19 533 bl KeccakF1600 534 mov x0,x19 535 mov x3,x22 536 b .Loop_squeeze 537 538.align 4 539.Lsqueeze_tail: 540 strb w4,[x20],#1 541 lsr x4,x4,#8 542 subs x21,x21,#1 543 beq .Lsqueeze_done 544 strb w4,[x20],#1 545 lsr x4,x4,#8 546 subs x21,x21,#1 547 beq .Lsqueeze_done 548 strb w4,[x20],#1 549 lsr x4,x4,#8 550 subs x21,x21,#1 551 beq .Lsqueeze_done 552 strb w4,[x20],#1 553 lsr x4,x4,#8 554 subs x21,x21,#1 555 beq .Lsqueeze_done 556 strb w4,[x20],#1 557 lsr x4,x4,#8 558 subs x21,x21,#1 559 beq .Lsqueeze_done 560 strb w4,[x20],#1 561 lsr x4,x4,#8 562 subs x21,x21,#1 563 beq .Lsqueeze_done 564 strb w4,[x20],#1 565 566.Lsqueeze_done: 567 ldp x19,x20,[sp,#16] 568 ldp x21,x22,[sp,#32] 569 ldp x29,x30,[sp],#48 570.inst 0xd50323bf // autiasp 571 ret 572.size SHA3_squeeze,.-SHA3_squeeze 573.type KeccakF1600_ce,%function 574.align 5 575KeccakF1600_ce: 576 mov x9,#12 577 adr x10,iotas 578 b .Loop_ce 579.align 4 580.Loop_ce: 581 ////////////////////////////////////////////////// Theta 582.inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b 583.inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b 584.inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b 585.inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b 586.inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b 587.inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b 588.inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b 589.inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b 590.inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b 591.inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b 592 593.inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1] 594.inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2] 595.inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3] 596.inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4] 597.inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0] 598 599 ////////////////////////////////////////////////// Theta+Rho+Pi 600.inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1] 601.inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20 602.inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61 603.inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39 604.inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18 605 606.inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62 607 608.inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43 609.inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25 610.inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8 611.inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56 612.inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41 613 614.inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27 615 616 eor v0.16b,v0.16b,v29.16b 617 ldr x11,[x10],#8 618 619.inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3] 620.inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15 621.inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10 622.inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6 623.inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3 624 625.inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // * 626 627.inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14 628.inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2 629.inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55 630.inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45 631.inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36 632 633.inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0] 634 635 ////////////////////////////////////////////////// Chi+Iota 636 dup v31.2d,x11 // borrow C[6] 637.inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // * 638.inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // * 639.inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b 640.inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b 641.inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b 642 643.inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // * 644.inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // * 645.inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b 646.inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b 647.inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b 648 649 eor v0.16b,v28.16b,v31.16b // Iota 650 651.inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // * 652.inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // * 653.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b 654.inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b 655.inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b 656 657.inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // * 658.inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // * 659.inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b 660.inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b 661.inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b 662 663.inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // * 664.inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // * 665.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b 666.inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b 667.inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b 668 ////////////////////////////////////////////////// Theta 669.inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b 670.inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b 671.inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b 672.inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b 673.inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b 674.inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b 675.inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b 676.inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b 677.inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b 678.inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b 679 680.inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1] 681.inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2] 682.inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3] 683.inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4] 684.inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0] 685 686 ////////////////////////////////////////////////// Theta+Rho+Pi 687.inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1] 688.inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20 689.inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61 690.inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39 691.inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18 692 693.inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62 694 695.inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43 696.inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25 697.inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8 698.inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56 699.inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41 700 701.inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27 702 703 eor v0.16b,v0.16b,v16.16b 704 ldr x11,[x10],#8 705 706.inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3] 707.inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15 708.inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10 709.inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6 710.inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3 711 712.inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // * 713 714.inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14 715.inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2 716.inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55 717.inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45 718.inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36 719 720.inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0] 721 722 ////////////////////////////////////////////////// Chi+Iota 723 dup v21.2d,x11 // borrow C[6] 724.inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // * 725.inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // * 726.inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b 727.inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b 728.inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b 729 730.inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // * 731.inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // * 732.inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b 733.inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b 734.inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b 735 736 eor v0.16b,v15.16b,v21.16b // Iota 737 738.inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // * 739.inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // * 740.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b 741.inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b 742.inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b 743 744.inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // * 745.inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // * 746.inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b 747.inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b 748.inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b 749 750.inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // * 751.inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // * 752.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b 753.inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b 754.inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b 755 subs x9,x9,#1 756 bne .Loop_ce 757 758 ret 759.size KeccakF1600_ce,.-KeccakF1600_ce 760 761.type KeccakF1600_cext,%function 762.align 5 763KeccakF1600_cext: 764.inst 0xd503233f // paciasp 765 stp x29,x30,[sp,#-80]! 766 add x29,sp,#0 767 stp d8,d9,[sp,#16] // per ABI requirement 768 stp d10,d11,[sp,#32] 769 stp d12,d13,[sp,#48] 770 stp d14,d15,[sp,#64] 771 ldp d0,d1,[x0,#8*0] 772 ldp d2,d3,[x0,#8*2] 773 ldp d4,d5,[x0,#8*4] 774 ldp d6,d7,[x0,#8*6] 775 ldp d8,d9,[x0,#8*8] 776 ldp d10,d11,[x0,#8*10] 777 ldp d12,d13,[x0,#8*12] 778 ldp d14,d15,[x0,#8*14] 779 ldp d16,d17,[x0,#8*16] 780 ldp d18,d19,[x0,#8*18] 781 ldp d20,d21,[x0,#8*20] 782 ldp d22,d23,[x0,#8*22] 783 ldr d24,[x0,#8*24] 784 bl KeccakF1600_ce 785 ldr x30,[sp,#8] 786 stp d0,d1,[x0,#8*0] 787 stp d2,d3,[x0,#8*2] 788 stp d4,d5,[x0,#8*4] 789 stp d6,d7,[x0,#8*6] 790 stp d8,d9,[x0,#8*8] 791 stp d10,d11,[x0,#8*10] 792 stp d12,d13,[x0,#8*12] 793 stp d14,d15,[x0,#8*14] 794 stp d16,d17,[x0,#8*16] 795 stp d18,d19,[x0,#8*18] 796 stp d20,d21,[x0,#8*20] 797 stp d22,d23,[x0,#8*22] 798 str d24,[x0,#8*24] 799 800 ldp d8,d9,[sp,#16] 801 ldp d10,d11,[sp,#32] 802 ldp d12,d13,[sp,#48] 803 ldp d14,d15,[sp,#64] 804 ldr x29,[sp],#80 805.inst 0xd50323bf // autiasp 806 ret 807.size KeccakF1600_cext,.-KeccakF1600_cext 808.globl SHA3_absorb_cext 809.type SHA3_absorb_cext,%function 810.align 5 811SHA3_absorb_cext: 812.inst 0xd503233f // paciasp 813 stp x29,x30,[sp,#-80]! 814 add x29,sp,#0 815 stp d8,d9,[sp,#16] // per ABI requirement 816 stp d10,d11,[sp,#32] 817 stp d12,d13,[sp,#48] 818 stp d14,d15,[sp,#64] 819 ldp d0,d1,[x0,#8*0] 820 ldp d2,d3,[x0,#8*2] 821 ldp d4,d5,[x0,#8*4] 822 ldp d6,d7,[x0,#8*6] 823 ldp d8,d9,[x0,#8*8] 824 ldp d10,d11,[x0,#8*10] 825 ldp d12,d13,[x0,#8*12] 826 ldp d14,d15,[x0,#8*14] 827 ldp d16,d17,[x0,#8*16] 828 ldp d18,d19,[x0,#8*18] 829 ldp d20,d21,[x0,#8*20] 830 ldp d22,d23,[x0,#8*22] 831 ldr d24,[x0,#8*24] 832 b .Loop_absorb_ce 833 834.align 4 835.Loop_absorb_ce: 836 subs x2,x2,x3 // len - bsz 837 blo .Labsorbed_ce 838 ldr d31,[x1],#8 // *inp++ 839#ifdef __AARCH64EB__ 840 rev64 v31.16b,v31.16b 841#endif 842 eor v0.16b,v0.16b,v31.16b 843 cmp x3,#8*(0+2) 844 blo .Lprocess_block_ce 845 ldr d31,[x1],#8 // *inp++ 846#ifdef __AARCH64EB__ 847 rev64 v31.16b,v31.16b 848#endif 849 eor v1.16b,v1.16b,v31.16b 850 beq .Lprocess_block_ce 851 ldr d31,[x1],#8 // *inp++ 852#ifdef __AARCH64EB__ 853 rev64 v31.16b,v31.16b 854#endif 855 eor v2.16b,v2.16b,v31.16b 856 cmp x3,#8*(2+2) 857 blo .Lprocess_block_ce 858 ldr d31,[x1],#8 // *inp++ 859#ifdef __AARCH64EB__ 860 rev64 v31.16b,v31.16b 861#endif 862 eor v3.16b,v3.16b,v31.16b 863 beq .Lprocess_block_ce 864 ldr d31,[x1],#8 // *inp++ 865#ifdef __AARCH64EB__ 866 rev64 v31.16b,v31.16b 867#endif 868 eor v4.16b,v4.16b,v31.16b 869 cmp x3,#8*(4+2) 870 blo .Lprocess_block_ce 871 ldr d31,[x1],#8 // *inp++ 872#ifdef __AARCH64EB__ 873 rev64 v31.16b,v31.16b 874#endif 875 eor v5.16b,v5.16b,v31.16b 876 beq .Lprocess_block_ce 877 ldr d31,[x1],#8 // *inp++ 878#ifdef __AARCH64EB__ 879 rev64 v31.16b,v31.16b 880#endif 881 eor v6.16b,v6.16b,v31.16b 882 cmp x3,#8*(6+2) 883 blo .Lprocess_block_ce 884 ldr d31,[x1],#8 // *inp++ 885#ifdef __AARCH64EB__ 886 rev64 v31.16b,v31.16b 887#endif 888 eor v7.16b,v7.16b,v31.16b 889 beq .Lprocess_block_ce 890 ldr d31,[x1],#8 // *inp++ 891#ifdef __AARCH64EB__ 892 rev64 v31.16b,v31.16b 893#endif 894 eor v8.16b,v8.16b,v31.16b 895 cmp x3,#8*(8+2) 896 blo .Lprocess_block_ce 897 ldr d31,[x1],#8 // *inp++ 898#ifdef __AARCH64EB__ 899 rev64 v31.16b,v31.16b 900#endif 901 eor v9.16b,v9.16b,v31.16b 902 beq .Lprocess_block_ce 903 ldr d31,[x1],#8 // *inp++ 904#ifdef __AARCH64EB__ 905 rev64 v31.16b,v31.16b 906#endif 907 eor v10.16b,v10.16b,v31.16b 908 cmp x3,#8*(10+2) 909 blo .Lprocess_block_ce 910 ldr d31,[x1],#8 // *inp++ 911#ifdef __AARCH64EB__ 912 rev64 v31.16b,v31.16b 913#endif 914 eor v11.16b,v11.16b,v31.16b 915 beq .Lprocess_block_ce 916 ldr d31,[x1],#8 // *inp++ 917#ifdef __AARCH64EB__ 918 rev64 v31.16b,v31.16b 919#endif 920 eor v12.16b,v12.16b,v31.16b 921 cmp x3,#8*(12+2) 922 blo .Lprocess_block_ce 923 ldr d31,[x1],#8 // *inp++ 924#ifdef __AARCH64EB__ 925 rev64 v31.16b,v31.16b 926#endif 927 eor v13.16b,v13.16b,v31.16b 928 beq .Lprocess_block_ce 929 ldr d31,[x1],#8 // *inp++ 930#ifdef __AARCH64EB__ 931 rev64 v31.16b,v31.16b 932#endif 933 eor v14.16b,v14.16b,v31.16b 934 cmp x3,#8*(14+2) 935 blo .Lprocess_block_ce 936 ldr d31,[x1],#8 // *inp++ 937#ifdef __AARCH64EB__ 938 rev64 v31.16b,v31.16b 939#endif 940 eor v15.16b,v15.16b,v31.16b 941 beq .Lprocess_block_ce 942 ldr d31,[x1],#8 // *inp++ 943#ifdef __AARCH64EB__ 944 rev64 v31.16b,v31.16b 945#endif 946 eor v16.16b,v16.16b,v31.16b 947 cmp x3,#8*(16+2) 948 blo .Lprocess_block_ce 949 ldr d31,[x1],#8 // *inp++ 950#ifdef __AARCH64EB__ 951 rev64 v31.16b,v31.16b 952#endif 953 eor v17.16b,v17.16b,v31.16b 954 beq .Lprocess_block_ce 955 ldr d31,[x1],#8 // *inp++ 956#ifdef __AARCH64EB__ 957 rev64 v31.16b,v31.16b 958#endif 959 eor v18.16b,v18.16b,v31.16b 960 cmp x3,#8*(18+2) 961 blo .Lprocess_block_ce 962 ldr d31,[x1],#8 // *inp++ 963#ifdef __AARCH64EB__ 964 rev64 v31.16b,v31.16b 965#endif 966 eor v19.16b,v19.16b,v31.16b 967 beq .Lprocess_block_ce 968 ldr d31,[x1],#8 // *inp++ 969#ifdef __AARCH64EB__ 970 rev64 v31.16b,v31.16b 971#endif 972 eor v20.16b,v20.16b,v31.16b 973 cmp x3,#8*(20+2) 974 blo .Lprocess_block_ce 975 ldr d31,[x1],#8 // *inp++ 976#ifdef __AARCH64EB__ 977 rev64 v31.16b,v31.16b 978#endif 979 eor v21.16b,v21.16b,v31.16b 980 beq .Lprocess_block_ce 981 ldr d31,[x1],#8 // *inp++ 982#ifdef __AARCH64EB__ 983 rev64 v31.16b,v31.16b 984#endif 985 eor v22.16b,v22.16b,v31.16b 986 cmp x3,#8*(22+2) 987 blo .Lprocess_block_ce 988 ldr d31,[x1],#8 // *inp++ 989#ifdef __AARCH64EB__ 990 rev64 v31.16b,v31.16b 991#endif 992 eor v23.16b,v23.16b,v31.16b 993 beq .Lprocess_block_ce 994 ldr d31,[x1],#8 // *inp++ 995#ifdef __AARCH64EB__ 996 rev64 v31.16b,v31.16b 997#endif 998 eor v24.16b,v24.16b,v31.16b 999 1000.Lprocess_block_ce: 1001 1002 bl KeccakF1600_ce 1003 1004 b .Loop_absorb_ce 1005 1006.align 4 1007.Labsorbed_ce: 1008 stp d0,d1,[x0,#8*0] 1009 stp d2,d3,[x0,#8*2] 1010 stp d4,d5,[x0,#8*4] 1011 stp d6,d7,[x0,#8*6] 1012 stp d8,d9,[x0,#8*8] 1013 stp d10,d11,[x0,#8*10] 1014 stp d12,d13,[x0,#8*12] 1015 stp d14,d15,[x0,#8*14] 1016 stp d16,d17,[x0,#8*16] 1017 stp d18,d19,[x0,#8*18] 1018 stp d20,d21,[x0,#8*20] 1019 stp d22,d23,[x0,#8*22] 1020 str d24,[x0,#8*24] 1021 add x0,x2,x3 // return value 1022 1023 ldp d8,d9,[sp,#16] 1024 ldp d10,d11,[sp,#32] 1025 ldp d12,d13,[sp,#48] 1026 ldp d14,d15,[sp,#64] 1027 ldp x29,x30,[sp],#80 1028.inst 0xd50323bf // autiasp 1029 ret 1030.size SHA3_absorb_cext,.-SHA3_absorb_cext 1031.globl SHA3_squeeze_cext 1032.type SHA3_squeeze_cext,%function 1033.align 5 1034SHA3_squeeze_cext: 1035.inst 0xd503233f // paciasp 1036 stp x29,x30,[sp,#-16]! 1037 add x29,sp,#0 1038 mov x9,x0 1039 mov x10,x3 1040 1041.Loop_squeeze_ce: 1042 ldr x4,[x9],#8 1043 cmp x2,#8 1044 blo .Lsqueeze_tail_ce 1045#ifdef __AARCH64EB__ 1046 rev x4,x4 1047#endif 1048 str x4,[x1],#8 1049 beq .Lsqueeze_done_ce 1050 1051 sub x2,x2,#8 1052 subs x10,x10,#8 1053 bhi .Loop_squeeze_ce 1054 1055 bl KeccakF1600_cext 1056 ldr x30,[sp,#8] 1057 mov x9,x0 1058 mov x10,x3 1059 b .Loop_squeeze_ce 1060 1061.align 4 1062.Lsqueeze_tail_ce: 1063 strb w4,[x1],#1 1064 lsr x4,x4,#8 1065 subs x2,x2,#1 1066 beq .Lsqueeze_done_ce 1067 strb w4,[x1],#1 1068 lsr x4,x4,#8 1069 subs x2,x2,#1 1070 beq .Lsqueeze_done_ce 1071 strb w4,[x1],#1 1072 lsr x4,x4,#8 1073 subs x2,x2,#1 1074 beq .Lsqueeze_done_ce 1075 strb w4,[x1],#1 1076 lsr x4,x4,#8 1077 subs x2,x2,#1 1078 beq .Lsqueeze_done_ce 1079 strb w4,[x1],#1 1080 lsr x4,x4,#8 1081 subs x2,x2,#1 1082 beq .Lsqueeze_done_ce 1083 strb w4,[x1],#1 1084 lsr x4,x4,#8 1085 subs x2,x2,#1 1086 beq .Lsqueeze_done_ce 1087 strb w4,[x1],#1 1088 1089.Lsqueeze_done_ce: 1090 ldr x29,[sp],#16 1091.inst 0xd50323bf // autiasp 1092 ret 1093.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 1094.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1095.align 2 1096