1.text 2 3.align 8 // strategic alignment and padding that allows to use 4 // address value as loop termination condition... 5.quad 0,0,0,0,0,0,0,0 6.type iotas,%object 7iotas: 8.quad 0x0000000000000001 9.quad 0x0000000000008082 10.quad 0x800000000000808a 11.quad 0x8000000080008000 12.quad 0x000000000000808b 13.quad 0x0000000080000001 14.quad 0x8000000080008081 15.quad 0x8000000000008009 16.quad 0x000000000000008a 17.quad 0x0000000000000088 18.quad 0x0000000080008009 19.quad 0x000000008000000a 20.quad 0x000000008000808b 21.quad 0x800000000000008b 22.quad 0x8000000000008089 23.quad 0x8000000000008003 24.quad 0x8000000000008002 25.quad 0x8000000000000080 26.quad 0x000000000000800a 27.quad 0x800000008000000a 28.quad 0x8000000080008081 29.quad 0x8000000000008080 30.quad 0x0000000080000001 31.quad 0x8000000080008008 32.size iotas,.-iotas 33.type KeccakF1600_int,%function 34.align 5 35KeccakF1600_int: 36 adr x28,iotas 37.inst 0xd503233f // paciasp 38 stp x28,x30,[sp,#16] // 32 bytes on top are mine 39 b .Loop 40.align 4 41.Loop: 42 ////////////////////////////////////////// Theta 43 eor x26,x0,x5 44 stp x4,x9,[sp,#0] // offload pair... 45 eor x27,x1,x6 46 eor x28,x2,x7 47 eor x30,x3,x8 48 eor x4,x4,x9 49 eor x26,x26,x10 50 eor x27,x27,x11 51 eor x28,x28,x12 52 eor x30,x30,x13 53 eor x4,x4,x14 54 eor x26,x26,x15 55 eor x27,x27,x16 56 eor x28,x28,x17 57 eor x30,x30,x25 58 eor x4,x4,x19 59 eor x26,x26,x20 60 eor x28,x28,x22 61 eor x27,x27,x21 62 eor x30,x30,x23 63 eor x4,x4,x24 64 65 eor x9,x26,x28,ror#63 66 67 eor x1,x1,x9 68 eor x6,x6,x9 69 eor x11,x11,x9 70 eor x16,x16,x9 71 eor x21,x21,x9 72 73 eor x9,x27,x30,ror#63 74 eor x28,x28,x4,ror#63 75 eor x30,x30,x26,ror#63 76 eor x4,x4,x27,ror#63 77 78 eor x27, x2,x9 // mov x27,x2 79 eor x7,x7,x9 80 eor x12,x12,x9 81 eor x17,x17,x9 82 eor x22,x22,x9 83 84 eor x0,x0,x4 85 eor x5,x5,x4 86 eor x10,x10,x4 87 eor x15,x15,x4 88 eor x20,x20,x4 89 ldp x4,x9,[sp,#0] // re-load offloaded data 90 eor x26, x3,x28 // mov x26,x3 91 eor x8,x8,x28 92 eor x13,x13,x28 93 eor x25,x25,x28 94 eor x23,x23,x28 95 96 eor x28, x4,x30 // mov x28,x4 97 eor x9,x9,x30 98 eor x14,x14,x30 99 eor x19,x19,x30 100 eor x24,x24,x30 101 102 ////////////////////////////////////////// Rho+Pi 103 mov x30,x1 104 ror x1,x6,#64-44 105 //mov x27,x2 106 ror x2,x12,#64-43 107 //mov x26,x3 108 ror x3,x25,#64-21 109 //mov x28,x4 110 ror x4,x24,#64-14 111 112 ror x6,x9,#64-20 113 ror x12,x13,#64-25 114 ror x25,x17,#64-15 115 ror x24,x21,#64-2 116 117 ror x9,x22,#64-61 118 ror x13,x19,#64-8 119 ror x17,x11,#64-10 120 ror x21,x8,#64-55 121 122 ror x22,x14,#64-39 123 ror x19,x23,#64-56 124 ror x11,x7,#64-6 125 ror x8,x16,#64-45 126 127 ror x14,x20,#64-18 128 ror x23,x15,#64-41 129 ror x7,x10,#64-3 130 ror x16,x5,#64-36 131 132 ror x5,x26,#64-28 133 ror x10,x30,#64-1 134 ror x15,x28,#64-27 135 ror x20,x27,#64-62 136 137 ////////////////////////////////////////// Chi+Iota 138 bic x26,x2,x1 139 bic x27,x3,x2 140 bic x28,x0,x4 141 bic x30,x1,x0 142 eor x0,x0,x26 143 bic x26,x4,x3 144 eor x1,x1,x27 145 ldr x27,[sp,#16] 146 eor x3,x3,x28 147 eor x4,x4,x30 148 eor x2,x2,x26 149 ldr x30,[x27],#8 // Iota[i++] 150 151 bic x26,x7,x6 152 tst x27,#255 // are we done? 153 str x27,[sp,#16] 154 bic x27,x8,x7 155 bic x28,x5,x9 156 eor x0,x0,x30 // A[0][0] ^= Iota 157 bic x30,x6,x5 158 eor x5,x5,x26 159 bic x26,x9,x8 160 eor x6,x6,x27 161 eor x8,x8,x28 162 eor x9,x9,x30 163 eor x7,x7,x26 164 165 bic x26,x12,x11 166 bic x27,x13,x12 167 bic x28,x10,x14 168 bic x30,x11,x10 169 eor x10,x10,x26 170 bic x26,x14,x13 171 eor x11,x11,x27 172 eor x13,x13,x28 173 eor x14,x14,x30 174 eor x12,x12,x26 175 176 bic x26,x17,x16 177 bic x27,x25,x17 178 bic x28,x15,x19 179 bic x30,x16,x15 180 eor x15,x15,x26 181 bic x26,x19,x25 182 eor x16,x16,x27 183 eor x25,x25,x28 184 eor x19,x19,x30 185 eor x17,x17,x26 186 187 bic x26,x22,x21 188 bic x27,x23,x22 189 bic x28,x20,x24 190 bic x30,x21,x20 191 eor x20,x20,x26 192 bic x26,x24,x23 193 eor x21,x21,x27 194 eor x23,x23,x28 195 eor x24,x24,x30 196 eor x22,x22,x26 197 198 bne .Loop 199 200 ldr x30,[sp,#24] 201.inst 0xd50323bf // autiasp 202 ret 203.size KeccakF1600_int,.-KeccakF1600_int 204 205.type KeccakF1600,%function 206.align 5 207KeccakF1600: 208.inst 0xd503233f // paciasp 209 stp x29,x30,[sp,#-128]! 210 add x29,sp,#0 211 stp x19,x20,[sp,#16] 212 stp x21,x22,[sp,#32] 213 stp x23,x24,[sp,#48] 214 stp x25,x26,[sp,#64] 215 stp x27,x28,[sp,#80] 216 sub sp,sp,#48 217 218 str x0,[sp,#32] // offload argument 219 mov x26,x0 220 ldp x0,x1,[x0,#16*0] 221 ldp x2,x3,[x26,#16*1] 222 ldp x4,x5,[x26,#16*2] 223 ldp x6,x7,[x26,#16*3] 224 ldp x8,x9,[x26,#16*4] 225 ldp x10,x11,[x26,#16*5] 226 ldp x12,x13,[x26,#16*6] 227 ldp x14,x15,[x26,#16*7] 228 ldp x16,x17,[x26,#16*8] 229 ldp x25,x19,[x26,#16*9] 230 ldp x20,x21,[x26,#16*10] 231 ldp x22,x23,[x26,#16*11] 232 ldr x24,[x26,#16*12] 233 234 bl KeccakF1600_int 235 236 ldr x26,[sp,#32] 237 stp x0,x1,[x26,#16*0] 238 stp x2,x3,[x26,#16*1] 239 stp x4,x5,[x26,#16*2] 240 stp x6,x7,[x26,#16*3] 241 stp x8,x9,[x26,#16*4] 242 stp x10,x11,[x26,#16*5] 243 stp x12,x13,[x26,#16*6] 244 stp x14,x15,[x26,#16*7] 245 stp x16,x17,[x26,#16*8] 246 stp x25,x19,[x26,#16*9] 247 stp x20,x21,[x26,#16*10] 248 stp x22,x23,[x26,#16*11] 249 str x24,[x26,#16*12] 250 251 ldp x19,x20,[x29,#16] 252 add sp,sp,#48 253 ldp x21,x22,[x29,#32] 254 ldp x23,x24,[x29,#48] 255 ldp x25,x26,[x29,#64] 256 ldp x27,x28,[x29,#80] 257 ldp x29,x30,[sp],#128 258.inst 0xd50323bf // autiasp 259 ret 260.size KeccakF1600,.-KeccakF1600 261 262.globl SHA3_absorb 263.type SHA3_absorb,%function 264.align 5 265SHA3_absorb: 266.inst 0xd503233f // paciasp 267 stp x29,x30,[sp,#-128]! 268 add x29,sp,#0 269 stp x19,x20,[sp,#16] 270 stp x21,x22,[sp,#32] 271 stp x23,x24,[sp,#48] 272 stp x25,x26,[sp,#64] 273 stp x27,x28,[sp,#80] 274 sub sp,sp,#64 275 276 stp x0,x1,[sp,#32] // offload arguments 277 stp x2,x3,[sp,#48] 278 279 mov x26,x0 // uint64_t A[5][5] 280 mov x27,x1 // const void *inp 281 mov x28,x2 // size_t len 282 mov x30,x3 // size_t bsz 283 ldp x0,x1,[x26,#16*0] 284 ldp x2,x3,[x26,#16*1] 285 ldp x4,x5,[x26,#16*2] 286 ldp x6,x7,[x26,#16*3] 287 ldp x8,x9,[x26,#16*4] 288 ldp x10,x11,[x26,#16*5] 289 ldp x12,x13,[x26,#16*6] 290 ldp x14,x15,[x26,#16*7] 291 ldp x16,x17,[x26,#16*8] 292 ldp x25,x19,[x26,#16*9] 293 ldp x20,x21,[x26,#16*10] 294 ldp x22,x23,[x26,#16*11] 295 ldr x24,[x26,#16*12] 296 b .Loop_absorb 297 298.align 4 299.Loop_absorb: 300 subs x26,x28,x30 // len - bsz 301 blo .Labsorbed 302 303 str x26,[sp,#48] // save len - bsz 304 ldr x26,[x27],#8 // *inp++ 305#ifdef __AARCH64EB__ 306 rev x26,x26 307#endif 308 eor x0,x0,x26 309 cmp x30,#8*(0+2) 310 blo .Lprocess_block 311 ldr x26,[x27],#8 // *inp++ 312#ifdef __AARCH64EB__ 313 rev x26,x26 314#endif 315 eor x1,x1,x26 316 beq .Lprocess_block 317 ldr x26,[x27],#8 // *inp++ 318#ifdef __AARCH64EB__ 319 rev x26,x26 320#endif 321 eor x2,x2,x26 322 cmp x30,#8*(2+2) 323 blo .Lprocess_block 324 ldr x26,[x27],#8 // *inp++ 325#ifdef __AARCH64EB__ 326 rev x26,x26 327#endif 328 eor x3,x3,x26 329 beq .Lprocess_block 330 ldr x26,[x27],#8 // *inp++ 331#ifdef __AARCH64EB__ 332 rev x26,x26 333#endif 334 eor x4,x4,x26 335 cmp x30,#8*(4+2) 336 blo .Lprocess_block 337 ldr x26,[x27],#8 // *inp++ 338#ifdef __AARCH64EB__ 339 rev x26,x26 340#endif 341 eor x5,x5,x26 342 beq .Lprocess_block 343 ldr x26,[x27],#8 // *inp++ 344#ifdef __AARCH64EB__ 345 rev x26,x26 346#endif 347 eor x6,x6,x26 348 cmp x30,#8*(6+2) 349 blo .Lprocess_block 350 ldr x26,[x27],#8 // *inp++ 351#ifdef __AARCH64EB__ 352 rev x26,x26 353#endif 354 eor x7,x7,x26 355 beq .Lprocess_block 356 ldr x26,[x27],#8 // *inp++ 357#ifdef __AARCH64EB__ 358 rev x26,x26 359#endif 360 eor x8,x8,x26 361 cmp x30,#8*(8+2) 362 blo .Lprocess_block 363 ldr x26,[x27],#8 // *inp++ 364#ifdef __AARCH64EB__ 365 rev x26,x26 366#endif 367 eor x9,x9,x26 368 beq .Lprocess_block 369 ldr x26,[x27],#8 // *inp++ 370#ifdef __AARCH64EB__ 371 rev x26,x26 372#endif 373 eor x10,x10,x26 374 cmp x30,#8*(10+2) 375 blo .Lprocess_block 376 ldr x26,[x27],#8 // *inp++ 377#ifdef __AARCH64EB__ 378 rev x26,x26 379#endif 380 eor x11,x11,x26 381 beq .Lprocess_block 382 ldr x26,[x27],#8 // *inp++ 383#ifdef __AARCH64EB__ 384 rev x26,x26 385#endif 386 eor x12,x12,x26 387 cmp x30,#8*(12+2) 388 blo .Lprocess_block 389 ldr x26,[x27],#8 // *inp++ 390#ifdef __AARCH64EB__ 391 rev x26,x26 392#endif 393 eor x13,x13,x26 394 beq .Lprocess_block 395 ldr x26,[x27],#8 // *inp++ 396#ifdef __AARCH64EB__ 397 rev x26,x26 398#endif 399 eor x14,x14,x26 400 cmp x30,#8*(14+2) 401 blo .Lprocess_block 402 ldr x26,[x27],#8 // *inp++ 403#ifdef __AARCH64EB__ 404 rev x26,x26 405#endif 406 eor x15,x15,x26 407 beq .Lprocess_block 408 ldr x26,[x27],#8 // *inp++ 409#ifdef __AARCH64EB__ 410 rev x26,x26 411#endif 412 eor x16,x16,x26 413 cmp x30,#8*(16+2) 414 blo .Lprocess_block 415 ldr x26,[x27],#8 // *inp++ 416#ifdef __AARCH64EB__ 417 rev x26,x26 418#endif 419 eor x17,x17,x26 420 beq .Lprocess_block 421 ldr x26,[x27],#8 // *inp++ 422#ifdef __AARCH64EB__ 423 rev x26,x26 424#endif 425 eor x25,x25,x26 426 cmp x30,#8*(18+2) 427 blo .Lprocess_block 428 ldr x26,[x27],#8 // *inp++ 429#ifdef __AARCH64EB__ 430 rev x26,x26 431#endif 432 eor x19,x19,x26 433 beq .Lprocess_block 434 ldr x26,[x27],#8 // *inp++ 435#ifdef __AARCH64EB__ 436 rev x26,x26 437#endif 438 eor x20,x20,x26 439 cmp x30,#8*(20+2) 440 blo .Lprocess_block 441 ldr x26,[x27],#8 // *inp++ 442#ifdef __AARCH64EB__ 443 rev x26,x26 444#endif 445 eor x21,x21,x26 446 beq .Lprocess_block 447 ldr x26,[x27],#8 // *inp++ 448#ifdef __AARCH64EB__ 449 rev x26,x26 450#endif 451 eor x22,x22,x26 452 cmp x30,#8*(22+2) 453 blo .Lprocess_block 454 ldr x26,[x27],#8 // *inp++ 455#ifdef __AARCH64EB__ 456 rev x26,x26 457#endif 458 eor x23,x23,x26 459 beq .Lprocess_block 460 ldr x26,[x27],#8 // *inp++ 461#ifdef __AARCH64EB__ 462 rev x26,x26 463#endif 464 eor x24,x24,x26 465 466.Lprocess_block: 467 str x27,[sp,#40] // save inp 468 469 bl KeccakF1600_int 470 471 ldr x27,[sp,#40] // restore arguments 472 ldp x28,x30,[sp,#48] 473 b .Loop_absorb 474 475.align 4 476.Labsorbed: 477 ldr x27,[sp,#32] 478 stp x0,x1,[x27,#16*0] 479 stp x2,x3,[x27,#16*1] 480 stp x4,x5,[x27,#16*2] 481 stp x6,x7,[x27,#16*3] 482 stp x8,x9,[x27,#16*4] 483 stp x10,x11,[x27,#16*5] 484 stp x12,x13,[x27,#16*6] 485 stp x14,x15,[x27,#16*7] 486 stp x16,x17,[x27,#16*8] 487 stp x25,x19,[x27,#16*9] 488 stp x20,x21,[x27,#16*10] 489 stp x22,x23,[x27,#16*11] 490 str x24,[x27,#16*12] 491 492 mov x0,x28 // return value 493 ldp x19,x20,[x29,#16] 494 add sp,sp,#64 495 ldp x21,x22,[x29,#32] 496 ldp x23,x24,[x29,#48] 497 ldp x25,x26,[x29,#64] 498 ldp x27,x28,[x29,#80] 499 ldp x29,x30,[sp],#128 500.inst 0xd50323bf // autiasp 501 ret 502.size SHA3_absorb,.-SHA3_absorb 503.globl SHA3_squeeze 504.type SHA3_squeeze,%function 505.align 5 506SHA3_squeeze: 507.inst 0xd503233f // paciasp 508 stp x29,x30,[sp,#-48]! 509 add x29,sp,#0 510 stp x19,x20,[sp,#16] 511 stp x21,x22,[sp,#32] 512 513 mov x19,x0 // put aside arguments 514 mov x20,x1 515 mov x21,x2 516 mov x22,x3 517 518.Loop_squeeze: 519 ldr x4,[x0],#8 520 cmp x21,#8 521 blo .Lsqueeze_tail 522#ifdef __AARCH64EB__ 523 rev x4,x4 524#endif 525 str x4,[x20],#8 526 subs x21,x21,#8 527 beq .Lsqueeze_done 528 529 subs x3,x3,#8 530 bhi .Loop_squeeze 531 532 mov x0,x19 533 bl KeccakF1600 534 mov x0,x19 535 mov x3,x22 536 b .Loop_squeeze 537 538.align 4 539.Lsqueeze_tail: 540 strb w4,[x20],#1 541 lsr x4,x4,#8 542 subs x21,x21,#1 543 beq .Lsqueeze_done 544 strb w4,[x20],#1 545 lsr x4,x4,#8 546 subs x21,x21,#1 547 beq .Lsqueeze_done 548 strb w4,[x20],#1 549 lsr x4,x4,#8 550 subs x21,x21,#1 551 beq .Lsqueeze_done 552 strb w4,[x20],#1 553 lsr x4,x4,#8 554 subs x21,x21,#1 555 beq .Lsqueeze_done 556 strb w4,[x20],#1 557 lsr x4,x4,#8 558 subs x21,x21,#1 559 beq .Lsqueeze_done 560 strb w4,[x20],#1 561 lsr x4,x4,#8 562 subs x21,x21,#1 563 beq .Lsqueeze_done 564 strb w4,[x20],#1 565 566.Lsqueeze_done: 567 ldp x19,x20,[sp,#16] 568 ldp x21,x22,[sp,#32] 569 ldp x29,x30,[sp],#48 570.inst 0xd50323bf // autiasp 571 ret 572.size SHA3_squeeze,.-SHA3_squeeze 573.type KeccakF1600_ce,%function 574.align 5 575KeccakF1600_ce: 576 mov x9,#24 577 adr x10,iotas 578 b .Loop_ce 579.align 4 580.Loop_ce: 581 ////////////////////////////////////////////////// Theta 582.inst 0xce0f2a99 //eor3 v25.16b,v20.16b,v15.16b,v10.16b 583.inst 0xce102eba //eor3 v26.16b,v21.16b,v16.16b,v11.16b 584.inst 0xce1132db //eor3 v27.16b,v22.16b,v17.16b,v12.16b 585.inst 0xce1236fc //eor3 v28.16b,v23.16b,v18.16b,v13.16b 586.inst 0xce133b1d //eor3 v29.16b,v24.16b,v19.16b,v14.16b 587.inst 0xce050339 //eor3 v25.16b,v25.16b, v5.16b,v0.16b 588.inst 0xce06075a //eor3 v26.16b,v26.16b, v6.16b,v1.16b 589.inst 0xce070b7b //eor3 v27.16b,v27.16b, v7.16b,v2.16b 590.inst 0xce080f9c //eor3 v28.16b,v28.16b, v8.16b,v3.16b 591.inst 0xce0913bd //eor3 v29.16b,v29.16b, v9.16b,v4.16b 592 593.inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1] 594.inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2] 595.inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3] 596.inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4] 597.inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0] 598 599 ////////////////////////////////////////////////// Theta+Rho+Pi 600.inst 0xce9efc39 //xar v25.16b, v1.16b,v30.16b,#64-1 // C[0]=A[2][0] 601 602.inst 0xce9e50c1 //xar v1.16b,v6.16b,v30.16b,#64-44 603.inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20 604.inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61 605.inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39 606.inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18 607 608.inst 0xce9f085a //xar v26.16b, v2.16b,v31.16b,#64-62 // C[1]=A[4][0] 609 610.inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43 611.inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25 612.inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8 613.inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56 614.inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41 615 616.inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27 617 618.inst 0xce9ccb1c //xar v28.16b, v24.16b,v28.16b,#64-14 // D[4]=A[0][4] 619.inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2 620.inst 0xce9b2508 //xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1] 621.inst 0xce9e4e04 //xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3] 622.inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36 623 624.inst 0xce9b9065 //xar v5.16b,v3.16b,v27.16b,#64-28 625 626 eor v0.16b,v0.16b,v29.16b 627 628.inst 0xce9bae5b //xar v27.16b, v18.16b,v27.16b,#64-21 // D[3]=A[0][3] 629.inst 0xce9fc623 //xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3] 630.inst 0xce9ed97e //xar v30.16b, v11.16b,v30.16b,#64-10 // D[1]=A[3][2] 631.inst 0xce9fe8ff //xar v31.16b, v7.16b,v31.16b,#64-6 // D[2]=A[2][1] 632.inst 0xce9df55d //xar v29.16b, v10.16b,v29.16b,#64-3 // D[0]=A[1][2] 633 634 ////////////////////////////////////////////////// Chi+Iota 635.inst 0xce362354 //bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1] 636.inst 0xce375915 //bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1] 637.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b 638.inst 0xce3a62f7 //bcax v23.16b,v23.16b,v26.16b, v24.16b 639.inst 0xce286b18 //bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] 640 641 ld1r {v26.2d},[x10],#8 642 643.inst 0xce330fd1 //bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] 644.inst 0xce2f4c72 //bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] 645.inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b 646.inst 0xce3e41ef //bcax v15.16b,v15.16b,v30.16b, v16.16b 647.inst 0xce237a10 //bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3] 648 649.inst 0xce2c7f2a //bcax v10.16b,v25.16b, v12.16b,v31.16b 650.inst 0xce2d33eb //bcax v11.16b,v31.16b, v13.16b,v12.16b 651.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b 652.inst 0xce3939ad //bcax v13.16b,v13.16b,v25.16b, v14.16b 653.inst 0xce3f65ce //bcax v14.16b,v14.16b,v31.16b, v25.16b 654 655.inst 0xce2913a7 //bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3] 656.inst 0xce252488 //bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3] 657.inst 0xce261529 //bcax v9.16b,v9.16b,v6.16b,v5.16b 658.inst 0xce3d18a5 //bcax v5.16b,v5.16b,v29.16b, v6.16b 659.inst 0xce2474c6 //bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3] 660 661.inst 0xce207363 //bcax v3.16b,v27.16b, v0.16b,v28.16b 662.inst 0xce210384 //bcax v4.16b,v28.16b, v1.16b,v0.16b 663.inst 0xce220400 //bcax v0.16b,v0.16b,v2.16b,v1.16b 664.inst 0xce3b0821 //bcax v1.16b,v1.16b,v27.16b, v2.16b 665.inst 0xce3c6c42 //bcax v2.16b,v2.16b,v28.16b, v27.16b 666 667 eor v0.16b,v0.16b,v26.16b 668 669 subs x9,x9,#1 670 bne .Loop_ce 671 672 ret 673.size KeccakF1600_ce,.-KeccakF1600_ce 674 675.type KeccakF1600_cext,%function 676.align 5 677KeccakF1600_cext: 678.inst 0xd503233f // paciasp 679 stp x29,x30,[sp,#-80]! 680 add x29,sp,#0 681 stp d8,d9,[sp,#16] // per ABI requirement 682 stp d10,d11,[sp,#32] 683 stp d12,d13,[sp,#48] 684 stp d14,d15,[sp,#64] 685 ldp d0,d1,[x0,#8*0] 686 ldp d2,d3,[x0,#8*2] 687 ldp d4,d5,[x0,#8*4] 688 ldp d6,d7,[x0,#8*6] 689 ldp d8,d9,[x0,#8*8] 690 ldp d10,d11,[x0,#8*10] 691 ldp d12,d13,[x0,#8*12] 692 ldp d14,d15,[x0,#8*14] 693 ldp d16,d17,[x0,#8*16] 694 ldp d18,d19,[x0,#8*18] 695 ldp d20,d21,[x0,#8*20] 696 ldp d22,d23,[x0,#8*22] 697 ldr d24,[x0,#8*24] 698 bl KeccakF1600_ce 699 ldr x30,[sp,#8] 700 stp d0,d1,[x0,#8*0] 701 stp d2,d3,[x0,#8*2] 702 stp d4,d5,[x0,#8*4] 703 stp d6,d7,[x0,#8*6] 704 stp d8,d9,[x0,#8*8] 705 stp d10,d11,[x0,#8*10] 706 stp d12,d13,[x0,#8*12] 707 stp d14,d15,[x0,#8*14] 708 stp d16,d17,[x0,#8*16] 709 stp d18,d19,[x0,#8*18] 710 stp d20,d21,[x0,#8*20] 711 stp d22,d23,[x0,#8*22] 712 str d24,[x0,#8*24] 713 714 ldp d8,d9,[sp,#16] 715 ldp d10,d11,[sp,#32] 716 ldp d12,d13,[sp,#48] 717 ldp d14,d15,[sp,#64] 718 ldr x29,[sp],#80 719.inst 0xd50323bf // autiasp 720 ret 721.size KeccakF1600_cext,.-KeccakF1600_cext 722.globl SHA3_absorb_cext 723.type SHA3_absorb_cext,%function 724.align 5 725SHA3_absorb_cext: 726.inst 0xd503233f // paciasp 727 stp x29,x30,[sp,#-80]! 728 add x29,sp,#0 729 stp d8,d9,[sp,#16] // per ABI requirement 730 stp d10,d11,[sp,#32] 731 stp d12,d13,[sp,#48] 732 stp d14,d15,[sp,#64] 733 ldp d0,d1,[x0,#8*0] 734 ldp d2,d3,[x0,#8*2] 735 ldp d4,d5,[x0,#8*4] 736 ldp d6,d7,[x0,#8*6] 737 ldp d8,d9,[x0,#8*8] 738 ldp d10,d11,[x0,#8*10] 739 ldp d12,d13,[x0,#8*12] 740 ldp d14,d15,[x0,#8*14] 741 ldp d16,d17,[x0,#8*16] 742 ldp d18,d19,[x0,#8*18] 743 ldp d20,d21,[x0,#8*20] 744 ldp d22,d23,[x0,#8*22] 745 ldr d24,[x0,#8*24] 746 b .Loop_absorb_ce 747 748.align 4 749.Loop_absorb_ce: 750 subs x2,x2,x3 // len - bsz 751 blo .Labsorbed_ce 752 ldr d31,[x1],#8 // *inp++ 753#ifdef __AARCH64EB__ 754 rev64 v31.16b,v31.16b 755#endif 756 eor v0.16b,v0.16b,v31.16b 757 cmp x3,#8*(0+2) 758 blo .Lprocess_block_ce 759 ldr d31,[x1],#8 // *inp++ 760#ifdef __AARCH64EB__ 761 rev64 v31.16b,v31.16b 762#endif 763 eor v1.16b,v1.16b,v31.16b 764 beq .Lprocess_block_ce 765 ldr d31,[x1],#8 // *inp++ 766#ifdef __AARCH64EB__ 767 rev64 v31.16b,v31.16b 768#endif 769 eor v2.16b,v2.16b,v31.16b 770 cmp x3,#8*(2+2) 771 blo .Lprocess_block_ce 772 ldr d31,[x1],#8 // *inp++ 773#ifdef __AARCH64EB__ 774 rev64 v31.16b,v31.16b 775#endif 776 eor v3.16b,v3.16b,v31.16b 777 beq .Lprocess_block_ce 778 ldr d31,[x1],#8 // *inp++ 779#ifdef __AARCH64EB__ 780 rev64 v31.16b,v31.16b 781#endif 782 eor v4.16b,v4.16b,v31.16b 783 cmp x3,#8*(4+2) 784 blo .Lprocess_block_ce 785 ldr d31,[x1],#8 // *inp++ 786#ifdef __AARCH64EB__ 787 rev64 v31.16b,v31.16b 788#endif 789 eor v5.16b,v5.16b,v31.16b 790 beq .Lprocess_block_ce 791 ldr d31,[x1],#8 // *inp++ 792#ifdef __AARCH64EB__ 793 rev64 v31.16b,v31.16b 794#endif 795 eor v6.16b,v6.16b,v31.16b 796 cmp x3,#8*(6+2) 797 blo .Lprocess_block_ce 798 ldr d31,[x1],#8 // *inp++ 799#ifdef __AARCH64EB__ 800 rev64 v31.16b,v31.16b 801#endif 802 eor v7.16b,v7.16b,v31.16b 803 beq .Lprocess_block_ce 804 ldr d31,[x1],#8 // *inp++ 805#ifdef __AARCH64EB__ 806 rev64 v31.16b,v31.16b 807#endif 808 eor v8.16b,v8.16b,v31.16b 809 cmp x3,#8*(8+2) 810 blo .Lprocess_block_ce 811 ldr d31,[x1],#8 // *inp++ 812#ifdef __AARCH64EB__ 813 rev64 v31.16b,v31.16b 814#endif 815 eor v9.16b,v9.16b,v31.16b 816 beq .Lprocess_block_ce 817 ldr d31,[x1],#8 // *inp++ 818#ifdef __AARCH64EB__ 819 rev64 v31.16b,v31.16b 820#endif 821 eor v10.16b,v10.16b,v31.16b 822 cmp x3,#8*(10+2) 823 blo .Lprocess_block_ce 824 ldr d31,[x1],#8 // *inp++ 825#ifdef __AARCH64EB__ 826 rev64 v31.16b,v31.16b 827#endif 828 eor v11.16b,v11.16b,v31.16b 829 beq .Lprocess_block_ce 830 ldr d31,[x1],#8 // *inp++ 831#ifdef __AARCH64EB__ 832 rev64 v31.16b,v31.16b 833#endif 834 eor v12.16b,v12.16b,v31.16b 835 cmp x3,#8*(12+2) 836 blo .Lprocess_block_ce 837 ldr d31,[x1],#8 // *inp++ 838#ifdef __AARCH64EB__ 839 rev64 v31.16b,v31.16b 840#endif 841 eor v13.16b,v13.16b,v31.16b 842 beq .Lprocess_block_ce 843 ldr d31,[x1],#8 // *inp++ 844#ifdef __AARCH64EB__ 845 rev64 v31.16b,v31.16b 846#endif 847 eor v14.16b,v14.16b,v31.16b 848 cmp x3,#8*(14+2) 849 blo .Lprocess_block_ce 850 ldr d31,[x1],#8 // *inp++ 851#ifdef __AARCH64EB__ 852 rev64 v31.16b,v31.16b 853#endif 854 eor v15.16b,v15.16b,v31.16b 855 beq .Lprocess_block_ce 856 ldr d31,[x1],#8 // *inp++ 857#ifdef __AARCH64EB__ 858 rev64 v31.16b,v31.16b 859#endif 860 eor v16.16b,v16.16b,v31.16b 861 cmp x3,#8*(16+2) 862 blo .Lprocess_block_ce 863 ldr d31,[x1],#8 // *inp++ 864#ifdef __AARCH64EB__ 865 rev64 v31.16b,v31.16b 866#endif 867 eor v17.16b,v17.16b,v31.16b 868 beq .Lprocess_block_ce 869 ldr d31,[x1],#8 // *inp++ 870#ifdef __AARCH64EB__ 871 rev64 v31.16b,v31.16b 872#endif 873 eor v18.16b,v18.16b,v31.16b 874 cmp x3,#8*(18+2) 875 blo .Lprocess_block_ce 876 ldr d31,[x1],#8 // *inp++ 877#ifdef __AARCH64EB__ 878 rev64 v31.16b,v31.16b 879#endif 880 eor v19.16b,v19.16b,v31.16b 881 beq .Lprocess_block_ce 882 ldr d31,[x1],#8 // *inp++ 883#ifdef __AARCH64EB__ 884 rev64 v31.16b,v31.16b 885#endif 886 eor v20.16b,v20.16b,v31.16b 887 cmp x3,#8*(20+2) 888 blo .Lprocess_block_ce 889 ldr d31,[x1],#8 // *inp++ 890#ifdef __AARCH64EB__ 891 rev64 v31.16b,v31.16b 892#endif 893 eor v21.16b,v21.16b,v31.16b 894 beq .Lprocess_block_ce 895 ldr d31,[x1],#8 // *inp++ 896#ifdef __AARCH64EB__ 897 rev64 v31.16b,v31.16b 898#endif 899 eor v22.16b,v22.16b,v31.16b 900 cmp x3,#8*(22+2) 901 blo .Lprocess_block_ce 902 ldr d31,[x1],#8 // *inp++ 903#ifdef __AARCH64EB__ 904 rev64 v31.16b,v31.16b 905#endif 906 eor v23.16b,v23.16b,v31.16b 907 beq .Lprocess_block_ce 908 ldr d31,[x1],#8 // *inp++ 909#ifdef __AARCH64EB__ 910 rev64 v31.16b,v31.16b 911#endif 912 eor v24.16b,v24.16b,v31.16b 913 914.Lprocess_block_ce: 915 916 bl KeccakF1600_ce 917 918 b .Loop_absorb_ce 919 920.align 4 921.Labsorbed_ce: 922 stp d0,d1,[x0,#8*0] 923 stp d2,d3,[x0,#8*2] 924 stp d4,d5,[x0,#8*4] 925 stp d6,d7,[x0,#8*6] 926 stp d8,d9,[x0,#8*8] 927 stp d10,d11,[x0,#8*10] 928 stp d12,d13,[x0,#8*12] 929 stp d14,d15,[x0,#8*14] 930 stp d16,d17,[x0,#8*16] 931 stp d18,d19,[x0,#8*18] 932 stp d20,d21,[x0,#8*20] 933 stp d22,d23,[x0,#8*22] 934 str d24,[x0,#8*24] 935 add x0,x2,x3 // return value 936 937 ldp d8,d9,[sp,#16] 938 ldp d10,d11,[sp,#32] 939 ldp d12,d13,[sp,#48] 940 ldp d14,d15,[sp,#64] 941 ldp x29,x30,[sp],#80 942.inst 0xd50323bf // autiasp 943 ret 944.size SHA3_absorb_cext,.-SHA3_absorb_cext 945.globl SHA3_squeeze_cext 946.type SHA3_squeeze_cext,%function 947.align 5 948SHA3_squeeze_cext: 949.inst 0xd503233f // paciasp 950 stp x29,x30,[sp,#-16]! 951 add x29,sp,#0 952 mov x9,x0 953 mov x10,x3 954 955.Loop_squeeze_ce: 956 ldr x4,[x9],#8 957 cmp x2,#8 958 blo .Lsqueeze_tail_ce 959#ifdef __AARCH64EB__ 960 rev x4,x4 961#endif 962 str x4,[x1],#8 963 beq .Lsqueeze_done_ce 964 965 sub x2,x2,#8 966 subs x10,x10,#8 967 bhi .Loop_squeeze_ce 968 969 bl KeccakF1600_cext 970 ldr x30,[sp,#8] 971 mov x9,x0 972 mov x10,x3 973 b .Loop_squeeze_ce 974 975.align 4 976.Lsqueeze_tail_ce: 977 strb w4,[x1],#1 978 lsr x4,x4,#8 979 subs x2,x2,#1 980 beq .Lsqueeze_done_ce 981 strb w4,[x1],#1 982 lsr x4,x4,#8 983 subs x2,x2,#1 984 beq .Lsqueeze_done_ce 985 strb w4,[x1],#1 986 lsr x4,x4,#8 987 subs x2,x2,#1 988 beq .Lsqueeze_done_ce 989 strb w4,[x1],#1 990 lsr x4,x4,#8 991 subs x2,x2,#1 992 beq .Lsqueeze_done_ce 993 strb w4,[x1],#1 994 lsr x4,x4,#8 995 subs x2,x2,#1 996 beq .Lsqueeze_done_ce 997 strb w4,[x1],#1 998 lsr x4,x4,#8 999 subs x2,x2,#1 1000 beq .Lsqueeze_done_ce 1001 strb w4,[x1],#1 1002 1003.Lsqueeze_done_ce: 1004 ldr x29,[sp],#16 1005.inst 0xd50323bf // autiasp 1006 ret 1007.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 1008.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1009.align 2 1010