1/* $NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <aarch64/asm.h> 30 31RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $") 32 33 .arch_extension aes 34 35/* 36 * uint32_t rcon[10] 37 * 38 * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2). 39 * Such elements of GF(8) need only eight bits to be represented, 40 * but we store them in 4-byte units so we can copy one into all 41 * four 4-byte lanes of a vector register with a single LD1R. The 42 * access pattern is fixed, so indices into this table are never 43 * secret. 44 */ 45 .section .rodata 46 .p2align 2 47 .type rcon,@object 48rcon: 49 .long 0x01 50 .long 0x02 51 .long 0x04 52 .long 0x08 53 .long 0x10 54 .long 0x20 55 .long 0x40 56 .long 0x80 57 .long 0x1b 58 .long 0x36 59END(rcon) 60 61/* 62 * uint128_t unshiftrows_rotword_1 63 * 64 * Table for TBL instruction to undo ShiftRows, and then do 65 * RotWord on word 1, and then copy it into all the other words. 66 */ 67 .section .rodata 68 .p2align 4 69 .type unshiftrows_rotword_1,@object 70unshiftrows_rotword_1: 71 .byte 0x01,0x0e,0x0b,0x04 72 .byte 0x01,0x0e,0x0b,0x04 73 .byte 0x01,0x0e,0x0b,0x04 74 .byte 0x01,0x0e,0x0b,0x04 75END(unshiftrows_rotword_1) 76 77/* 78 * uint128_t unshiftrows_3 79 * 80 * Table for TBL instruction to undo ShiftRows, and then copy word 81 * 3 into all the other words. 82 */ 83 .section .rodata 84 .p2align 4 85 .type unshiftrows_3,@object 86unshiftrows_3: 87 .byte 0x0c,0x09,0x06,0x03 88 .byte 0x0c,0x09,0x06,0x03 89 .byte 0x0c,0x09,0x06,0x03 90 .byte 0x0c,0x09,0x06,0x03 91END(unshiftrows_3) 92 93/* 94 * uint128_t unshiftrows_rotword_3 95 * 96 * Table for TBL instruction to undo ShiftRows, and then do 97 * RotWord on word 3, and then copy it into all the other words. 98 */ 99 .section .rodata 100 .p2align 4 101 .type unshiftrows_rotword_3,@object 102unshiftrows_rotword_3: 103 .byte 0x09,0x06,0x03,0x0c 104 .byte 0x09,0x06,0x03,0x0c 105 .byte 0x09,0x06,0x03,0x0c 106 .byte 0x09,0x06,0x03,0x0c 107END(unshiftrows_rotword_3) 108 109/* 110 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1) 111 * 112 * Expand a 16-byte AES-128 key into 10 round keys. 113 * 114 * Standard ABI calling convention. 115 */ 116ENTRY(aesarmv8_setenckey128) 117 ld1 {v1.16b}, [x1] /* q1 := master key */ 118 119 adrl x4, unshiftrows_rotword_3 120 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ 121 ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 table */ 122 123 str q1, [x0], #0x10 /* store master key as first round key */ 124 mov x2, #10 /* round count */ 125 adrl x3, rcon /* round constant */ 126 1271: /* 128 * q0 = 0 129 * v1.4s = (prk[0], prk[1], prk[2], prk[3]) 130 * x0 = pointer to round key to compute 131 * x2 = round count 132 * x3 = rcon pointer 133 */ 134 135 /* q3 := ShiftRows(SubBytes(q1)) */ 136 mov v3.16b, v1.16b 137 aese v3.16b, v0.16b 138 139 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ 140 ld1r {v4.4s}, [x3], #4 141 tbl v3.16b, {v3.16b}, v16.16b 142 eor v3.16b, v3.16b, v4.16b 143 144 /* 145 * v5.4s := (0,prk[0],prk[1],prk[2]) 146 * v6.4s := (0,0,prk[0],prk[1]) 147 * v7.4s := (0,0,0,prk[0]) 148 */ 149 ext v5.16b, v0.16b, v1.16b, #12 150 ext v6.16b, v0.16b, v1.16b, #8 151 ext v7.16b, v0.16b, v1.16b, #4 152 153 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ 154 eor v1.16b, v1.16b, v3.16b 155 eor v1.16b, v1.16b, v5.16b 156 eor v1.16b, v1.16b, v6.16b 157 eor v1.16b, v1.16b, v7.16b 158 159 subs x2, x2, #1 /* count down rounds */ 160 str q1, [x0], #0x10 /* store round key */ 161 b.ne 1b 162 163 ret 164END(aesarmv8_setenckey128) 165 166/* 167 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1) 168 * 169 * Expand a 24-byte AES-192 key into 12 round keys. 170 * 171 * Standard ABI calling convention. 172 */ 173ENTRY(aesarmv8_setenckey192) 174 ld1 {v1.16b}, [x1], #0x10 /* q1 := master key[0:128) */ 175 ld1 {v2.8b}, [x1] /* d2 := master key[128:192) */ 176 177 adrl x4, unshiftrows_rotword_1 178 adrl x5, unshiftrows_rotword_3 179 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ 180 ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_1 */ 181 ld1 {v17.16b}, [x5] /* q17 := unshiftrows_rotword_3 */ 182 183 str q1, [x0], #0x10 /* store master key[0:128) as round key */ 184 mov x2, #12 /* round count */ 185 adrl x3, rcon /* round constant */ 186 1871: /* 188 * q0 = 0 189 * v1.4s = (prk[0], prk[1], prk[2], prk[3]) 190 * v2.4s = (rklo[0], rklo[1], xxx, xxx) 191 * x0 = pointer to three round keys to compute 192 * x2 = round count 193 * x3 = rcon pointer 194 */ 195 196 /* q3 := ShiftRows(SubBytes(q2)) */ 197 mov v3.16b, v2.16b 198 aese v3.16b, v0.16b 199 200 /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ 201 ld1r {v4.4s}, [x3], #4 202 tbl v3.16b, {v3.16b}, v16.16b 203 eor v3.16b, v3.16b, v4.16b 204 205 /* 206 * We need to compute: 207 * 208 * rk[0] := rklo[0] 209 * rk[1] := rklo[1] 210 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] 211 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] 212 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] 213 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] 214 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] 215 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] 216 * ^ rklo[1] 217 */ 218 219 /* 220 * v5.4s := (0,prk[0],prk[1],prk[2]) 221 * v6.4s := (0,0,prk[0],prk[1]) 222 * v7.4s := (0,0,0,prk[0]) 223 */ 224 ext v5.16b, v0.16b, v1.16b, #12 225 ext v6.16b, v0.16b, v1.16b, #8 226 ext v7.16b, v0.16b, v1.16b, #4 227 228 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */ 229 eor v5.16b, v5.16b, v1.16b 230 eor v5.16b, v5.16b, v3.16b 231 eor v5.16b, v5.16b, v6.16b 232 eor v5.16b, v5.16b, v7.16b 233 234 /* 235 * At this point, rk is split across v2.4s = (rk[0],rk[1],...) 236 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s = 237 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or 238 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s = 239 * (rklo[0],rklo[1],...). 240 */ 241 242 /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */ 243 dup v1.4s, v5.s[3] 244 mov v1.s[0], v5.s[2] 245 246 /* 247 * v6.4s := (0, 0, rklo[0], rklo[1]) 248 * v7.4s := (0, 0, 0, rklo[0]) 249 */ 250 ext v6.16b, v0.16b, v2.16b, #8 251 ext v7.16b, v0.16b, v2.16b, #4 252 253 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ 254 eor v3.16b, v1.16b, v6.16b 255 eor v3.16b, v3.16b, v7.16b 256 257 /* 258 * Recall v2.4s = (rk[0], rk[1], xxx, xxx) 259 * and v5.4s = (rk[2], rk[3], xxx, xxx). Set 260 * v2.4s := (rk[0], rk[1], rk[2], rk[3]) 261 */ 262 mov v2.d[1], v5.d[0] 263 264 /* store two round keys */ 265 stp q2, q3, [x0], #0x20 266 267 /* 268 * Live vector registers at this point: 269 * 270 * q0 = zero 271 * q2 = rk 272 * q3 = nrk 273 * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) 274 * q16 = unshiftrows_rotword_1 275 * q17 = unshiftrows_rotword_3 276 * 277 * We have to compute, in q1: 278 * 279 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] 280 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] 281 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 282 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 283 * ^ nrk[1] 284 * 285 * And, if there's any more afterward, in q2: 286 * 287 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 288 * ^ nrk[1] ^ nrk[2] 289 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 290 * ^ nrk[1] ^ nrk[2] ^ nrk[3] 291 */ 292 293 /* q1 := RotWords(SubBytes(q3)) */ 294 mov v1.16b, v3.16b 295 aese v1.16b, v0.16b 296 297 /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ 298 ld1r {v4.4s}, [x3], #4 299 tbl v1.16b, {v1.16b}, v17.16b 300 eor v1.16b, v1.16b, v4.16b 301 302 /* 303 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already] 304 * v4.4s := (0, rk[2], rk[3], nrk[0]) 305 * v6.4s := (0, 0, rk[2], rk[3]) 306 * v7.4s := (0, 0, 0, rk[2]) 307 */ 308 ext v4.16b, v0.16b, v5.16b, #12 309 ext v6.16b, v0.16b, v5.16b, #8 310 ext v7.16b, v0.16b, v5.16b, #4 311 312 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */ 313 eor v1.16b, v1.16b, v5.16b 314 eor v1.16b, v1.16b, v4.16b 315 eor v1.16b, v1.16b, v6.16b 316 eor v1.16b, v1.16b, v7.16b 317 318 subs x2, x2, #3 /* count down three rounds */ 319 str q1, [x0], #0x10 /* store third round key */ 320 b.eq 2f 321 322 /* 323 * v4.4s := (nrk[2], nrk[3], xxx, xxx) 324 * v5.4s := (0, nrk[2], xxx, xxx) 325 */ 326 ext v4.16b, v3.16b, v0.16b, #8 327 ext v5.16b, v0.16b, v4.16b, #12 328 329 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */ 330 dup v2.4s, v1.s[3] 331 332 /* 333 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2], 334 * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3], 335 * xxx, xxx) 336 */ 337 eor v2.16b, v2.16b, v4.16b 338 eor v2.16b, v2.16b, v5.16b 339 340 b 1b 341 3422: ret 343END(aesarmv8_setenckey192) 344 345/* 346 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1) 347 * 348 * Expand a 32-byte AES-256 key into 14 round keys. 349 * 350 * Standard ABI calling convention. 351 */ 352ENTRY(aesarmv8_setenckey256) 353 /* q1 := key[0:128), q2 := key[128:256) */ 354 ld1 {v1.16b-v2.16b}, [x1], #0x20 355 356 adrl x4, unshiftrows_rotword_3 357 adrl x5, unshiftrows_3 358 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ 359 ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 */ 360 ld1 {v17.16b}, [x5] /* q17 := unshiftrows_3 */ 361 362 /* store master key as first two round keys */ 363 stp q1, q2, [x0], #0x20 364 mov x2, #14 /* round count */ 365 adrl x3, rcon /* round constant */ 366 3671: /* 368 * q0 = 0 369 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3]) 370 * v2.4s = (prk[0], prk[1], prk[2], prk[3]) 371 * x2 = round count 372 * x3 = rcon pointer 373 */ 374 375 /* q3 := ShiftRows(SubBytes(q2)) */ 376 mov v3.16b, v2.16b 377 aese v3.16b, v0.16b 378 379 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ 380 ld1r {v4.4s}, [x3], #4 381 tbl v3.16b, {v3.16b}, v16.16b 382 eor v3.16b, v3.16b, v4.16b 383 384 /* 385 * v5.4s := (0,pprk[0],pprk[1],pprk[2]) 386 * v6.4s := (0,0,pprk[0],pprk[1]) 387 * v7.4s := (0,0,0,pprk[0]) 388 */ 389 ext v5.16b, v0.16b, v1.16b, #12 390 ext v6.16b, v0.16b, v1.16b, #8 391 ext v7.16b, v0.16b, v1.16b, #4 392 393 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ 394 eor v1.16b, v1.16b, v3.16b 395 eor v1.16b, v1.16b, v5.16b 396 eor v1.16b, v1.16b, v6.16b 397 eor v1.16b, v1.16b, v7.16b 398 399 subs x2, x2, #2 /* count down two rounds */ 400 b.eq 2f /* stop if this is the last one */ 401 402 /* q3 := ShiftRows(SubBytes(q1)) */ 403 mov v3.16b, v1.16b 404 aese v3.16b, v0.16b 405 406 /* v3.4s[i] := SubBytes(rk[3]) */ 407 tbl v3.16b, {v3.16b}, v17.16b 408 409 /* 410 * v5.4s := (0,prk[0],prk[1],prk[2]) 411 * v6.4s := (0,0,prk[0],prk[1]) 412 * v7.4s := (0,0,0,prk[0]) 413 */ 414 ext v5.16b, v0.16b, v2.16b, #12 415 ext v6.16b, v0.16b, v2.16b, #8 416 ext v7.16b, v0.16b, v2.16b, #4 417 418 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ 419 eor v2.16b, v2.16b, v3.16b 420 eor v2.16b, v2.16b, v5.16b 421 eor v2.16b, v2.16b, v6.16b 422 eor v2.16b, v2.16b, v7.16b 423 424 stp q1, q2, [x0], #0x20 /* store two round keys */ 425 b 1b 426 4272: str q1, [x0] /* store last round key */ 428 ret 429END(aesarmv8_setenckey256) 430 431/* 432 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1, 433 * uint32_t nrounds@x2) 434 * 435 * Convert AES encryption round keys to AES decryption round keys. 436 * `rounds' must be between 10 and 14. 437 * 438 * Standard ABI calling convention. 439 */ 440ENTRY(aesarmv8_enctodec) 441 ldr q0, [x0, x2, lsl #4] /* load last round key */ 442 b 2f 443 _ALIGN_TEXT 4441: aesimc v0.16b, v0.16b /* convert encryption to decryption */ 4452: str q0, [x1], #0x10 /* store round key */ 446 subs x2, x2, #1 /* count down round */ 447 ldr q0, [x0, x2, lsl #4] /* load previous round key */ 448 b.ne 1b /* repeat if there's more */ 449 str q0, [x1] /* store first round key verbatim */ 450 ret 451END(aesarmv8_enctodec) 452 453/* 454 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1, 455 * uint8_t out[16] @x2, uint32_t nrounds@x3) 456 * 457 * Encrypt a single block. 458 * 459 * Standard ABI calling convention. 460 */ 461ENTRY(aesarmv8_enc) 462 stp fp, lr, [sp, #-16]! /* push stack frame */ 463 mov fp, sp 464 ld1 {v0.16b}, [x1] /* q0 := ptxt */ 465 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ 466 st1 {v0.16b}, [x2] /* store ctxt */ 467 ldp fp, lr, [sp], #16 /* pop stack frame */ 468 ret 469END(aesarmv8_enc) 470 471/* 472 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1, 473 * uint8_t out[16] @x2, uint32_t nrounds@x3) 474 * 475 * Decrypt a single block. 476 * 477 * Standard ABI calling convention. 478 */ 479ENTRY(aesarmv8_dec) 480 stp fp, lr, [sp, #-16]! /* push stack frame */ 481 mov fp, sp 482 ld1 {v0.16b}, [x1] /* q0 := ctxt */ 483 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ 484 st1 {v0.16b}, [x2] /* store ptxt */ 485 ldp fp, lr, [sp], #16 /* pop stack frame */ 486 ret 487END(aesarmv8_dec) 488 489/* 490 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1, 491 * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4, 492 * uint32_t nrounds@x5) 493 * 494 * Encrypt a contiguous sequence of blocks with AES-CBC. 495 * 496 * nbytes must be an integral multiple of 16. 497 * 498 * Standard ABI calling convention. 499 */ 500ENTRY(aesarmv8_cbc_enc) 501 cbz x3, 2f /* stop if nothing to do */ 502 stp fp, lr, [sp, #-16]! /* push stack frame */ 503 mov fp, sp 504 mov x9, x0 /* x9 := enckey */ 505 mov x10, x3 /* x10 := nbytes */ 506 ld1 {v0.16b}, [x4] /* q0 := chaining value */ 507 _ALIGN_TEXT 5081: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */ 509 eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ 510 mov x0, x9 /* x0 := enckey */ 511 mov x3, x5 /* x3 := nrounds */ 512 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ 513 subs x10, x10, #0x10 /* count down nbytes */ 514 st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */ 515 b.ne 1b /* repeat if x10 is nonzero */ 516 st1 {v0.16b}, [x4] /* store chaining value */ 517 ldp fp, lr, [sp], #16 /* pop stack frame */ 5182: ret 519END(aesarmv8_cbc_enc) 520 521/* 522 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, 523 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, 524 * uint32_t nrounds@x5) 525 * 526 * Decrypt a contiguous sequence of blocks with AES-CBC. 527 * 528 * nbytes must be a positive integral multiple of 16. This routine 529 * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once. 530 * 531 * Standard ABI calling convention. 532 */ 533ENTRY(aesarmv8_cbc_dec1) 534 stp fp, lr, [sp, #-16]! /* push stack frame */ 535 mov fp, sp 536 ld1 {v24.16b}, [x4] /* q24 := iv */ 537 mov x9, x0 /* x9 := enckey */ 538 mov x10, x3 /* x10 := nbytes */ 539 add x1, x1, x3 /* x1 := pointer past end of in */ 540 add x2, x2, x3 /* x2 := pointer past end of out */ 541 sub x1, x1, #0x10 542 ld1 {v0.16b}, [x1] /* q0 := last ciphertext block */ 543 st1 {v0.16b}, [x4] /* update iv */ 544 b 2f 545 _ALIGN_TEXT 5461: sub x1, x1, #0x10 547 ld1 {v31.16b}, [x1] /* q31 := chaining value */ 548 sub x2, x2, #0x10 549 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ 550 st1 {v0.16b}, [x2] /* store plaintext block */ 551 mov v0.16b, v31.16b /* move cv = ciphertext block */ 5522: mov x0, x9 /* x0 := enckey */ 553 mov x3, x5 /* x3 := nrounds */ 554 bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ 555 subs x10, x10, #0x10 /* count down nbytes */ 556 b.ne 1b /* repeat if more blocks */ 557 eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ 558 sub x2, x2, #0x10 /* store first plaintext block */ 559 st1 {v0.16b}, [x2] 560 ldp fp, lr, [sp], #16 /* pop stack frame */ 561 ret 562END(aesarmv8_cbc_dec1) 563 564/* 565 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, 566 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, 567 * uint32_t nrounds@x5) 568 * 569 * Decrypt a contiguous sequence of 8-block units with AES-CBC. 570 * 571 * nbytes must be a positive integral multiple of 128. 572 * 573 * Standard ABI calling convention. 574 */ 575ENTRY(aesarmv8_cbc_dec8) 576 stp fp, lr, [sp, #-16]! /* push stack frame */ 577 mov fp, sp 578 ld1 {v24.16b}, [x4] /* q24 := iv */ 579 mov x9, x0 /* x9 := enckey */ 580 mov x10, x3 /* x10 := nbytes */ 581 add x1, x1, x3 /* x1 := pointer past end of in */ 582 add x2, x2, x3 /* x2 := pointer past end of out */ 583 sub x1, x1, #0x20 584 ld1 {v6.16b, v7.16b}, [x1] /* q6, q7 := last ciphertext blocks */ 585 st1 {v7.16b}, [x4] /* update iv */ 586 b 2f 587 _ALIGN_TEXT 5881: sub x1, x1, #0x20 589 ld1 {v6.16b, v7.16b}, [x1] 590 eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ 591 sub x2, x2, #0x20 592 st1 {v0.16b, v1.16b}, [x2] 5932: sub x1, x1, #0x20 594 ld1 {v4.16b-v5.16b}, [x1] 595 sub x1, x1, #0x40 596 ld1 {v0.16b-v3.16b}, [x1] 597 598 mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */ 599 mov v30.16b, v5.16b 600 mov v29.16b, v4.16b 601 mov v28.16b, v3.16b 602 mov v27.16b, v2.16b 603 mov v26.16b, v1.16b 604 mov v25.16b, v0.16b 605 mov x0, x9 /* x0 := enckey */ 606 mov x3, x5 /* x3 := nrounds */ 607 bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i]; 608 * trash x0/x3/q16 */ 609 eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */ 610 eor v6.16b, v6.16b, v30.16b 611 eor v5.16b, v5.16b, v29.16b 612 eor v4.16b, v4.16b, v28.16b 613 eor v3.16b, v3.16b, v27.16b 614 eor v2.16b, v2.16b, v26.16b 615 eor v1.16b, v1.16b, v25.16b 616 subs x10, x10, #0x80 /* count down nbytes */ 617 sub x2, x2, #0x20 /* store plaintext blocks */ 618 st1 {v6.16b-v7.16b}, [x2] 619 sub x2, x2, #0x40 620 st1 {v2.16b-v5.16b}, [x2] 621 b.ne 1b /* repeat if there's more */ 622 eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */ 623 sub x2, x2, #0x20 624 st1 {v0.16b, v1.16b}, [x2] /* store first two plaintext blocks */ 625 ldp fp, lr, [sp], #16 /* pop stack frame */ 626 ret 627END(aesarmv8_cbc_dec8) 628 629/* 630 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1, 631 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 632 * uint32_t nrounds@x5) 633 * 634 * Encrypt a contiguous sequence of blocks with AES-XTS. 635 * 636 * nbytes must be a positive integral multiple of 16. This routine 637 * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once. 638 * 639 * Standard ABI calling convention. 640 */ 641ENTRY(aesarmv8_xts_enc1) 642 stp fp, lr, [sp, #-16]! /* push stack frame */ 643 mov fp, sp 644 mov x9, x0 /* x9 := enckey */ 645 mov x10, x3 /* x10 := nbytes */ 646 ld1 {v31.16b}, [x4] /* q31 := tweak */ 647 _ALIGN_TEXT 6481: ld1 {v0.16b}, [x1], #0x10 /* q0 := ptxt */ 649 mov x0, x9 /* x0 := enckey */ 650 mov x3, x5 /* x3 := nrounds */ 651 eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */ 652 bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */ 653 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ 654 st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */ 655 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 656 subs x10, x10, #0x10 /* count down nbytes */ 657 b.ne 1b /* repeat if more blocks */ 658 st1 {v31.16b}, [x4] /* update tweak */ 659 ldp fp, lr, [sp], #16 /* pop stack frame */ 660 ret 661END(aesarmv8_xts_enc1) 662 663/* 664 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1, 665 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 666 * uint32_t nrounds@x5) 667 * 668 * Encrypt a contiguous sequence of blocks with AES-XTS. 669 * 670 * nbytes must be a positive integral multiple of 128. 671 * 672 * Standard ABI calling convention. 673 */ 674ENTRY(aesarmv8_xts_enc8) 675 stp fp, lr, [sp, #-16]! /* push stack frame */ 676 mov fp, sp 677 mov x9, x0 /* x9 := enckey */ 678 mov x10, x3 /* x10 := nbytes */ 679 ld1 {v31.16b}, [x4] /* q31 := tweak */ 680 _ALIGN_TEXT 6811: mov v24.16b, v31.16b /* q24 := tweak[0] */ 682 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 683 mov v25.16b, v31.16b /* q25 := tweak[1] */ 684 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 685 mov v26.16b, v31.16b /* q26 := tweak[2] */ 686 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 687 mov v27.16b, v31.16b /* q27 := tweak[3] */ 688 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 689 mov v28.16b, v31.16b /* q28 := tweak[4] */ 690 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 691 mov v29.16b, v31.16b /* q29 := tweak[5] */ 692 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 693 mov v30.16b, v31.16b /* q30 := tweak[6] */ 694 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 695 /* q31 := tweak[7] */ 696 ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ptxt[i] */ 697 ld1 {v4.16b-v7.16b}, [x1], #0x40 698 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ 699 eor v1.16b, v1.16b, v25.16b 700 eor v2.16b, v2.16b, v26.16b 701 eor v3.16b, v3.16b, v27.16b 702 eor v4.16b, v4.16b, v28.16b 703 eor v5.16b, v5.16b, v29.16b 704 eor v6.16b, v6.16b, v30.16b 705 eor v7.16b, v7.16b, v31.16b 706 mov x0, x9 /* x0 := enckey */ 707 mov x3, x5 /* x3 := nrounds */ 708 bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */ 709 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ 710 eor v1.16b, v1.16b, v25.16b 711 eor v2.16b, v2.16b, v26.16b 712 eor v3.16b, v3.16b, v27.16b 713 eor v4.16b, v4.16b, v28.16b 714 eor v5.16b, v5.16b, v29.16b 715 eor v6.16b, v6.16b, v30.16b 716 eor v7.16b, v7.16b, v31.16b 717 st1 {v0.16b-v3.16b}, [x2], #0x40 /* store ciphertext blocks */ 718 st1 {v4.16b-v7.16b}, [x2], #0x40 719 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 720 subs x10, x10, #0x80 /* count down nbytes */ 721 b.ne 1b /* repeat if more block groups */ 722 st1 {v31.16b}, [x4] /* update tweak */ 723 ldp fp, lr, [sp], #16 /* pop stack frame */ 724 ret 725END(aesarmv8_xts_enc8) 726 727/* 728 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, 729 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 730 * uint32_t nrounds@x5) 731 * 732 * Decrypt a contiguous sequdece of blocks with AES-XTS. 733 * 734 * nbytes must be a positive integral multiple of 16. This routine 735 * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. 736 * 737 * Standard ABI calling convention. 738 */ 739ENTRY(aesarmv8_xts_dec1) 740 stp fp, lr, [sp, #-16]! /* push stack frame */ 741 mov fp, sp 742 mov x9, x0 /* x9 := deckey */ 743 mov x10, x3 /* x10 := nbytes */ 744 ld1 {v31.16b}, [x4] /* q31 := tweak */ 745 _ALIGN_TEXT 7461: ld1 {v0.16b}, [x1], #0x10 /* q0 := ctxt */ 747 mov x0, x9 /* x0 := deckey */ 748 mov x3, x5 /* x3 := nrounds */ 749 eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */ 750 bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */ 751 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */ 752 st1 {v0.16b}, [x2], #0x10 /* store plaintext block */ 753 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 754 subs x10, x10, #0x10 /* count down nbytes */ 755 b.ne 1b /* repeat if more blocks */ 756 st1 {v31.16b}, [x4] /* update tweak */ 757 ldp fp, lr, [sp], #16 /* pop stack frame */ 758 ret 759END(aesarmv8_xts_dec1) 760 761/* 762 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, 763 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 764 * uint32_t nrounds@x5) 765 * 766 * Decrypt a contiguous sequdece of blocks with AES-XTS. 767 * 768 * nbytes must be a positive integral multiple of 128. 769 * 770 * Standard ABI calling convention. 771 */ 772ENTRY(aesarmv8_xts_dec8) 773 stp fp, lr, [sp, #-16]! /* push stack frame */ 774 mov fp, sp 775 mov x9, x0 /* x9 := deckey */ 776 mov x10, x3 /* x10 := nbytes */ 777 ld1 {v31.16b}, [x4] /* q31 := tweak */ 778 _ALIGN_TEXT 7791: mov v24.16b, v31.16b /* q24 := tweak[0] */ 780 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 781 mov v25.16b, v31.16b /* q25 := tweak[1] */ 782 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 783 mov v26.16b, v31.16b /* q26 := tweak[2] */ 784 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 785 mov v27.16b, v31.16b /* q27 := tweak[3] */ 786 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 787 mov v28.16b, v31.16b /* q28 := tweak[4] */ 788 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 789 mov v29.16b, v31.16b /* q29 := tweak[5] */ 790 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 791 mov v30.16b, v31.16b /* q30 := tweak[6] */ 792 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 793 /* q31 := tweak[7] */ 794 ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ctxt[i] */ 795 ld1 {v4.16b-v7.16b}, [x1], #0x40 796 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */ 797 eor v1.16b, v1.16b, v25.16b 798 eor v2.16b, v2.16b, v26.16b 799 eor v3.16b, v3.16b, v27.16b 800 eor v4.16b, v4.16b, v28.16b 801 eor v5.16b, v5.16b, v29.16b 802 eor v6.16b, v6.16b, v30.16b 803 eor v7.16b, v7.16b, v31.16b 804 mov x0, x9 /* x0 := deckey */ 805 mov x3, x5 /* x3 := nrounds */ 806 bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */ 807 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ 808 eor v1.16b, v1.16b, v25.16b 809 eor v2.16b, v2.16b, v26.16b 810 eor v3.16b, v3.16b, v27.16b 811 eor v4.16b, v4.16b, v28.16b 812 eor v5.16b, v5.16b, v29.16b 813 eor v6.16b, v6.16b, v30.16b 814 eor v7.16b, v7.16b, v31.16b 815 st1 {v0.16b-v3.16b}, [x2], #0x40 /* store plaintext blocks */ 816 st1 {v4.16b-v7.16b}, [x2], #0x40 817 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 818 subs x10, x10, #0x80 /* count down nbytes */ 819 b.ne 1b /* repeat if more block groups */ 820 st1 {v31.16b}, [x4] /* update tweak */ 821 ldp fp, lr, [sp], #16 /* pop stack frame */ 822 ret 823END(aesarmv8_xts_dec8) 824 825/* 826 * aesarmv8_xts_mulx(tweak@q31) 827 * 828 * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. 829 * Uses x0 and q0/q1 as temporaries. 830 */ 831 .text 832 _ALIGN_TEXT 833 .type aesarmv8_xts_mulx,@function 834aesarmv8_xts_mulx: 835 /* 836 * Simultaneously determine 837 * (a) whether the high bit of the low half must be 838 * shifted into the low bit of the high half, and 839 * (b) whether the high bit of the high half must be 840 * carried into x^128 = x^7 + x^2 + x + 1. 841 */ 842 adrl x0, xtscarry 843 cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */ 844 ld1 {v0.16b}, [x0] /* q0 := xtscarry */ 845 ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ 846 shl v31.2d, v31.2d, #1 /* shift */ 847 and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ 848 eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */ 849 ret 850END(aesarmv8_xts_mulx) 851 852 .section .rodata 853 .p2align 4 854 .type xtscarry,@object 855xtscarry: 856 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 857END(xtscarry) 858 859/* 860 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1) 861 * 862 * Update an AES-XTS tweak. 863 * 864 * Standard ABI calling convention. 865 */ 866ENTRY(aesarmv8_xts_update) 867 stp fp, lr, [sp, #-16]! /* push stack frame */ 868 mov fp, sp 869 ld1 {v31.16b}, [x0] /* load tweak */ 870 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 871 st1 {v31.16b}, [x1] /* store tweak */ 872 ldp fp, lr, [sp], #16 /* pop stack frame */ 873 ret 874END(aesarmv8_xts_update) 875 876/* 877 * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0, 878 * const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3, 879 * uint32_t nrounds@x4) 880 * 881 * Update CBC-MAC. 882 * 883 * nbytes must be a positive integral multiple of 16. 884 * 885 * Standard ABI calling convention. 886 */ 887ENTRY(aesarmv8_cbcmac_update1) 888 stp fp, lr, [sp, #-16]! /* push stack frame */ 889 mov fp, sp 890 ld1 {v0.16b}, [x3] /* q0 := initial authenticator */ 891 mov x9, x0 /* x9 := enckey */ 892 mov x5, x3 /* x5 := &auth (enc1 trashes x3) */ 893 _ALIGN_TEXT 8941: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */ 895 mov x0, x9 /* x0 := enckey */ 896 mov x3, x4 /* x3 := nrounds */ 897 eor v0.16b, v0.16b, v1.16b /* q0 := auth ^ ptxt */ 898 bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */ 899 subs x2, x2, #0x10 /* count down nbytes */ 900 b.ne 1b /* repeat if x10 is nonzero */ 901 st1 {v0.16b}, [x5] /* store updated authenticator */ 902 ldp fp, lr, [sp], #16 /* pop stack frame */ 903 ret 904END(aesarmv8_cbcmac_update1) 905 906/* 907 * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1, 908 * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4, 909 * uint32_t nrounds@x5) 910 * 911 * Update CCM encryption. 912 * 913 * nbytes must be a positive integral multiple of 16. 914 * 915 * Standard ABI calling convention. 916 */ 917ENTRY(aesarmv8_ccm_enc1) 918 stp fp, lr, [sp, #-16]! /* push stack frame */ 919 mov fp, sp 920 ld1 {v0.16b-v1.16b}, [x4] /* q0 := auth, q1 := ctr (be) */ 921 adrl x11, ctr32_inc /* x11 := &ctr32_inc */ 922 ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ 923 mov x9, x0 /* x9 := enckey */ 924 mov x10, x3 /* x10 := nbytes */ 925 rev32 v2.16b, v1.16b /* q2 := ctr (host-endian) */ 926 _ALIGN_TEXT 9271: ld1 {v3.16b}, [x1], #0x10 /* q3 := plaintext block */ 928 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ 929 mov x0, x9 /* x0 := enckey */ 930 mov x3, x5 /* x3 := nrounds */ 931 rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ 932 eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */ 933 bl aesarmv8_enc2 /* q0 := auth', q1 := pad; 934 * trash x0/x3/q16 */ 935 eor v3.16b, v1.16b, v3.16b /* q3 := ciphertext block */ 936 subs x10, x10, #0x10 /* count down bytes */ 937 st1 {v3.16b}, [x2], #0x10 /* store ciphertext block */ 938 b.ne 1b /* repeat if more blocks */ 939 rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ 940 st1 {v0.16b-v1.16b}, [x4] /* store updated auth/ctr */ 941 ldp fp, lr, [sp], #16 /* pop stack frame */ 942 ret 943END(aesarmv8_ccm_enc1) 944 945/* 946 * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1, 947 * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4, 948 * uint32_t nrounds@x5) 949 * 950 * Update CCM decryption. 951 * 952 * nbytes must be a positive integral multiple of 16. 953 * 954 * Standard ABI calling convention. 955 */ 956ENTRY(aesarmv8_ccm_dec1) 957 stp fp, lr, [sp, #-16]! /* push stack frame */ 958 mov fp, sp 959 ld1 {v1.16b, v2.16b}, [x4] /* q1 := auth, q2 := ctr (be) */ 960 adrl x11, ctr32_inc /* x11 := &ctr32_inc */ 961 ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ 962 mov x9, x0 /* x9 := enckey */ 963 mov x10, x3 /* x10 := nbytes */ 964 rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ 965 966 /* Decrypt the first block. */ 967 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ 968 mov x3, x5 /* x3 := nrounds */ 969 rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ 970 ld1 {v3.16b}, [x1], #0x10 /* q3 := ctxt */ 971 bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */ 972 b 2f 973 974 _ALIGN_TEXT 9751: /* 976 * Authenticate the last block and decrypt the next block 977 * simultaneously. 978 * 979 * q1 = auth ^ ptxt[-1] 980 * q2 = ctr[-1] (le) 981 */ 982 add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ 983 mov x0, x9 /* x0 := enckey */ 984 mov x3, x5 /* x3 := nrounds */ 985 rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ 986 ld1 {v3.16b}, [x1], #0x10 /* q3 := ctxt */ 987 bl aesarmv8_enc2 /* q0 := pad, q1 := auth'; 988 * trash x0/x3/q16 */ 9892: eor v3.16b, v0.16b, v3.16b /* q3 := plaintext block */ 990 subs x10, x10, #0x10 991 st1 {v3.16b}, [x2], #0x10 /* store plaintext */ 992 eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */ 993 b.ne 1b 994 995 rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ 996 997 /* Authenticate the last block. */ 998 mov x0, x9 /* x0 := enckey */ 999 mov x3, x5 /* x3 := nrounds */ 1000 mov v0.16b, v1.16b /* q0 := auth ^ ptxt */ 1001 bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */ 1002 1003 mov v1.16b, v2.16b /* store updated auth/ctr */ 1004 st1 {v0.16b-v1.16b}, [x4] 1005 ldp fp, lr, [sp], #16 /* pop stack frame */ 1006 ret 1007END(aesarmv8_ccm_dec1) 1008 1009 .section .rodata 1010 .p2align 4 1011 .type ctr32_inc,@object 1012ctr32_inc: 1013 .int 0, 0, 0, 1 1014END(ctr32_inc) 1015 1016/* 1017 * aesarmv8_enc1(const struct aesenc *enckey@x0, 1018 * uint128_t block@q0, uint32_t nrounds@x3) 1019 * 1020 * Encrypt a single AES block in q0. 1021 * 1022 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 1023 */ 1024 .text 1025 _ALIGN_TEXT 1026 .type aesarmv8_enc1,@function 1027aesarmv8_enc1: 1028 ldr q16, [x0], #0x10 /* load round key */ 1029 sub x3, x3, #1 1030 _ALIGN_TEXT 10311: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */ 1032 aese v0.16b, v16.16b 1033 aesmc v0.16b, v0.16b 1034 ldr q16, [x0], #0x10 1035 subs x3, x3, #1 1036 b.ne 1b 1037 /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ 1038 aese v0.16b, v16.16b 1039 ldr q16, [x0] /* load last round key */ 1040 /* q0 := AddRoundKey_q16(q0) */ 1041 eor v0.16b, v0.16b, v16.16b 1042 ret 1043END(aesarmv8_enc1) 1044 1045/* 1046 * aesarmv8_enc2(const struct aesenc *enckey@x0, 1047 * uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3) 1048 * 1049 * Encrypt two AES blocks in q0 and q1. 1050 * 1051 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 1052 */ 1053 .text 1054 _ALIGN_TEXT 1055 .type aesarmv8_enc2,@function 1056aesarmv8_enc2: 1057 ldr q16, [x0], #0x10 /* load round key */ 1058 sub x3, x3, #1 1059 _ALIGN_TEXT 10601: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */ 1061 aese v0.16b, v16.16b 1062 aesmc v0.16b, v0.16b 1063 aese v1.16b, v16.16b 1064 aesmc v1.16b, v1.16b 1065 ldr q16, [x0], #0x10 /* load next round key */ 1066 subs x3, x3, #1 1067 b.ne 1b 1068 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ 1069 aese v0.16b, v16.16b 1070 aese v1.16b, v16.16b 1071 ldr q16, [x0] /* load last round key */ 1072 /* q[i] := AddRoundKey_q16(q[i]) */ 1073 eor v0.16b, v0.16b, v16.16b 1074 eor v1.16b, v1.16b, v16.16b 1075 ret 1076END(aesarmv8_enc2) 1077 1078/* 1079 * aesarmv8_enc8(const struct aesenc *enckey@x0, 1080 * uint128_t block0@q0, ..., uint128_t block7@q7, 1081 * uint32_t nrounds@x3) 1082 * 1083 * Encrypt eight AES blocks in q0 through q7 in parallel. 1084 * 1085 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 1086 */ 1087 .text 1088 _ALIGN_TEXT 1089 .type aesarmv8_enc8,@function 1090aesarmv8_enc8: 1091 ldr q16, [x0], #0x10 /* load round key */ 1092 sub x3, x3, #1 1093 _ALIGN_TEXT 10941: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */ 1095 aese v0.16b, v16.16b 1096 aesmc v0.16b, v0.16b 1097 aese v1.16b, v16.16b 1098 aesmc v1.16b, v1.16b 1099 aese v2.16b, v16.16b 1100 aesmc v2.16b, v2.16b 1101 aese v3.16b, v16.16b 1102 aesmc v3.16b, v3.16b 1103 aese v4.16b, v16.16b 1104 aesmc v4.16b, v4.16b 1105 aese v5.16b, v16.16b 1106 aesmc v5.16b, v5.16b 1107 aese v6.16b, v16.16b 1108 aesmc v6.16b, v6.16b 1109 aese v7.16b, v16.16b 1110 aesmc v7.16b, v7.16b 1111 ldr q16, [x0], #0x10 /* load next round key */ 1112 subs x3, x3, #1 1113 b.ne 1b 1114 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ 1115 aese v0.16b, v16.16b 1116 aese v1.16b, v16.16b 1117 aese v2.16b, v16.16b 1118 aese v3.16b, v16.16b 1119 aese v4.16b, v16.16b 1120 aese v5.16b, v16.16b 1121 aese v6.16b, v16.16b 1122 aese v7.16b, v16.16b 1123 ldr q16, [x0] /* load last round key */ 1124 /* q[i] := AddRoundKey_q16(q[i]) */ 1125 eor v0.16b, v0.16b, v16.16b 1126 eor v1.16b, v1.16b, v16.16b 1127 eor v2.16b, v2.16b, v16.16b 1128 eor v3.16b, v3.16b, v16.16b 1129 eor v4.16b, v4.16b, v16.16b 1130 eor v5.16b, v5.16b, v16.16b 1131 eor v6.16b, v6.16b, v16.16b 1132 eor v7.16b, v7.16b, v16.16b 1133 ret 1134END(aesarmv8_enc8) 1135 1136/* 1137 * aesarmv8_dec1(const struct aesdec *deckey@x0, 1138 * uint128_t block@q0, uint32_t nrounds@x3) 1139 * 1140 * Decrypt a single AES block in q0. 1141 * 1142 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 1143 */ 1144 .text 1145 _ALIGN_TEXT 1146 .type aesarmv8_dec1,@function 1147aesarmv8_dec1: 1148 ldr q16, [x0], #0x10 /* load round key */ 1149 sub x3, x3, #1 1150 _ALIGN_TEXT 11511: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ 1152 aesd v0.16b, v16.16b 1153 /* q0 := InMixColumns(q0) */ 1154 aesimc v0.16b, v0.16b 1155 ldr q16, [x0], #0x10 /* load next round key */ 1156 subs x3, x3, #1 1157 b.ne 1b 1158 /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ 1159 aesd v0.16b, v16.16b 1160 ldr q16, [x0] /* load last round key */ 1161 /* q0 := AddRoundKey_q16(q0) */ 1162 eor v0.16b, v0.16b, v16.16b 1163 ret 1164END(aesarmv8_dec1) 1165 1166/* 1167 * aesarmv8_dec8(const struct aesdec *deckey@x0, 1168 * uint128_t block0@q0, ..., uint128_t block7@q7, 1169 * uint32_t nrounds@x3) 1170 * 1171 * Decrypt eight AES blocks in q0 through q7 in parallel. 1172 * 1173 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 1174 */ 1175 .text 1176 _ALIGN_TEXT 1177 .type aesarmv8_dec8,@function 1178aesarmv8_dec8: 1179 ldr q16, [x0], #0x10 /* load round key */ 1180 sub x3, x3, #1 1181 _ALIGN_TEXT 11821: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ 1183 aesd v0.16b, v16.16b 1184 /* q[i] := InMixColumns(q[i]) */ 1185 aesimc v0.16b, v0.16b 1186 aesd v1.16b, v16.16b 1187 aesimc v1.16b, v1.16b 1188 aesd v2.16b, v16.16b 1189 aesimc v2.16b, v2.16b 1190 aesd v3.16b, v16.16b 1191 aesimc v3.16b, v3.16b 1192 aesd v4.16b, v16.16b 1193 aesimc v4.16b, v4.16b 1194 aesd v5.16b, v16.16b 1195 aesimc v5.16b, v5.16b 1196 aesd v6.16b, v16.16b 1197 aesimc v6.16b, v6.16b 1198 aesd v7.16b, v16.16b 1199 aesimc v7.16b, v7.16b 1200 ldr q16, [x0], #0x10 /* load next round key */ 1201 subs x3, x3, #1 1202 b.ne 1b 1203 /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ 1204 aesd v0.16b, v16.16b 1205 aesd v1.16b, v16.16b 1206 aesd v2.16b, v16.16b 1207 aesd v3.16b, v16.16b 1208 aesd v4.16b, v16.16b 1209 aesd v5.16b, v16.16b 1210 aesd v6.16b, v16.16b 1211 aesd v7.16b, v16.16b 1212 ldr q16, [x0] /* load last round key */ 1213 /* q[i] := AddRoundKey_q16(q[i]) */ 1214 eor v0.16b, v0.16b, v16.16b 1215 eor v1.16b, v1.16b, v16.16b 1216 eor v2.16b, v2.16b, v16.16b 1217 eor v3.16b, v3.16b, v16.16b 1218 eor v4.16b, v4.16b, v16.16b 1219 eor v5.16b, v5.16b, v16.16b 1220 eor v6.16b, v6.16b, v16.16b 1221 eor v7.16b, v7.16b, v16.16b 1222 ret 1223END(aesarmv8_dec8) 1224