1/* $NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <machine/asm.h> 30 31RCSID("$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $") 32 33/* 34 * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined 35 * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned) 36 * Packed Single, defined to operate on binary32 floats. They have 37 * exactly the same architectural effects (move a 128-bit quantity from 38 * memory into an xmm register). 39 * 40 * In principle, they might have different microarchitectural effects 41 * so that MOVAPS/MOVUPS might incur a penalty when the register is 42 * later used for integer paths, but in practice they don't. So we use 43 * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS. 44 */ 45#define movdqa movaps 46#define movdqu movups 47 48/* 49 * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi) 50 * 51 * Expand a 16-byte AES-128 key into 10 round keys. 52 * 53 * Standard ABI calling convention. 54 */ 55ENTRY(aesni_setenckey128) 56 movdqu (%rsi),%xmm0 /* load master key into %xmm0 */ 57 movdqa %xmm0,(%rdi) /* store master key as the first round key */ 58 lea 0x10(%rdi),%rdi /* advance %rdi to next round key */ 59 aeskeygenassist $0x1,%xmm0,%xmm2 60 call aesni_expand128 61 aeskeygenassist $0x2,%xmm0,%xmm2 62 call aesni_expand128 63 aeskeygenassist $0x4,%xmm0,%xmm2 64 call aesni_expand128 65 aeskeygenassist $0x8,%xmm0,%xmm2 66 call aesni_expand128 67 aeskeygenassist $0x10,%xmm0,%xmm2 68 call aesni_expand128 69 aeskeygenassist $0x20,%xmm0,%xmm2 70 call aesni_expand128 71 aeskeygenassist $0x40,%xmm0,%xmm2 72 call aesni_expand128 73 aeskeygenassist $0x80,%xmm0,%xmm2 74 call aesni_expand128 75 aeskeygenassist $0x1b,%xmm0,%xmm2 76 call aesni_expand128 77 aeskeygenassist $0x36,%xmm0,%xmm2 78 call aesni_expand128 79 ret 80END(aesni_setenckey128) 81 82/* 83 * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi) 84 * 85 * Expand a 24-byte AES-192 key into 12 round keys. 86 * 87 * Standard ABI calling convention. 88 */ 89ENTRY(aesni_setenckey192) 90 movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */ 91 movq 0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */ 92 movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */ 93 lea 0x10(%rdi),%rdi /* advance %rdi to next round key */ 94 aeskeygenassist $0x1,%xmm1,%xmm2 95 call aesni_expand192a 96 aeskeygenassist $0x2,%xmm0,%xmm2 97 call aesni_expand192b 98 aeskeygenassist $0x4,%xmm1,%xmm2 99 call aesni_expand192a 100 aeskeygenassist $0x8,%xmm0,%xmm2 101 call aesni_expand192b 102 aeskeygenassist $0x10,%xmm1,%xmm2 103 call aesni_expand192a 104 aeskeygenassist $0x20,%xmm0,%xmm2 105 call aesni_expand192b 106 aeskeygenassist $0x40,%xmm1,%xmm2 107 call aesni_expand192a 108 aeskeygenassist $0x80,%xmm0,%xmm2 109 call aesni_expand192b 110 ret 111END(aesni_setenckey192) 112 113/* 114 * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi) 115 * 116 * Expand a 32-byte AES-256 key into 14 round keys. 117 * 118 * Standard ABI calling convention. 119 */ 120ENTRY(aesni_setenckey256) 121 movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */ 122 movdqu 0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */ 123 movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */ 124 movdqa %xmm1,0x10(%rdi) /* store master key [128:256) as round key */ 125 lea 0x20(%rdi),%rdi /* advance %rdi to next round key */ 126 aeskeygenassist $0x1,%xmm1,%xmm2 127 call aesni_expand256a 128 aeskeygenassist $0x1,%xmm0,%xmm2 129 call aesni_expand256b 130 aeskeygenassist $0x2,%xmm1,%xmm2 131 call aesni_expand256a 132 aeskeygenassist $0x2,%xmm0,%xmm2 133 call aesni_expand256b 134 aeskeygenassist $0x4,%xmm1,%xmm2 135 call aesni_expand256a 136 aeskeygenassist $0x4,%xmm0,%xmm2 137 call aesni_expand256b 138 aeskeygenassist $0x8,%xmm1,%xmm2 139 call aesni_expand256a 140 aeskeygenassist $0x8,%xmm0,%xmm2 141 call aesni_expand256b 142 aeskeygenassist $0x10,%xmm1,%xmm2 143 call aesni_expand256a 144 aeskeygenassist $0x10,%xmm0,%xmm2 145 call aesni_expand256b 146 aeskeygenassist $0x20,%xmm1,%xmm2 147 call aesni_expand256a 148 aeskeygenassist $0x20,%xmm0,%xmm2 149 call aesni_expand256b 150 aeskeygenassist $0x40,%xmm1,%xmm2 151 call aesni_expand256a 152 ret 153END(aesni_setenckey256) 154 155/* 156 * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0, 157 * uint128_t keygenassist@xmm2) 158 * 159 * 1. Compute the AES-128 round key using the previous round key. 160 * 2. Store it at *rkp. 161 * 3. Set %xmm0 to it. 162 * 4. Advance %rdi to point at the next round key. 163 * 164 * Internal ABI. On entry: 165 * 166 * %rdi = rkp, pointer to round key to compute 167 * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) 168 * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON) 169 * 170 * On exit: 171 * 172 * %rdi = &rkp[1], rkp advanced by one round key 173 * %xmm0 = rk, the round key we just computed 174 * %xmm2 = garbage 175 * %xmm4 = garbage 176 * %xmm5 = garbage 177 * %xmm6 = garbage 178 * 179 * Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15, 180 * and all other registers). 181 */ 182 .text 183 _ALIGN_TEXT 184 .type aesni_expand128,@function 185aesni_expand128: 186 /* 187 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]), 188 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON. 189 */ 190 pshufd $0b11111111,%xmm2,%xmm2 191 192 /* 193 * %xmm4 := (0, prk[0], prk[1], prk[2]) 194 * %xmm5 := (0, 0, prk[0], prk[1]) 195 * %xmm6 := (0, 0, 0, prk[0]) 196 */ 197 movdqa %xmm0,%xmm4 198 movdqa %xmm0,%xmm5 199 movdqa %xmm0,%xmm6 200 pslldq $4,%xmm4 201 pslldq $8,%xmm5 202 pslldq $12,%xmm6 203 204 /* 205 * %xmm0 := (rk[0] = t ^ prk[0], 206 * rk[1] = t ^ prk[0] ^ prk[1], 207 * rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2], 208 * rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3]) 209 */ 210 pxor %xmm2,%xmm0 211 pxor %xmm4,%xmm0 212 pxor %xmm5,%xmm0 213 pxor %xmm6,%xmm0 214 215 movdqa %xmm0,(%rdi) /* store round key */ 216 lea 0x10(%rdi),%rdi /* advance to next round key address */ 217 ret 218END(aesni_expand128) 219 220/* 221 * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0, 222 * uint64_t rklo@xmm1, uint128_t keygenassist@xmm2) 223 * 224 * Set even-numbered AES-192 round key. 225 * 226 * Internal ABI. On entry: 227 * 228 * %rdi = rkp, pointer to two round keys to compute 229 * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) 230 * %xmm1 = (rklo[0], rklo[1], xxx, xxx) 231 * %xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx) 232 * 233 * On exit: 234 * 235 * %rdi = &rkp[2], rkp advanced by two round keys 236 * %xmm0 = nrk, second round key we just computed 237 * %xmm1 = rk, first round key we just computed 238 * %xmm2 = garbage 239 * %xmm4 = garbage 240 * %xmm5 = garbage 241 * %xmm6 = garbage 242 * %xmm7 = garbage 243 */ 244 .text 245 _ALIGN_TEXT 246 .type aesni_expand192a,@function 247aesni_expand192a: 248 /* 249 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]), 250 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON. 251 */ 252 pshufd $0b01010101,%xmm2,%xmm2 253 254 /* 255 * We need to compute: 256 * 257 * rk[0] := rklo[0] 258 * rk[1] := rklo[1] 259 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] 260 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] 261 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] 262 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] 263 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] 264 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] 265 * ^ rklo[1] 266 */ 267 268 /* 269 * %xmm4 := (prk[0], prk[1], prk[2], prk[3]) 270 * %xmm5 := (0, prk[0], prk[1], prk[2]) 271 * %xmm6 := (0, 0, prk[0], prk[1]) 272 * %xmm7 := (0, 0, 0, prk[0]) 273 */ 274 movdqa %xmm0,%xmm4 275 movdqa %xmm0,%xmm5 276 movdqa %xmm0,%xmm6 277 movdqa %xmm0,%xmm7 278 pslldq $4,%xmm5 279 pslldq $8,%xmm6 280 pslldq $12,%xmm7 281 282 /* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */ 283 pxor %xmm2,%xmm4 284 pxor %xmm5,%xmm4 285 pxor %xmm6,%xmm4 286 pxor %xmm7,%xmm4 287 288 /* 289 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and 290 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]); 291 * and we have yet to compute nrk[2] or nrk[3], which requires 292 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...). We need 293 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and 294 * nrk into %xmm0. 295 */ 296 297 /* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */ 298 pshufd $0b11111110,%xmm4,%xmm0 299 300 /* 301 * %xmm6 := (0, 0, rklo[0], rklo[1]) 302 * %xmm7 := (0, 0, 0, rklo[0]) 303 */ 304 movdqa %xmm1,%xmm6 305 movdqa %xmm1,%xmm7 306 307 pslldq $8,%xmm6 308 pslldq $12,%xmm7 309 310 /* 311 * %xmm0 := (nrk[0], 312 * nrk[1], 313 * nrk[2] = nrk[1] ^ rklo[0], 314 * nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1]) 315 */ 316 pxor %xmm6,%xmm0 317 pxor %xmm7,%xmm0 318 319 /* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */ 320 shufps $0b01000100,%xmm4,%xmm1 321 322 movdqa %xmm1,(%rdi) /* store round key */ 323 movdqa %xmm0,0x10(%rdi) /* store next round key */ 324 lea 0x20(%rdi),%rdi /* advance two round keys */ 325 ret 326END(aesni_expand192a) 327 328/* 329 * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0, 330 * uint128_t keygenassist@xmm2) 331 * 332 * Set odd-numbered AES-192 round key. 333 * 334 * Internal ABI. On entry: 335 * 336 * %rdi = rkp, pointer to round key to compute 337 * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) 338 * %xmm1 = (xxx, xxx, pprk[2], pprk[3]) 339 * %xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON) 340 * 341 * On exit: 342 * 343 * %rdi = &rkp[1], rkp advanced by one round key 344 * %xmm0 = rk, the round key we just computed 345 * %xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key 346 * %xmm2 = garbage 347 * %xmm4 = garbage 348 * %xmm5 = garbage 349 * %xmm6 = garbage 350 * %xmm7 = garbage 351 */ 352 .text 353 _ALIGN_TEXT 354 .type aesni_expand192b,@function 355aesni_expand192b: 356 /* 357 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]), 358 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON. 359 */ 360 pshufd $0b11111111,%xmm2,%xmm2 361 362 /* 363 * We need to compute: 364 * 365 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] 366 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] 367 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] 368 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] 369 * ^ prk[1] 370 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] 371 * ^ prk[1] ^ prk[2] 372 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0] 373 * ^ prk[1] ^ prk[2] ^ prk[3] 374 */ 375 376 /* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */ 377 shufps $0b01001110,%xmm0,%xmm1 378 379 /* 380 * %xmm5 := (0, pprk[2], pprk[3], prk[0]) 381 * %xmm6 := (0, 0, pprk[2], pprk[3]) 382 * %xmm7 := (0, 0, 0, pprk[2]) 383 */ 384 movdqa %xmm1,%xmm5 385 movdqa %xmm1,%xmm6 386 movdqa %xmm1,%xmm7 387 pslldq $4,%xmm5 388 pslldq $8,%xmm6 389 pslldq $12,%xmm7 390 391 /* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */ 392 pxor %xmm2,%xmm1 393 pxor %xmm5,%xmm1 394 pxor %xmm6,%xmm1 395 pxor %xmm7,%xmm1 396 397 /* %xmm4 := (prk[2], prk[3], xxx, xxx) */ 398 pshufd $0b00001110,%xmm0,%xmm4 399 400 /* %xmm5 := (0, prk[2], xxx, xxx) */ 401 movdqa %xmm4,%xmm5 402 pslldq $4,%xmm5 403 404 /* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */ 405 movdqa %xmm1,%xmm0 406 407 /* %xmm1 := (rk[3], rk[3], xxx, xxx) */ 408 shufps $0b00001111,%xmm1,%xmm1 409 410 /* 411 * %xmm1 := (nrk[0] = rk[3] ^ prk[2], 412 * nrk[1] = rk[3] ^ prk[2] ^ prk[3], 413 * xxx, 414 * xxx) 415 */ 416 pxor %xmm4,%xmm1 417 pxor %xmm5,%xmm1 418 419 movdqa %xmm0,(%rdi) /* store round key */ 420 lea 0x10(%rdi),%rdi /* advance to next round key address */ 421 ret 422END(aesni_expand192b) 423 424/* 425 * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0, 426 * uint128_t prk@xmm1, uint128_t keygenassist@xmm2) 427 * 428 * Set even-numbered AES-256 round key. 429 * 430 * Internal ABI. On entry: 431 * 432 * %rdi = rkp, pointer to round key to compute 433 * %xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3]) 434 * %xmm1 = (prk[0], prk[1], prk[2], prk[3]) 435 * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3]))) 436 * 437 * On exit: 438 * 439 * %rdi = &rkp[1], rkp advanced by one round key 440 * %xmm0 = rk, the round key we just computed 441 * %xmm1 = prk, previous round key, preserved from entry 442 * %xmm2 = garbage 443 * %xmm4 = garbage 444 * %xmm5 = garbage 445 * %xmm6 = garbage 446 * 447 * The computation turns out to be the same as for AES-128; the 448 * previous round key does not figure into it, only the 449 * previous-previous round key. 450 */ 451 aesni_expand256a = aesni_expand128 452 453/* 454 * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0, 455 * uint128_t pprk@xmm1, uint128_t keygenassist@xmm2) 456 * 457 * Set odd-numbered AES-256 round key. 458 * 459 * Internal ABI. On entry: 460 * 461 * %rdi = rkp, pointer to round key to compute 462 * %xmm0 = (prk[0], prk[1], prk[2], prk[3]) 463 * %xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3]) 464 * %xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx) 465 * 466 * On exit: 467 * 468 * %rdi = &rkp[1], rkp advanced by one round key 469 * %xmm0 = prk, previous round key, preserved from entry 470 * %xmm1 = rk, the round key we just computed 471 * %xmm2 = garbage 472 * %xmm4 = garbage 473 * %xmm5 = garbage 474 * %xmm6 = garbage 475 */ 476 .text 477 _ALIGN_TEXT 478 .type aesni_expand256b,@function 479aesni_expand256b: 480 /* 481 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]), 482 * i.e., set each word of %xmm2 to t := Sub(prk[3]). 483 */ 484 pshufd $0b10101010,%xmm2,%xmm2 485 486 /* 487 * %xmm4 := (0, pprk[0], pprk[1], pprk[2]) 488 * %xmm5 := (0, 0, pprk[0], pprk[1]) 489 * %xmm6 := (0, 0, 0, pprk[0]) 490 */ 491 movdqa %xmm1,%xmm4 492 movdqa %xmm1,%xmm5 493 movdqa %xmm1,%xmm6 494 pslldq $4,%xmm4 495 pslldq $8,%xmm5 496 pslldq $12,%xmm6 497 498 /* 499 * %xmm0 := (rk[0] = t ^ pprk[0], 500 * rk[1] = t ^ pprk[0] ^ pprk[1], 501 * rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2], 502 * rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3]) 503 */ 504 pxor %xmm2,%xmm1 505 pxor %xmm4,%xmm1 506 pxor %xmm5,%xmm1 507 pxor %xmm6,%xmm1 508 509 movdqa %xmm1,(%rdi) /* store round key */ 510 lea 0x10(%rdi),%rdi /* advance to next round key address */ 511 ret 512END(aesni_expand256b) 513 514/* 515 * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi, 516 * uint32_t nrounds@rdx) 517 * 518 * Convert AES encryption round keys to AES decryption round keys. 519 * `rounds' must be between 10 and 14. 520 * 521 * Standard ABI calling convention. 522 */ 523ENTRY(aesni_enctodec) 524 shl $4,%edx /* rdx := byte offset of last round key */ 525 movdqa (%rdi,%rdx),%xmm0 /* load last round key */ 526 movdqa %xmm0,(%rsi) /* store last round key verbatim */ 527 jmp 2f 528 _ALIGN_TEXT 5291: movdqa (%rdi,%rdx),%xmm0 /* load round key */ 530 aesimc %xmm0,%xmm0 /* convert encryption to decryption */ 531 movdqa %xmm0,(%rsi) /* store round key */ 5322: sub $0x10,%rdx /* advance to next round key */ 533 lea 0x10(%rsi),%rsi 534 jnz 1b /* repeat if more rounds */ 535 movdqa (%rdi),%xmm0 /* load first round key */ 536 movdqa %xmm0,(%rsi) /* store first round key verbatim */ 537 ret 538END(aesni_enctodec) 539 540/* 541 * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi, 542 * uint8_t out[16] @rdx, uint32_t nrounds@ecx) 543 * 544 * Encrypt a single block. 545 * 546 * Standard ABI calling convention. 547 */ 548ENTRY(aesni_enc) 549 movdqu (%rsi),%xmm0 550 call aesni_enc1 551 movdqu %xmm0,(%rdx) 552 ret 553END(aesni_enc) 554 555/* 556 * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi, 557 * uint8_t out[16] @rdx, uint32_t nrounds@ecx) 558 * 559 * Decrypt a single block. 560 * 561 * Standard ABI calling convention. 562 */ 563ENTRY(aesni_dec) 564 movdqu (%rsi),%xmm0 565 call aesni_dec1 566 movdqu %xmm0,(%rdx) 567 ret 568END(aesni_dec) 569 570/* 571 * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, 572 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8, 573 * uint32_t nrounds@r9d) 574 * 575 * Encrypt a contiguous sequence of blocks with AES-CBC. 576 * 577 * nbytes must be an integral multiple of 16. 578 * 579 * Standard ABI calling convention. 580 */ 581ENTRY(aesni_cbc_enc) 582 cmp $0,%rcx 583 jz 2f 584 mov %rcx,%r10 /* r10 := nbytes */ 585 movdqu (%r8),%xmm0 /* xmm0 := chaining value */ 586 _ALIGN_TEXT 5871: movdqu (%rsi),%xmm1 /* xmm1 := plaintext block */ 588 lea 0x10(%rsi),%rsi 589 pxor %xmm1,%xmm0 /* xmm0 := cv ^ ptxt */ 590 mov %r9d,%ecx /* ecx := nrounds */ 591 call aesni_enc1 /* xmm0 := ciphertext block */ 592 movdqu %xmm0,(%rdx) 593 lea 0x10(%rdx),%rdx 594 sub $0x10,%r10 595 jnz 1b /* repeat if r10 is nonzero */ 596 movdqu %xmm0,(%r8) /* store chaining value */ 5972: ret 598END(aesni_cbc_enc) 599 600/* 601 * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, 602 * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8, 603 * uint32_t nrounds@r9) 604 * 605 * Decrypt a contiguous sequence of blocks with AES-CBC. 606 * 607 * nbytes must be a positive integral multiple of 16. This routine 608 * is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once. 609 * 610 * Standard ABI calling convention. 611 */ 612ENTRY(aesni_cbc_dec1) 613 push %rbp /* create stack frame uint128[1] */ 614 mov %rsp,%rbp 615 sub $0x10,%rsp 616 movdqu (%r8),%xmm8 /* xmm8 := iv */ 617 movdqa %xmm8,(%rsp) /* save iv */ 618 mov %rcx,%r10 /* r10 := nbytes */ 619 movdqu -0x10(%rsi,%r10),%xmm0 /* xmm0 := last ciphertext block */ 620 movdqu %xmm0,(%r8) /* update iv */ 621 jmp 2f 622 _ALIGN_TEXT 6231: movdqu -0x10(%rsi,%r10),%xmm8 /* xmm8 := chaining value */ 624 pxor %xmm8,%xmm0 /* xmm0 := ptxt */ 625 movdqu %xmm0,(%rdx,%r10) /* store plaintext block */ 626 movdqa %xmm8,%xmm0 /* move cv = ciphertext block */ 6272: mov %r9d,%ecx /* ecx := nrounds */ 628 call aesni_dec1 /* xmm0 := cv ^ ptxt */ 629 sub $0x10,%r10 630 jnz 1b /* repeat if more blocks */ 631 pxor (%rsp),%xmm0 /* xmm0 := ptxt */ 632 movdqu %xmm0,(%rdx) /* store first plaintext block */ 633 leave 634 ret 635END(aesni_cbc_dec1) 636 637/* 638 * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, 639 * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8, 640 * uint32_t nrounds@r9) 641 * 642 * Decrypt a contiguous sequence of 8-block units with AES-CBC. 643 * 644 * nbytes must be a positive integral multiple of 128. 645 * 646 * Standard ABI calling convention. 647 */ 648ENTRY(aesni_cbc_dec8) 649 push %rbp /* create stack frame uint128[1] */ 650 mov %rsp,%rbp 651 sub $0x10,%rsp 652 movdqu (%r8),%xmm8 /* xmm8 := iv */ 653 movdqa %xmm8,(%rsp) /* save iv */ 654 mov %rcx,%r10 /* r10 := nbytes */ 655 movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := ciphertext block[n-1] */ 656 movdqu %xmm7,(%r8) /* update iv */ 657 jmp 2f 658 _ALIGN_TEXT 6591: movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := cv[0] */ 660 pxor %xmm7,%xmm0 /* xmm0 := ptxt[0] */ 661 movdqu %xmm0,(%rdx,%r10) /* store plaintext block */ 6622: movdqu -0x20(%rsi,%r10),%xmm6 /* xmm6 := ciphertext block[n-2] */ 663 movdqu -0x30(%rsi,%r10),%xmm5 /* xmm5 := ciphertext block[n-3] */ 664 movdqu -0x40(%rsi,%r10),%xmm4 /* xmm4 := ciphertext block[n-4] */ 665 movdqu -0x50(%rsi,%r10),%xmm3 /* xmm3 := ciphertext block[n-5] */ 666 movdqu -0x60(%rsi,%r10),%xmm2 /* xmm2 := ciphertext block[n-6] */ 667 movdqu -0x70(%rsi,%r10),%xmm1 /* xmm1 := ciphertext block[n-7] */ 668 movdqu -0x80(%rsi,%r10),%xmm0 /* xmm0 := ciphertext block[n-8] */ 669 movdqa %xmm6,%xmm15 /* xmm[8+i] := cv[i], 0<i<8 */ 670 movdqa %xmm5,%xmm14 671 movdqa %xmm4,%xmm13 672 movdqa %xmm3,%xmm12 673 movdqa %xmm2,%xmm11 674 movdqa %xmm1,%xmm10 675 movdqa %xmm0,%xmm9 676 mov %r9d,%ecx /* ecx := nrounds */ 677 call aesni_dec8 /* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */ 678 pxor %xmm15,%xmm7 /* xmm[i] := ptxt[i], 0<i<8 */ 679 pxor %xmm14,%xmm6 680 pxor %xmm13,%xmm5 681 pxor %xmm12,%xmm4 682 pxor %xmm11,%xmm3 683 pxor %xmm10,%xmm2 684 pxor %xmm9,%xmm1 685 movdqu %xmm7,-0x10(%rdx,%r10) /* store plaintext blocks */ 686 movdqu %xmm6,-0x20(%rdx,%r10) 687 movdqu %xmm5,-0x30(%rdx,%r10) 688 movdqu %xmm4,-0x40(%rdx,%r10) 689 movdqu %xmm3,-0x50(%rdx,%r10) 690 movdqu %xmm2,-0x60(%rdx,%r10) 691 movdqu %xmm1,-0x70(%rdx,%r10) 692 sub $0x80,%r10 693 jnz 1b /* repeat if more blocks */ 694 pxor (%rsp),%xmm0 /* xmm0 := ptxt[0] */ 695 movdqu %xmm0,(%rdx) /* store first plaintext block */ 696 leave 697 ret 698END(aesni_cbc_dec8) 699 700/* 701 * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, 702 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, 703 * uint32_t nrounds@r9d) 704 * 705 * Encrypt a contiguous sequence of blocks with AES-XTS. 706 * 707 * nbytes must be a positive integral multiple of 16. This routine 708 * is not vectorized; use aesni_xts_enc8 for >=8 blocks at once. 709 * 710 * Standard ABI calling convention. 711 */ 712ENTRY(aesni_xts_enc1) 713 mov %rcx,%r10 /* r10 := nbytes */ 714 movdqu (%r8),%xmm15 /* xmm15 := tweak */ 715 _ALIGN_TEXT 7161: movdqu (%rsi),%xmm0 /* xmm0 := ptxt */ 717 lea 0x10(%rsi),%rsi /* advance rdi to next block */ 718 pxor %xmm15,%xmm0 /* xmm0 := ptxt ^ tweak */ 719 mov %r9d,%ecx /* ecx := nrounds */ 720 call aesni_enc1 /* xmm0 := AES(ptxt ^ tweak) */ 721 pxor %xmm15,%xmm0 /* xmm0 := AES(ptxt ^ tweak) ^ tweak */ 722 movdqu %xmm0,(%rdx) /* store ciphertext block */ 723 lea 0x10(%rdx),%rdx /* advance rsi to next block */ 724 call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */ 725 sub $0x10,%r10 726 jnz 1b /* repeat if more blocks */ 727 movdqu %xmm15,(%r8) /* update tweak */ 728 ret 729END(aesni_xts_enc1) 730 731/* 732 * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, 733 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, 734 * uint32_t nrounds@r9d) 735 * 736 * Encrypt a contiguous sequence of blocks with AES-XTS. 737 * 738 * nbytes must be a positive integral multiple of 128. 739 * 740 * Standard ABI calling convention. 741 */ 742ENTRY(aesni_xts_enc8) 743 push %rbp /* create stack frame uint128[1] */ 744 mov %rsp,%rbp 745 sub $0x10,%rsp 746 mov %rcx,%r10 /* r10 := nbytes */ 747 movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */ 748 _ALIGN_TEXT 7491: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */ 750 call aesni_xts_mulx /* xmm15 := tweak[1] */ 751 movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */ 752 call aesni_xts_mulx /* xmm15 := tweak[2] */ 753 movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */ 754 call aesni_xts_mulx /* xmm15 := tweak[3] */ 755 movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */ 756 call aesni_xts_mulx /* xmm15 := tweak[4] */ 757 movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */ 758 call aesni_xts_mulx /* xmm15 := tweak[5] */ 759 movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */ 760 call aesni_xts_mulx /* xmm15 := tweak[6] */ 761 movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */ 762 call aesni_xts_mulx /* xmm15 := tweak[7] */ 763 movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */ 764 movdqu 0x10(%rsi),%xmm1 765 movdqu 0x20(%rsi),%xmm2 766 movdqu 0x30(%rsi),%xmm3 767 movdqu 0x40(%rsi),%xmm4 768 movdqu 0x50(%rsi),%xmm5 769 movdqu 0x60(%rsi),%xmm6 770 movdqu 0x70(%rsi),%xmm7 771 lea 0x80(%rsi),%rsi /* advance rsi to next block group */ 772 movdqa %xmm8,(%rsp) /* save tweak[0] */ 773 pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */ 774 pxor %xmm9,%xmm1 775 pxor %xmm10,%xmm2 776 pxor %xmm11,%xmm3 777 pxor %xmm12,%xmm4 778 pxor %xmm13,%xmm5 779 pxor %xmm14,%xmm6 780 pxor %xmm15,%xmm7 781 mov %r9d,%ecx /* ecx := nrounds */ 782 call aesni_enc8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */ 783 pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */ 784 pxor %xmm9,%xmm1 785 pxor %xmm10,%xmm2 786 pxor %xmm11,%xmm3 787 pxor %xmm12,%xmm4 788 pxor %xmm13,%xmm5 789 pxor %xmm14,%xmm6 790 pxor %xmm15,%xmm7 791 movdqu %xmm0,(%rdx) /* store ciphertext blocks */ 792 movdqu %xmm1,0x10(%rdx) 793 movdqu %xmm2,0x20(%rdx) 794 movdqu %xmm3,0x30(%rdx) 795 movdqu %xmm4,0x40(%rdx) 796 movdqu %xmm5,0x50(%rdx) 797 movdqu %xmm6,0x60(%rdx) 798 movdqu %xmm7,0x70(%rdx) 799 lea 0x80(%rdx),%rdx /* advance rdx to next block group */ 800 call aesni_xts_mulx /* xmm15 := tweak[8] */ 801 sub $0x80,%r10 802 jnz 1b /* repeat if more block groups */ 803 movdqu %xmm15,(%r8) /* update tweak */ 804 leave 805 ret 806END(aesni_xts_enc8) 807 808/* 809 * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, 810 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, 811 * uint32_t nrounds@r9d) 812 * 813 * Decrypt a contiguous sequence of blocks with AES-XTS. 814 * 815 * nbytes must be a positive integral multiple of 16. This routine 816 * is not vectorized; use aesni_xts_dec8 for >=8 blocks at once. 817 * 818 * Standard ABI calling convention. 819 */ 820ENTRY(aesni_xts_dec1) 821 mov %rcx,%r10 /* r10 := nbytes */ 822 movdqu (%r8),%xmm15 /* xmm15 := tweak */ 823 _ALIGN_TEXT 8241: movdqu (%rsi),%xmm0 /* xmm0 := ctxt */ 825 lea 0x10(%rsi),%rsi /* advance rdi to next block */ 826 pxor %xmm15,%xmm0 /* xmm0 := ctxt ^ tweak */ 827 mov %r9d,%ecx /* ecx := nrounds */ 828 call aesni_dec1 /* xmm0 := AES(ctxt ^ tweak) */ 829 pxor %xmm15,%xmm0 /* xmm0 := AES(ctxt ^ tweak) ^ tweak */ 830 movdqu %xmm0,(%rdx) /* store plaintext block */ 831 lea 0x10(%rdx),%rdx /* advance rsi to next block */ 832 call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */ 833 sub $0x10,%r10 834 jnz 1b /* repeat if more blocks */ 835 movdqu %xmm15,(%r8) /* update tweak */ 836 ret 837END(aesni_xts_dec1) 838 839/* 840 * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi, 841 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8, 842 * uint32_t nrounds@r9d) 843 * 844 * Decrypt a contiguous sequence of blocks with AES-XTS. 845 * 846 * nbytes must be a positive integral multiple of 128. 847 * 848 * Standard ABI calling convention. 849 */ 850ENTRY(aesni_xts_dec8) 851 push %rbp /* create stack frame uint128[1] */ 852 mov %rsp,%rbp 853 sub $0x10,%rsp 854 mov %rcx,%r10 /* r10 := nbytes */ 855 movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */ 856 _ALIGN_TEXT 8571: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */ 858 call aesni_xts_mulx /* xmm15 := tweak[1] */ 859 movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */ 860 call aesni_xts_mulx /* xmm15 := tweak[2] */ 861 movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */ 862 call aesni_xts_mulx /* xmm15 := tweak[3] */ 863 movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */ 864 call aesni_xts_mulx /* xmm51 := tweak[4] */ 865 movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */ 866 call aesni_xts_mulx /* xmm15 := tweak[5] */ 867 movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */ 868 call aesni_xts_mulx /* xmm15 := tweak[6] */ 869 movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */ 870 call aesni_xts_mulx /* xmm15 := tweak[7] */ 871 movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */ 872 movdqu 0x10(%rsi),%xmm1 873 movdqu 0x20(%rsi),%xmm2 874 movdqu 0x30(%rsi),%xmm3 875 movdqu 0x40(%rsi),%xmm4 876 movdqu 0x50(%rsi),%xmm5 877 movdqu 0x60(%rsi),%xmm6 878 movdqu 0x70(%rsi),%xmm7 879 lea 0x80(%rsi),%rsi /* advance rsi to next block group */ 880 movdqa %xmm8,(%rsp) /* save tweak[0] */ 881 pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */ 882 pxor %xmm9,%xmm1 883 pxor %xmm10,%xmm2 884 pxor %xmm11,%xmm3 885 pxor %xmm12,%xmm4 886 pxor %xmm13,%xmm5 887 pxor %xmm14,%xmm6 888 pxor %xmm15,%xmm7 889 mov %r9d,%ecx /* ecx := nrounds */ 890 call aesni_dec8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */ 891 pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */ 892 pxor %xmm9,%xmm1 893 pxor %xmm10,%xmm2 894 pxor %xmm11,%xmm3 895 pxor %xmm12,%xmm4 896 pxor %xmm13,%xmm5 897 pxor %xmm14,%xmm6 898 pxor %xmm15,%xmm7 899 movdqu %xmm0,(%rdx) /* store ciphertext blocks */ 900 movdqu %xmm1,0x10(%rdx) 901 movdqu %xmm2,0x20(%rdx) 902 movdqu %xmm3,0x30(%rdx) 903 movdqu %xmm4,0x40(%rdx) 904 movdqu %xmm5,0x50(%rdx) 905 movdqu %xmm6,0x60(%rdx) 906 movdqu %xmm7,0x70(%rdx) 907 lea 0x80(%rdx),%rdx /* advance rdx to next block group */ 908 call aesni_xts_mulx /* xmm15 := tweak[8] */ 909 sub $0x80,%r10 910 jnz 1b /* repeat if more block groups */ 911 movdqu %xmm15,(%r8) /* update tweak */ 912 leave 913 ret 914END(aesni_xts_dec8) 915 916/* 917 * aesni_xts_mulx(tweak@xmm15) 918 * 919 * Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. 920 * Uses %xmm0 as temporary. 921 */ 922 .text 923 _ALIGN_TEXT 924 .type aesni_xts_mulx,@function 925aesni_xts_mulx: 926 /* 927 * Simultaneously determine 928 * (a) whether the high bit of the low quadword must be 929 * shifted into the low bit of the high quadword, and 930 * (b) whether the high bit of the high quadword must be 931 * carried into x^128 = x^7 + x^2 + x + 1. 932 */ 933 pxor %xmm0,%xmm0 /* xmm0 := 0 */ 934 pcmpgtq %xmm15,%xmm0 /* xmm0[i] := -1 if 0 > xmm15[i] else 0 */ 935 pshufd $0b01001110,%xmm0,%xmm0 /* swap halves of xmm0 */ 936 pand xtscarry(%rip),%xmm0 /* copy xtscarry according to mask */ 937 psllq $1,%xmm15 /* shift */ 938 pxor %xmm0,%xmm15 /* incorporate (a) and (b) */ 939 ret 940END(aesni_xts_mulx) 941 942 .section .rodata 943 .p2align 4 944 .type xtscarry,@object 945xtscarry: 946 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 947END(xtscarry) 948 949/* 950 * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi) 951 * 952 * Update an AES-XTS tweak. 953 * 954 * Standard ABI calling convention. 955 */ 956ENTRY(aesni_xts_update) 957 movdqu (%rdi),%xmm15 958 call aesni_xts_mulx 959 movdqu %xmm15,(%rsi) 960 ret 961END(aesni_xts_update) 962 963/* 964 * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, 965 * size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d) 966 * 967 * Update CBC-MAC. 968 * 969 * nbytes must be a positive integral multiple of 16. 970 * 971 * Standard ABI calling convention. 972 */ 973ENTRY(aesni_cbcmac_update1) 974 movdqu (%rcx),%xmm0 /* xmm0 := auth */ 975 mov %rdx,%r10 /* r10 := nbytes */ 976 mov %rcx,%rdx /* rdx := &auth */ 977 _ALIGN_TEXT 9781: pxor (%rsi),%xmm0 /* xmm0 ^= plaintext block */ 979 lea 0x10(%rsi),%rsi 980 mov %r8d,%ecx /* ecx := nrounds */ 981 call aesni_enc1 /* xmm0 := auth'; trash rax,rcx,xmm8 */ 982 sub $0x10,%r10 983 jnz 1b 984 movdqu %xmm0,(%rdx) /* store auth' */ 985 ret 986END(aesni_cbcmac_update1) 987 988/* 989 * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, 990 * uint8_t *out@rdx, size_t nbytes@rcx, 991 * uint8_t authctr[32] @r8, uint32_t nrounds@r9d) 992 * 993 * Update CCM encryption. 994 * 995 * nbytes must be a positive integral multiple of 16. 996 * 997 * Standard ABI calling convention. 998 */ 999ENTRY(aesni_ccm_enc1) 1000 mov %rcx,%r10 /* r10 := nbytes */ 1001 movdqu 0x10(%r8),%xmm2 /* xmm2 := ctr (be) */ 1002 movdqa bswap32(%rip),%xmm4 /* xmm4 := bswap32 table */ 1003 movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */ 1004 movdqu (%r8),%xmm0 /* xmm0 := auth */ 1005 pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */ 1006 _ALIGN_TEXT 10071: movdqu (%rsi),%xmm3 /* xmm3 := plaintext block */ 1008 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */ 1009 lea 0x10(%rsi),%rsi 1010 movdqa %xmm2,%xmm1 /* xmm1 := ctr (le) */ 1011 mov %r9d,%ecx /* ecx := nrounds */ 1012 pshufb %xmm4,%xmm1 /* xmm1 := ctr (be) */ 1013 pxor %xmm3,%xmm0 /* xmm0 := auth ^ ptxt */ 1014 call aesni_enc2 /* trash rax/rcx/xmm8 */ 1015 pxor %xmm1,%xmm3 /* xmm3 := ciphertext block */ 1016 sub $0x10,%r10 /* count down bytes */ 1017 movdqu %xmm3,(%rdx) /* store ciphertext block */ 1018 lea 0x10(%rdx),%rdx 1019 jnz 1b /* repeat if more blocks */ 1020 pshufb %xmm4,%xmm2 /* xmm2 := ctr (be) */ 1021 movdqu %xmm0,(%r8) /* store updated auth */ 1022 movdqu %xmm2,0x10(%r8) /* store updated ctr */ 1023 ret 1024END(aesni_ccm_enc1) 1025 1026/* 1027 * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi, 1028 * uint8_t *out@rdx, size_t nbytes@rcx, 1029 * uint8_t authctr[32] @r8, uint32_t nrounds@r9d) 1030 * 1031 * Update CCM decryption. 1032 * 1033 * nbytes must be a positive integral multiple of 16. 1034 * 1035 * Standard ABI calling convention. 1036 */ 1037ENTRY(aesni_ccm_dec1) 1038 movdqu 0x10(%r8),%xmm2 /* xmm2 := ctr (be) */ 1039 movdqa bswap32(%rip),%xmm4 /* xmm4 := bswap32 table */ 1040 movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */ 1041 movdqu (%r8),%xmm1 /* xmm1 := auth */ 1042 pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */ 1043 mov %rcx,%r10 /* r10 := nbytes */ 1044 1045 /* Decrypt the first block. */ 1046 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */ 1047 mov %r9d,%ecx /* ecx := nrounds */ 1048 movdqa %xmm2,%xmm0 /* xmm0 := ctr (le) */ 1049 movdqu (%rsi),%xmm3 /* xmm3 := ctxt */ 1050 pshufb %xmm4,%xmm0 /* xmm0 := ctr (be) */ 1051 lea 0x10(%rsi),%rsi 1052 call aesni_enc1 /* xmm0 := pad; trash rax/rcx/xmm8 */ 1053 jmp 2f 1054 1055 _ALIGN_TEXT 10561: /* 1057 * Authenticate the last block and decrypt the next block 1058 * simultaneously. 1059 * 1060 * xmm1 = auth ^ ptxt[-1] 1061 * xmm2 = ctr[-1] (le) 1062 */ 1063 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */ 1064 mov %r9d,%ecx /* ecx := nrounds */ 1065 movdqa %xmm2,%xmm0 /* xmm0 := ctr (le) */ 1066 movdqu (%rsi),%xmm3 /* xmm3 := ctxt */ 1067 pshufb %xmm4,%xmm0 /* xmm0 := ctr (be) */ 1068 lea 0x10(%rsi),%rsi 1069 call aesni_enc2 /* xmm0 := pad, xmm1 := auth'; 1070 * trash rax/rcx/xmm8 */ 10712: pxor %xmm0,%xmm3 /* xmm3 := ptxt */ 1072 sub $0x10,%r10 1073 movdqu %xmm3,(%rdx) /* store plaintext */ 1074 lea 0x10(%rdx),%rdx 1075 pxor %xmm3,%xmm1 /* xmm1 := auth ^ ptxt */ 1076 jnz 1b 1077 1078 /* Authenticate the last block. */ 1079 movdqa %xmm1,%xmm0 /* xmm0 := auth ^ ptxt */ 1080 mov %r9d,%ecx /* ecx := nrounds */ 1081 call aesni_enc1 /* xmm0 := auth' */ 1082 pshufb %xmm4,%xmm2 /* xmm2 := ctr (be) */ 1083 movdqu %xmm0,(%r8) /* store updated auth */ 1084 movdqu %xmm2,0x10(%r8) /* store updated ctr */ 1085 ret 1086END(aesni_ccm_dec1) 1087 1088 .section .rodata 1089 .p2align 4 1090 .type bswap32,@object 1091bswap32: 1092 .byte 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 1093END(bswap32) 1094 1095 .section .rodata 1096 .p2align 4 1097 .type ctr32_inc,@object 1098ctr32_inc: 1099 .byte 0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0 1100END(ctr32_inc) 1101 1102/* 1103 * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0, 1104 * uint32_t nrounds@ecx) 1105 * 1106 * Encrypt a single AES block in %xmm0. 1107 * 1108 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. 1109 */ 1110 .text 1111 _ALIGN_TEXT 1112 .type aesni_enc1,@function 1113aesni_enc1: 1114 pxor (%rdi),%xmm0 /* xor in first round key */ 1115 shl $4,%ecx /* ecx := total byte size of round keys */ 1116 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */ 1117 neg %rcx /* rcx := byte offset of round key from end */ 1118 jmp 2f 1119 _ALIGN_TEXT 11201: aesenc %xmm8,%xmm0 11212: movdqa (%rax,%rcx),%xmm8 /* load round key */ 1122 add $0x10,%rcx 1123 jnz 1b /* repeat if more rounds */ 1124 aesenclast %xmm8,%xmm0 1125 ret 1126END(aesni_enc1) 1127 1128/* 1129 * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, 1130 * uint128_t block1@xmm1, uint32_t nrounds@ecx) 1131 * 1132 * Encrypt two AES blocks in %xmm0 and %xmm1. 1133 * 1134 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. 1135 */ 1136 .text 1137 _ALIGN_TEXT 1138 .type aesni_enc2,@function 1139aesni_enc2: 1140 movdqa (%rdi),%xmm8 /* xmm8 := first round key */ 1141 shl $4,%ecx /* ecx := total byte size of round keys */ 1142 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */ 1143 neg %rcx /* rcx := byte offset of round key from end */ 1144 pxor %xmm8,%xmm0 /* xor in first round key */ 1145 pxor %xmm8,%xmm1 1146 jmp 2f 1147 _ALIGN_TEXT 11481: aesenc %xmm8,%xmm0 1149 aesenc %xmm8,%xmm1 11502: movdqa (%rax,%rcx),%xmm8 /* load round key */ 1151 add $0x10,%rcx 1152 jnz 1b /* repeat if there's more */ 1153 aesenclast %xmm8,%xmm0 1154 aesenclast %xmm8,%xmm1 1155 ret 1156END(aesni_enc2) 1157 1158/* 1159 * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ..., 1160 * block7@xmm7, uint32_t nrounds@ecx) 1161 * 1162 * Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel. 1163 * 1164 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. 1165 */ 1166 .text 1167 _ALIGN_TEXT 1168 .type aesni_enc8,@function 1169aesni_enc8: 1170 movdqa (%rdi),%xmm8 /* xor in first round key */ 1171 pxor %xmm8,%xmm0 1172 pxor %xmm8,%xmm1 1173 pxor %xmm8,%xmm2 1174 pxor %xmm8,%xmm3 1175 pxor %xmm8,%xmm4 1176 pxor %xmm8,%xmm5 1177 pxor %xmm8,%xmm6 1178 pxor %xmm8,%xmm7 1179 shl $4,%ecx /* ecx := total byte size of round keys */ 1180 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */ 1181 neg %rcx /* rcx := byte offset of round key from end */ 1182 jmp 2f 1183 _ALIGN_TEXT 11841: aesenc %xmm8,%xmm0 1185 aesenc %xmm8,%xmm1 1186 aesenc %xmm8,%xmm2 1187 aesenc %xmm8,%xmm3 1188 aesenc %xmm8,%xmm4 1189 aesenc %xmm8,%xmm5 1190 aesenc %xmm8,%xmm6 1191 aesenc %xmm8,%xmm7 11922: movdqa (%rax,%rcx),%xmm8 /* load round key */ 1193 add $0x10,%rcx 1194 jnz 1b /* repeat if more rounds */ 1195 aesenclast %xmm8,%xmm0 1196 aesenclast %xmm8,%xmm1 1197 aesenclast %xmm8,%xmm2 1198 aesenclast %xmm8,%xmm3 1199 aesenclast %xmm8,%xmm4 1200 aesenclast %xmm8,%xmm5 1201 aesenclast %xmm8,%xmm6 1202 aesenclast %xmm8,%xmm7 1203 ret 1204END(aesni_enc8) 1205 1206/* 1207 * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0, 1208 * uint32_t nrounds@ecx) 1209 * 1210 * Decrypt a single AES block in %xmm0. 1211 * 1212 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx. 1213 */ 1214 .text 1215 _ALIGN_TEXT 1216 .type aesni_dec1,@function 1217aesni_dec1: 1218 pxor (%rdi),%xmm0 /* xor in first round key */ 1219 shl $4,%ecx /* ecx := byte offset of round key */ 1220 lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */ 1221 neg %rcx /* rcx := byte offset of round key from end */ 1222 jmp 2f 1223 _ALIGN_TEXT 12241: aesdec %xmm8,%xmm0 12252: movdqa (%rax,%rcx),%xmm8 /* load round key */ 1226 add $0x10,%rcx 1227 jnz 1b /* repeat if more rounds */ 1228 aesdeclast %xmm8,%xmm0 1229 ret 1230END(aesni_dec1) 1231 1232/* 1233 * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ..., 1234 * block7@xmm7, uint32_t nrounds@ecx) 1235 * 1236 * Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel. 1237 * 1238 * Internal ABI. Uses %xmm8 as temporary. Destroys %rcx. 1239 */ 1240 .text 1241 _ALIGN_TEXT 1242 .type aesni_dec8,@function 1243aesni_dec8: 1244 movdqa (%rdi),%xmm8 /* xor in first round key */ 1245 pxor %xmm8,%xmm0 1246 pxor %xmm8,%xmm1 1247 pxor %xmm8,%xmm2 1248 pxor %xmm8,%xmm3 1249 pxor %xmm8,%xmm4 1250 pxor %xmm8,%xmm5 1251 pxor %xmm8,%xmm6 1252 pxor %xmm8,%xmm7 1253 shl $4,%ecx /* ecx := byte offset of round key */ 1254 lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */ 1255 neg %rcx /* rcx := byte offset of round key from end */ 1256 jmp 2f 1257 _ALIGN_TEXT 12581: aesdec %xmm8,%xmm0 1259 aesdec %xmm8,%xmm1 1260 aesdec %xmm8,%xmm2 1261 aesdec %xmm8,%xmm3 1262 aesdec %xmm8,%xmm4 1263 aesdec %xmm8,%xmm5 1264 aesdec %xmm8,%xmm6 1265 aesdec %xmm8,%xmm7 12662: movdqa (%rax,%rcx),%xmm8 /* load round key */ 1267 add $0x10,%rcx 1268 jnz 1b /* repeat if more rounds */ 1269 aesdeclast %xmm8,%xmm0 1270 aesdeclast %xmm8,%xmm1 1271 aesdeclast %xmm8,%xmm2 1272 aesdeclast %xmm8,%xmm3 1273 aesdeclast %xmm8,%xmm4 1274 aesdeclast %xmm8,%xmm5 1275 aesdeclast %xmm8,%xmm6 1276 aesdeclast %xmm8,%xmm7 1277 ret 1278END(aesni_dec8) 1279