1/* $OpenBSD: aes_intel.S,v 1.14 2021/09/04 22:15:33 bluhm Exp $ */ 2 3/* 4 * Implement AES algorithm in Intel AES-NI instructions. 5 * 6 * The white paper of AES-NI instructions can be downloaded from: 7 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 8 * 9 * Copyright (C) 2008-2010, Intel Corporation 10 * Author: Huang Ying <ying.huang@intel.com> 11 * Vinodh Gopal <vinodh.gopal@intel.com> 12 * Kahraman Akdemir 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following 16 * conditions are met: 17 * 18 * - Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 21 * - Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the 24 * distribution. 25 * 26 * - Neither the name of Intel Corporation nor the names of its 27 * contributors may be used to endorse or promote products 28 * derived from this software without specific prior written 29 * permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 32 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 33 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 34 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 36 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 37 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 38 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 40 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 41 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 */ 43 44/* 45 * Changes to the original source code released by Intel: 46 * 47 * - assembler macros were converted to the actual instructions; 48 * - aesni_ctr_enc was changed to be RFC 3686 compliant; 49 * - aes-gcm mode added; 50 * - aes-xts implementation added; 51 * 52 * Copyright (c) 2010,2011 Mike Belopuhov 53 * Copyright (c) 2013 Joel Sing <jsing@openbsd.org> 54 * 55 * Permission to use, copy, modify, and distribute this software for any 56 * purpose with or without fee is hereby granted, provided that the above 57 * copyright notice and this permission notice appear in all copies. 58 * 59 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 60 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 61 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 62 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 63 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 64 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 65 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 66 */ 67 68#include <machine/param.h> 69#include <machine/asm.h> 70 71#define STATE1 %xmm0 72#define STATE2 %xmm4 73#define STATE3 %xmm5 74#define STATE4 %xmm6 75#define STATE STATE1 76#define IN1 %xmm1 77#define IN2 %xmm7 78#define IN3 %xmm8 79#define IN4 %xmm9 80#define IN IN1 81#define KEY %xmm2 82#define IV %xmm3 83#define BSWAP_MASK %xmm10 84#define CTR %xmm11 85#define INC %xmm12 86 87#define KEYP %rdi 88#define OUTP %rsi 89#define INP %rdx 90#define LEN %rcx 91#define HSTATE %rcx 92#define IVP %r8 93#define ICBP %r8 94#define KLEN %r9d 95#define T1 %r10 96#define TKEYP T1 97#define T2 %r11 98#define TCTR_LOW T2 99 100 .section .rodata 101.align 16 102.Lbswap_mask: 103 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 104 105 .text 106 107_key_expansion_128: 108_key_expansion_256a: 109 RETGUARD_SETUP(_key_expansion_128, rax) 110 pshufd $0b11111111,%xmm1,%xmm1 111 shufps $0b00010000,%xmm0,%xmm4 112 pxor %xmm4,%xmm0 113 shufps $0b10001100,%xmm0,%xmm4 114 pxor %xmm4,%xmm0 115 pxor %xmm1,%xmm0 116 movaps %xmm0,(%rcx) 117 add $0x10,%rcx 118 RETGUARD_CHECK(_key_expansion_128, rax) 119 ret 120 lfence 121 122_key_expansion_192a: 123 RETGUARD_SETUP(_key_expansion_192a, rax) 124 pshufd $0b01010101,%xmm1,%xmm1 125 shufps $0b00010000,%xmm0,%xmm4 126 pxor %xmm4,%xmm0 127 shufps $0b10001100,%xmm0,%xmm4 128 pxor %xmm4,%xmm0 129 pxor %xmm1,%xmm0 130 131 movaps %xmm2,%xmm5 132 movaps %xmm2,%xmm6 133 pslldq $4,%xmm5 134 pshufd $0b11111111,%xmm0,%xmm3 135 pxor %xmm3,%xmm2 136 pxor %xmm5,%xmm2 137 138 movaps %xmm0,%xmm1 139 shufps $0b01000100,%xmm0,%xmm6 140 movaps %xmm6,(%rcx) 141 shufps $0b01001110,%xmm2,%xmm1 142 movaps %xmm1,16(%rcx) 143 add $0x20,%rcx 144 RETGUARD_CHECK(_key_expansion_192a, rax) 145 ret 146 lfence 147 148_key_expansion_192b: 149 RETGUARD_SETUP(_key_expansion_192b, rax) 150 pshufd $0b01010101,%xmm1,%xmm1 151 shufps $0b00010000,%xmm0,%xmm4 152 pxor %xmm4,%xmm0 153 shufps $0b10001100,%xmm0,%xmm4 154 pxor %xmm4,%xmm0 155 pxor %xmm1,%xmm0 156 157 movaps %xmm2,%xmm5 158 pslldq $4,%xmm5 159 pshufd $0b11111111,%xmm0,%xmm3 160 pxor %xmm3,%xmm2 161 pxor %xmm5,%xmm2 162 163 movaps %xmm0,(%rcx) 164 add $0x10,%rcx 165 RETGUARD_CHECK(_key_expansion_192b, rax) 166 ret 167 lfence 168 169_key_expansion_256b: 170 RETGUARD_SETUP(_key_expansion_256b, rax) 171 pshufd $0b10101010,%xmm1,%xmm1 172 shufps $0b00010000,%xmm2,%xmm4 173 pxor %xmm4,%xmm2 174 shufps $0b10001100,%xmm2,%xmm4 175 pxor %xmm4,%xmm2 176 pxor %xmm1,%xmm2 177 movaps %xmm2,(%rcx) 178 add $0x10,%rcx 179 RETGUARD_CHECK(_key_expansion_256b, rax) 180 ret 181 lfence 182 183/* 184 * void aesni_set_key(struct aesni_session *ses, uint8_t *key, size_t len) 185 */ 186ENTRY(aesni_set_key) 187 RETGUARD_SETUP(aesni_set_key, r11) 188 movups (%rsi),%xmm0 # user key (first 16 bytes) 189 movaps %xmm0,(%rdi) 190 lea 0x10(%rdi),%rcx # key addr 191 movl %edx,480(%rdi) 192 pxor %xmm4,%xmm4 # xmm4 is assumed 0 in _key_expansion_x 193 cmp $24,%dl 194 jb 2f 195 je 1f 196 movups 0x10(%rsi),%xmm2 # other user key 197 movaps %xmm2,(%rcx) 198 add $0x10,%rcx 199 aeskeygenassist $0x1,%xmm2,%xmm1 # round 1 200 call _key_expansion_256a 201 aeskeygenassist $0x1,%xmm0,%xmm1 202 call _key_expansion_256b 203 aeskeygenassist $0x2,%xmm2,%xmm1 # round 2 204 call _key_expansion_256a 205 aeskeygenassist $0x2,%xmm0,%xmm1 206 call _key_expansion_256b 207 aeskeygenassist $0x4,%xmm2,%xmm1 # round 3 208 call _key_expansion_256a 209 aeskeygenassist $0x4,%xmm0,%xmm1 210 call _key_expansion_256b 211 aeskeygenassist $0x8,%xmm2,%xmm1 # round 4 212 call _key_expansion_256a 213 aeskeygenassist $0x8,%xmm0,%xmm1 214 call _key_expansion_256b 215 aeskeygenassist $0x10,%xmm2,%xmm1 # round 5 216 call _key_expansion_256a 217 aeskeygenassist $0x10,%xmm0,%xmm1 218 call _key_expansion_256b 219 aeskeygenassist $0x20,%xmm2,%xmm1 # round 6 220 call _key_expansion_256a 221 aeskeygenassist $0x20,%xmm0,%xmm1 222 call _key_expansion_256b 223 aeskeygenassist $0x40,%xmm2,%xmm1 # round 7 224 call _key_expansion_256a 225 jmp 3f 2261: /* 192 bit key */ 227 movq 0x10(%rsi),%xmm2 # other user key 228 aeskeygenassist $0x1,%xmm2,%xmm1 # round 1 229 call _key_expansion_192a 230 aeskeygenassist $0x2,%xmm2,%xmm1 # round 2 231 call _key_expansion_192b 232 aeskeygenassist $0x4,%xmm2,%xmm1 # round 3 233 call _key_expansion_192a 234 aeskeygenassist $0x8,%xmm2,%xmm1 # round 4 235 call _key_expansion_192b 236 aeskeygenassist $0x10,%xmm2,%xmm1 # round 5 237 call _key_expansion_192a 238 aeskeygenassist $0x20,%xmm2,%xmm1 # round 6 239 call _key_expansion_192b 240 aeskeygenassist $0x40,%xmm2,%xmm1 # round 7 241 call _key_expansion_192a 242 aeskeygenassist $0x80,%xmm2,%xmm1 # round 8 243 call _key_expansion_192b 244 jmp 3f 2452: /* 128 bit key */ 246 aeskeygenassist $0x1,%xmm0,%xmm1 # round 1 247 call _key_expansion_128 248 aeskeygenassist $0x2,%xmm0,%xmm1 # round 2 249 call _key_expansion_128 250 aeskeygenassist $0x4,%xmm0,%xmm1 # round 3 251 call _key_expansion_128 252 aeskeygenassist $0x8,%xmm0,%xmm1 # round 4 253 call _key_expansion_128 254 aeskeygenassist $0x10,%xmm0,%xmm1 # round 5 255 call _key_expansion_128 256 aeskeygenassist $0x20,%xmm0,%xmm1 # round 6 257 call _key_expansion_128 258 aeskeygenassist $0x40,%xmm0,%xmm1 # round 7 259 call _key_expansion_128 260 aeskeygenassist $0x80,%xmm0,%xmm1 # round 8 261 call _key_expansion_128 262 aeskeygenassist $0x1b,%xmm0,%xmm1 # round 9 263 call _key_expansion_128 264 aeskeygenassist $0x36,%xmm0,%xmm1 # round 10 265 call _key_expansion_128 2663: 267 sub $0x10,%rcx 268 movaps (%rdi),%xmm0 269 movaps (%rcx),%xmm1 270 movaps %xmm0,240(%rcx) 271 movaps %xmm1,240(%rdi) 272 add $0x10,%rdi 273 lea 240-16(%rcx),%rsi 274.align 4 2754: 276 movaps (%rdi),%xmm0 277 aesimc %xmm0,%xmm1 278 movaps %xmm1,(%rsi) 279 add $0x10,%rdi 280 sub $0x10,%rsi 281 cmp %rcx,%rdi 282 jb 4b 283 RETGUARD_CHECK(aesni_set_key, r11) 284 ret 285 lfence 286 287/* 288 * void aesni_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src) 289 */ 290ENTRY(aesni_enc) 291 RETGUARD_SETUP(aesni_enc, r11) 292 movl 480(KEYP),KLEN # key length 293 movups (INP),STATE # input 294 call _aesni_enc1 295 movups STATE,(OUTP) # output 296 RETGUARD_CHECK(aesni_enc, r11) 297 ret 298 lfence 299 300/* 301 * _aesni_enc1: internal ABI 302 * input: 303 * KEYP: key struct pointer 304 * KLEN: round count 305 * STATE: initial state (input) 306 * output: 307 * STATE: final state (output) 308 * changed: 309 * KEY 310 * TKEYP (T1) 311 */ 312_aesni_enc1: 313 RETGUARD_SETUP(_aesni_enc1, rax) 314 movaps (KEYP),KEY # key 315 mov KEYP,TKEYP 316 pxor KEY,STATE # round 0 317 add $0x30,TKEYP 318 cmp $24,KLEN 319 jb 2f 320 lea 0x20(TKEYP),TKEYP 321 je 1f 322 add $0x20,TKEYP 323 movaps -0x60(TKEYP),KEY 324 aesenc KEY,STATE 325 movaps -0x50(TKEYP),KEY 326 aesenc KEY,STATE 327.align 4 3281: /* 192 bit key */ 329 movaps -0x40(TKEYP),KEY 330 aesenc KEY,STATE 331 movaps -0x30(TKEYP),KEY 332 aesenc KEY,STATE 333.align 4 3342: /* 128 bit key */ 335 movaps -0x20(TKEYP),KEY 336 aesenc KEY,STATE 337 movaps -0x10(TKEYP),KEY 338 aesenc KEY,STATE 339 movaps (TKEYP),KEY 340 aesenc KEY,STATE 341 movaps 0x10(TKEYP),KEY 342 aesenc KEY,STATE 343 movaps 0x20(TKEYP),KEY 344 aesenc KEY,STATE 345 movaps 0x30(TKEYP),KEY 346 aesenc KEY,STATE 347 movaps 0x40(TKEYP),KEY 348 aesenc KEY,STATE 349 movaps 0x50(TKEYP),KEY 350 aesenc KEY,STATE 351 movaps 0x60(TKEYP),KEY 352 aesenc KEY,STATE 353 movaps 0x70(TKEYP),KEY 354 aesenclast KEY,STATE 355 RETGUARD_CHECK(_aesni_enc1, rax) 356 ret 357 lfence 358 359/* 360 * _aesni_enc4: internal ABI 361 * input: 362 * KEYP: key struct pointer 363 * KLEN: round count 364 * STATE1: initial state (input) 365 * STATE2 366 * STATE3 367 * STATE4 368 * output: 369 * STATE1: final state (output) 370 * STATE2 371 * STATE3 372 * STATE4 373 * changed: 374 * KEY 375 * TKEYP (T1) 376 */ 377_aesni_enc4: 378 RETGUARD_SETUP(_aesni_enc4, rax) 379 movaps (KEYP),KEY # key 380 mov KEYP,TKEYP 381 pxor KEY,STATE1 # round 0 382 pxor KEY,STATE2 383 pxor KEY,STATE3 384 pxor KEY,STATE4 385 add $0x30,TKEYP 386 cmp $24,KLEN 387 jb 2f 388 lea 0x20(TKEYP),TKEYP 389 je 1f 390 add $0x20,TKEYP 391 movaps -0x60(TKEYP),KEY 392 aesenc KEY,STATE1 393 aesenc KEY,STATE2 394 aesenc KEY,STATE3 395 aesenc KEY,STATE4 396 movaps -0x50(TKEYP),KEY 397 aesenc KEY,STATE1 398 aesenc KEY,STATE2 399 aesenc KEY,STATE3 400 aesenc KEY,STATE4 401#.align 4 4021: /* 192 bit key */ 403 movaps -0x40(TKEYP),KEY 404 aesenc KEY,STATE1 405 aesenc KEY,STATE2 406 aesenc KEY,STATE3 407 aesenc KEY,STATE4 408 movaps -0x30(TKEYP),KEY 409 aesenc KEY,STATE1 410 aesenc KEY,STATE2 411 aesenc KEY,STATE3 412 aesenc KEY,STATE4 413#.align 4 4142: /* 128 bit key */ 415 movaps -0x20(TKEYP),KEY 416 aesenc KEY,STATE1 417 aesenc KEY,STATE2 418 aesenc KEY,STATE3 419 aesenc KEY,STATE4 420 movaps -0x10(TKEYP),KEY 421 aesenc KEY,STATE1 422 aesenc KEY,STATE2 423 aesenc KEY,STATE3 424 aesenc KEY,STATE4 425 movaps (TKEYP),KEY 426 aesenc KEY,STATE1 427 aesenc KEY,STATE2 428 aesenc KEY,STATE3 429 aesenc KEY,STATE4 430 movaps 0x10(TKEYP),KEY 431 aesenc KEY,STATE1 432 aesenc KEY,STATE2 433 aesenc KEY,STATE3 434 aesenc KEY,STATE4 435 movaps 0x20(TKEYP),KEY 436 aesenc KEY,STATE1 437 aesenc KEY,STATE2 438 aesenc KEY,STATE3 439 aesenc KEY,STATE4 440 movaps 0x30(TKEYP),KEY 441 aesenc KEY,STATE1 442 aesenc KEY,STATE2 443 aesenc KEY,STATE3 444 aesenc KEY,STATE4 445 movaps 0x40(TKEYP),KEY 446 aesenc KEY,STATE1 447 aesenc KEY,STATE2 448 aesenc KEY,STATE3 449 aesenc KEY,STATE4 450 movaps 0x50(TKEYP),KEY 451 aesenc KEY,STATE1 452 aesenc KEY,STATE2 453 aesenc KEY,STATE3 454 aesenc KEY,STATE4 455 movaps 0x60(TKEYP),KEY 456 aesenc KEY,STATE1 457 aesenc KEY,STATE2 458 aesenc KEY,STATE3 459 aesenc KEY,STATE4 460 movaps 0x70(TKEYP),KEY 461 aesenclast KEY,STATE1 # last round 462 aesenclast KEY,STATE2 463 aesenclast KEY,STATE3 464 aesenclast KEY,STATE4 465 RETGUARD_CHECK(_aesni_enc4, rax) 466 ret 467 lfence 468 469/* 470 * void aesni_dec(struct aesni_session *ses, uint8_t *dst, uint8_t *src) 471 */ 472ENTRY(aesni_dec) 473 RETGUARD_SETUP(aesni_dec, r11) 474 mov 480(KEYP),KLEN # key length 475 add $240,KEYP 476 movups (INP),STATE # input 477 call _aesni_dec1 478 movups STATE,(OUTP) # output 479 RETGUARD_CHECK(aesni_dec, r11) 480 ret 481 lfence 482 483/* 484 * _aesni_dec1: internal ABI 485 * input: 486 * KEYP: key struct pointer 487 * KLEN: key length 488 * STATE: initial state (input) 489 * output: 490 * STATE: final state (output) 491 * changed: 492 * KEY 493 * TKEYP (T1) 494 */ 495_aesni_dec1: 496 RETGUARD_SETUP(_aesni_dec1, rax) 497 movaps (KEYP),KEY # key 498 mov KEYP,TKEYP 499 pxor KEY,STATE # round 0 500 add $0x30,TKEYP 501 cmp $24,KLEN 502 jb 2f 503 lea 0x20(TKEYP),TKEYP 504 je 1f 505 add $0x20,TKEYP 506 movaps -0x60(TKEYP),KEY 507 aesdec KEY,STATE 508 movaps -0x50(TKEYP),KEY 509 aesdec KEY,STATE 510.align 4 5111: /* 192 bit key */ 512 movaps -0x40(TKEYP),KEY 513 aesdec KEY,STATE 514 movaps -0x30(TKEYP),KEY 515 aesdec KEY,STATE 516.align 4 5172: /* 128 bit key */ 518 movaps -0x20(TKEYP),KEY 519 aesdec KEY,STATE 520 movaps -0x10(TKEYP),KEY 521 aesdec KEY,STATE 522 movaps (TKEYP),KEY 523 aesdec KEY,STATE 524 movaps 0x10(TKEYP),KEY 525 aesdec KEY,STATE 526 movaps 0x20(TKEYP),KEY 527 aesdec KEY,STATE 528 movaps 0x30(TKEYP),KEY 529 aesdec KEY,STATE 530 movaps 0x40(TKEYP),KEY 531 aesdec KEY,STATE 532 movaps 0x50(TKEYP),KEY 533 aesdec KEY,STATE 534 movaps 0x60(TKEYP),KEY 535 aesdec KEY,STATE 536 movaps 0x70(TKEYP),KEY 537 aesdeclast KEY,STATE 538 RETGUARD_CHECK(_aesni_dec1, rax) 539 ret 540 lfence 541 542/* 543 * _aesni_dec4: internal ABI 544 * input: 545 * KEYP: key struct pointer 546 * KLEN: key length 547 * STATE1: initial state (input) 548 * STATE2 549 * STATE3 550 * STATE4 551 * output: 552 * STATE1: final state (output) 553 * STATE2 554 * STATE3 555 * STATE4 556 * changed: 557 * KEY 558 * TKEYP (T1) 559 */ 560_aesni_dec4: 561 RETGUARD_SETUP(_aesni_dec4, rax) 562 movaps (KEYP),KEY # key 563 mov KEYP,TKEYP 564 pxor KEY,STATE1 # round 0 565 pxor KEY,STATE2 566 pxor KEY,STATE3 567 pxor KEY,STATE4 568 add $0x30,TKEYP 569 cmp $24,KLEN 570 jb 2f 571 lea 0x20(TKEYP),TKEYP 572 je 1f 573 add $0x20,TKEYP 574 movaps -0x60(TKEYP),KEY 575 aesdec KEY,STATE1 576 aesdec KEY,STATE2 577 aesdec KEY,STATE3 578 aesdec KEY,STATE4 579 movaps -0x50(TKEYP),KEY 580 aesdec KEY,STATE1 581 aesdec KEY,STATE2 582 aesdec KEY,STATE3 583 aesdec KEY,STATE4 584.align 4 5851: /* 192 bit key */ 586 movaps -0x40(TKEYP),KEY 587 aesdec KEY,STATE1 588 aesdec KEY,STATE2 589 aesdec KEY,STATE3 590 aesdec KEY,STATE4 591 movaps -0x30(TKEYP),KEY 592 aesdec KEY,STATE1 593 aesdec KEY,STATE2 594 aesdec KEY,STATE3 595 aesdec KEY,STATE4 596.align 4 5972: /* 128 bit key */ 598 movaps -0x20(TKEYP),KEY 599 aesdec KEY,STATE1 600 aesdec KEY,STATE2 601 aesdec KEY,STATE3 602 aesdec KEY,STATE4 603 movaps -0x10(TKEYP),KEY 604 aesdec KEY,STATE1 605 aesdec KEY,STATE2 606 aesdec KEY,STATE3 607 aesdec KEY,STATE4 608 movaps (TKEYP),KEY 609 aesdec KEY,STATE1 610 aesdec KEY,STATE2 611 aesdec KEY,STATE3 612 aesdec KEY,STATE4 613 movaps 0x10(TKEYP),KEY 614 aesdec KEY,STATE1 615 aesdec KEY,STATE2 616 aesdec KEY,STATE3 617 aesdec KEY,STATE4 618 movaps 0x20(TKEYP),KEY 619 aesdec KEY,STATE1 620 aesdec KEY,STATE2 621 aesdec KEY,STATE3 622 aesdec KEY,STATE4 623 movaps 0x30(TKEYP),KEY 624 aesdec KEY,STATE1 625 aesdec KEY,STATE2 626 aesdec KEY,STATE3 627 aesdec KEY,STATE4 628 movaps 0x40(TKEYP),KEY 629 aesdec KEY,STATE1 630 aesdec KEY,STATE2 631 aesdec KEY,STATE3 632 aesdec KEY,STATE4 633 movaps 0x50(TKEYP),KEY 634 aesdec KEY,STATE1 635 aesdec KEY,STATE2 636 aesdec KEY,STATE3 637 aesdec KEY,STATE4 638 movaps 0x60(TKEYP),KEY 639 aesdec KEY,STATE1 640 aesdec KEY,STATE2 641 aesdec KEY,STATE3 642 aesdec KEY,STATE4 643 movaps 0x70(TKEYP),KEY 644 aesdeclast KEY,STATE1 # last round 645 aesdeclast KEY,STATE2 646 aesdeclast KEY,STATE3 647 aesdeclast KEY,STATE4 648 RETGUARD_CHECK(_aesni_dec4, rax) 649 ret 650 lfence 651 652#if 0 653/* 654 * void aesni_ecb_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src, 655 * size_t len) 656 */ 657ENTRY(aesni_ecb_enc) 658 RETGUARD_SETUP(aesni_ecb_enc, r11) 659 test LEN,LEN # check length 660 jz 3f 661 mov 480(KEYP),KLEN 662 cmp $16,LEN 663 jb 3f 664 cmp $64,LEN 665 jb 2f 666.align 4 6671: 668 movups (INP),STATE1 669 movups 0x10(INP),STATE2 670 movups 0x20(INP),STATE3 671 movups 0x30(INP),STATE4 672 call _aesni_enc4 673 movups STATE1,(OUTP) 674 movups STATE2,0x10(OUTP) 675 movups STATE3,0x20(OUTP) 676 movups STATE4,0x30(OUTP) 677 sub $64,LEN 678 add $64,INP 679 add $64,OUTP 680 cmp $64,LEN 681 jge 1b 682 cmp $16,LEN 683 jb 3f 684.align 4 6852: 686 movups (INP),STATE1 687 call _aesni_enc1 688 movups STATE1,(OUTP) 689 sub $16,LEN 690 add $16,INP 691 add $16,OUTP 692 cmp $16,LEN 693 jge 2b 6943: 695 RETGUARD_CHECK(aesni_ecb_enc, r11) 696 ret 697 lfence 698 699/* 700 * void aesni_ecb_dec(struct aesni_session *ses, uint8_t *dst, uint8_t *src, 701 * size_t len); 702 */ 703ENTRY(aesni_ecb_dec) 704 RETGUARD_SETUP(aesni_ecb_dec, r11) 705 test LEN,LEN 706 jz 3f 707 mov 480(KEYP),KLEN 708 add $240,KEYP 709 cmp $16,LEN 710 jb 3f 711 cmp $64,LEN 712 jb 2f 713.align 4 7141: 715 movups (INP),STATE1 716 movups 0x10(INP),STATE2 717 movups 0x20(INP),STATE3 718 movups 0x30(INP),STATE4 719 call _aesni_dec4 720 movups STATE1,(OUTP) 721 movups STATE2,0x10(OUTP) 722 movups STATE3,0x20(OUTP) 723 movups STATE4,0x30(OUTP) 724 sub $64,LEN 725 add $64,INP 726 add $64,OUTP 727 cmp $64,LEN 728 jge 1b 729 cmp $16,LEN 730 jb 3f 731.align 4 7322: 733 movups (INP),STATE1 734 call _aesni_dec1 735 movups STATE1,(OUTP) 736 sub $16,LEN 737 add $16,INP 738 add $16,OUTP 739 cmp $16,LEN 740 jge 2b 7413: 742 RETGUARD_CHECK(aesni_ecb_dec, r11) 743 ret 744 lfence 745#endif 746 747/* 748 * void aesni_cbc_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src, 749 * size_t len, uint8_t *iv) 750 */ 751ENTRY(aesni_cbc_enc) 752 RETGUARD_SETUP(aesni_cbc_enc, r11) 753 cmp $16,LEN 754 jb 2f 755 mov 480(KEYP),KLEN 756 movups (IVP),STATE # load iv as initial state 757.align 4 7581: 759 movups (INP),IN # load input 760 pxor IN,STATE 761 call _aesni_enc1 762 movups STATE,(OUTP) # store output 763 sub $16,LEN 764 add $16,INP 765 add $16,OUTP 766 cmp $16,LEN 767 jge 1b 768 movups STATE,(IVP) 7692: 770 RETGUARD_CHECK(aesni_cbc_enc, r11) 771 ret 772 lfence 773 774/* 775 * void aesni_cbc_dec(struct aesni_session *ses, uint8_t *dst, uint8_t *src, 776 * size_t len, uint8_t *iv) 777 */ 778ENTRY(aesni_cbc_dec) 779 RETGUARD_SETUP(aesni_cbc_dec, r11) 780 cmp $16,LEN 781 jb 4f 782 mov 480(KEYP),KLEN 783 add $240,KEYP 784 movups (IVP),IV 785 cmp $64,LEN 786 jb 2f 787.align 4 7881: /* pipeline 4 instructions when possible */ 789 movups (INP),IN1 790 movaps IN1,STATE1 791 movups 0x10(INP),IN2 792 movaps IN2,STATE2 793 movups 0x20(INP),IN3 794 movaps IN3,STATE3 795 movups 0x30(INP),IN4 796 movaps IN4,STATE4 797 call _aesni_dec4 798 pxor IV,STATE1 799 pxor IN1,STATE2 800 pxor IN2,STATE3 801 pxor IN3,STATE4 802 movaps IN4,IV 803 movups STATE1,(OUTP) 804 movups STATE2,0x10(OUTP) 805 movups STATE3,0x20(OUTP) 806 movups STATE4,0x30(OUTP) 807 sub $64,LEN 808 add $64,INP 809 add $64,OUTP 810 cmp $64,LEN 811 jge 1b 812 cmp $16,LEN 813 jb 3f 814.align 4 8152: 816 movups (INP),IN 817 movaps IN,STATE 818 call _aesni_dec1 819 pxor IV,STATE 820 movups STATE,(OUTP) 821 movaps IN,IV 822 sub $16,LEN 823 add $16,INP 824 add $16,OUTP 825 cmp $16,LEN 826 jge 2b 8273: 828 movups IV,(IVP) 8294: 830 RETGUARD_CHECK(aesni_cbc_dec, r11) 831 ret 832 lfence 833 834/* 835 * _aesni_inc_init: internal ABI 836 * setup registers used by _aesni_inc 837 * input: 838 * ICB 839 * output: 840 * CTR: == CTR, in little endian 841 * IV: == IV, in big endian 842 * TCTR_LOW: == lower dword of CTR 843 * INC: == 1, in little endian 844 * BSWAP_MASK == endian swapping mask 845 */ 846_aesni_inc_init: 847 RETGUARD_SETUP(_aesni_inc_init, rax) 848 movdqa CTR,IV 849 pslldq $8,IV 850 movdqu .Lbswap_mask,BSWAP_MASK 851 pshufb BSWAP_MASK,CTR 852 mov $1,TCTR_LOW 853 movd TCTR_LOW,INC 854 movd CTR,TCTR_LOW 855 RETGUARD_CHECK(_aesni_inc_init, rax) 856 ret 857 lfence 858 859/* 860 * _aesni_inc: internal ABI 861 * Increase IV by 1, IV is in big endian 862 * input: 863 * IV 864 * CTR: == IV, in little endian 865 * TCTR_LOW: == lower dword of CTR 866 * INC: == 1, in little endian 867 * BSWAP_MASK == endian swapping mask 868 * output: 869 * IV: Increase by 1 870 * changed: 871 * CTR: == output IV, in little endian 872 * TCTR_LOW: == lower dword of CTR 873 */ 874_aesni_inc: 875 RETGUARD_SETUP(_aesni_inc, rax) 876 paddq INC,CTR 877 add $1,TCTR_LOW 878 jnc 1f 879 pslldq $8,INC 880 paddq INC,CTR 881 psrldq $8,INC 8821: 883 movaps CTR,IV 884 pshufb BSWAP_MASK,IV 885 RETGUARD_CHECK(_aesni_inc, rax) 886 ret 887 lfence 888 889/* 890 * void aesni_ctr_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src, 891 * size_t len, uint8_t *icb) 892 */ 893ENTRY(aesni_ctr_enc) 894 RETGUARD_SETUP(aesni_ctr_enc, r11) 895 RETGUARD_PUSH(r11) 896 cmp $16,LEN 897 jb 4f 898 mov 480(KEYP),KLEN 899 movdqu (ICBP),CTR 900 call _aesni_inc_init 901 cmp $64,LEN 902 jb 2f 903.align 4 9041: /* pipeline 4 instructions when possible */ 905 call _aesni_inc 906 movaps IV,STATE1 907 movups (INP),IN1 908 call _aesni_inc 909 movaps IV,STATE2 910 movups 0x10(INP),IN2 911 call _aesni_inc 912 movaps IV,STATE3 913 movups 0x20(INP),IN3 914 call _aesni_inc 915 movaps IV,STATE4 916 movups 0x30(INP),IN4 917 call _aesni_enc4 918 pxor IN1,STATE1 919 movups STATE1,(OUTP) 920 pxor IN2,STATE2 921 movups STATE2,0x10(OUTP) 922 pxor IN3,STATE3 923 movups STATE3,0x20(OUTP) 924 pxor IN4,STATE4 925 movups STATE4,0x30(OUTP) 926 sub $64,LEN 927 add $64,INP 928 add $64,OUTP 929 cmp $64,LEN 930 jge 1b 931 cmp $16,LEN 932 jb 3f 933.align 4 9342: 935 call _aesni_inc 936 movaps IV,STATE 937 movups (INP),IN 938 call _aesni_enc1 939 pxor IN,STATE 940 movups STATE,(OUTP) 941 sub $16,LEN 942 add $16,INP 943 add $16,OUTP 944 cmp $16,LEN 945 jge 2b 9463: 947 movq IV,(IVP) 9484: 949 RETGUARD_POP(r11) 950 RETGUARD_CHECK(aesni_ctr_enc, r11) 951 ret 952 lfence 953 954_aesni_gmac_gfmul: 955 RETGUARD_SETUP(_aesni_gmac_gfmul, rax) 956 movdqa %xmm0,%xmm3 957 pclmulqdq $0x00,%xmm1,%xmm3 # xmm3 holds a0*b0 958 movdqa %xmm0,%xmm4 959 pclmulqdq $0x10,%xmm1,%xmm4 # xmm4 holds a0*b1 960 movdqa %xmm0,%xmm5 961 pclmulqdq $0x01,%xmm1,%xmm5 # xmm5 holds a1*b0 962 movdqa %xmm0,%xmm6 963 pclmulqdq $0x11,%xmm1,%xmm6 # xmm6 holds a1*b1 964 965 pxor %xmm5,%xmm4 # xmm4 holds a0*b1 + a1*b0 966 movdqa %xmm4,%xmm5 967 psrldq $8,%xmm4 968 pslldq $8,%xmm5 969 pxor %xmm5,%xmm3 970 pxor %xmm4,%xmm6 971 972 /* 973 * <xmm6:xmm3> holds the result of the carry-less 974 * multiplication of xmm0 by xmm1 975 * 976 * shift the result by one bit position to the left 977 * cope for the fact that bits are reversed 978 */ 979 movdqa %xmm3,%xmm7 980 movdqa %xmm6,%xmm8 981 pslld $1,%xmm3 982 pslld $1,%xmm6 983 psrld $31,%xmm7 984 psrld $31,%xmm8 985 movdqa %xmm7,%xmm9 986 pslldq $4,%xmm8 987 pslldq $4,%xmm7 988 psrldq $12,%xmm9 989 por %xmm7,%xmm3 990 por %xmm8,%xmm6 991 por %xmm9,%xmm6 992 993 /* first phase of the reduction */ 994 movdqa %xmm3,%xmm7 995 movdqa %xmm3,%xmm8 996 movdqa %xmm3,%xmm9 997 pslld $31,%xmm7 # packed right shifting << 31 998 pslld $30,%xmm8 # packed right shifting shift << 30 999 pslld $25,%xmm9 # packed right shifting shift << 25 1000 pxor %xmm8,%xmm7 # xor the shifted versions 1001 pxor %xmm9,%xmm7 1002 movdqa %xmm7,%xmm8 1003 pslldq $12,%xmm7 1004 psrldq $4,%xmm8 1005 pxor %xmm7,%xmm3 1006 1007 /* second phase of the reduction */ 1008 movdqa %xmm3,%xmm2 1009 movdqa %xmm3,%xmm4 1010 movdqa %xmm3,%xmm5 1011 psrld $1,%xmm2 # packed left shifting >> 1 1012 psrld $2,%xmm4 # packed left shifting >> 2 1013 psrld $7,%xmm5 # packed left shifting >> 7 1014 pxor %xmm4,%xmm2 # xor the shifted versions 1015 pxor %xmm5,%xmm2 1016 pxor %xmm8,%xmm2 1017 pxor %xmm2,%xmm3 1018 pxor %xmm3,%xmm6 # the result is in xmm6 1019 RETGUARD_CHECK(_aesni_gmac_gfmul, rax) 1020 ret 1021 lfence 1022 1023/* 1024 * void aesni_gmac_update(GHASH_CTX *ghash, uint8_t *src, size_t len) 1025 */ 1026ENTRY(aesni_gmac_update) 1027 RETGUARD_SETUP(aesni_gmac_update, r11) 1028 cmp $16,%rdx 1029 jb 2f 1030 1031 movdqu .Lbswap_mask,BSWAP_MASK # endianness swap mask 1032 1033 movdqu (%rdi),%xmm1 # hash subkey 1034 movdqu 32(%rdi),%xmm6 # initial state 1035 pshufb BSWAP_MASK,%xmm1 1036 pshufb BSWAP_MASK,%xmm6 10371: 1038 movdqu (%rsi),%xmm2 1039 pshufb BSWAP_MASK,%xmm2 1040 movdqa %xmm6,%xmm0 1041 pxor %xmm2,%xmm0 1042 call _aesni_gmac_gfmul 1043 1044 sub $16,%rdx 1045 add $16,%rsi 1046 cmp $16,%rdx 1047 jge 1b 1048 1049 pshufb BSWAP_MASK,%xmm6 1050 movdqu %xmm6,16(%rdi) 1051 movdqu %xmm6,32(%rdi) 10522: 1053 RETGUARD_CHECK(aesni_gmac_update, r11) 1054 ret 1055 lfence 1056 1057/* 1058 * void aesni_gmac_final(struct aesni_sess *ses, uint8_t *tag, 1059 * uint8_t *icb, uint8_t *hashstate) 1060 */ 1061ENTRY(aesni_gmac_final) 1062 RETGUARD_SETUP(aesni_gmac_final, r11) 1063 movl 480(KEYP),KLEN # key length 1064 movdqu (INP),STATE # icb 1065 call _aesni_enc1 1066 movdqu (HSTATE),IN 1067 pxor IN,STATE 1068 movdqu STATE,(OUTP) # output 1069 RETGUARD_CHECK(aesni_gmac_final, r11) 1070 ret 1071 lfence 1072 1073/* 1074 * void aesni_xts_enc(struct aesni_xts_ctx *xts, uint8_t *dst, uint8_t *src, 1075 * size_t len, uint8_t *iv) 1076 */ 1077ENTRY(aesni_xts_enc) 1078 RETGUARD_SETUP(aesni_xts_enc, r11) 1079 RETGUARD_PUSH(r11) 1080 cmp $16,%rcx 1081 jb 2f 1082 1083 call _aesni_xts_tweak 1084 1085 movl 480(KEYP),KLEN # key length 10861: 1087 movups (%rdx),%xmm0 # src 1088 pxor %xmm3,%xmm0 # xor block with tweak 1089 call _aesni_enc1 1090 pxor %xmm3,%xmm0 # xor block with tweak 1091 movups %xmm0,(%rsi) # dst 1092 1093 call _aesni_xts_tweak_exp 1094 1095 add $16,%rsi 1096 add $16,%rdx 1097 sub $16,%rcx 1098 cmp $16,%rcx 1099 jge 1b 11002: 1101 RETGUARD_POP(r11) 1102 RETGUARD_CHECK(aesni_xts_enc, r11) 1103 ret 1104 lfence 1105 1106/* 1107 * void aesni_xts_dec(struct aesni_xts_ctx *xts, uint8_t *dst, uint8_t *src, 1108 * size_t len, uint8_t *iv) 1109 */ 1110ENTRY(aesni_xts_dec) 1111 RETGUARD_SETUP(aesni_xts_dec, r11) 1112 RETGUARD_PUSH(r11) 1113 cmp $16,%rcx 1114 jb 2f 1115 1116 call _aesni_xts_tweak 1117 1118 movl 480(KEYP),KLEN # key length 1119 add $240,KEYP # decryption key 11201: 1121 movups (%rdx),%xmm0 # src 1122 pxor %xmm3,%xmm0 # xor block with tweak 1123 call _aesni_dec1 1124 pxor %xmm3,%xmm0 # xor block with tweak 1125 movups %xmm0,(%rsi) # dst 1126 1127 call _aesni_xts_tweak_exp 1128 1129 add $16,%rsi 1130 add $16,%rdx 1131 sub $16,%rcx 1132 cmp $16,%rcx 1133 jge 1b 11342: 1135 RETGUARD_POP(r11) 1136 RETGUARD_CHECK(aesni_xts_dec, r11) 1137 ret 1138 lfence 1139 1140/* 1141 * Prepare tweak as E_k2(IV). IV is specified as LE representation of a 1142 * 64-bit block number which we allow to be passed in directly. Since 1143 * we're on a 64-bit LE host the representation is already correct. 1144 * 1145 * xts is in %rdi, iv is in %r8 and we return the tweak in %xmm3. 1146 */ 1147_aesni_xts_tweak: 1148 RETGUARD_SETUP(_aesni_xts_tweak, rax) 1149 RETGUARD_PUSH(rax) 1150 mov (%r8),%r10 1151 movd %r10,%xmm0 # Last 64-bits of IV are always zero. 1152 mov KEYP,%r11 1153 lea 496(%rdi),KEYP 1154 movl 480(KEYP),KLEN 1155 call _aesni_enc1 1156 movdqa %xmm0,%xmm3 1157 mov %r11,KEYP 1158 RETGUARD_POP(rax) 1159 RETGUARD_CHECK(_aesni_xts_tweak, rax) 1160 ret 1161 lfence 1162 1163/* 1164 * Exponentiate AES XTS tweak (in %xmm3). 1165 */ 1166_aesni_xts_tweak_exp: 1167 RETGUARD_SETUP(_aesni_xts_tweak_exp, rax) 1168 pextrw $7,%xmm3,%r10 1169 pextrw $3,%xmm3,%r11 1170 psllq $1,%xmm3 # Left shift. 1171 1172 and $0x8000,%r11 # Carry between quads. 1173 jz 1f 1174 mov $1,%r11 1175 pxor %xmm0,%xmm0 1176 pinsrw $4,%r11,%xmm0 1177 por %xmm0,%xmm3 11781: 1179 and $0x8000,%r10 1180 jz 2f 1181 pextrw $0,%xmm3,%r11 1182 xor $0x87,%r11 # AES XTS alpha - GF(2^128). 1183 pinsrw $0,%r11,%xmm3 11842: 1185 RETGUARD_CHECK(_aesni_xts_tweak_exp, rax) 1186 ret 1187 lfence 1188