1 /* $OpenBSD: gcm128.c,v 1.25 2023/07/08 14:56:54 beck Exp $ */ 2 /* ==================================================================== 3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * 3. All advertising materials mentioning features or use of this 18 * software must display the following acknowledgment: 19 * "This product includes software developed by the OpenSSL Project 20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 21 * 22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 23 * endorse or promote products derived from this software without 24 * prior written permission. For written permission, please contact 25 * openssl-core@openssl.org. 26 * 27 * 5. Products derived from this software may not be called "OpenSSL" 28 * nor may "OpenSSL" appear in their names without prior written 29 * permission of the OpenSSL Project. 30 * 31 * 6. Redistributions of any form whatsoever must retain the following 32 * acknowledgment: 33 * "This product includes software developed by the OpenSSL Project 34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 47 * OF THE POSSIBILITY OF SUCH DAMAGE. 48 * ==================================================================== 49 */ 50 51 #define OPENSSL_FIPSAPI 52 53 #include <openssl/crypto.h> 54 #include "modes_local.h" 55 #include <string.h> 56 57 #ifndef MODES_DEBUG 58 # ifndef NDEBUG 59 # define NDEBUG 60 # endif 61 #endif 62 63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT) 64 /* redefine, because alignment is ensured */ 65 #undef GETU32 66 #define GETU32(p) BSWAP4(*(const u32 *)(p)) 67 #undef PUTU32 68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) 69 #endif 70 71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) 72 #define REDUCE1BIT(V) \ 73 do { \ 74 if (sizeof(size_t)==8) { \ 75 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ 76 V.lo = (V.hi<<63)|(V.lo>>1); \ 77 V.hi = (V.hi>>1 )^T; \ 78 } else { \ 79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ 80 V.lo = (V.hi<<63)|(V.lo>>1); \ 81 V.hi = (V.hi>>1 )^((u64)T<<32); \ 82 } \ 83 } while(0) 84 85 /* 86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should 87 * never be set to 8. 8 is effectively reserved for testing purposes. 88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as 89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover 90 * whole spectrum of possible table driven implementations. Why? In 91 * non-"Shoup's" case memory access pattern is segmented in such manner, 92 * that it's trivial to see that cache timing information can reveal 93 * fair portion of intermediate hash value. Given that ciphertext is 94 * always available to attacker, it's possible for him to attempt to 95 * deduce secret parameter H and if successful, tamper with messages 96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's 97 * not as trivial, but there is no reason to believe that it's resistant 98 * to cache-timing attack. And the thing about "8-bit" implementation is 99 * that it consumes 16 (sixteen) times more memory, 4KB per individual 100 * key + 1KB shared. Well, on pros side it should be twice as fast as 101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version 102 * was observed to run ~75% faster, closer to 100% for commercial 103 * compilers... Yet "4-bit" procedure is preferred, because it's 104 * believed to provide better security-performance balance and adequate 105 * all-round performance. "All-round" refers to things like: 106 * 107 * - shorter setup time effectively improves overall timing for 108 * handling short messages; 109 * - larger table allocation can become unbearable because of VM 110 * subsystem penalties (for example on Windows large enough free 111 * results in VM working set trimming, meaning that consequent 112 * malloc would immediately incur working set expansion); 113 * - larger table has larger cache footprint, which can affect 114 * performance of other code paths (not necessarily even from same 115 * thread in Hyper-Threading world); 116 * 117 * Value of 1 is not appropriate for performance reasons. 118 */ 119 #if TABLE_BITS==8 120 121 static void 122 gcm_init_8bit(u128 Htable[256], u64 H[2]) 123 { 124 int i, j; 125 u128 V; 126 127 Htable[0].hi = 0; 128 Htable[0].lo = 0; 129 V.hi = H[0]; 130 V.lo = H[1]; 131 132 for (Htable[128] = V, i = 64; i > 0; i >>= 1) { 133 REDUCE1BIT(V); 134 Htable[i] = V; 135 } 136 137 for (i = 2; i < 256; i <<= 1) { 138 u128 *Hi = Htable + i, H0 = *Hi; 139 for (j = 1; j < i; ++j) { 140 Hi[j].hi = H0.hi ^ Htable[j].hi; 141 Hi[j].lo = H0.lo ^ Htable[j].lo; 142 } 143 } 144 } 145 146 static void 147 gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) 148 { 149 u128 Z = { 0, 0}; 150 const u8 *xi = (const u8 *)Xi + 15; 151 size_t rem, n = *xi; 152 static const size_t rem_8bit[256] = { 153 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), 154 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), 155 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), 156 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), 157 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), 158 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), 159 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), 160 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), 161 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), 162 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), 163 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), 164 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), 165 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), 166 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), 167 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), 168 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), 169 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), 170 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), 171 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), 172 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), 173 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), 174 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), 175 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), 176 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), 177 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), 178 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), 179 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), 180 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), 181 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), 182 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), 183 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), 184 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), 185 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), 186 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), 187 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), 188 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), 189 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), 190 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), 191 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), 192 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), 193 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), 194 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), 195 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), 196 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), 197 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), 198 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), 199 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), 200 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), 201 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), 202 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), 203 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), 204 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), 205 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), 206 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), 207 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), 208 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), 209 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), 210 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), 211 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), 212 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), 213 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), 214 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), 215 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), 216 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; 217 218 while (1) { 219 Z.hi ^= Htable[n].hi; 220 Z.lo ^= Htable[n].lo; 221 222 if ((u8 *)Xi == xi) 223 break; 224 225 n = *(--xi); 226 227 rem = (size_t)Z.lo & 0xff; 228 Z.lo = (Z.hi << 56)|(Z.lo >> 8); 229 Z.hi = (Z.hi >> 8); 230 #if SIZE_MAX == 0xffffffffffffffff 231 Z.hi ^= rem_8bit[rem]; 232 #else 233 Z.hi ^= (u64)rem_8bit[rem] << 32; 234 #endif 235 } 236 237 #if BYTE_ORDER == LITTLE_ENDIAN 238 #ifdef BSWAP8 239 Xi[0] = BSWAP8(Z.hi); 240 Xi[1] = BSWAP8(Z.lo); 241 #else 242 u8 *p = (u8 *)Xi; 243 u32 v; 244 v = (u32)(Z.hi >> 32); 245 PUTU32(p, v); 246 v = (u32)(Z.hi); 247 PUTU32(p + 4, v); 248 v = (u32)(Z.lo >> 32); 249 PUTU32(p + 8, v); 250 v = (u32)(Z.lo); 251 PUTU32(p + 12, v); 252 #endif 253 #else /* BIG_ENDIAN */ 254 Xi[0] = Z.hi; 255 Xi[1] = Z.lo; 256 #endif 257 } 258 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) 259 260 #elif TABLE_BITS==4 261 262 static void 263 gcm_init_4bit(u128 Htable[16], u64 H[2]) 264 { 265 u128 V; 266 #if defined(OPENSSL_SMALL_FOOTPRINT) 267 int i; 268 #endif 269 270 Htable[0].hi = 0; 271 Htable[0].lo = 0; 272 V.hi = H[0]; 273 V.lo = H[1]; 274 275 #if defined(OPENSSL_SMALL_FOOTPRINT) 276 for (Htable[8] = V, i = 4; i > 0; i >>= 1) { 277 REDUCE1BIT(V); 278 Htable[i] = V; 279 } 280 281 for (i = 2; i < 16; i <<= 1) { 282 u128 *Hi = Htable + i; 283 int j; 284 for (V = *Hi, j = 1; j < i; ++j) { 285 Hi[j].hi = V.hi ^ Htable[j].hi; 286 Hi[j].lo = V.lo ^ Htable[j].lo; 287 } 288 } 289 #else 290 Htable[8] = V; 291 REDUCE1BIT(V); 292 Htable[4] = V; 293 REDUCE1BIT(V); 294 Htable[2] = V; 295 REDUCE1BIT(V); 296 Htable[1] = V; 297 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; 298 V = Htable[4]; 299 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; 300 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; 301 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; 302 V = Htable[8]; 303 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; 304 Htable[10].hi = V.hi ^ Htable[2].hi, 305 Htable[10].lo = V.lo ^ Htable[2].lo; 306 Htable[11].hi = V.hi ^ Htable[3].hi, 307 Htable[11].lo = V.lo ^ Htable[3].lo; 308 Htable[12].hi = V.hi ^ Htable[4].hi, 309 Htable[12].lo = V.lo ^ Htable[4].lo; 310 Htable[13].hi = V.hi ^ Htable[5].hi, 311 Htable[13].lo = V.lo ^ Htable[5].lo; 312 Htable[14].hi = V.hi ^ Htable[6].hi, 313 Htable[14].lo = V.lo ^ Htable[6].lo; 314 Htable[15].hi = V.hi ^ Htable[7].hi, 315 Htable[15].lo = V.lo ^ Htable[7].lo; 316 #endif 317 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) 318 /* 319 * ARM assembler expects specific dword order in Htable. 320 */ 321 { 322 int j; 323 #if BYTE_ORDER == LITTLE_ENDIAN 324 for (j = 0; j < 16; ++j) { 325 V = Htable[j]; 326 Htable[j].hi = V.lo; 327 Htable[j].lo = V.hi; 328 } 329 #else /* BIG_ENDIAN */ 330 for (j = 0; j < 16; ++j) { 331 V = Htable[j]; 332 Htable[j].hi = V.lo << 32|V.lo >> 32; 333 Htable[j].lo = V.hi << 32|V.hi >> 32; 334 } 335 #endif 336 } 337 #endif 338 } 339 340 #ifndef GHASH_ASM 341 static const size_t rem_4bit[16] = { 342 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), 343 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), 344 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), 345 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; 346 347 static void 348 gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) 349 { 350 u128 Z; 351 int cnt = 15; 352 size_t rem, nlo, nhi; 353 354 nlo = ((const u8 *)Xi)[15]; 355 nhi = nlo >> 4; 356 nlo &= 0xf; 357 358 Z.hi = Htable[nlo].hi; 359 Z.lo = Htable[nlo].lo; 360 361 while (1) { 362 rem = (size_t)Z.lo & 0xf; 363 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 364 Z.hi = (Z.hi >> 4); 365 #if SIZE_MAX == 0xffffffffffffffff 366 Z.hi ^= rem_4bit[rem]; 367 #else 368 Z.hi ^= (u64)rem_4bit[rem] << 32; 369 #endif 370 Z.hi ^= Htable[nhi].hi; 371 Z.lo ^= Htable[nhi].lo; 372 373 if (--cnt < 0) 374 break; 375 376 nlo = ((const u8 *)Xi)[cnt]; 377 nhi = nlo >> 4; 378 nlo &= 0xf; 379 380 rem = (size_t)Z.lo & 0xf; 381 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 382 Z.hi = (Z.hi >> 4); 383 #if SIZE_MAX == 0xffffffffffffffff 384 Z.hi ^= rem_4bit[rem]; 385 #else 386 Z.hi ^= (u64)rem_4bit[rem] << 32; 387 #endif 388 Z.hi ^= Htable[nlo].hi; 389 Z.lo ^= Htable[nlo].lo; 390 } 391 392 #if BYTE_ORDER == LITTLE_ENDIAN 393 #ifdef BSWAP8 394 Xi[0] = BSWAP8(Z.hi); 395 Xi[1] = BSWAP8(Z.lo); 396 #else 397 u8 *p = (u8 *)Xi; 398 u32 v; 399 v = (u32)(Z.hi >> 32); 400 PUTU32(p, v); 401 v = (u32)(Z.hi); 402 PUTU32(p + 4, v); 403 v = (u32)(Z.lo >> 32); 404 PUTU32(p + 8, v); 405 v = (u32)(Z.lo); 406 PUTU32(p + 12, v); 407 #endif 408 #else /* BIG_ENDIAN */ 409 Xi[0] = Z.hi; 410 Xi[1] = Z.lo; 411 #endif 412 } 413 414 #if !defined(OPENSSL_SMALL_FOOTPRINT) 415 /* 416 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for 417 * details... Compiler-generated code doesn't seem to give any 418 * performance improvement, at least not on x86[_64]. It's here 419 * mostly as reference and a placeholder for possible future 420 * non-trivial optimization[s]... 421 */ 422 static void 423 gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], 424 const u8 *inp, size_t len) 425 { 426 u128 Z; 427 int cnt; 428 size_t rem, nlo, nhi; 429 430 #if 1 431 do { 432 cnt = 15; 433 nlo = ((const u8 *)Xi)[15]; 434 nlo ^= inp[15]; 435 nhi = nlo >> 4; 436 nlo &= 0xf; 437 438 Z.hi = Htable[nlo].hi; 439 Z.lo = Htable[nlo].lo; 440 441 while (1) { 442 rem = (size_t)Z.lo & 0xf; 443 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 444 Z.hi = (Z.hi >> 4); 445 #if SIZE_MAX == 0xffffffffffffffff 446 Z.hi ^= rem_4bit[rem]; 447 #else 448 Z.hi ^= (u64)rem_4bit[rem] << 32; 449 #endif 450 Z.hi ^= Htable[nhi].hi; 451 Z.lo ^= Htable[nhi].lo; 452 453 if (--cnt < 0) 454 break; 455 456 nlo = ((const u8 *)Xi)[cnt]; 457 nlo ^= inp[cnt]; 458 nhi = nlo >> 4; 459 nlo &= 0xf; 460 461 rem = (size_t)Z.lo & 0xf; 462 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 463 Z.hi = (Z.hi >> 4); 464 #if SIZE_MAX == 0xffffffffffffffff 465 Z.hi ^= rem_4bit[rem]; 466 #else 467 Z.hi ^= (u64)rem_4bit[rem] << 32; 468 #endif 469 Z.hi ^= Htable[nlo].hi; 470 Z.lo ^= Htable[nlo].lo; 471 } 472 #else 473 /* 474 * Extra 256+16 bytes per-key plus 512 bytes shared tables 475 * [should] give ~50% improvement... One could have PACK()-ed 476 * the rem_8bit even here, but the priority is to minimize 477 * cache footprint... 478 */ 479 u128 Hshr4[16]; /* Htable shifted right by 4 bits */ 480 u8 Hshl4[16]; /* Htable shifted left by 4 bits */ 481 static const unsigned short rem_8bit[256] = { 482 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, 483 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, 484 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, 485 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, 486 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, 487 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, 488 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, 489 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, 490 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, 491 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, 492 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, 493 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, 494 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, 495 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, 496 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, 497 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, 498 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, 499 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, 500 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, 501 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, 502 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, 503 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, 504 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, 505 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, 506 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, 507 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, 508 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, 509 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, 510 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, 511 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, 512 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, 513 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; 514 /* 515 * This pre-processing phase slows down procedure by approximately 516 * same time as it makes each loop spin faster. In other words 517 * single block performance is approximately same as straightforward 518 * "4-bit" implementation, and then it goes only faster... 519 */ 520 for (cnt = 0; cnt < 16; ++cnt) { 521 Z.hi = Htable[cnt].hi; 522 Z.lo = Htable[cnt].lo; 523 Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4); 524 Hshr4[cnt].hi = (Z.hi >> 4); 525 Hshl4[cnt] = (u8)(Z.lo << 4); 526 } 527 528 do { 529 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) { 530 nlo = ((const u8 *)Xi)[cnt]; 531 nlo ^= inp[cnt]; 532 nhi = nlo >> 4; 533 nlo &= 0xf; 534 535 Z.hi ^= Htable[nlo].hi; 536 Z.lo ^= Htable[nlo].lo; 537 538 rem = (size_t)Z.lo & 0xff; 539 540 Z.lo = (Z.hi << 56)|(Z.lo >> 8); 541 Z.hi = (Z.hi >> 8); 542 543 Z.hi ^= Hshr4[nhi].hi; 544 Z.lo ^= Hshr4[nhi].lo; 545 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48; 546 } 547 548 nlo = ((const u8 *)Xi)[0]; 549 nlo ^= inp[0]; 550 nhi = nlo >> 4; 551 nlo &= 0xf; 552 553 Z.hi ^= Htable[nlo].hi; 554 Z.lo ^= Htable[nlo].lo; 555 556 rem = (size_t)Z.lo & 0xf; 557 558 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 559 Z.hi = (Z.hi >> 4); 560 561 Z.hi ^= Htable[nhi].hi; 562 Z.lo ^= Htable[nhi].lo; 563 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48; 564 #endif 565 566 #if BYTE_ORDER == LITTLE_ENDIAN 567 #ifdef BSWAP8 568 Xi[0] = BSWAP8(Z.hi); 569 Xi[1] = BSWAP8(Z.lo); 570 #else 571 u8 *p = (u8 *)Xi; 572 u32 v; 573 v = (u32)(Z.hi >> 32); 574 PUTU32(p, v); 575 v = (u32)(Z.hi); 576 PUTU32(p + 4, v); 577 v = (u32)(Z.lo >> 32); 578 PUTU32(p + 8, v); 579 v = (u32)(Z.lo); 580 PUTU32(p + 12, v); 581 #endif 582 #else /* BIG_ENDIAN */ 583 Xi[0] = Z.hi; 584 Xi[1] = Z.lo; 585 #endif 586 } while (inp += 16, len -= 16); 587 } 588 #endif 589 #else 590 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]); 591 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp, 592 size_t len); 593 #endif 594 595 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) 596 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) 597 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) 598 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache 599 * trashing effect. In other words idea is to hash data while it's 600 * still in L1 cache after encryption pass... */ 601 #define GHASH_CHUNK (3*1024) 602 #endif 603 604 #else /* TABLE_BITS */ 605 606 static void 607 gcm_gmult_1bit(u64 Xi[2], const u64 H[2]) 608 { 609 u128 V, Z = { 0,0 }; 610 long X; 611 int i, j; 612 const long *xi = (const long *)Xi; 613 614 V.hi = H[0]; /* H is in host byte order, no byte swapping */ 615 V.lo = H[1]; 616 617 for (j = 0; j < 16/sizeof(long); ++j) { 618 #if BYTE_ORDER == LITTLE_ENDIAN 619 #if SIZE_MAX == 0xffffffffffffffff 620 #ifdef BSWAP8 621 X = (long)(BSWAP8(xi[j])); 622 #else 623 const u8 *p = (const u8 *)(xi + j); 624 X = (long)((u64)GETU32(p) << 32|GETU32(p + 4)); 625 #endif 626 #else 627 const u8 *p = (const u8 *)(xi + j); 628 X = (long)GETU32(p); 629 #endif 630 #else /* BIG_ENDIAN */ 631 X = xi[j]; 632 #endif 633 634 for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) { 635 u64 M = (u64)(X >> (8*sizeof(long) - 1)); 636 Z.hi ^= V.hi & M; 637 Z.lo ^= V.lo & M; 638 639 REDUCE1BIT(V); 640 } 641 } 642 643 #if BYTE_ORDER == LITTLE_ENDIAN 644 #ifdef BSWAP8 645 Xi[0] = BSWAP8(Z.hi); 646 Xi[1] = BSWAP8(Z.lo); 647 #else 648 u8 *p = (u8 *)Xi; 649 u32 v; 650 v = (u32)(Z.hi >> 32); 651 PUTU32(p, v); 652 v = (u32)(Z.hi); 653 PUTU32(p + 4, v); 654 v = (u32)(Z.lo >> 32); 655 PUTU32(p + 8, v); 656 v = (u32)(Z.lo); 657 PUTU32(p + 12, v); 658 #endif 659 #else /* BIG_ENDIAN */ 660 Xi[0] = Z.hi; 661 Xi[1] = Z.lo; 662 #endif 663 } 664 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) 665 666 #endif 667 668 #if defined(GHASH_ASM) && \ 669 (defined(__i386) || defined(__i386__) || \ 670 defined(__x86_64) || defined(__x86_64__) || \ 671 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 672 #include "x86_arch.h" 673 #endif 674 675 #if TABLE_BITS==4 && defined(GHASH_ASM) 676 # if (defined(__i386) || defined(__i386__) || \ 677 defined(__x86_64) || defined(__x86_64__) || \ 678 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 679 # define GHASH_ASM_X86_OR_64 680 # define GCM_FUNCREF_4BIT 681 682 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]); 683 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]); 684 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp, 685 size_t len); 686 687 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) 688 # define GHASH_ASM_X86 689 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]); 690 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp, 691 size_t len); 692 693 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]); 694 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp, 695 size_t len); 696 # endif 697 # elif defined(__arm__) || defined(__arm) 698 # include "arm_arch.h" 699 # if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT) 700 # define GHASH_ASM_ARM 701 # define GCM_FUNCREF_4BIT 702 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]); 703 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp, 704 size_t len); 705 # endif 706 # endif 707 #endif 708 709 #ifdef GCM_FUNCREF_4BIT 710 # undef GCM_MUL 711 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) 712 # ifdef GHASH 713 # undef GHASH 714 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) 715 # endif 716 #endif 717 718 void 719 CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) 720 { 721 memset(ctx, 0, sizeof(*ctx)); 722 ctx->block = block; 723 ctx->key = key; 724 725 (*block)(ctx->H.c, ctx->H.c, key); 726 727 #if BYTE_ORDER == LITTLE_ENDIAN 728 /* H is stored in host byte order */ 729 #ifdef BSWAP8 730 ctx->H.u[0] = BSWAP8(ctx->H.u[0]); 731 ctx->H.u[1] = BSWAP8(ctx->H.u[1]); 732 #else 733 u8 *p = ctx->H.c; 734 u64 hi, lo; 735 hi = (u64)GETU32(p) << 32|GETU32(p + 4); 736 lo = (u64)GETU32(p + 8) << 32|GETU32(p + 12); 737 ctx->H.u[0] = hi; 738 ctx->H.u[1] = lo; 739 #endif 740 #endif 741 742 #if TABLE_BITS==8 743 gcm_init_8bit(ctx->Htable, ctx->H.u); 744 #elif TABLE_BITS==4 745 # if defined(GHASH_ASM_X86_OR_64) 746 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) 747 /* check FXSR and PCLMULQDQ bits */ 748 if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) == 749 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) { 750 gcm_init_clmul(ctx->Htable, ctx->H.u); 751 ctx->gmult = gcm_gmult_clmul; 752 ctx->ghash = gcm_ghash_clmul; 753 return; 754 } 755 # endif 756 gcm_init_4bit(ctx->Htable, ctx->H.u); 757 # if defined(GHASH_ASM_X86) /* x86 only */ 758 # if defined(OPENSSL_IA32_SSE2) 759 if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) { /* check SSE bit */ 760 # else 761 if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) { /* check MMX bit */ 762 # endif 763 ctx->gmult = gcm_gmult_4bit_mmx; 764 ctx->ghash = gcm_ghash_4bit_mmx; 765 } else { 766 ctx->gmult = gcm_gmult_4bit_x86; 767 ctx->ghash = gcm_ghash_4bit_x86; 768 } 769 # else 770 ctx->gmult = gcm_gmult_4bit; 771 ctx->ghash = gcm_ghash_4bit; 772 # endif 773 # elif defined(GHASH_ASM_ARM) 774 if (OPENSSL_armcap_P & ARMV7_NEON) { 775 ctx->gmult = gcm_gmult_neon; 776 ctx->ghash = gcm_ghash_neon; 777 } else { 778 gcm_init_4bit(ctx->Htable, ctx->H.u); 779 ctx->gmult = gcm_gmult_4bit; 780 ctx->ghash = gcm_ghash_4bit; 781 } 782 # else 783 gcm_init_4bit(ctx->Htable, ctx->H.u); 784 # endif 785 #endif 786 } 787 LCRYPTO_ALIAS(CRYPTO_gcm128_init); 788 789 void 790 CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len) 791 { 792 unsigned int ctr; 793 #ifdef GCM_FUNCREF_4BIT 794 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 795 #endif 796 797 ctx->Yi.u[0] = 0; 798 ctx->Yi.u[1] = 0; 799 ctx->Xi.u[0] = 0; 800 ctx->Xi.u[1] = 0; 801 ctx->len.u[0] = 0; /* AAD length */ 802 ctx->len.u[1] = 0; /* message length */ 803 ctx->ares = 0; 804 ctx->mres = 0; 805 806 if (len == 12) { 807 memcpy(ctx->Yi.c, iv, 12); 808 ctx->Yi.c[15] = 1; 809 ctr = 1; 810 } else { 811 size_t i; 812 u64 len0 = len; 813 814 while (len >= 16) { 815 for (i = 0; i < 16; ++i) 816 ctx->Yi.c[i] ^= iv[i]; 817 GCM_MUL(ctx, Yi); 818 iv += 16; 819 len -= 16; 820 } 821 if (len) { 822 for (i = 0; i < len; ++i) 823 ctx->Yi.c[i] ^= iv[i]; 824 GCM_MUL(ctx, Yi); 825 } 826 len0 <<= 3; 827 #if BYTE_ORDER == LITTLE_ENDIAN 828 #ifdef BSWAP8 829 ctx->Yi.u[1] ^= BSWAP8(len0); 830 #else 831 ctx->Yi.c[8] ^= (u8)(len0 >> 56); 832 ctx->Yi.c[9] ^= (u8)(len0 >> 48); 833 ctx->Yi.c[10] ^= (u8)(len0 >> 40); 834 ctx->Yi.c[11] ^= (u8)(len0 >> 32); 835 ctx->Yi.c[12] ^= (u8)(len0 >> 24); 836 ctx->Yi.c[13] ^= (u8)(len0 >> 16); 837 ctx->Yi.c[14] ^= (u8)(len0 >> 8); 838 ctx->Yi.c[15] ^= (u8)(len0); 839 #endif 840 #else /* BIG_ENDIAN */ 841 ctx->Yi.u[1] ^= len0; 842 #endif 843 844 GCM_MUL(ctx, Yi); 845 846 #if BYTE_ORDER == LITTLE_ENDIAN 847 #ifdef BSWAP4 848 ctr = BSWAP4(ctx->Yi.d[3]); 849 #else 850 ctr = GETU32(ctx->Yi.c + 12); 851 #endif 852 #else /* BIG_ENDIAN */ 853 ctr = ctx->Yi.d[3]; 854 #endif 855 } 856 857 (*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key); 858 ++ctr; 859 #if BYTE_ORDER == LITTLE_ENDIAN 860 #ifdef BSWAP4 861 ctx->Yi.d[3] = BSWAP4(ctr); 862 #else 863 PUTU32(ctx->Yi.c + 12, ctr); 864 #endif 865 #else /* BIG_ENDIAN */ 866 ctx->Yi.d[3] = ctr; 867 #endif 868 } 869 LCRYPTO_ALIAS(CRYPTO_gcm128_setiv); 870 871 int 872 CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len) 873 { 874 size_t i; 875 unsigned int n; 876 u64 alen = ctx->len.u[0]; 877 #ifdef GCM_FUNCREF_4BIT 878 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 879 # ifdef GHASH 880 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 881 const u8 *inp, size_t len) = ctx->ghash; 882 # endif 883 #endif 884 885 if (ctx->len.u[1]) 886 return -2; 887 888 alen += len; 889 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len)) 890 return -1; 891 ctx->len.u[0] = alen; 892 893 n = ctx->ares; 894 if (n) { 895 while (n && len) { 896 ctx->Xi.c[n] ^= *(aad++); 897 --len; 898 n = (n + 1) % 16; 899 } 900 if (n == 0) 901 GCM_MUL(ctx, Xi); 902 else { 903 ctx->ares = n; 904 return 0; 905 } 906 } 907 908 #ifdef GHASH 909 if ((i = (len & (size_t)-16))) { 910 GHASH(ctx, aad, i); 911 aad += i; 912 len -= i; 913 } 914 #else 915 while (len >= 16) { 916 for (i = 0; i < 16; ++i) 917 ctx->Xi.c[i] ^= aad[i]; 918 GCM_MUL(ctx, Xi); 919 aad += 16; 920 len -= 16; 921 } 922 #endif 923 if (len) { 924 n = (unsigned int)len; 925 for (i = 0; i < len; ++i) 926 ctx->Xi.c[i] ^= aad[i]; 927 } 928 929 ctx->ares = n; 930 return 0; 931 } 932 LCRYPTO_ALIAS(CRYPTO_gcm128_aad); 933 934 int 935 CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, 936 const unsigned char *in, unsigned char *out, 937 size_t len) 938 { 939 unsigned int n, ctr; 940 size_t i; 941 u64 mlen = ctx->len.u[1]; 942 block128_f block = ctx->block; 943 void *key = ctx->key; 944 #ifdef GCM_FUNCREF_4BIT 945 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 946 # ifdef GHASH 947 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 948 const u8 *inp, size_t len) = ctx->ghash; 949 # endif 950 #endif 951 952 mlen += len; 953 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 954 return -1; 955 ctx->len.u[1] = mlen; 956 957 if (ctx->ares) { 958 /* First call to encrypt finalizes GHASH(AAD) */ 959 GCM_MUL(ctx, Xi); 960 ctx->ares = 0; 961 } 962 963 #if BYTE_ORDER == LITTLE_ENDIAN 964 #ifdef BSWAP4 965 ctr = BSWAP4(ctx->Yi.d[3]); 966 #else 967 ctr = GETU32(ctx->Yi.c + 12); 968 #endif 969 #else /* BIG_ENDIAN */ 970 ctr = ctx->Yi.d[3]; 971 #endif 972 973 n = ctx->mres; 974 #if !defined(OPENSSL_SMALL_FOOTPRINT) 975 if (16 % sizeof(size_t) == 0) 976 do { /* always true actually */ 977 if (n) { 978 while (n && len) { 979 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ 980 ctx->EKi.c[n]; 981 --len; 982 n = (n + 1) % 16; 983 } 984 if (n == 0) 985 GCM_MUL(ctx, Xi); 986 else { 987 ctx->mres = n; 988 return 0; 989 } 990 } 991 #ifdef __STRICT_ALIGNMENT 992 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0) 993 break; 994 #endif 995 #if defined(GHASH) && defined(GHASH_CHUNK) 996 while (len >= GHASH_CHUNK) { 997 size_t j = GHASH_CHUNK; 998 999 while (j) { 1000 size_t *out_t = (size_t *)out; 1001 const size_t *in_t = (const size_t *)in; 1002 1003 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1004 ++ctr; 1005 #if BYTE_ORDER == LITTLE_ENDIAN 1006 #ifdef BSWAP4 1007 ctx->Yi.d[3] = BSWAP4(ctr); 1008 #else 1009 PUTU32(ctx->Yi.c + 12, ctr); 1010 #endif 1011 #else /* BIG_ENDIAN */ 1012 ctx->Yi.d[3] = ctr; 1013 #endif 1014 for (i = 0; i < 16/sizeof(size_t); ++i) 1015 out_t[i] = in_t[i] ^ 1016 ctx->EKi.t[i]; 1017 out += 16; 1018 in += 16; 1019 j -= 16; 1020 } 1021 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK); 1022 len -= GHASH_CHUNK; 1023 } 1024 if ((i = (len & (size_t)-16))) { 1025 size_t j = i; 1026 1027 while (len >= 16) { 1028 size_t *out_t = (size_t *)out; 1029 const size_t *in_t = (const size_t *)in; 1030 1031 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1032 ++ctr; 1033 #if BYTE_ORDER == LITTLE_ENDIAN 1034 #ifdef BSWAP4 1035 ctx->Yi.d[3] = BSWAP4(ctr); 1036 #else 1037 PUTU32(ctx->Yi.c + 12, ctr); 1038 #endif 1039 #else /* BIG_ENDIAN */ 1040 ctx->Yi.d[3] = ctr; 1041 #endif 1042 for (i = 0; i < 16/sizeof(size_t); ++i) 1043 out_t[i] = in_t[i] ^ 1044 ctx->EKi.t[i]; 1045 out += 16; 1046 in += 16; 1047 len -= 16; 1048 } 1049 GHASH(ctx, out - j, j); 1050 } 1051 #else 1052 while (len >= 16) { 1053 size_t *out_t = (size_t *)out; 1054 const size_t *in_t = (const size_t *)in; 1055 1056 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1057 ++ctr; 1058 #if BYTE_ORDER == LITTLE_ENDIAN 1059 #ifdef BSWAP4 1060 ctx->Yi.d[3] = BSWAP4(ctr); 1061 #else 1062 PUTU32(ctx->Yi.c + 12, ctr); 1063 #endif 1064 #else /* BIG_ENDIAN */ 1065 ctx->Yi.d[3] = ctr; 1066 #endif 1067 for (i = 0; i < 16/sizeof(size_t); ++i) 1068 ctx->Xi.t[i] ^= 1069 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 1070 GCM_MUL(ctx, Xi); 1071 out += 16; 1072 in += 16; 1073 len -= 16; 1074 } 1075 #endif 1076 if (len) { 1077 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1078 ++ctr; 1079 #if BYTE_ORDER == LITTLE_ENDIAN 1080 #ifdef BSWAP4 1081 ctx->Yi.d[3] = BSWAP4(ctr); 1082 #else 1083 PUTU32(ctx->Yi.c + 12, ctr); 1084 #endif 1085 #else /* BIG_ENDIAN */ 1086 ctx->Yi.d[3] = ctr; 1087 #endif 1088 while (len--) { 1089 ctx->Xi.c[n] ^= out[n] = in[n] ^ 1090 ctx->EKi.c[n]; 1091 ++n; 1092 } 1093 } 1094 1095 ctx->mres = n; 1096 return 0; 1097 } while (0); 1098 #endif 1099 for (i = 0; i < len; ++i) { 1100 if (n == 0) { 1101 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1102 ++ctr; 1103 #if BYTE_ORDER == LITTLE_ENDIAN 1104 #ifdef BSWAP4 1105 ctx->Yi.d[3] = BSWAP4(ctr); 1106 #else 1107 PUTU32(ctx->Yi.c + 12, ctr); 1108 #endif 1109 #else /* BIG_ENDIAN */ 1110 ctx->Yi.d[3] = ctr; 1111 #endif 1112 } 1113 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n]; 1114 n = (n + 1) % 16; 1115 if (n == 0) 1116 GCM_MUL(ctx, Xi); 1117 } 1118 1119 ctx->mres = n; 1120 return 0; 1121 } 1122 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt); 1123 1124 int 1125 CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, 1126 const unsigned char *in, unsigned char *out, 1127 size_t len) 1128 { 1129 unsigned int n, ctr; 1130 size_t i; 1131 u64 mlen = ctx->len.u[1]; 1132 block128_f block = ctx->block; 1133 void *key = ctx->key; 1134 #ifdef GCM_FUNCREF_4BIT 1135 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1136 # ifdef GHASH 1137 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 1138 const u8 *inp, size_t len) = ctx->ghash; 1139 # endif 1140 #endif 1141 1142 mlen += len; 1143 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 1144 return -1; 1145 ctx->len.u[1] = mlen; 1146 1147 if (ctx->ares) { 1148 /* First call to decrypt finalizes GHASH(AAD) */ 1149 GCM_MUL(ctx, Xi); 1150 ctx->ares = 0; 1151 } 1152 1153 #if BYTE_ORDER == LITTLE_ENDIAN 1154 #ifdef BSWAP4 1155 ctr = BSWAP4(ctx->Yi.d[3]); 1156 #else 1157 ctr = GETU32(ctx->Yi.c + 12); 1158 #endif 1159 #else /* BIG_ENDIAN */ 1160 ctr = ctx->Yi.d[3]; 1161 #endif 1162 1163 n = ctx->mres; 1164 #if !defined(OPENSSL_SMALL_FOOTPRINT) 1165 if (16 % sizeof(size_t) == 0) 1166 do { /* always true actually */ 1167 if (n) { 1168 while (n && len) { 1169 u8 c = *(in++); 1170 *(out++) = c ^ ctx->EKi.c[n]; 1171 ctx->Xi.c[n] ^= c; 1172 --len; 1173 n = (n + 1) % 16; 1174 } 1175 if (n == 0) 1176 GCM_MUL(ctx, Xi); 1177 else { 1178 ctx->mres = n; 1179 return 0; 1180 } 1181 } 1182 #ifdef __STRICT_ALIGNMENT 1183 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0) 1184 break; 1185 #endif 1186 #if defined(GHASH) && defined(GHASH_CHUNK) 1187 while (len >= GHASH_CHUNK) { 1188 size_t j = GHASH_CHUNK; 1189 1190 GHASH(ctx, in, GHASH_CHUNK); 1191 while (j) { 1192 size_t *out_t = (size_t *)out; 1193 const size_t *in_t = (const size_t *)in; 1194 1195 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1196 ++ctr; 1197 #if BYTE_ORDER == LITTLE_ENDIAN 1198 #ifdef BSWAP4 1199 ctx->Yi.d[3] = BSWAP4(ctr); 1200 #else 1201 PUTU32(ctx->Yi.c + 12, ctr); 1202 #endif 1203 #else /* BIG_ENDIAN */ 1204 ctx->Yi.d[3] = ctr; 1205 #endif 1206 for (i = 0; i < 16/sizeof(size_t); ++i) 1207 out_t[i] = in_t[i] ^ 1208 ctx->EKi.t[i]; 1209 out += 16; 1210 in += 16; 1211 j -= 16; 1212 } 1213 len -= GHASH_CHUNK; 1214 } 1215 if ((i = (len & (size_t)-16))) { 1216 GHASH(ctx, in, i); 1217 while (len >= 16) { 1218 size_t *out_t = (size_t *)out; 1219 const size_t *in_t = (const size_t *)in; 1220 1221 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1222 ++ctr; 1223 #if BYTE_ORDER == LITTLE_ENDIAN 1224 #ifdef BSWAP4 1225 ctx->Yi.d[3] = BSWAP4(ctr); 1226 #else 1227 PUTU32(ctx->Yi.c + 12, ctr); 1228 #endif 1229 #else /* BIG_ENDIAN */ 1230 ctx->Yi.d[3] = ctr; 1231 #endif 1232 for (i = 0; i < 16/sizeof(size_t); ++i) 1233 out_t[i] = in_t[i] ^ 1234 ctx->EKi.t[i]; 1235 out += 16; 1236 in += 16; 1237 len -= 16; 1238 } 1239 } 1240 #else 1241 while (len >= 16) { 1242 size_t *out_t = (size_t *)out; 1243 const size_t *in_t = (const size_t *)in; 1244 1245 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1246 ++ctr; 1247 #if BYTE_ORDER == LITTLE_ENDIAN 1248 #ifdef BSWAP4 1249 ctx->Yi.d[3] = BSWAP4(ctr); 1250 #else 1251 PUTU32(ctx->Yi.c + 12, ctr); 1252 #endif 1253 #else /* BIG_ENDIAN */ 1254 ctx->Yi.d[3] = ctr; 1255 #endif 1256 for (i = 0; i < 16/sizeof(size_t); ++i) { 1257 size_t c = in[i]; 1258 out[i] = c ^ ctx->EKi.t[i]; 1259 ctx->Xi.t[i] ^= c; 1260 } 1261 GCM_MUL(ctx, Xi); 1262 out += 16; 1263 in += 16; 1264 len -= 16; 1265 } 1266 #endif 1267 if (len) { 1268 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1269 ++ctr; 1270 #if BYTE_ORDER == LITTLE_ENDIAN 1271 #ifdef BSWAP4 1272 ctx->Yi.d[3] = BSWAP4(ctr); 1273 #else 1274 PUTU32(ctx->Yi.c + 12, ctr); 1275 #endif 1276 #else /* BIG_ENDIAN */ 1277 ctx->Yi.d[3] = ctr; 1278 #endif 1279 while (len--) { 1280 u8 c = in[n]; 1281 ctx->Xi.c[n] ^= c; 1282 out[n] = c ^ ctx->EKi.c[n]; 1283 ++n; 1284 } 1285 } 1286 1287 ctx->mres = n; 1288 return 0; 1289 } while (0); 1290 #endif 1291 for (i = 0; i < len; ++i) { 1292 u8 c; 1293 if (n == 0) { 1294 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1295 ++ctr; 1296 #if BYTE_ORDER == LITTLE_ENDIAN 1297 #ifdef BSWAP4 1298 ctx->Yi.d[3] = BSWAP4(ctr); 1299 #else 1300 PUTU32(ctx->Yi.c + 12, ctr); 1301 #endif 1302 #else /* BIG_ENDIAN */ 1303 ctx->Yi.d[3] = ctr; 1304 #endif 1305 } 1306 c = in[i]; 1307 out[i] = c ^ ctx->EKi.c[n]; 1308 ctx->Xi.c[n] ^= c; 1309 n = (n + 1) % 16; 1310 if (n == 0) 1311 GCM_MUL(ctx, Xi); 1312 } 1313 1314 ctx->mres = n; 1315 return 0; 1316 } 1317 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt); 1318 1319 int 1320 CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, 1321 const unsigned char *in, unsigned char *out, 1322 size_t len, ctr128_f stream) 1323 { 1324 unsigned int n, ctr; 1325 size_t i; 1326 u64 mlen = ctx->len.u[1]; 1327 void *key = ctx->key; 1328 #ifdef GCM_FUNCREF_4BIT 1329 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1330 # ifdef GHASH 1331 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 1332 const u8 *inp, size_t len) = ctx->ghash; 1333 # endif 1334 #endif 1335 1336 mlen += len; 1337 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 1338 return -1; 1339 ctx->len.u[1] = mlen; 1340 1341 if (ctx->ares) { 1342 /* First call to encrypt finalizes GHASH(AAD) */ 1343 GCM_MUL(ctx, Xi); 1344 ctx->ares = 0; 1345 } 1346 1347 #if BYTE_ORDER == LITTLE_ENDIAN 1348 #ifdef BSWAP4 1349 ctr = BSWAP4(ctx->Yi.d[3]); 1350 #else 1351 ctr = GETU32(ctx->Yi.c + 12); 1352 #endif 1353 #else /* BIG_ENDIAN */ 1354 ctr = ctx->Yi.d[3]; 1355 #endif 1356 1357 n = ctx->mres; 1358 if (n) { 1359 while (n && len) { 1360 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; 1361 --len; 1362 n = (n + 1) % 16; 1363 } 1364 if (n == 0) 1365 GCM_MUL(ctx, Xi); 1366 else { 1367 ctx->mres = n; 1368 return 0; 1369 } 1370 } 1371 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1372 while (len >= GHASH_CHUNK) { 1373 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c); 1374 ctr += GHASH_CHUNK/16; 1375 #if BYTE_ORDER == LITTLE_ENDIAN 1376 #ifdef BSWAP4 1377 ctx->Yi.d[3] = BSWAP4(ctr); 1378 #else 1379 PUTU32(ctx->Yi.c + 12, ctr); 1380 #endif 1381 #else /* BIG_ENDIAN */ 1382 ctx->Yi.d[3] = ctr; 1383 #endif 1384 GHASH(ctx, out, GHASH_CHUNK); 1385 out += GHASH_CHUNK; 1386 in += GHASH_CHUNK; 1387 len -= GHASH_CHUNK; 1388 } 1389 #endif 1390 if ((i = (len & (size_t)-16))) { 1391 size_t j = i/16; 1392 1393 (*stream)(in, out, j, key, ctx->Yi.c); 1394 ctr += (unsigned int)j; 1395 #if BYTE_ORDER == LITTLE_ENDIAN 1396 #ifdef BSWAP4 1397 ctx->Yi.d[3] = BSWAP4(ctr); 1398 #else 1399 PUTU32(ctx->Yi.c + 12, ctr); 1400 #endif 1401 #else /* BIG_ENDIAN */ 1402 ctx->Yi.d[3] = ctr; 1403 #endif 1404 in += i; 1405 len -= i; 1406 #if defined(GHASH) 1407 GHASH(ctx, out, i); 1408 out += i; 1409 #else 1410 while (j--) { 1411 for (i = 0; i < 16; ++i) 1412 ctx->Xi.c[i] ^= out[i]; 1413 GCM_MUL(ctx, Xi); 1414 out += 16; 1415 } 1416 #endif 1417 } 1418 if (len) { 1419 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key); 1420 ++ctr; 1421 #if BYTE_ORDER == LITTLE_ENDIAN 1422 #ifdef BSWAP4 1423 ctx->Yi.d[3] = BSWAP4(ctr); 1424 #else 1425 PUTU32(ctx->Yi.c + 12, ctr); 1426 #endif 1427 #else /* BIG_ENDIAN */ 1428 ctx->Yi.d[3] = ctr; 1429 #endif 1430 while (len--) { 1431 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; 1432 ++n; 1433 } 1434 } 1435 1436 ctx->mres = n; 1437 return 0; 1438 } 1439 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32); 1440 1441 int 1442 CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, 1443 const unsigned char *in, unsigned char *out, 1444 size_t len, ctr128_f stream) 1445 { 1446 unsigned int n, ctr; 1447 size_t i; 1448 u64 mlen = ctx->len.u[1]; 1449 void *key = ctx->key; 1450 #ifdef GCM_FUNCREF_4BIT 1451 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1452 # ifdef GHASH 1453 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 1454 const u8 *inp, size_t len) = ctx->ghash; 1455 # endif 1456 #endif 1457 1458 mlen += len; 1459 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 1460 return -1; 1461 ctx->len.u[1] = mlen; 1462 1463 if (ctx->ares) { 1464 /* First call to decrypt finalizes GHASH(AAD) */ 1465 GCM_MUL(ctx, Xi); 1466 ctx->ares = 0; 1467 } 1468 1469 #if BYTE_ORDER == LITTLE_ENDIAN 1470 #ifdef BSWAP4 1471 ctr = BSWAP4(ctx->Yi.d[3]); 1472 #else 1473 ctr = GETU32(ctx->Yi.c + 12); 1474 #endif 1475 #else /* BIG_ENDIAN */ 1476 ctr = ctx->Yi.d[3]; 1477 #endif 1478 1479 n = ctx->mres; 1480 if (n) { 1481 while (n && len) { 1482 u8 c = *(in++); 1483 *(out++) = c ^ ctx->EKi.c[n]; 1484 ctx->Xi.c[n] ^= c; 1485 --len; 1486 n = (n + 1) % 16; 1487 } 1488 if (n == 0) 1489 GCM_MUL(ctx, Xi); 1490 else { 1491 ctx->mres = n; 1492 return 0; 1493 } 1494 } 1495 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1496 while (len >= GHASH_CHUNK) { 1497 GHASH(ctx, in, GHASH_CHUNK); 1498 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c); 1499 ctr += GHASH_CHUNK/16; 1500 #if BYTE_ORDER == LITTLE_ENDIAN 1501 #ifdef BSWAP4 1502 ctx->Yi.d[3] = BSWAP4(ctr); 1503 #else 1504 PUTU32(ctx->Yi.c + 12, ctr); 1505 #endif 1506 #else /* BIG_ENDIAN */ 1507 ctx->Yi.d[3] = ctr; 1508 #endif 1509 out += GHASH_CHUNK; 1510 in += GHASH_CHUNK; 1511 len -= GHASH_CHUNK; 1512 } 1513 #endif 1514 if ((i = (len & (size_t)-16))) { 1515 size_t j = i/16; 1516 1517 #if defined(GHASH) 1518 GHASH(ctx, in, i); 1519 #else 1520 while (j--) { 1521 size_t k; 1522 for (k = 0; k < 16; ++k) 1523 ctx->Xi.c[k] ^= in[k]; 1524 GCM_MUL(ctx, Xi); 1525 in += 16; 1526 } 1527 j = i/16; 1528 in -= i; 1529 #endif 1530 (*stream)(in, out, j, key, ctx->Yi.c); 1531 ctr += (unsigned int)j; 1532 #if BYTE_ORDER == LITTLE_ENDIAN 1533 #ifdef BSWAP4 1534 ctx->Yi.d[3] = BSWAP4(ctr); 1535 #else 1536 PUTU32(ctx->Yi.c + 12, ctr); 1537 #endif 1538 #else /* BIG_ENDIAN */ 1539 ctx->Yi.d[3] = ctr; 1540 #endif 1541 out += i; 1542 in += i; 1543 len -= i; 1544 } 1545 if (len) { 1546 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key); 1547 ++ctr; 1548 #if BYTE_ORDER == LITTLE_ENDIAN 1549 #ifdef BSWAP4 1550 ctx->Yi.d[3] = BSWAP4(ctr); 1551 #else 1552 PUTU32(ctx->Yi.c + 12, ctr); 1553 #endif 1554 #else /* BIG_ENDIAN */ 1555 ctx->Yi.d[3] = ctr; 1556 #endif 1557 while (len--) { 1558 u8 c = in[n]; 1559 ctx->Xi.c[n] ^= c; 1560 out[n] = c ^ ctx->EKi.c[n]; 1561 ++n; 1562 } 1563 } 1564 1565 ctx->mres = n; 1566 return 0; 1567 } 1568 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32); 1569 1570 int 1571 CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag, 1572 size_t len) 1573 { 1574 u64 alen = ctx->len.u[0] << 3; 1575 u64 clen = ctx->len.u[1] << 3; 1576 #ifdef GCM_FUNCREF_4BIT 1577 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1578 #endif 1579 1580 if (ctx->mres || ctx->ares) 1581 GCM_MUL(ctx, Xi); 1582 1583 #if BYTE_ORDER == LITTLE_ENDIAN 1584 #ifdef BSWAP8 1585 alen = BSWAP8(alen); 1586 clen = BSWAP8(clen); 1587 #else 1588 { 1589 u8 *p = ctx->len.c; 1590 1591 ctx->len.u[0] = alen; 1592 ctx->len.u[1] = clen; 1593 1594 alen = (u64)GETU32(p) << 32|GETU32(p + 4); 1595 clen = (u64)GETU32(p + 8) << 32|GETU32(p + 12); 1596 } 1597 #endif 1598 #endif 1599 1600 ctx->Xi.u[0] ^= alen; 1601 ctx->Xi.u[1] ^= clen; 1602 GCM_MUL(ctx, Xi); 1603 1604 ctx->Xi.u[0] ^= ctx->EK0.u[0]; 1605 ctx->Xi.u[1] ^= ctx->EK0.u[1]; 1606 1607 if (tag && len <= sizeof(ctx->Xi)) 1608 return memcmp(ctx->Xi.c, tag, len); 1609 else 1610 return -1; 1611 } 1612 LCRYPTO_ALIAS(CRYPTO_gcm128_finish); 1613 1614 void 1615 CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) 1616 { 1617 CRYPTO_gcm128_finish(ctx, NULL, 0); 1618 memcpy(tag, ctx->Xi.c, 1619 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c)); 1620 } 1621 LCRYPTO_ALIAS(CRYPTO_gcm128_tag); 1622 1623 GCM128_CONTEXT * 1624 CRYPTO_gcm128_new(void *key, block128_f block) 1625 { 1626 GCM128_CONTEXT *ret; 1627 1628 if ((ret = malloc(sizeof(GCM128_CONTEXT)))) 1629 CRYPTO_gcm128_init(ret, key, block); 1630 1631 return ret; 1632 } 1633 LCRYPTO_ALIAS(CRYPTO_gcm128_new); 1634 1635 void 1636 CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) 1637 { 1638 freezero(ctx, sizeof(*ctx)); 1639 } 1640 LCRYPTO_ALIAS(CRYPTO_gcm128_release); 1641