1 /* $OpenBSD: gcm128.c,v 1.15 2016/11/04 17:30:30 miod Exp $ */ 2 /* ==================================================================== 3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * 3. All advertising materials mentioning features or use of this 18 * software must display the following acknowledgment: 19 * "This product includes software developed by the OpenSSL Project 20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 21 * 22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 23 * endorse or promote products derived from this software without 24 * prior written permission. For written permission, please contact 25 * openssl-core@openssl.org. 26 * 27 * 5. Products derived from this software may not be called "OpenSSL" 28 * nor may "OpenSSL" appear in their names without prior written 29 * permission of the OpenSSL Project. 30 * 31 * 6. Redistributions of any form whatsoever must retain the following 32 * acknowledgment: 33 * "This product includes software developed by the OpenSSL Project 34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 47 * OF THE POSSIBILITY OF SUCH DAMAGE. 48 * ==================================================================== 49 */ 50 51 #define OPENSSL_FIPSAPI 52 53 #include <openssl/crypto.h> 54 #include "modes_lcl.h" 55 #include <string.h> 56 57 #ifndef MODES_DEBUG 58 # ifndef NDEBUG 59 # define NDEBUG 60 # endif 61 #endif 62 63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT) 64 /* redefine, because alignment is ensured */ 65 #undef GETU32 66 #define GETU32(p) BSWAP4(*(const u32 *)(p)) 67 #undef PUTU32 68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) 69 #endif 70 71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) 72 #define REDUCE1BIT(V) \ 73 do { \ 74 if (sizeof(size_t)==8) { \ 75 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ 76 V.lo = (V.hi<<63)|(V.lo>>1); \ 77 V.hi = (V.hi>>1 )^T; \ 78 } else { \ 79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ 80 V.lo = (V.hi<<63)|(V.lo>>1); \ 81 V.hi = (V.hi>>1 )^((u64)T<<32); \ 82 } \ 83 } while(0) 84 85 /* 86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should 87 * never be set to 8. 8 is effectively reserved for testing purposes. 88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as 89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover 90 * whole spectrum of possible table driven implementations. Why? In 91 * non-"Shoup's" case memory access pattern is segmented in such manner, 92 * that it's trivial to see that cache timing information can reveal 93 * fair portion of intermediate hash value. Given that ciphertext is 94 * always available to attacker, it's possible for him to attempt to 95 * deduce secret parameter H and if successful, tamper with messages 96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's 97 * not as trivial, but there is no reason to believe that it's resistant 98 * to cache-timing attack. And the thing about "8-bit" implementation is 99 * that it consumes 16 (sixteen) times more memory, 4KB per individual 100 * key + 1KB shared. Well, on pros side it should be twice as fast as 101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version 102 * was observed to run ~75% faster, closer to 100% for commercial 103 * compilers... Yet "4-bit" procedure is preferred, because it's 104 * believed to provide better security-performance balance and adequate 105 * all-round performance. "All-round" refers to things like: 106 * 107 * - shorter setup time effectively improves overall timing for 108 * handling short messages; 109 * - larger table allocation can become unbearable because of VM 110 * subsystem penalties (for example on Windows large enough free 111 * results in VM working set trimming, meaning that consequent 112 * malloc would immediately incur working set expansion); 113 * - larger table has larger cache footprint, which can affect 114 * performance of other code paths (not necessarily even from same 115 * thread in Hyper-Threading world); 116 * 117 * Value of 1 is not appropriate for performance reasons. 118 */ 119 #if TABLE_BITS==8 120 121 static void gcm_init_8bit(u128 Htable[256], u64 H[2]) 122 { 123 int i, j; 124 u128 V; 125 126 Htable[0].hi = 0; 127 Htable[0].lo = 0; 128 V.hi = H[0]; 129 V.lo = H[1]; 130 131 for (Htable[128]=V, i=64; i>0; i>>=1) { 132 REDUCE1BIT(V); 133 Htable[i] = V; 134 } 135 136 for (i=2; i<256; i<<=1) { 137 u128 *Hi = Htable+i, H0 = *Hi; 138 for (j=1; j<i; ++j) { 139 Hi[j].hi = H0.hi^Htable[j].hi; 140 Hi[j].lo = H0.lo^Htable[j].lo; 141 } 142 } 143 } 144 145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) 146 { 147 u128 Z = { 0, 0}; 148 const u8 *xi = (const u8 *)Xi+15; 149 size_t rem, n = *xi; 150 static const size_t rem_8bit[256] = { 151 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), 152 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), 153 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), 154 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), 155 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), 156 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), 157 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), 158 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), 159 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), 160 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), 161 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), 162 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), 163 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), 164 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), 165 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), 166 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), 167 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), 168 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), 169 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), 170 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), 171 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), 172 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), 173 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), 174 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), 175 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), 176 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), 177 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), 178 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), 179 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), 180 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), 181 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), 182 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), 183 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), 184 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), 185 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), 186 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), 187 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), 188 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), 189 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), 190 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), 191 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), 192 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), 193 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), 194 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), 195 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), 196 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), 197 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), 198 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), 199 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), 200 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), 201 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), 202 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), 203 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), 204 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), 205 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), 206 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), 207 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), 208 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), 209 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), 210 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), 211 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), 212 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), 213 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), 214 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; 215 216 while (1) { 217 Z.hi ^= Htable[n].hi; 218 Z.lo ^= Htable[n].lo; 219 220 if ((u8 *)Xi==xi) break; 221 222 n = *(--xi); 223 224 rem = (size_t)Z.lo&0xff; 225 Z.lo = (Z.hi<<56)|(Z.lo>>8); 226 Z.hi = (Z.hi>>8); 227 if (sizeof(size_t)==8) 228 Z.hi ^= rem_8bit[rem]; 229 else 230 Z.hi ^= (u64)rem_8bit[rem]<<32; 231 } 232 233 if (BYTE_ORDER == LITTLE_ENDIAN) { 234 #ifdef BSWAP8 235 Xi[0] = BSWAP8(Z.hi); 236 Xi[1] = BSWAP8(Z.lo); 237 #else 238 u8 *p = (u8 *)Xi; 239 u32 v; 240 v = (u32)(Z.hi>>32); PUTU32(p,v); 241 v = (u32)(Z.hi); PUTU32(p+4,v); 242 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 243 v = (u32)(Z.lo); PUTU32(p+12,v); 244 #endif 245 } 246 else { 247 Xi[0] = Z.hi; 248 Xi[1] = Z.lo; 249 } 250 } 251 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) 252 253 #elif TABLE_BITS==4 254 255 static void gcm_init_4bit(u128 Htable[16], u64 H[2]) 256 { 257 u128 V; 258 #if defined(OPENSSL_SMALL_FOOTPRINT) 259 int i; 260 #endif 261 262 Htable[0].hi = 0; 263 Htable[0].lo = 0; 264 V.hi = H[0]; 265 V.lo = H[1]; 266 267 #if defined(OPENSSL_SMALL_FOOTPRINT) 268 for (Htable[8]=V, i=4; i>0; i>>=1) { 269 REDUCE1BIT(V); 270 Htable[i] = V; 271 } 272 273 for (i=2; i<16; i<<=1) { 274 u128 *Hi = Htable+i; 275 int j; 276 for (V=*Hi, j=1; j<i; ++j) { 277 Hi[j].hi = V.hi^Htable[j].hi; 278 Hi[j].lo = V.lo^Htable[j].lo; 279 } 280 } 281 #else 282 Htable[8] = V; 283 REDUCE1BIT(V); 284 Htable[4] = V; 285 REDUCE1BIT(V); 286 Htable[2] = V; 287 REDUCE1BIT(V); 288 Htable[1] = V; 289 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; 290 V=Htable[4]; 291 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; 292 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; 293 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; 294 V=Htable[8]; 295 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; 296 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; 297 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; 298 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; 299 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; 300 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; 301 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; 302 #endif 303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) 304 /* 305 * ARM assembler expects specific dword order in Htable. 306 */ 307 { 308 int j; 309 310 if (BYTE_ORDER == LITTLE_ENDIAN) 311 for (j=0;j<16;++j) { 312 V = Htable[j]; 313 Htable[j].hi = V.lo; 314 Htable[j].lo = V.hi; 315 } 316 else 317 for (j=0;j<16;++j) { 318 V = Htable[j]; 319 Htable[j].hi = V.lo<<32|V.lo>>32; 320 Htable[j].lo = V.hi<<32|V.hi>>32; 321 } 322 } 323 #endif 324 } 325 326 #ifndef GHASH_ASM 327 static const size_t rem_4bit[16] = { 328 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), 329 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), 330 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), 331 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; 332 333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) 334 { 335 u128 Z; 336 int cnt = 15; 337 size_t rem, nlo, nhi; 338 339 nlo = ((const u8 *)Xi)[15]; 340 nhi = nlo>>4; 341 nlo &= 0xf; 342 343 Z.hi = Htable[nlo].hi; 344 Z.lo = Htable[nlo].lo; 345 346 while (1) { 347 rem = (size_t)Z.lo&0xf; 348 Z.lo = (Z.hi<<60)|(Z.lo>>4); 349 Z.hi = (Z.hi>>4); 350 if (sizeof(size_t)==8) 351 Z.hi ^= rem_4bit[rem]; 352 else 353 Z.hi ^= (u64)rem_4bit[rem]<<32; 354 355 Z.hi ^= Htable[nhi].hi; 356 Z.lo ^= Htable[nhi].lo; 357 358 if (--cnt<0) break; 359 360 nlo = ((const u8 *)Xi)[cnt]; 361 nhi = nlo>>4; 362 nlo &= 0xf; 363 364 rem = (size_t)Z.lo&0xf; 365 Z.lo = (Z.hi<<60)|(Z.lo>>4); 366 Z.hi = (Z.hi>>4); 367 if (sizeof(size_t)==8) 368 Z.hi ^= rem_4bit[rem]; 369 else 370 Z.hi ^= (u64)rem_4bit[rem]<<32; 371 372 Z.hi ^= Htable[nlo].hi; 373 Z.lo ^= Htable[nlo].lo; 374 } 375 376 if (BYTE_ORDER == LITTLE_ENDIAN) { 377 #ifdef BSWAP8 378 Xi[0] = BSWAP8(Z.hi); 379 Xi[1] = BSWAP8(Z.lo); 380 #else 381 u8 *p = (u8 *)Xi; 382 u32 v; 383 v = (u32)(Z.hi>>32); PUTU32(p,v); 384 v = (u32)(Z.hi); PUTU32(p+4,v); 385 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 386 v = (u32)(Z.lo); PUTU32(p+12,v); 387 #endif 388 } 389 else { 390 Xi[0] = Z.hi; 391 Xi[1] = Z.lo; 392 } 393 } 394 395 #if !defined(OPENSSL_SMALL_FOOTPRINT) 396 /* 397 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for 398 * details... Compiler-generated code doesn't seem to give any 399 * performance improvement, at least not on x86[_64]. It's here 400 * mostly as reference and a placeholder for possible future 401 * non-trivial optimization[s]... 402 */ 403 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], 404 const u8 *inp,size_t len) 405 { 406 u128 Z; 407 int cnt; 408 size_t rem, nlo, nhi; 409 410 #if 1 411 do { 412 cnt = 15; 413 nlo = ((const u8 *)Xi)[15]; 414 nlo ^= inp[15]; 415 nhi = nlo>>4; 416 nlo &= 0xf; 417 418 Z.hi = Htable[nlo].hi; 419 Z.lo = Htable[nlo].lo; 420 421 while (1) { 422 rem = (size_t)Z.lo&0xf; 423 Z.lo = (Z.hi<<60)|(Z.lo>>4); 424 Z.hi = (Z.hi>>4); 425 if (sizeof(size_t)==8) 426 Z.hi ^= rem_4bit[rem]; 427 else 428 Z.hi ^= (u64)rem_4bit[rem]<<32; 429 430 Z.hi ^= Htable[nhi].hi; 431 Z.lo ^= Htable[nhi].lo; 432 433 if (--cnt<0) break; 434 435 nlo = ((const u8 *)Xi)[cnt]; 436 nlo ^= inp[cnt]; 437 nhi = nlo>>4; 438 nlo &= 0xf; 439 440 rem = (size_t)Z.lo&0xf; 441 Z.lo = (Z.hi<<60)|(Z.lo>>4); 442 Z.hi = (Z.hi>>4); 443 if (sizeof(size_t)==8) 444 Z.hi ^= rem_4bit[rem]; 445 else 446 Z.hi ^= (u64)rem_4bit[rem]<<32; 447 448 Z.hi ^= Htable[nlo].hi; 449 Z.lo ^= Htable[nlo].lo; 450 } 451 #else 452 /* 453 * Extra 256+16 bytes per-key plus 512 bytes shared tables 454 * [should] give ~50% improvement... One could have PACK()-ed 455 * the rem_8bit even here, but the priority is to minimize 456 * cache footprint... 457 */ 458 u128 Hshr4[16]; /* Htable shifted right by 4 bits */ 459 u8 Hshl4[16]; /* Htable shifted left by 4 bits */ 460 static const unsigned short rem_8bit[256] = { 461 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, 462 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, 463 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, 464 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, 465 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, 466 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, 467 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, 468 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, 469 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, 470 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, 471 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, 472 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, 473 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, 474 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, 475 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, 476 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, 477 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, 478 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, 479 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, 480 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, 481 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, 482 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, 483 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, 484 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, 485 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, 486 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, 487 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, 488 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, 489 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, 490 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, 491 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, 492 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; 493 /* 494 * This pre-processing phase slows down procedure by approximately 495 * same time as it makes each loop spin faster. In other words 496 * single block performance is approximately same as straightforward 497 * "4-bit" implementation, and then it goes only faster... 498 */ 499 for (cnt=0; cnt<16; ++cnt) { 500 Z.hi = Htable[cnt].hi; 501 Z.lo = Htable[cnt].lo; 502 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); 503 Hshr4[cnt].hi = (Z.hi>>4); 504 Hshl4[cnt] = (u8)(Z.lo<<4); 505 } 506 507 do { 508 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { 509 nlo = ((const u8 *)Xi)[cnt]; 510 nlo ^= inp[cnt]; 511 nhi = nlo>>4; 512 nlo &= 0xf; 513 514 Z.hi ^= Htable[nlo].hi; 515 Z.lo ^= Htable[nlo].lo; 516 517 rem = (size_t)Z.lo&0xff; 518 519 Z.lo = (Z.hi<<56)|(Z.lo>>8); 520 Z.hi = (Z.hi>>8); 521 522 Z.hi ^= Hshr4[nhi].hi; 523 Z.lo ^= Hshr4[nhi].lo; 524 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; 525 } 526 527 nlo = ((const u8 *)Xi)[0]; 528 nlo ^= inp[0]; 529 nhi = nlo>>4; 530 nlo &= 0xf; 531 532 Z.hi ^= Htable[nlo].hi; 533 Z.lo ^= Htable[nlo].lo; 534 535 rem = (size_t)Z.lo&0xf; 536 537 Z.lo = (Z.hi<<60)|(Z.lo>>4); 538 Z.hi = (Z.hi>>4); 539 540 Z.hi ^= Htable[nhi].hi; 541 Z.lo ^= Htable[nhi].lo; 542 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; 543 #endif 544 545 if (BYTE_ORDER == LITTLE_ENDIAN) { 546 #ifdef BSWAP8 547 Xi[0] = BSWAP8(Z.hi); 548 Xi[1] = BSWAP8(Z.lo); 549 #else 550 u8 *p = (u8 *)Xi; 551 u32 v; 552 v = (u32)(Z.hi>>32); PUTU32(p,v); 553 v = (u32)(Z.hi); PUTU32(p+4,v); 554 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 555 v = (u32)(Z.lo); PUTU32(p+12,v); 556 #endif 557 } 558 else { 559 Xi[0] = Z.hi; 560 Xi[1] = Z.lo; 561 } 562 } while (inp+=16, len-=16); 563 } 564 #endif 565 #else 566 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); 567 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 568 #endif 569 570 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) 571 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) 572 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) 573 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache 574 * trashing effect. In other words idea is to hash data while it's 575 * still in L1 cache after encryption pass... */ 576 #define GHASH_CHUNK (3*1024) 577 #endif 578 579 #else /* TABLE_BITS */ 580 581 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) 582 { 583 u128 V,Z = { 0,0 }; 584 long X; 585 int i,j; 586 const long *xi = (const long *)Xi; 587 588 V.hi = H[0]; /* H is in host byte order, no byte swapping */ 589 V.lo = H[1]; 590 591 for (j=0; j<16/sizeof(long); ++j) { 592 if (BYTE_ORDER == LITTLE_ENDIAN) { 593 if (sizeof(long)==8) { 594 #ifdef BSWAP8 595 X = (long)(BSWAP8(xi[j])); 596 #else 597 const u8 *p = (const u8 *)(xi+j); 598 X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); 599 #endif 600 } 601 else { 602 const u8 *p = (const u8 *)(xi+j); 603 X = (long)GETU32(p); 604 } 605 } 606 else 607 X = xi[j]; 608 609 for (i=0; i<8*sizeof(long); ++i, X<<=1) { 610 u64 M = (u64)(X>>(8*sizeof(long)-1)); 611 Z.hi ^= V.hi&M; 612 Z.lo ^= V.lo&M; 613 614 REDUCE1BIT(V); 615 } 616 } 617 618 if (BYTE_ORDER == LITTLE_ENDIAN) { 619 #ifdef BSWAP8 620 Xi[0] = BSWAP8(Z.hi); 621 Xi[1] = BSWAP8(Z.lo); 622 #else 623 u8 *p = (u8 *)Xi; 624 u32 v; 625 v = (u32)(Z.hi>>32); PUTU32(p,v); 626 v = (u32)(Z.hi); PUTU32(p+4,v); 627 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 628 v = (u32)(Z.lo); PUTU32(p+12,v); 629 #endif 630 } 631 else { 632 Xi[0] = Z.hi; 633 Xi[1] = Z.lo; 634 } 635 } 636 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) 637 638 #endif 639 640 #if defined(GHASH_ASM) && \ 641 (defined(__i386) || defined(__i386__) || \ 642 defined(__x86_64) || defined(__x86_64__) || \ 643 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 644 #include "x86_arch.h" 645 #endif 646 647 #if TABLE_BITS==4 && defined(GHASH_ASM) 648 # if (defined(__i386) || defined(__i386__) || \ 649 defined(__x86_64) || defined(__x86_64__) || \ 650 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 651 # define GHASH_ASM_X86_OR_64 652 # define GCM_FUNCREF_4BIT 653 654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); 655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); 656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 657 658 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) 659 # define GHASH_ASM_X86 660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); 661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 662 663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); 664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 665 # endif 666 # elif defined(__arm__) || defined(__arm) 667 # include "arm_arch.h" 668 # if __ARM_ARCH__>=7 669 # define GHASH_ASM_ARM 670 # define GCM_FUNCREF_4BIT 671 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); 672 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 673 # endif 674 # endif 675 #endif 676 677 #ifdef GCM_FUNCREF_4BIT 678 # undef GCM_MUL 679 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) 680 # ifdef GHASH 681 # undef GHASH 682 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) 683 # endif 684 #endif 685 686 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) 687 { 688 memset(ctx,0,sizeof(*ctx)); 689 ctx->block = block; 690 ctx->key = key; 691 692 (*block)(ctx->H.c,ctx->H.c,key); 693 694 if (BYTE_ORDER == LITTLE_ENDIAN) { 695 /* H is stored in host byte order */ 696 #ifdef BSWAP8 697 ctx->H.u[0] = BSWAP8(ctx->H.u[0]); 698 ctx->H.u[1] = BSWAP8(ctx->H.u[1]); 699 #else 700 u8 *p = ctx->H.c; 701 u64 hi,lo; 702 hi = (u64)GETU32(p) <<32|GETU32(p+4); 703 lo = (u64)GETU32(p+8)<<32|GETU32(p+12); 704 ctx->H.u[0] = hi; 705 ctx->H.u[1] = lo; 706 #endif 707 } 708 709 #if TABLE_BITS==8 710 gcm_init_8bit(ctx->Htable,ctx->H.u); 711 #elif TABLE_BITS==4 712 # if defined(GHASH_ASM_X86_OR_64) 713 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) 714 /* check FXSR and PCLMULQDQ bits */ 715 if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) == 716 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) { 717 gcm_init_clmul(ctx->Htable,ctx->H.u); 718 ctx->gmult = gcm_gmult_clmul; 719 ctx->ghash = gcm_ghash_clmul; 720 return; 721 } 722 # endif 723 gcm_init_4bit(ctx->Htable,ctx->H.u); 724 # if defined(GHASH_ASM_X86) /* x86 only */ 725 # if defined(OPENSSL_IA32_SSE2) 726 if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) { /* check SSE bit */ 727 # else 728 if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) { /* check MMX bit */ 729 # endif 730 ctx->gmult = gcm_gmult_4bit_mmx; 731 ctx->ghash = gcm_ghash_4bit_mmx; 732 } else { 733 ctx->gmult = gcm_gmult_4bit_x86; 734 ctx->ghash = gcm_ghash_4bit_x86; 735 } 736 # else 737 ctx->gmult = gcm_gmult_4bit; 738 ctx->ghash = gcm_ghash_4bit; 739 # endif 740 # elif defined(GHASH_ASM_ARM) 741 if (OPENSSL_armcap_P & ARMV7_NEON) { 742 ctx->gmult = gcm_gmult_neon; 743 ctx->ghash = gcm_ghash_neon; 744 } else { 745 gcm_init_4bit(ctx->Htable,ctx->H.u); 746 ctx->gmult = gcm_gmult_4bit; 747 ctx->ghash = gcm_ghash_4bit; 748 } 749 # else 750 gcm_init_4bit(ctx->Htable,ctx->H.u); 751 # endif 752 #endif 753 } 754 755 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) 756 { 757 unsigned int ctr; 758 #ifdef GCM_FUNCREF_4BIT 759 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 760 #endif 761 762 ctx->Yi.u[0] = 0; 763 ctx->Yi.u[1] = 0; 764 ctx->Xi.u[0] = 0; 765 ctx->Xi.u[1] = 0; 766 ctx->len.u[0] = 0; /* AAD length */ 767 ctx->len.u[1] = 0; /* message length */ 768 ctx->ares = 0; 769 ctx->mres = 0; 770 771 if (len==12) { 772 memcpy(ctx->Yi.c,iv,12); 773 ctx->Yi.c[15]=1; 774 ctr=1; 775 } 776 else { 777 size_t i; 778 u64 len0 = len; 779 780 while (len>=16) { 781 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; 782 GCM_MUL(ctx,Yi); 783 iv += 16; 784 len -= 16; 785 } 786 if (len) { 787 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; 788 GCM_MUL(ctx,Yi); 789 } 790 len0 <<= 3; 791 if (BYTE_ORDER == LITTLE_ENDIAN) { 792 #ifdef BSWAP8 793 ctx->Yi.u[1] ^= BSWAP8(len0); 794 #else 795 ctx->Yi.c[8] ^= (u8)(len0>>56); 796 ctx->Yi.c[9] ^= (u8)(len0>>48); 797 ctx->Yi.c[10] ^= (u8)(len0>>40); 798 ctx->Yi.c[11] ^= (u8)(len0>>32); 799 ctx->Yi.c[12] ^= (u8)(len0>>24); 800 ctx->Yi.c[13] ^= (u8)(len0>>16); 801 ctx->Yi.c[14] ^= (u8)(len0>>8); 802 ctx->Yi.c[15] ^= (u8)(len0); 803 #endif 804 } 805 else 806 ctx->Yi.u[1] ^= len0; 807 808 GCM_MUL(ctx,Yi); 809 810 if (BYTE_ORDER == LITTLE_ENDIAN) 811 #ifdef BSWAP4 812 ctr = BSWAP4(ctx->Yi.d[3]); 813 #else 814 ctr = GETU32(ctx->Yi.c+12); 815 #endif 816 else 817 ctr = ctx->Yi.d[3]; 818 } 819 820 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); 821 ++ctr; 822 if (BYTE_ORDER == LITTLE_ENDIAN) 823 #ifdef BSWAP4 824 ctx->Yi.d[3] = BSWAP4(ctr); 825 #else 826 PUTU32(ctx->Yi.c+12,ctr); 827 #endif 828 else 829 ctx->Yi.d[3] = ctr; 830 } 831 832 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) 833 { 834 size_t i; 835 unsigned int n; 836 u64 alen = ctx->len.u[0]; 837 #ifdef GCM_FUNCREF_4BIT 838 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 839 # ifdef GHASH 840 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 841 const u8 *inp,size_t len) = ctx->ghash; 842 # endif 843 #endif 844 845 if (ctx->len.u[1]) return -2; 846 847 alen += len; 848 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) 849 return -1; 850 ctx->len.u[0] = alen; 851 852 n = ctx->ares; 853 if (n) { 854 while (n && len) { 855 ctx->Xi.c[n] ^= *(aad++); 856 --len; 857 n = (n+1)%16; 858 } 859 if (n==0) GCM_MUL(ctx,Xi); 860 else { 861 ctx->ares = n; 862 return 0; 863 } 864 } 865 866 #ifdef GHASH 867 if ((i = (len&(size_t)-16))) { 868 GHASH(ctx,aad,i); 869 aad += i; 870 len -= i; 871 } 872 #else 873 while (len>=16) { 874 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; 875 GCM_MUL(ctx,Xi); 876 aad += 16; 877 len -= 16; 878 } 879 #endif 880 if (len) { 881 n = (unsigned int)len; 882 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; 883 } 884 885 ctx->ares = n; 886 return 0; 887 } 888 889 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, 890 const unsigned char *in, unsigned char *out, 891 size_t len) 892 { 893 unsigned int n, ctr; 894 size_t i; 895 u64 mlen = ctx->len.u[1]; 896 block128_f block = ctx->block; 897 void *key = ctx->key; 898 #ifdef GCM_FUNCREF_4BIT 899 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 900 # ifdef GHASH 901 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 902 const u8 *inp,size_t len) = ctx->ghash; 903 # endif 904 #endif 905 906 mlen += len; 907 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 908 return -1; 909 ctx->len.u[1] = mlen; 910 911 if (ctx->ares) { 912 /* First call to encrypt finalizes GHASH(AAD) */ 913 GCM_MUL(ctx,Xi); 914 ctx->ares = 0; 915 } 916 917 if (BYTE_ORDER == LITTLE_ENDIAN) 918 #ifdef BSWAP4 919 ctr = BSWAP4(ctx->Yi.d[3]); 920 #else 921 ctr = GETU32(ctx->Yi.c+12); 922 #endif 923 else 924 ctr = ctx->Yi.d[3]; 925 926 n = ctx->mres; 927 #if !defined(OPENSSL_SMALL_FOOTPRINT) 928 if (16%sizeof(size_t) == 0) do { /* always true actually */ 929 if (n) { 930 while (n && len) { 931 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 932 --len; 933 n = (n+1)%16; 934 } 935 if (n==0) GCM_MUL(ctx,Xi); 936 else { 937 ctx->mres = n; 938 return 0; 939 } 940 } 941 #ifdef __STRICT_ALIGNMENT 942 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 943 break; 944 #endif 945 #if defined(GHASH) && defined(GHASH_CHUNK) 946 while (len>=GHASH_CHUNK) { 947 size_t j=GHASH_CHUNK; 948 949 while (j) { 950 size_t *out_t=(size_t *)out; 951 const size_t *in_t=(const size_t *)in; 952 953 (*block)(ctx->Yi.c,ctx->EKi.c,key); 954 ++ctr; 955 if (BYTE_ORDER == LITTLE_ENDIAN) 956 #ifdef BSWAP4 957 ctx->Yi.d[3] = BSWAP4(ctr); 958 #else 959 PUTU32(ctx->Yi.c+12,ctr); 960 #endif 961 else 962 ctx->Yi.d[3] = ctr; 963 for (i=0; i<16/sizeof(size_t); ++i) 964 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 965 out += 16; 966 in += 16; 967 j -= 16; 968 } 969 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); 970 len -= GHASH_CHUNK; 971 } 972 if ((i = (len&(size_t)-16))) { 973 size_t j=i; 974 975 while (len>=16) { 976 size_t *out_t=(size_t *)out; 977 const size_t *in_t=(const size_t *)in; 978 979 (*block)(ctx->Yi.c,ctx->EKi.c,key); 980 ++ctr; 981 if (BYTE_ORDER == LITTLE_ENDIAN) 982 #ifdef BSWAP4 983 ctx->Yi.d[3] = BSWAP4(ctr); 984 #else 985 PUTU32(ctx->Yi.c+12,ctr); 986 #endif 987 else 988 ctx->Yi.d[3] = ctr; 989 for (i=0; i<16/sizeof(size_t); ++i) 990 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 991 out += 16; 992 in += 16; 993 len -= 16; 994 } 995 GHASH(ctx,out-j,j); 996 } 997 #else 998 while (len>=16) { 999 size_t *out_t=(size_t *)out; 1000 const size_t *in_t=(const size_t *)in; 1001 1002 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1003 ++ctr; 1004 if (BYTE_ORDER == LITTLE_ENDIAN) 1005 #ifdef BSWAP4 1006 ctx->Yi.d[3] = BSWAP4(ctr); 1007 #else 1008 PUTU32(ctx->Yi.c+12,ctr); 1009 #endif 1010 else 1011 ctx->Yi.d[3] = ctr; 1012 for (i=0; i<16/sizeof(size_t); ++i) 1013 ctx->Xi.t[i] ^= 1014 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1015 GCM_MUL(ctx,Xi); 1016 out += 16; 1017 in += 16; 1018 len -= 16; 1019 } 1020 #endif 1021 if (len) { 1022 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1023 ++ctr; 1024 if (BYTE_ORDER == LITTLE_ENDIAN) 1025 #ifdef BSWAP4 1026 ctx->Yi.d[3] = BSWAP4(ctr); 1027 #else 1028 PUTU32(ctx->Yi.c+12,ctr); 1029 #endif 1030 else 1031 ctx->Yi.d[3] = ctr; 1032 while (len--) { 1033 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1034 ++n; 1035 } 1036 } 1037 1038 ctx->mres = n; 1039 return 0; 1040 } while(0); 1041 #endif 1042 for (i=0;i<len;++i) { 1043 if (n==0) { 1044 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1045 ++ctr; 1046 if (BYTE_ORDER == LITTLE_ENDIAN) 1047 #ifdef BSWAP4 1048 ctx->Yi.d[3] = BSWAP4(ctr); 1049 #else 1050 PUTU32(ctx->Yi.c+12,ctr); 1051 #endif 1052 else 1053 ctx->Yi.d[3] = ctr; 1054 } 1055 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; 1056 n = (n+1)%16; 1057 if (n==0) 1058 GCM_MUL(ctx,Xi); 1059 } 1060 1061 ctx->mres = n; 1062 return 0; 1063 } 1064 1065 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, 1066 const unsigned char *in, unsigned char *out, 1067 size_t len) 1068 { 1069 unsigned int n, ctr; 1070 size_t i; 1071 u64 mlen = ctx->len.u[1]; 1072 block128_f block = ctx->block; 1073 void *key = ctx->key; 1074 #ifdef GCM_FUNCREF_4BIT 1075 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1076 # ifdef GHASH 1077 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1078 const u8 *inp,size_t len) = ctx->ghash; 1079 # endif 1080 #endif 1081 1082 mlen += len; 1083 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1084 return -1; 1085 ctx->len.u[1] = mlen; 1086 1087 if (ctx->ares) { 1088 /* First call to decrypt finalizes GHASH(AAD) */ 1089 GCM_MUL(ctx,Xi); 1090 ctx->ares = 0; 1091 } 1092 1093 if (BYTE_ORDER == LITTLE_ENDIAN) 1094 #ifdef BSWAP4 1095 ctr = BSWAP4(ctx->Yi.d[3]); 1096 #else 1097 ctr = GETU32(ctx->Yi.c+12); 1098 #endif 1099 else 1100 ctr = ctx->Yi.d[3]; 1101 1102 n = ctx->mres; 1103 #if !defined(OPENSSL_SMALL_FOOTPRINT) 1104 if (16%sizeof(size_t) == 0) do { /* always true actually */ 1105 if (n) { 1106 while (n && len) { 1107 u8 c = *(in++); 1108 *(out++) = c^ctx->EKi.c[n]; 1109 ctx->Xi.c[n] ^= c; 1110 --len; 1111 n = (n+1)%16; 1112 } 1113 if (n==0) GCM_MUL (ctx,Xi); 1114 else { 1115 ctx->mres = n; 1116 return 0; 1117 } 1118 } 1119 #ifdef __STRICT_ALIGNMENT 1120 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 1121 break; 1122 #endif 1123 #if defined(GHASH) && defined(GHASH_CHUNK) 1124 while (len>=GHASH_CHUNK) { 1125 size_t j=GHASH_CHUNK; 1126 1127 GHASH(ctx,in,GHASH_CHUNK); 1128 while (j) { 1129 size_t *out_t=(size_t *)out; 1130 const size_t *in_t=(const size_t *)in; 1131 1132 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1133 ++ctr; 1134 if (BYTE_ORDER == LITTLE_ENDIAN) 1135 #ifdef BSWAP4 1136 ctx->Yi.d[3] = BSWAP4(ctr); 1137 #else 1138 PUTU32(ctx->Yi.c+12,ctr); 1139 #endif 1140 else 1141 ctx->Yi.d[3] = ctr; 1142 for (i=0; i<16/sizeof(size_t); ++i) 1143 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1144 out += 16; 1145 in += 16; 1146 j -= 16; 1147 } 1148 len -= GHASH_CHUNK; 1149 } 1150 if ((i = (len&(size_t)-16))) { 1151 GHASH(ctx,in,i); 1152 while (len>=16) { 1153 size_t *out_t=(size_t *)out; 1154 const size_t *in_t=(const size_t *)in; 1155 1156 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1157 ++ctr; 1158 if (BYTE_ORDER == LITTLE_ENDIAN) 1159 #ifdef BSWAP4 1160 ctx->Yi.d[3] = BSWAP4(ctr); 1161 #else 1162 PUTU32(ctx->Yi.c+12,ctr); 1163 #endif 1164 else 1165 ctx->Yi.d[3] = ctr; 1166 for (i=0; i<16/sizeof(size_t); ++i) 1167 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1168 out += 16; 1169 in += 16; 1170 len -= 16; 1171 } 1172 } 1173 #else 1174 while (len>=16) { 1175 size_t *out_t=(size_t *)out; 1176 const size_t *in_t=(const size_t *)in; 1177 1178 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1179 ++ctr; 1180 if (BYTE_ORDER == LITTLE_ENDIAN) 1181 #ifdef BSWAP4 1182 ctx->Yi.d[3] = BSWAP4(ctr); 1183 #else 1184 PUTU32(ctx->Yi.c+12,ctr); 1185 #endif 1186 else 1187 ctx->Yi.d[3] = ctr; 1188 for (i=0; i<16/sizeof(size_t); ++i) { 1189 size_t c = in[i]; 1190 out[i] = c^ctx->EKi.t[i]; 1191 ctx->Xi.t[i] ^= c; 1192 } 1193 GCM_MUL(ctx,Xi); 1194 out += 16; 1195 in += 16; 1196 len -= 16; 1197 } 1198 #endif 1199 if (len) { 1200 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1201 ++ctr; 1202 if (BYTE_ORDER == LITTLE_ENDIAN) 1203 #ifdef BSWAP4 1204 ctx->Yi.d[3] = BSWAP4(ctr); 1205 #else 1206 PUTU32(ctx->Yi.c+12,ctr); 1207 #endif 1208 else 1209 ctx->Yi.d[3] = ctr; 1210 while (len--) { 1211 u8 c = in[n]; 1212 ctx->Xi.c[n] ^= c; 1213 out[n] = c^ctx->EKi.c[n]; 1214 ++n; 1215 } 1216 } 1217 1218 ctx->mres = n; 1219 return 0; 1220 } while(0); 1221 #endif 1222 for (i=0;i<len;++i) { 1223 u8 c; 1224 if (n==0) { 1225 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1226 ++ctr; 1227 if (BYTE_ORDER == LITTLE_ENDIAN) 1228 #ifdef BSWAP4 1229 ctx->Yi.d[3] = BSWAP4(ctr); 1230 #else 1231 PUTU32(ctx->Yi.c+12,ctr); 1232 #endif 1233 else 1234 ctx->Yi.d[3] = ctr; 1235 } 1236 c = in[i]; 1237 out[i] = c^ctx->EKi.c[n]; 1238 ctx->Xi.c[n] ^= c; 1239 n = (n+1)%16; 1240 if (n==0) 1241 GCM_MUL(ctx,Xi); 1242 } 1243 1244 ctx->mres = n; 1245 return 0; 1246 } 1247 1248 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, 1249 const unsigned char *in, unsigned char *out, 1250 size_t len, ctr128_f stream) 1251 { 1252 unsigned int n, ctr; 1253 size_t i; 1254 u64 mlen = ctx->len.u[1]; 1255 void *key = ctx->key; 1256 #ifdef GCM_FUNCREF_4BIT 1257 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1258 # ifdef GHASH 1259 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1260 const u8 *inp,size_t len) = ctx->ghash; 1261 # endif 1262 #endif 1263 1264 mlen += len; 1265 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1266 return -1; 1267 ctx->len.u[1] = mlen; 1268 1269 if (ctx->ares) { 1270 /* First call to encrypt finalizes GHASH(AAD) */ 1271 GCM_MUL(ctx,Xi); 1272 ctx->ares = 0; 1273 } 1274 1275 if (BYTE_ORDER == LITTLE_ENDIAN) 1276 #ifdef BSWAP4 1277 ctr = BSWAP4(ctx->Yi.d[3]); 1278 #else 1279 ctr = GETU32(ctx->Yi.c+12); 1280 #endif 1281 else 1282 ctr = ctx->Yi.d[3]; 1283 1284 n = ctx->mres; 1285 if (n) { 1286 while (n && len) { 1287 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 1288 --len; 1289 n = (n+1)%16; 1290 } 1291 if (n==0) GCM_MUL(ctx,Xi); 1292 else { 1293 ctx->mres = n; 1294 return 0; 1295 } 1296 } 1297 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1298 while (len>=GHASH_CHUNK) { 1299 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1300 ctr += GHASH_CHUNK/16; 1301 if (BYTE_ORDER == LITTLE_ENDIAN) 1302 #ifdef BSWAP4 1303 ctx->Yi.d[3] = BSWAP4(ctr); 1304 #else 1305 PUTU32(ctx->Yi.c+12,ctr); 1306 #endif 1307 else 1308 ctx->Yi.d[3] = ctr; 1309 GHASH(ctx,out,GHASH_CHUNK); 1310 out += GHASH_CHUNK; 1311 in += GHASH_CHUNK; 1312 len -= GHASH_CHUNK; 1313 } 1314 #endif 1315 if ((i = (len&(size_t)-16))) { 1316 size_t j=i/16; 1317 1318 (*stream)(in,out,j,key,ctx->Yi.c); 1319 ctr += (unsigned int)j; 1320 if (BYTE_ORDER == LITTLE_ENDIAN) 1321 #ifdef BSWAP4 1322 ctx->Yi.d[3] = BSWAP4(ctr); 1323 #else 1324 PUTU32(ctx->Yi.c+12,ctr); 1325 #endif 1326 else 1327 ctx->Yi.d[3] = ctr; 1328 in += i; 1329 len -= i; 1330 #if defined(GHASH) 1331 GHASH(ctx,out,i); 1332 out += i; 1333 #else 1334 while (j--) { 1335 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; 1336 GCM_MUL(ctx,Xi); 1337 out += 16; 1338 } 1339 #endif 1340 } 1341 if (len) { 1342 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1343 ++ctr; 1344 if (BYTE_ORDER == LITTLE_ENDIAN) 1345 #ifdef BSWAP4 1346 ctx->Yi.d[3] = BSWAP4(ctr); 1347 #else 1348 PUTU32(ctx->Yi.c+12,ctr); 1349 #endif 1350 else 1351 ctx->Yi.d[3] = ctr; 1352 while (len--) { 1353 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1354 ++n; 1355 } 1356 } 1357 1358 ctx->mres = n; 1359 return 0; 1360 } 1361 1362 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, 1363 const unsigned char *in, unsigned char *out, 1364 size_t len,ctr128_f stream) 1365 { 1366 unsigned int n, ctr; 1367 size_t i; 1368 u64 mlen = ctx->len.u[1]; 1369 void *key = ctx->key; 1370 #ifdef GCM_FUNCREF_4BIT 1371 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1372 # ifdef GHASH 1373 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1374 const u8 *inp,size_t len) = ctx->ghash; 1375 # endif 1376 #endif 1377 1378 mlen += len; 1379 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1380 return -1; 1381 ctx->len.u[1] = mlen; 1382 1383 if (ctx->ares) { 1384 /* First call to decrypt finalizes GHASH(AAD) */ 1385 GCM_MUL(ctx,Xi); 1386 ctx->ares = 0; 1387 } 1388 1389 if (BYTE_ORDER == LITTLE_ENDIAN) 1390 #ifdef BSWAP4 1391 ctr = BSWAP4(ctx->Yi.d[3]); 1392 #else 1393 ctr = GETU32(ctx->Yi.c+12); 1394 #endif 1395 else 1396 ctr = ctx->Yi.d[3]; 1397 1398 n = ctx->mres; 1399 if (n) { 1400 while (n && len) { 1401 u8 c = *(in++); 1402 *(out++) = c^ctx->EKi.c[n]; 1403 ctx->Xi.c[n] ^= c; 1404 --len; 1405 n = (n+1)%16; 1406 } 1407 if (n==0) GCM_MUL (ctx,Xi); 1408 else { 1409 ctx->mres = n; 1410 return 0; 1411 } 1412 } 1413 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1414 while (len>=GHASH_CHUNK) { 1415 GHASH(ctx,in,GHASH_CHUNK); 1416 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1417 ctr += GHASH_CHUNK/16; 1418 if (BYTE_ORDER == LITTLE_ENDIAN) 1419 #ifdef BSWAP4 1420 ctx->Yi.d[3] = BSWAP4(ctr); 1421 #else 1422 PUTU32(ctx->Yi.c+12,ctr); 1423 #endif 1424 else 1425 ctx->Yi.d[3] = ctr; 1426 out += GHASH_CHUNK; 1427 in += GHASH_CHUNK; 1428 len -= GHASH_CHUNK; 1429 } 1430 #endif 1431 if ((i = (len&(size_t)-16))) { 1432 size_t j=i/16; 1433 1434 #if defined(GHASH) 1435 GHASH(ctx,in,i); 1436 #else 1437 while (j--) { 1438 size_t k; 1439 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; 1440 GCM_MUL(ctx,Xi); 1441 in += 16; 1442 } 1443 j = i/16; 1444 in -= i; 1445 #endif 1446 (*stream)(in,out,j,key,ctx->Yi.c); 1447 ctr += (unsigned int)j; 1448 if (BYTE_ORDER == LITTLE_ENDIAN) 1449 #ifdef BSWAP4 1450 ctx->Yi.d[3] = BSWAP4(ctr); 1451 #else 1452 PUTU32(ctx->Yi.c+12,ctr); 1453 #endif 1454 else 1455 ctx->Yi.d[3] = ctr; 1456 out += i; 1457 in += i; 1458 len -= i; 1459 } 1460 if (len) { 1461 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1462 ++ctr; 1463 if (BYTE_ORDER == LITTLE_ENDIAN) 1464 #ifdef BSWAP4 1465 ctx->Yi.d[3] = BSWAP4(ctr); 1466 #else 1467 PUTU32(ctx->Yi.c+12,ctr); 1468 #endif 1469 else 1470 ctx->Yi.d[3] = ctr; 1471 while (len--) { 1472 u8 c = in[n]; 1473 ctx->Xi.c[n] ^= c; 1474 out[n] = c^ctx->EKi.c[n]; 1475 ++n; 1476 } 1477 } 1478 1479 ctx->mres = n; 1480 return 0; 1481 } 1482 1483 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, 1484 size_t len) 1485 { 1486 u64 alen = ctx->len.u[0]<<3; 1487 u64 clen = ctx->len.u[1]<<3; 1488 #ifdef GCM_FUNCREF_4BIT 1489 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1490 #endif 1491 1492 if (ctx->mres || ctx->ares) 1493 GCM_MUL(ctx,Xi); 1494 1495 if (BYTE_ORDER == LITTLE_ENDIAN) { 1496 #ifdef BSWAP8 1497 alen = BSWAP8(alen); 1498 clen = BSWAP8(clen); 1499 #else 1500 u8 *p = ctx->len.c; 1501 1502 ctx->len.u[0] = alen; 1503 ctx->len.u[1] = clen; 1504 1505 alen = (u64)GETU32(p) <<32|GETU32(p+4); 1506 clen = (u64)GETU32(p+8)<<32|GETU32(p+12); 1507 #endif 1508 } 1509 1510 ctx->Xi.u[0] ^= alen; 1511 ctx->Xi.u[1] ^= clen; 1512 GCM_MUL(ctx,Xi); 1513 1514 ctx->Xi.u[0] ^= ctx->EK0.u[0]; 1515 ctx->Xi.u[1] ^= ctx->EK0.u[1]; 1516 1517 if (tag && len<=sizeof(ctx->Xi)) 1518 return memcmp(ctx->Xi.c,tag,len); 1519 else 1520 return -1; 1521 } 1522 1523 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) 1524 { 1525 CRYPTO_gcm128_finish(ctx, NULL, 0); 1526 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); 1527 } 1528 1529 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) 1530 { 1531 GCM128_CONTEXT *ret; 1532 1533 if ((ret = malloc(sizeof(GCM128_CONTEXT)))) 1534 CRYPTO_gcm128_init(ret,key,block); 1535 1536 return ret; 1537 } 1538 1539 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) 1540 { 1541 if (ctx) { 1542 explicit_bzero(ctx,sizeof(*ctx)); 1543 free(ctx); 1544 } 1545 } 1546