1 /* $OpenBSD: gcm128.c,v 1.10 2014/07/09 16:06:13 miod Exp $ */ 2 /* ==================================================================== 3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * 3. All advertising materials mentioning features or use of this 18 * software must display the following acknowledgment: 19 * "This product includes software developed by the OpenSSL Project 20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 21 * 22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 23 * endorse or promote products derived from this software without 24 * prior written permission. For written permission, please contact 25 * openssl-core@openssl.org. 26 * 27 * 5. Products derived from this software may not be called "OpenSSL" 28 * nor may "OpenSSL" appear in their names without prior written 29 * permission of the OpenSSL Project. 30 * 31 * 6. Redistributions of any form whatsoever must retain the following 32 * acknowledgment: 33 * "This product includes software developed by the OpenSSL Project 34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 47 * OF THE POSSIBILITY OF SUCH DAMAGE. 48 * ==================================================================== 49 */ 50 51 #define OPENSSL_FIPSAPI 52 53 #include <openssl/crypto.h> 54 #include "modes_lcl.h" 55 #include <string.h> 56 57 #ifndef MODES_DEBUG 58 # ifndef NDEBUG 59 # define NDEBUG 60 # endif 61 #endif 62 #include <assert.h> 63 64 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT) 65 /* redefine, because alignment is ensured */ 66 #undef GETU32 67 #define GETU32(p) BSWAP4(*(const u32 *)(p)) 68 #undef PUTU32 69 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) 70 #endif 71 72 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) 73 #define REDUCE1BIT(V) \ 74 do { \ 75 if (sizeof(size_t)==8) { \ 76 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ 77 V.lo = (V.hi<<63)|(V.lo>>1); \ 78 V.hi = (V.hi>>1 )^T; \ 79 } else { \ 80 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ 81 V.lo = (V.hi<<63)|(V.lo>>1); \ 82 V.hi = (V.hi>>1 )^((u64)T<<32); \ 83 } \ 84 } while(0) 85 86 /* 87 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should 88 * never be set to 8. 8 is effectively reserved for testing purposes. 89 * TABLE_BITS>1 are lookup-table-driven implementations referred to as 90 * "Shoup's" in GCM specification. In other words OpenSSL does not cover 91 * whole spectrum of possible table driven implementations. Why? In 92 * non-"Shoup's" case memory access pattern is segmented in such manner, 93 * that it's trivial to see that cache timing information can reveal 94 * fair portion of intermediate hash value. Given that ciphertext is 95 * always available to attacker, it's possible for him to attempt to 96 * deduce secret parameter H and if successful, tamper with messages 97 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's 98 * not as trivial, but there is no reason to believe that it's resistant 99 * to cache-timing attack. And the thing about "8-bit" implementation is 100 * that it consumes 16 (sixteen) times more memory, 4KB per individual 101 * key + 1KB shared. Well, on pros side it should be twice as fast as 102 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version 103 * was observed to run ~75% faster, closer to 100% for commercial 104 * compilers... Yet "4-bit" procedure is preferred, because it's 105 * believed to provide better security-performance balance and adequate 106 * all-round performance. "All-round" refers to things like: 107 * 108 * - shorter setup time effectively improves overall timing for 109 * handling short messages; 110 * - larger table allocation can become unbearable because of VM 111 * subsystem penalties (for example on Windows large enough free 112 * results in VM working set trimming, meaning that consequent 113 * malloc would immediately incur working set expansion); 114 * - larger table has larger cache footprint, which can affect 115 * performance of other code paths (not necessarily even from same 116 * thread in Hyper-Threading world); 117 * 118 * Value of 1 is not appropriate for performance reasons. 119 */ 120 #if TABLE_BITS==8 121 122 static void gcm_init_8bit(u128 Htable[256], u64 H[2]) 123 { 124 int i, j; 125 u128 V; 126 127 Htable[0].hi = 0; 128 Htable[0].lo = 0; 129 V.hi = H[0]; 130 V.lo = H[1]; 131 132 for (Htable[128]=V, i=64; i>0; i>>=1) { 133 REDUCE1BIT(V); 134 Htable[i] = V; 135 } 136 137 for (i=2; i<256; i<<=1) { 138 u128 *Hi = Htable+i, H0 = *Hi; 139 for (j=1; j<i; ++j) { 140 Hi[j].hi = H0.hi^Htable[j].hi; 141 Hi[j].lo = H0.lo^Htable[j].lo; 142 } 143 } 144 } 145 146 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) 147 { 148 u128 Z = { 0, 0}; 149 const u8 *xi = (const u8 *)Xi+15; 150 size_t rem, n = *xi; 151 static const size_t rem_8bit[256] = { 152 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), 153 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), 154 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), 155 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), 156 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), 157 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), 158 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), 159 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), 160 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), 161 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), 162 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), 163 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), 164 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), 165 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), 166 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), 167 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), 168 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), 169 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), 170 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), 171 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), 172 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), 173 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), 174 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), 175 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), 176 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), 177 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), 178 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), 179 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), 180 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), 181 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), 182 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), 183 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), 184 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), 185 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), 186 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), 187 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), 188 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), 189 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), 190 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), 191 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), 192 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), 193 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), 194 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), 195 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), 196 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), 197 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), 198 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), 199 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), 200 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), 201 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), 202 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), 203 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), 204 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), 205 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), 206 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), 207 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), 208 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), 209 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), 210 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), 211 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), 212 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), 213 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), 214 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), 215 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; 216 217 while (1) { 218 Z.hi ^= Htable[n].hi; 219 Z.lo ^= Htable[n].lo; 220 221 if ((u8 *)Xi==xi) break; 222 223 n = *(--xi); 224 225 rem = (size_t)Z.lo&0xff; 226 Z.lo = (Z.hi<<56)|(Z.lo>>8); 227 Z.hi = (Z.hi>>8); 228 if (sizeof(size_t)==8) 229 Z.hi ^= rem_8bit[rem]; 230 else 231 Z.hi ^= (u64)rem_8bit[rem]<<32; 232 } 233 234 if (BYTE_ORDER == LITTLE_ENDIAN) { 235 #ifdef BSWAP8 236 Xi[0] = BSWAP8(Z.hi); 237 Xi[1] = BSWAP8(Z.lo); 238 #else 239 u8 *p = (u8 *)Xi; 240 u32 v; 241 v = (u32)(Z.hi>>32); PUTU32(p,v); 242 v = (u32)(Z.hi); PUTU32(p+4,v); 243 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 244 v = (u32)(Z.lo); PUTU32(p+12,v); 245 #endif 246 } 247 else { 248 Xi[0] = Z.hi; 249 Xi[1] = Z.lo; 250 } 251 } 252 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) 253 254 #elif TABLE_BITS==4 255 256 static void gcm_init_4bit(u128 Htable[16], u64 H[2]) 257 { 258 u128 V; 259 #if defined(OPENSSL_SMALL_FOOTPRINT) 260 int i; 261 #endif 262 263 Htable[0].hi = 0; 264 Htable[0].lo = 0; 265 V.hi = H[0]; 266 V.lo = H[1]; 267 268 #if defined(OPENSSL_SMALL_FOOTPRINT) 269 for (Htable[8]=V, i=4; i>0; i>>=1) { 270 REDUCE1BIT(V); 271 Htable[i] = V; 272 } 273 274 for (i=2; i<16; i<<=1) { 275 u128 *Hi = Htable+i; 276 int j; 277 for (V=*Hi, j=1; j<i; ++j) { 278 Hi[j].hi = V.hi^Htable[j].hi; 279 Hi[j].lo = V.lo^Htable[j].lo; 280 } 281 } 282 #else 283 Htable[8] = V; 284 REDUCE1BIT(V); 285 Htable[4] = V; 286 REDUCE1BIT(V); 287 Htable[2] = V; 288 REDUCE1BIT(V); 289 Htable[1] = V; 290 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; 291 V=Htable[4]; 292 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; 293 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; 294 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; 295 V=Htable[8]; 296 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; 297 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; 298 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; 299 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; 300 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; 301 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; 302 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; 303 #endif 304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) 305 /* 306 * ARM assembler expects specific dword order in Htable. 307 */ 308 { 309 int j; 310 311 if (BYTE_ORDER == LITTLE_ENDIAN) 312 for (j=0;j<16;++j) { 313 V = Htable[j]; 314 Htable[j].hi = V.lo; 315 Htable[j].lo = V.hi; 316 } 317 else 318 for (j=0;j<16;++j) { 319 V = Htable[j]; 320 Htable[j].hi = V.lo<<32|V.lo>>32; 321 Htable[j].lo = V.hi<<32|V.hi>>32; 322 } 323 } 324 #endif 325 } 326 327 #ifndef GHASH_ASM 328 static const size_t rem_4bit[16] = { 329 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), 330 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), 331 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), 332 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; 333 334 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) 335 { 336 u128 Z; 337 int cnt = 15; 338 size_t rem, nlo, nhi; 339 340 nlo = ((const u8 *)Xi)[15]; 341 nhi = nlo>>4; 342 nlo &= 0xf; 343 344 Z.hi = Htable[nlo].hi; 345 Z.lo = Htable[nlo].lo; 346 347 while (1) { 348 rem = (size_t)Z.lo&0xf; 349 Z.lo = (Z.hi<<60)|(Z.lo>>4); 350 Z.hi = (Z.hi>>4); 351 if (sizeof(size_t)==8) 352 Z.hi ^= rem_4bit[rem]; 353 else 354 Z.hi ^= (u64)rem_4bit[rem]<<32; 355 356 Z.hi ^= Htable[nhi].hi; 357 Z.lo ^= Htable[nhi].lo; 358 359 if (--cnt<0) break; 360 361 nlo = ((const u8 *)Xi)[cnt]; 362 nhi = nlo>>4; 363 nlo &= 0xf; 364 365 rem = (size_t)Z.lo&0xf; 366 Z.lo = (Z.hi<<60)|(Z.lo>>4); 367 Z.hi = (Z.hi>>4); 368 if (sizeof(size_t)==8) 369 Z.hi ^= rem_4bit[rem]; 370 else 371 Z.hi ^= (u64)rem_4bit[rem]<<32; 372 373 Z.hi ^= Htable[nlo].hi; 374 Z.lo ^= Htable[nlo].lo; 375 } 376 377 if (BYTE_ORDER == LITTLE_ENDIAN) { 378 #ifdef BSWAP8 379 Xi[0] = BSWAP8(Z.hi); 380 Xi[1] = BSWAP8(Z.lo); 381 #else 382 u8 *p = (u8 *)Xi; 383 u32 v; 384 v = (u32)(Z.hi>>32); PUTU32(p,v); 385 v = (u32)(Z.hi); PUTU32(p+4,v); 386 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 387 v = (u32)(Z.lo); PUTU32(p+12,v); 388 #endif 389 } 390 else { 391 Xi[0] = Z.hi; 392 Xi[1] = Z.lo; 393 } 394 } 395 396 #if !defined(OPENSSL_SMALL_FOOTPRINT) 397 /* 398 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for 399 * details... Compiler-generated code doesn't seem to give any 400 * performance improvement, at least not on x86[_64]. It's here 401 * mostly as reference and a placeholder for possible future 402 * non-trivial optimization[s]... 403 */ 404 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], 405 const u8 *inp,size_t len) 406 { 407 u128 Z; 408 int cnt; 409 size_t rem, nlo, nhi; 410 411 #if 1 412 do { 413 cnt = 15; 414 nlo = ((const u8 *)Xi)[15]; 415 nlo ^= inp[15]; 416 nhi = nlo>>4; 417 nlo &= 0xf; 418 419 Z.hi = Htable[nlo].hi; 420 Z.lo = Htable[nlo].lo; 421 422 while (1) { 423 rem = (size_t)Z.lo&0xf; 424 Z.lo = (Z.hi<<60)|(Z.lo>>4); 425 Z.hi = (Z.hi>>4); 426 if (sizeof(size_t)==8) 427 Z.hi ^= rem_4bit[rem]; 428 else 429 Z.hi ^= (u64)rem_4bit[rem]<<32; 430 431 Z.hi ^= Htable[nhi].hi; 432 Z.lo ^= Htable[nhi].lo; 433 434 if (--cnt<0) break; 435 436 nlo = ((const u8 *)Xi)[cnt]; 437 nlo ^= inp[cnt]; 438 nhi = nlo>>4; 439 nlo &= 0xf; 440 441 rem = (size_t)Z.lo&0xf; 442 Z.lo = (Z.hi<<60)|(Z.lo>>4); 443 Z.hi = (Z.hi>>4); 444 if (sizeof(size_t)==8) 445 Z.hi ^= rem_4bit[rem]; 446 else 447 Z.hi ^= (u64)rem_4bit[rem]<<32; 448 449 Z.hi ^= Htable[nlo].hi; 450 Z.lo ^= Htable[nlo].lo; 451 } 452 #else 453 /* 454 * Extra 256+16 bytes per-key plus 512 bytes shared tables 455 * [should] give ~50% improvement... One could have PACK()-ed 456 * the rem_8bit even here, but the priority is to minimize 457 * cache footprint... 458 */ 459 u128 Hshr4[16]; /* Htable shifted right by 4 bits */ 460 u8 Hshl4[16]; /* Htable shifted left by 4 bits */ 461 static const unsigned short rem_8bit[256] = { 462 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, 463 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, 464 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, 465 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, 466 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, 467 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, 468 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, 469 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, 470 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, 471 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, 472 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, 473 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, 474 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, 475 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, 476 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, 477 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, 478 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, 479 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, 480 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, 481 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, 482 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, 483 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, 484 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, 485 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, 486 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, 487 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, 488 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, 489 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, 490 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, 491 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, 492 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, 493 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; 494 /* 495 * This pre-processing phase slows down procedure by approximately 496 * same time as it makes each loop spin faster. In other words 497 * single block performance is approximately same as straightforward 498 * "4-bit" implementation, and then it goes only faster... 499 */ 500 for (cnt=0; cnt<16; ++cnt) { 501 Z.hi = Htable[cnt].hi; 502 Z.lo = Htable[cnt].lo; 503 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); 504 Hshr4[cnt].hi = (Z.hi>>4); 505 Hshl4[cnt] = (u8)(Z.lo<<4); 506 } 507 508 do { 509 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { 510 nlo = ((const u8 *)Xi)[cnt]; 511 nlo ^= inp[cnt]; 512 nhi = nlo>>4; 513 nlo &= 0xf; 514 515 Z.hi ^= Htable[nlo].hi; 516 Z.lo ^= Htable[nlo].lo; 517 518 rem = (size_t)Z.lo&0xff; 519 520 Z.lo = (Z.hi<<56)|(Z.lo>>8); 521 Z.hi = (Z.hi>>8); 522 523 Z.hi ^= Hshr4[nhi].hi; 524 Z.lo ^= Hshr4[nhi].lo; 525 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; 526 } 527 528 nlo = ((const u8 *)Xi)[0]; 529 nlo ^= inp[0]; 530 nhi = nlo>>4; 531 nlo &= 0xf; 532 533 Z.hi ^= Htable[nlo].hi; 534 Z.lo ^= Htable[nlo].lo; 535 536 rem = (size_t)Z.lo&0xf; 537 538 Z.lo = (Z.hi<<60)|(Z.lo>>4); 539 Z.hi = (Z.hi>>4); 540 541 Z.hi ^= Htable[nhi].hi; 542 Z.lo ^= Htable[nhi].lo; 543 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; 544 #endif 545 546 if (BYTE_ORDER == LITTLE_ENDIAN) { 547 #ifdef BSWAP8 548 Xi[0] = BSWAP8(Z.hi); 549 Xi[1] = BSWAP8(Z.lo); 550 #else 551 u8 *p = (u8 *)Xi; 552 u32 v; 553 v = (u32)(Z.hi>>32); PUTU32(p,v); 554 v = (u32)(Z.hi); PUTU32(p+4,v); 555 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 556 v = (u32)(Z.lo); PUTU32(p+12,v); 557 #endif 558 } 559 else { 560 Xi[0] = Z.hi; 561 Xi[1] = Z.lo; 562 } 563 } while (inp+=16, len-=16); 564 } 565 #endif 566 #else 567 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); 568 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 569 #endif 570 571 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) 572 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) 573 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) 574 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache 575 * trashing effect. In other words idea is to hash data while it's 576 * still in L1 cache after encryption pass... */ 577 #define GHASH_CHUNK (3*1024) 578 #endif 579 580 #else /* TABLE_BITS */ 581 582 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) 583 { 584 u128 V,Z = { 0,0 }; 585 long X; 586 int i,j; 587 const long *xi = (const long *)Xi; 588 589 V.hi = H[0]; /* H is in host byte order, no byte swapping */ 590 V.lo = H[1]; 591 592 for (j=0; j<16/sizeof(long); ++j) { 593 if (BYTE_ORDER == LITTLE_ENDIAN) { 594 if (sizeof(long)==8) { 595 #ifdef BSWAP8 596 X = (long)(BSWAP8(xi[j])); 597 #else 598 const u8 *p = (const u8 *)(xi+j); 599 X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); 600 #endif 601 } 602 else { 603 const u8 *p = (const u8 *)(xi+j); 604 X = (long)GETU32(p); 605 } 606 } 607 else 608 X = xi[j]; 609 610 for (i=0; i<8*sizeof(long); ++i, X<<=1) { 611 u64 M = (u64)(X>>(8*sizeof(long)-1)); 612 Z.hi ^= V.hi&M; 613 Z.lo ^= V.lo&M; 614 615 REDUCE1BIT(V); 616 } 617 } 618 619 if (BYTE_ORDER == LITTLE_ENDIAN) { 620 #ifdef BSWAP8 621 Xi[0] = BSWAP8(Z.hi); 622 Xi[1] = BSWAP8(Z.lo); 623 #else 624 u8 *p = (u8 *)Xi; 625 u32 v; 626 v = (u32)(Z.hi>>32); PUTU32(p,v); 627 v = (u32)(Z.hi); PUTU32(p+4,v); 628 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 629 v = (u32)(Z.lo); PUTU32(p+12,v); 630 #endif 631 } 632 else { 633 Xi[0] = Z.hi; 634 Xi[1] = Z.lo; 635 } 636 } 637 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) 638 639 #endif 640 641 #if TABLE_BITS==4 && defined(GHASH_ASM) 642 # if !defined(I386_ONLY) && \ 643 (defined(__i386) || defined(__i386__) || \ 644 defined(__x86_64) || defined(__x86_64__) || \ 645 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 646 # define GHASH_ASM_X86_OR_64 647 # define GCM_FUNCREF_4BIT 648 extern unsigned int OPENSSL_ia32cap_P[2]; 649 650 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); 651 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); 652 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 653 654 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) 655 # define GHASH_ASM_X86 656 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); 657 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 658 659 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); 660 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 661 # endif 662 # elif defined(__arm__) || defined(__arm) 663 # include "arm_arch.h" 664 # if __ARM_ARCH__>=7 665 # define GHASH_ASM_ARM 666 # define GCM_FUNCREF_4BIT 667 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); 668 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 669 # endif 670 # endif 671 #endif 672 673 #ifdef GCM_FUNCREF_4BIT 674 # undef GCM_MUL 675 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) 676 # ifdef GHASH 677 # undef GHASH 678 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) 679 # endif 680 #endif 681 682 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) 683 { 684 memset(ctx,0,sizeof(*ctx)); 685 ctx->block = block; 686 ctx->key = key; 687 688 (*block)(ctx->H.c,ctx->H.c,key); 689 690 if (BYTE_ORDER == LITTLE_ENDIAN) { 691 /* H is stored in host byte order */ 692 #ifdef BSWAP8 693 ctx->H.u[0] = BSWAP8(ctx->H.u[0]); 694 ctx->H.u[1] = BSWAP8(ctx->H.u[1]); 695 #else 696 u8 *p = ctx->H.c; 697 u64 hi,lo; 698 hi = (u64)GETU32(p) <<32|GETU32(p+4); 699 lo = (u64)GETU32(p+8)<<32|GETU32(p+12); 700 ctx->H.u[0] = hi; 701 ctx->H.u[1] = lo; 702 #endif 703 } 704 705 #if TABLE_BITS==8 706 gcm_init_8bit(ctx->Htable,ctx->H.u); 707 #elif TABLE_BITS==4 708 # if defined(GHASH_ASM_X86_OR_64) 709 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) 710 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */ 711 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */ 712 gcm_init_clmul(ctx->Htable,ctx->H.u); 713 ctx->gmult = gcm_gmult_clmul; 714 ctx->ghash = gcm_ghash_clmul; 715 return; 716 } 717 # endif 718 gcm_init_4bit(ctx->Htable,ctx->H.u); 719 # if defined(GHASH_ASM_X86) /* x86 only */ 720 # if defined(OPENSSL_IA32_SSE2) 721 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */ 722 # else 723 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */ 724 # endif 725 ctx->gmult = gcm_gmult_4bit_mmx; 726 ctx->ghash = gcm_ghash_4bit_mmx; 727 } else { 728 ctx->gmult = gcm_gmult_4bit_x86; 729 ctx->ghash = gcm_ghash_4bit_x86; 730 } 731 # else 732 ctx->gmult = gcm_gmult_4bit; 733 ctx->ghash = gcm_ghash_4bit; 734 # endif 735 # elif defined(GHASH_ASM_ARM) 736 if (OPENSSL_armcap_P & ARMV7_NEON) { 737 ctx->gmult = gcm_gmult_neon; 738 ctx->ghash = gcm_ghash_neon; 739 } else { 740 gcm_init_4bit(ctx->Htable,ctx->H.u); 741 ctx->gmult = gcm_gmult_4bit; 742 ctx->ghash = gcm_ghash_4bit; 743 } 744 # else 745 gcm_init_4bit(ctx->Htable,ctx->H.u); 746 # endif 747 #endif 748 } 749 750 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) 751 { 752 unsigned int ctr; 753 #ifdef GCM_FUNCREF_4BIT 754 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 755 #endif 756 757 ctx->Yi.u[0] = 0; 758 ctx->Yi.u[1] = 0; 759 ctx->Xi.u[0] = 0; 760 ctx->Xi.u[1] = 0; 761 ctx->len.u[0] = 0; /* AAD length */ 762 ctx->len.u[1] = 0; /* message length */ 763 ctx->ares = 0; 764 ctx->mres = 0; 765 766 if (len==12) { 767 memcpy(ctx->Yi.c,iv,12); 768 ctx->Yi.c[15]=1; 769 ctr=1; 770 } 771 else { 772 size_t i; 773 u64 len0 = len; 774 775 while (len>=16) { 776 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; 777 GCM_MUL(ctx,Yi); 778 iv += 16; 779 len -= 16; 780 } 781 if (len) { 782 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; 783 GCM_MUL(ctx,Yi); 784 } 785 len0 <<= 3; 786 if (BYTE_ORDER == LITTLE_ENDIAN) { 787 #ifdef BSWAP8 788 ctx->Yi.u[1] ^= BSWAP8(len0); 789 #else 790 ctx->Yi.c[8] ^= (u8)(len0>>56); 791 ctx->Yi.c[9] ^= (u8)(len0>>48); 792 ctx->Yi.c[10] ^= (u8)(len0>>40); 793 ctx->Yi.c[11] ^= (u8)(len0>>32); 794 ctx->Yi.c[12] ^= (u8)(len0>>24); 795 ctx->Yi.c[13] ^= (u8)(len0>>16); 796 ctx->Yi.c[14] ^= (u8)(len0>>8); 797 ctx->Yi.c[15] ^= (u8)(len0); 798 #endif 799 } 800 else 801 ctx->Yi.u[1] ^= len0; 802 803 GCM_MUL(ctx,Yi); 804 805 if (BYTE_ORDER == LITTLE_ENDIAN) 806 #ifdef BSWAP4 807 ctr = BSWAP4(ctx->Yi.d[3]); 808 #else 809 ctr = GETU32(ctx->Yi.c+12); 810 #endif 811 else 812 ctr = ctx->Yi.d[3]; 813 } 814 815 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); 816 ++ctr; 817 if (BYTE_ORDER == LITTLE_ENDIAN) 818 #ifdef BSWAP4 819 ctx->Yi.d[3] = BSWAP4(ctr); 820 #else 821 PUTU32(ctx->Yi.c+12,ctr); 822 #endif 823 else 824 ctx->Yi.d[3] = ctr; 825 } 826 827 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) 828 { 829 size_t i; 830 unsigned int n; 831 u64 alen = ctx->len.u[0]; 832 #ifdef GCM_FUNCREF_4BIT 833 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 834 # ifdef GHASH 835 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 836 const u8 *inp,size_t len) = ctx->ghash; 837 # endif 838 #endif 839 840 if (ctx->len.u[1]) return -2; 841 842 alen += len; 843 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) 844 return -1; 845 ctx->len.u[0] = alen; 846 847 n = ctx->ares; 848 if (n) { 849 while (n && len) { 850 ctx->Xi.c[n] ^= *(aad++); 851 --len; 852 n = (n+1)%16; 853 } 854 if (n==0) GCM_MUL(ctx,Xi); 855 else { 856 ctx->ares = n; 857 return 0; 858 } 859 } 860 861 #ifdef GHASH 862 if ((i = (len&(size_t)-16))) { 863 GHASH(ctx,aad,i); 864 aad += i; 865 len -= i; 866 } 867 #else 868 while (len>=16) { 869 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; 870 GCM_MUL(ctx,Xi); 871 aad += 16; 872 len -= 16; 873 } 874 #endif 875 if (len) { 876 n = (unsigned int)len; 877 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; 878 } 879 880 ctx->ares = n; 881 return 0; 882 } 883 884 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, 885 const unsigned char *in, unsigned char *out, 886 size_t len) 887 { 888 unsigned int n, ctr; 889 size_t i; 890 u64 mlen = ctx->len.u[1]; 891 block128_f block = ctx->block; 892 void *key = ctx->key; 893 #ifdef GCM_FUNCREF_4BIT 894 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 895 # ifdef GHASH 896 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 897 const u8 *inp,size_t len) = ctx->ghash; 898 # endif 899 #endif 900 901 #if 0 902 n = (unsigned int)mlen%16; /* alternative to ctx->mres */ 903 #endif 904 mlen += len; 905 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 906 return -1; 907 ctx->len.u[1] = mlen; 908 909 if (ctx->ares) { 910 /* First call to encrypt finalizes GHASH(AAD) */ 911 GCM_MUL(ctx,Xi); 912 ctx->ares = 0; 913 } 914 915 if (BYTE_ORDER == LITTLE_ENDIAN) 916 #ifdef BSWAP4 917 ctr = BSWAP4(ctx->Yi.d[3]); 918 #else 919 ctr = GETU32(ctx->Yi.c+12); 920 #endif 921 else 922 ctr = ctx->Yi.d[3]; 923 924 n = ctx->mres; 925 #if !defined(OPENSSL_SMALL_FOOTPRINT) 926 if (16%sizeof(size_t) == 0) do { /* always true actually */ 927 if (n) { 928 while (n && len) { 929 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 930 --len; 931 n = (n+1)%16; 932 } 933 if (n==0) GCM_MUL(ctx,Xi); 934 else { 935 ctx->mres = n; 936 return 0; 937 } 938 } 939 #ifdef __STRICT_ALIGNMENT 940 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 941 break; 942 #endif 943 #if defined(GHASH) && defined(GHASH_CHUNK) 944 while (len>=GHASH_CHUNK) { 945 size_t j=GHASH_CHUNK; 946 947 while (j) { 948 size_t *out_t=(size_t *)out; 949 const size_t *in_t=(const size_t *)in; 950 951 (*block)(ctx->Yi.c,ctx->EKi.c,key); 952 ++ctr; 953 if (BYTE_ORDER == LITTLE_ENDIAN) 954 #ifdef BSWAP4 955 ctx->Yi.d[3] = BSWAP4(ctr); 956 #else 957 PUTU32(ctx->Yi.c+12,ctr); 958 #endif 959 else 960 ctx->Yi.d[3] = ctr; 961 for (i=0; i<16/sizeof(size_t); ++i) 962 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 963 out += 16; 964 in += 16; 965 j -= 16; 966 } 967 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); 968 len -= GHASH_CHUNK; 969 } 970 if ((i = (len&(size_t)-16))) { 971 size_t j=i; 972 973 while (len>=16) { 974 size_t *out_t=(size_t *)out; 975 const size_t *in_t=(const size_t *)in; 976 977 (*block)(ctx->Yi.c,ctx->EKi.c,key); 978 ++ctr; 979 if (BYTE_ORDER == LITTLE_ENDIAN) 980 #ifdef BSWAP4 981 ctx->Yi.d[3] = BSWAP4(ctr); 982 #else 983 PUTU32(ctx->Yi.c+12,ctr); 984 #endif 985 else 986 ctx->Yi.d[3] = ctr; 987 for (i=0; i<16/sizeof(size_t); ++i) 988 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 989 out += 16; 990 in += 16; 991 len -= 16; 992 } 993 GHASH(ctx,out-j,j); 994 } 995 #else 996 while (len>=16) { 997 size_t *out_t=(size_t *)out; 998 const size_t *in_t=(const size_t *)in; 999 1000 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1001 ++ctr; 1002 if (BYTE_ORDER == LITTLE_ENDIAN) 1003 #ifdef BSWAP4 1004 ctx->Yi.d[3] = BSWAP4(ctr); 1005 #else 1006 PUTU32(ctx->Yi.c+12,ctr); 1007 #endif 1008 else 1009 ctx->Yi.d[3] = ctr; 1010 for (i=0; i<16/sizeof(size_t); ++i) 1011 ctx->Xi.t[i] ^= 1012 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1013 GCM_MUL(ctx,Xi); 1014 out += 16; 1015 in += 16; 1016 len -= 16; 1017 } 1018 #endif 1019 if (len) { 1020 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1021 ++ctr; 1022 if (BYTE_ORDER == LITTLE_ENDIAN) 1023 #ifdef BSWAP4 1024 ctx->Yi.d[3] = BSWAP4(ctr); 1025 #else 1026 PUTU32(ctx->Yi.c+12,ctr); 1027 #endif 1028 else 1029 ctx->Yi.d[3] = ctr; 1030 while (len--) { 1031 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1032 ++n; 1033 } 1034 } 1035 1036 ctx->mres = n; 1037 return 0; 1038 } while(0); 1039 #endif 1040 for (i=0;i<len;++i) { 1041 if (n==0) { 1042 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1043 ++ctr; 1044 if (BYTE_ORDER == LITTLE_ENDIAN) 1045 #ifdef BSWAP4 1046 ctx->Yi.d[3] = BSWAP4(ctr); 1047 #else 1048 PUTU32(ctx->Yi.c+12,ctr); 1049 #endif 1050 else 1051 ctx->Yi.d[3] = ctr; 1052 } 1053 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; 1054 n = (n+1)%16; 1055 if (n==0) 1056 GCM_MUL(ctx,Xi); 1057 } 1058 1059 ctx->mres = n; 1060 return 0; 1061 } 1062 1063 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, 1064 const unsigned char *in, unsigned char *out, 1065 size_t len) 1066 { 1067 unsigned int n, ctr; 1068 size_t i; 1069 u64 mlen = ctx->len.u[1]; 1070 block128_f block = ctx->block; 1071 void *key = ctx->key; 1072 #ifdef GCM_FUNCREF_4BIT 1073 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1074 # ifdef GHASH 1075 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1076 const u8 *inp,size_t len) = ctx->ghash; 1077 # endif 1078 #endif 1079 1080 mlen += len; 1081 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1082 return -1; 1083 ctx->len.u[1] = mlen; 1084 1085 if (ctx->ares) { 1086 /* First call to decrypt finalizes GHASH(AAD) */ 1087 GCM_MUL(ctx,Xi); 1088 ctx->ares = 0; 1089 } 1090 1091 if (BYTE_ORDER == LITTLE_ENDIAN) 1092 #ifdef BSWAP4 1093 ctr = BSWAP4(ctx->Yi.d[3]); 1094 #else 1095 ctr = GETU32(ctx->Yi.c+12); 1096 #endif 1097 else 1098 ctr = ctx->Yi.d[3]; 1099 1100 n = ctx->mres; 1101 #if !defined(OPENSSL_SMALL_FOOTPRINT) 1102 if (16%sizeof(size_t) == 0) do { /* always true actually */ 1103 if (n) { 1104 while (n && len) { 1105 u8 c = *(in++); 1106 *(out++) = c^ctx->EKi.c[n]; 1107 ctx->Xi.c[n] ^= c; 1108 --len; 1109 n = (n+1)%16; 1110 } 1111 if (n==0) GCM_MUL (ctx,Xi); 1112 else { 1113 ctx->mres = n; 1114 return 0; 1115 } 1116 } 1117 #ifdef __STRICT_ALIGNMENT 1118 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 1119 break; 1120 #endif 1121 #if defined(GHASH) && defined(GHASH_CHUNK) 1122 while (len>=GHASH_CHUNK) { 1123 size_t j=GHASH_CHUNK; 1124 1125 GHASH(ctx,in,GHASH_CHUNK); 1126 while (j) { 1127 size_t *out_t=(size_t *)out; 1128 const size_t *in_t=(const size_t *)in; 1129 1130 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1131 ++ctr; 1132 if (BYTE_ORDER == LITTLE_ENDIAN) 1133 #ifdef BSWAP4 1134 ctx->Yi.d[3] = BSWAP4(ctr); 1135 #else 1136 PUTU32(ctx->Yi.c+12,ctr); 1137 #endif 1138 else 1139 ctx->Yi.d[3] = ctr; 1140 for (i=0; i<16/sizeof(size_t); ++i) 1141 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1142 out += 16; 1143 in += 16; 1144 j -= 16; 1145 } 1146 len -= GHASH_CHUNK; 1147 } 1148 if ((i = (len&(size_t)-16))) { 1149 GHASH(ctx,in,i); 1150 while (len>=16) { 1151 size_t *out_t=(size_t *)out; 1152 const size_t *in_t=(const size_t *)in; 1153 1154 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1155 ++ctr; 1156 if (BYTE_ORDER == LITTLE_ENDIAN) 1157 #ifdef BSWAP4 1158 ctx->Yi.d[3] = BSWAP4(ctr); 1159 #else 1160 PUTU32(ctx->Yi.c+12,ctr); 1161 #endif 1162 else 1163 ctx->Yi.d[3] = ctr; 1164 for (i=0; i<16/sizeof(size_t); ++i) 1165 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1166 out += 16; 1167 in += 16; 1168 len -= 16; 1169 } 1170 } 1171 #else 1172 while (len>=16) { 1173 size_t *out_t=(size_t *)out; 1174 const size_t *in_t=(const size_t *)in; 1175 1176 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1177 ++ctr; 1178 if (BYTE_ORDER == LITTLE_ENDIAN) 1179 #ifdef BSWAP4 1180 ctx->Yi.d[3] = BSWAP4(ctr); 1181 #else 1182 PUTU32(ctx->Yi.c+12,ctr); 1183 #endif 1184 else 1185 ctx->Yi.d[3] = ctr; 1186 for (i=0; i<16/sizeof(size_t); ++i) { 1187 size_t c = in[i]; 1188 out[i] = c^ctx->EKi.t[i]; 1189 ctx->Xi.t[i] ^= c; 1190 } 1191 GCM_MUL(ctx,Xi); 1192 out += 16; 1193 in += 16; 1194 len -= 16; 1195 } 1196 #endif 1197 if (len) { 1198 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1199 ++ctr; 1200 if (BYTE_ORDER == LITTLE_ENDIAN) 1201 #ifdef BSWAP4 1202 ctx->Yi.d[3] = BSWAP4(ctr); 1203 #else 1204 PUTU32(ctx->Yi.c+12,ctr); 1205 #endif 1206 else 1207 ctx->Yi.d[3] = ctr; 1208 while (len--) { 1209 u8 c = in[n]; 1210 ctx->Xi.c[n] ^= c; 1211 out[n] = c^ctx->EKi.c[n]; 1212 ++n; 1213 } 1214 } 1215 1216 ctx->mres = n; 1217 return 0; 1218 } while(0); 1219 #endif 1220 for (i=0;i<len;++i) { 1221 u8 c; 1222 if (n==0) { 1223 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1224 ++ctr; 1225 if (BYTE_ORDER == LITTLE_ENDIAN) 1226 #ifdef BSWAP4 1227 ctx->Yi.d[3] = BSWAP4(ctr); 1228 #else 1229 PUTU32(ctx->Yi.c+12,ctr); 1230 #endif 1231 else 1232 ctx->Yi.d[3] = ctr; 1233 } 1234 c = in[i]; 1235 out[i] = c^ctx->EKi.c[n]; 1236 ctx->Xi.c[n] ^= c; 1237 n = (n+1)%16; 1238 if (n==0) 1239 GCM_MUL(ctx,Xi); 1240 } 1241 1242 ctx->mres = n; 1243 return 0; 1244 } 1245 1246 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, 1247 const unsigned char *in, unsigned char *out, 1248 size_t len, ctr128_f stream) 1249 { 1250 unsigned int n, ctr; 1251 size_t i; 1252 u64 mlen = ctx->len.u[1]; 1253 void *key = ctx->key; 1254 #ifdef GCM_FUNCREF_4BIT 1255 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1256 # ifdef GHASH 1257 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1258 const u8 *inp,size_t len) = ctx->ghash; 1259 # endif 1260 #endif 1261 1262 mlen += len; 1263 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1264 return -1; 1265 ctx->len.u[1] = mlen; 1266 1267 if (ctx->ares) { 1268 /* First call to encrypt finalizes GHASH(AAD) */ 1269 GCM_MUL(ctx,Xi); 1270 ctx->ares = 0; 1271 } 1272 1273 if (BYTE_ORDER == LITTLE_ENDIAN) 1274 #ifdef BSWAP4 1275 ctr = BSWAP4(ctx->Yi.d[3]); 1276 #else 1277 ctr = GETU32(ctx->Yi.c+12); 1278 #endif 1279 else 1280 ctr = ctx->Yi.d[3]; 1281 1282 n = ctx->mres; 1283 if (n) { 1284 while (n && len) { 1285 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 1286 --len; 1287 n = (n+1)%16; 1288 } 1289 if (n==0) GCM_MUL(ctx,Xi); 1290 else { 1291 ctx->mres = n; 1292 return 0; 1293 } 1294 } 1295 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1296 while (len>=GHASH_CHUNK) { 1297 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1298 ctr += GHASH_CHUNK/16; 1299 if (BYTE_ORDER == LITTLE_ENDIAN) 1300 #ifdef BSWAP4 1301 ctx->Yi.d[3] = BSWAP4(ctr); 1302 #else 1303 PUTU32(ctx->Yi.c+12,ctr); 1304 #endif 1305 else 1306 ctx->Yi.d[3] = ctr; 1307 GHASH(ctx,out,GHASH_CHUNK); 1308 out += GHASH_CHUNK; 1309 in += GHASH_CHUNK; 1310 len -= GHASH_CHUNK; 1311 } 1312 #endif 1313 if ((i = (len&(size_t)-16))) { 1314 size_t j=i/16; 1315 1316 (*stream)(in,out,j,key,ctx->Yi.c); 1317 ctr += (unsigned int)j; 1318 if (BYTE_ORDER == LITTLE_ENDIAN) 1319 #ifdef BSWAP4 1320 ctx->Yi.d[3] = BSWAP4(ctr); 1321 #else 1322 PUTU32(ctx->Yi.c+12,ctr); 1323 #endif 1324 else 1325 ctx->Yi.d[3] = ctr; 1326 in += i; 1327 len -= i; 1328 #if defined(GHASH) 1329 GHASH(ctx,out,i); 1330 out += i; 1331 #else 1332 while (j--) { 1333 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; 1334 GCM_MUL(ctx,Xi); 1335 out += 16; 1336 } 1337 #endif 1338 } 1339 if (len) { 1340 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1341 ++ctr; 1342 if (BYTE_ORDER == LITTLE_ENDIAN) 1343 #ifdef BSWAP4 1344 ctx->Yi.d[3] = BSWAP4(ctr); 1345 #else 1346 PUTU32(ctx->Yi.c+12,ctr); 1347 #endif 1348 else 1349 ctx->Yi.d[3] = ctr; 1350 while (len--) { 1351 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1352 ++n; 1353 } 1354 } 1355 1356 ctx->mres = n; 1357 return 0; 1358 } 1359 1360 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, 1361 const unsigned char *in, unsigned char *out, 1362 size_t len,ctr128_f stream) 1363 { 1364 unsigned int n, ctr; 1365 size_t i; 1366 u64 mlen = ctx->len.u[1]; 1367 void *key = ctx->key; 1368 #ifdef GCM_FUNCREF_4BIT 1369 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1370 # ifdef GHASH 1371 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1372 const u8 *inp,size_t len) = ctx->ghash; 1373 # endif 1374 #endif 1375 1376 mlen += len; 1377 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1378 return -1; 1379 ctx->len.u[1] = mlen; 1380 1381 if (ctx->ares) { 1382 /* First call to decrypt finalizes GHASH(AAD) */ 1383 GCM_MUL(ctx,Xi); 1384 ctx->ares = 0; 1385 } 1386 1387 if (BYTE_ORDER == LITTLE_ENDIAN) 1388 #ifdef BSWAP4 1389 ctr = BSWAP4(ctx->Yi.d[3]); 1390 #else 1391 ctr = GETU32(ctx->Yi.c+12); 1392 #endif 1393 else 1394 ctr = ctx->Yi.d[3]; 1395 1396 n = ctx->mres; 1397 if (n) { 1398 while (n && len) { 1399 u8 c = *(in++); 1400 *(out++) = c^ctx->EKi.c[n]; 1401 ctx->Xi.c[n] ^= c; 1402 --len; 1403 n = (n+1)%16; 1404 } 1405 if (n==0) GCM_MUL (ctx,Xi); 1406 else { 1407 ctx->mres = n; 1408 return 0; 1409 } 1410 } 1411 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1412 while (len>=GHASH_CHUNK) { 1413 GHASH(ctx,in,GHASH_CHUNK); 1414 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1415 ctr += GHASH_CHUNK/16; 1416 if (BYTE_ORDER == LITTLE_ENDIAN) 1417 #ifdef BSWAP4 1418 ctx->Yi.d[3] = BSWAP4(ctr); 1419 #else 1420 PUTU32(ctx->Yi.c+12,ctr); 1421 #endif 1422 else 1423 ctx->Yi.d[3] = ctr; 1424 out += GHASH_CHUNK; 1425 in += GHASH_CHUNK; 1426 len -= GHASH_CHUNK; 1427 } 1428 #endif 1429 if ((i = (len&(size_t)-16))) { 1430 size_t j=i/16; 1431 1432 #if defined(GHASH) 1433 GHASH(ctx,in,i); 1434 #else 1435 while (j--) { 1436 size_t k; 1437 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; 1438 GCM_MUL(ctx,Xi); 1439 in += 16; 1440 } 1441 j = i/16; 1442 in -= i; 1443 #endif 1444 (*stream)(in,out,j,key,ctx->Yi.c); 1445 ctr += (unsigned int)j; 1446 if (BYTE_ORDER == LITTLE_ENDIAN) 1447 #ifdef BSWAP4 1448 ctx->Yi.d[3] = BSWAP4(ctr); 1449 #else 1450 PUTU32(ctx->Yi.c+12,ctr); 1451 #endif 1452 else 1453 ctx->Yi.d[3] = ctr; 1454 out += i; 1455 in += i; 1456 len -= i; 1457 } 1458 if (len) { 1459 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1460 ++ctr; 1461 if (BYTE_ORDER == LITTLE_ENDIAN) 1462 #ifdef BSWAP4 1463 ctx->Yi.d[3] = BSWAP4(ctr); 1464 #else 1465 PUTU32(ctx->Yi.c+12,ctr); 1466 #endif 1467 else 1468 ctx->Yi.d[3] = ctr; 1469 while (len--) { 1470 u8 c = in[n]; 1471 ctx->Xi.c[n] ^= c; 1472 out[n] = c^ctx->EKi.c[n]; 1473 ++n; 1474 } 1475 } 1476 1477 ctx->mres = n; 1478 return 0; 1479 } 1480 1481 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, 1482 size_t len) 1483 { 1484 u64 alen = ctx->len.u[0]<<3; 1485 u64 clen = ctx->len.u[1]<<3; 1486 #ifdef GCM_FUNCREF_4BIT 1487 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1488 #endif 1489 1490 if (ctx->mres || ctx->ares) 1491 GCM_MUL(ctx,Xi); 1492 1493 if (BYTE_ORDER == LITTLE_ENDIAN) { 1494 #ifdef BSWAP8 1495 alen = BSWAP8(alen); 1496 clen = BSWAP8(clen); 1497 #else 1498 u8 *p = ctx->len.c; 1499 1500 ctx->len.u[0] = alen; 1501 ctx->len.u[1] = clen; 1502 1503 alen = (u64)GETU32(p) <<32|GETU32(p+4); 1504 clen = (u64)GETU32(p+8)<<32|GETU32(p+12); 1505 #endif 1506 } 1507 1508 ctx->Xi.u[0] ^= alen; 1509 ctx->Xi.u[1] ^= clen; 1510 GCM_MUL(ctx,Xi); 1511 1512 ctx->Xi.u[0] ^= ctx->EK0.u[0]; 1513 ctx->Xi.u[1] ^= ctx->EK0.u[1]; 1514 1515 if (tag && len<=sizeof(ctx->Xi)) 1516 return memcmp(ctx->Xi.c,tag,len); 1517 else 1518 return -1; 1519 } 1520 1521 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) 1522 { 1523 CRYPTO_gcm128_finish(ctx, NULL, 0); 1524 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); 1525 } 1526 1527 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) 1528 { 1529 GCM128_CONTEXT *ret; 1530 1531 if ((ret = malloc(sizeof(GCM128_CONTEXT)))) 1532 CRYPTO_gcm128_init(ret,key,block); 1533 1534 return ret; 1535 } 1536 1537 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) 1538 { 1539 if (ctx) { 1540 OPENSSL_cleanse(ctx,sizeof(*ctx)); 1541 free(ctx); 1542 } 1543 } 1544