1 /*
2 * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include <string.h>
11 #include <openssl/crypto.h>
12 #include "internal/cryptlib.h"
13 #include "internal/endian.h"
14 #include "crypto/modes.h"
15
16 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
17 typedef size_t size_t_aX __attribute((__aligned__(1)));
18 #else
19 typedef size_t size_t_aX;
20 #endif
21
22 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
23 /* redefine, because alignment is ensured */
24 # undef GETU32
25 # define GETU32(p) BSWAP4(*(const u32 *)(p))
26 # undef PUTU32
27 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
28 #endif
29
30 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
31 #define REDUCE1BIT(V) do { \
32 if (sizeof(size_t)==8) { \
33 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
34 V.lo = (V.hi<<63)|(V.lo>>1); \
35 V.hi = (V.hi>>1 )^T; \
36 } \
37 else { \
38 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
39 V.lo = (V.hi<<63)|(V.lo>>1); \
40 V.hi = (V.hi>>1 )^((u64)T<<32); \
41 } \
42 } while(0)
43
44 /*-
45 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
46 * never be set to 8. 8 is effectively reserved for testing purposes.
47 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
48 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
49 * whole spectrum of possible table driven implementations. Why? In
50 * non-"Shoup's" case memory access pattern is segmented in such manner,
51 * that it's trivial to see that cache timing information can reveal
52 * fair portion of intermediate hash value. Given that ciphertext is
53 * always available to attacker, it's possible for him to attempt to
54 * deduce secret parameter H and if successful, tamper with messages
55 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
56 * not as trivial, but there is no reason to believe that it's resistant
57 * to cache-timing attack. And the thing about "8-bit" implementation is
58 * that it consumes 16 (sixteen) times more memory, 4KB per individual
59 * key + 1KB shared. Well, on pros side it should be twice as fast as
60 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
61 * was observed to run ~75% faster, closer to 100% for commercial
62 * compilers... Yet "4-bit" procedure is preferred, because it's
63 * believed to provide better security-performance balance and adequate
64 * all-round performance. "All-round" refers to things like:
65 *
66 * - shorter setup time effectively improves overall timing for
67 * handling short messages;
68 * - larger table allocation can become unbearable because of VM
69 * subsystem penalties (for example on Windows large enough free
70 * results in VM working set trimming, meaning that consequent
71 * malloc would immediately incur working set expansion);
72 * - larger table has larger cache footprint, which can affect
73 * performance of other code paths (not necessarily even from same
74 * thread in Hyper-Threading world);
75 *
76 * Value of 1 is not appropriate for performance reasons.
77 */
78 #if TABLE_BITS==8
79
gcm_init_8bit(u128 Htable[256],u64 H[2])80 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
81 {
82 int i, j;
83 u128 V;
84
85 Htable[0].hi = 0;
86 Htable[0].lo = 0;
87 V.hi = H[0];
88 V.lo = H[1];
89
90 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
91 REDUCE1BIT(V);
92 Htable[i] = V;
93 }
94
95 for (i = 2; i < 256; i <<= 1) {
96 u128 *Hi = Htable + i, H0 = *Hi;
97 for (j = 1; j < i; ++j) {
98 Hi[j].hi = H0.hi ^ Htable[j].hi;
99 Hi[j].lo = H0.lo ^ Htable[j].lo;
100 }
101 }
102 }
103
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])104 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
105 {
106 u128 Z = { 0, 0 };
107 const u8 *xi = (const u8 *)Xi + 15;
108 size_t rem, n = *xi;
109 DECLARE_IS_ENDIAN;
110 static const size_t rem_8bit[256] = {
111 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
112 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
113 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
114 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
115 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
116 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
117 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
118 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
119 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
120 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
121 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
122 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
123 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
124 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
125 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
126 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
127 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
128 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
129 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
130 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
131 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
132 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
133 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
134 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
135 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
136 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
137 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
138 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
139 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
140 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
141 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
142 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
143 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
144 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
145 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
146 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
147 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
148 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
149 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
150 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
151 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
152 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
153 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
154 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
155 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
156 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
157 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
158 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
159 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
160 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
161 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
162 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
163 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
164 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
165 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
166 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
167 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
168 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
169 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
170 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
171 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
172 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
173 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
174 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
175 };
176
177 while (1) {
178 Z.hi ^= Htable[n].hi;
179 Z.lo ^= Htable[n].lo;
180
181 if ((u8 *)Xi == xi)
182 break;
183
184 n = *(--xi);
185
186 rem = (size_t)Z.lo & 0xff;
187 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
188 Z.hi = (Z.hi >> 8);
189 if (sizeof(size_t) == 8)
190 Z.hi ^= rem_8bit[rem];
191 else
192 Z.hi ^= (u64)rem_8bit[rem] << 32;
193 }
194
195 if (IS_LITTLE_ENDIAN) {
196 # ifdef BSWAP8
197 Xi[0] = BSWAP8(Z.hi);
198 Xi[1] = BSWAP8(Z.lo);
199 # else
200 u8 *p = (u8 *)Xi;
201 u32 v;
202 v = (u32)(Z.hi >> 32);
203 PUTU32(p, v);
204 v = (u32)(Z.hi);
205 PUTU32(p + 4, v);
206 v = (u32)(Z.lo >> 32);
207 PUTU32(p + 8, v);
208 v = (u32)(Z.lo);
209 PUTU32(p + 12, v);
210 # endif
211 } else {
212 Xi[0] = Z.hi;
213 Xi[1] = Z.lo;
214 }
215 }
216
217 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
218
219 #elif TABLE_BITS==4
220
gcm_init_4bit(u128 Htable[16],u64 H[2])221 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
222 {
223 u128 V;
224 # if defined(OPENSSL_SMALL_FOOTPRINT)
225 int i;
226 # endif
227
228 Htable[0].hi = 0;
229 Htable[0].lo = 0;
230 V.hi = H[0];
231 V.lo = H[1];
232
233 # if defined(OPENSSL_SMALL_FOOTPRINT)
234 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
235 REDUCE1BIT(V);
236 Htable[i] = V;
237 }
238
239 for (i = 2; i < 16; i <<= 1) {
240 u128 *Hi = Htable + i;
241 int j;
242 for (V = *Hi, j = 1; j < i; ++j) {
243 Hi[j].hi = V.hi ^ Htable[j].hi;
244 Hi[j].lo = V.lo ^ Htable[j].lo;
245 }
246 }
247 # else
248 Htable[8] = V;
249 REDUCE1BIT(V);
250 Htable[4] = V;
251 REDUCE1BIT(V);
252 Htable[2] = V;
253 REDUCE1BIT(V);
254 Htable[1] = V;
255 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
256 V = Htable[4];
257 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
258 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
259 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
260 V = Htable[8];
261 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
262 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
263 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
264 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
265 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
266 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
267 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
268 # endif
269 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
270 /*
271 * ARM assembler expects specific dword order in Htable.
272 */
273 {
274 int j;
275 DECLARE_IS_ENDIAN;
276
277 if (IS_LITTLE_ENDIAN)
278 for (j = 0; j < 16; ++j) {
279 V = Htable[j];
280 Htable[j].hi = V.lo;
281 Htable[j].lo = V.hi;
282 } else
283 for (j = 0; j < 16; ++j) {
284 V = Htable[j];
285 Htable[j].hi = V.lo << 32 | V.lo >> 32;
286 Htable[j].lo = V.hi << 32 | V.hi >> 32;
287 }
288 }
289 # endif
290 }
291
292 # ifndef GHASH_ASM
293 static const size_t rem_4bit[16] = {
294 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
295 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
296 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
297 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
298 };
299
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])300 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
301 {
302 u128 Z;
303 int cnt = 15;
304 size_t rem, nlo, nhi;
305 DECLARE_IS_ENDIAN;
306
307 nlo = ((const u8 *)Xi)[15];
308 nhi = nlo >> 4;
309 nlo &= 0xf;
310
311 Z.hi = Htable[nlo].hi;
312 Z.lo = Htable[nlo].lo;
313
314 while (1) {
315 rem = (size_t)Z.lo & 0xf;
316 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
317 Z.hi = (Z.hi >> 4);
318 if (sizeof(size_t) == 8)
319 Z.hi ^= rem_4bit[rem];
320 else
321 Z.hi ^= (u64)rem_4bit[rem] << 32;
322
323 Z.hi ^= Htable[nhi].hi;
324 Z.lo ^= Htable[nhi].lo;
325
326 if (--cnt < 0)
327 break;
328
329 nlo = ((const u8 *)Xi)[cnt];
330 nhi = nlo >> 4;
331 nlo &= 0xf;
332
333 rem = (size_t)Z.lo & 0xf;
334 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
335 Z.hi = (Z.hi >> 4);
336 if (sizeof(size_t) == 8)
337 Z.hi ^= rem_4bit[rem];
338 else
339 Z.hi ^= (u64)rem_4bit[rem] << 32;
340
341 Z.hi ^= Htable[nlo].hi;
342 Z.lo ^= Htable[nlo].lo;
343 }
344
345 if (IS_LITTLE_ENDIAN) {
346 # ifdef BSWAP8
347 Xi[0] = BSWAP8(Z.hi);
348 Xi[1] = BSWAP8(Z.lo);
349 # else
350 u8 *p = (u8 *)Xi;
351 u32 v;
352 v = (u32)(Z.hi >> 32);
353 PUTU32(p, v);
354 v = (u32)(Z.hi);
355 PUTU32(p + 4, v);
356 v = (u32)(Z.lo >> 32);
357 PUTU32(p + 8, v);
358 v = (u32)(Z.lo);
359 PUTU32(p + 12, v);
360 # endif
361 } else {
362 Xi[0] = Z.hi;
363 Xi[1] = Z.lo;
364 }
365 }
366
367 # if !defined(OPENSSL_SMALL_FOOTPRINT)
368 /*
369 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
370 * details... Compiler-generated code doesn't seem to give any
371 * performance improvement, at least not on x86[_64]. It's here
372 * mostly as reference and a placeholder for possible future
373 * non-trivial optimization[s]...
374 */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)375 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
376 const u8 *inp, size_t len)
377 {
378 u128 Z;
379 int cnt;
380 size_t rem, nlo, nhi;
381 DECLARE_IS_ENDIAN;
382
383 # if 1
384 do {
385 cnt = 15;
386 nlo = ((const u8 *)Xi)[15];
387 nlo ^= inp[15];
388 nhi = nlo >> 4;
389 nlo &= 0xf;
390
391 Z.hi = Htable[nlo].hi;
392 Z.lo = Htable[nlo].lo;
393
394 while (1) {
395 rem = (size_t)Z.lo & 0xf;
396 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
397 Z.hi = (Z.hi >> 4);
398 if (sizeof(size_t) == 8)
399 Z.hi ^= rem_4bit[rem];
400 else
401 Z.hi ^= (u64)rem_4bit[rem] << 32;
402
403 Z.hi ^= Htable[nhi].hi;
404 Z.lo ^= Htable[nhi].lo;
405
406 if (--cnt < 0)
407 break;
408
409 nlo = ((const u8 *)Xi)[cnt];
410 nlo ^= inp[cnt];
411 nhi = nlo >> 4;
412 nlo &= 0xf;
413
414 rem = (size_t)Z.lo & 0xf;
415 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
416 Z.hi = (Z.hi >> 4);
417 if (sizeof(size_t) == 8)
418 Z.hi ^= rem_4bit[rem];
419 else
420 Z.hi ^= (u64)rem_4bit[rem] << 32;
421
422 Z.hi ^= Htable[nlo].hi;
423 Z.lo ^= Htable[nlo].lo;
424 }
425 # else
426 /*
427 * Extra 256+16 bytes per-key plus 512 bytes shared tables
428 * [should] give ~50% improvement... One could have PACK()-ed
429 * the rem_8bit even here, but the priority is to minimize
430 * cache footprint...
431 */
432 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
433 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
434 static const unsigned short rem_8bit[256] = {
435 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
436 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
437 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
438 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
439 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
440 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
441 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
442 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
443 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
444 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
445 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
446 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
447 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
448 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
449 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
450 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
451 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
452 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
453 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
454 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
455 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
456 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
457 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
458 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
459 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
460 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
461 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
462 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
463 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
464 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
465 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
466 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
467 };
468 /*
469 * This pre-processing phase slows down procedure by approximately
470 * same time as it makes each loop spin faster. In other words
471 * single block performance is approximately same as straightforward
472 * "4-bit" implementation, and then it goes only faster...
473 */
474 for (cnt = 0; cnt < 16; ++cnt) {
475 Z.hi = Htable[cnt].hi;
476 Z.lo = Htable[cnt].lo;
477 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
478 Hshr4[cnt].hi = (Z.hi >> 4);
479 Hshl4[cnt] = (u8)(Z.lo << 4);
480 }
481
482 do {
483 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
484 nlo = ((const u8 *)Xi)[cnt];
485 nlo ^= inp[cnt];
486 nhi = nlo >> 4;
487 nlo &= 0xf;
488
489 Z.hi ^= Htable[nlo].hi;
490 Z.lo ^= Htable[nlo].lo;
491
492 rem = (size_t)Z.lo & 0xff;
493
494 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
495 Z.hi = (Z.hi >> 8);
496
497 Z.hi ^= Hshr4[nhi].hi;
498 Z.lo ^= Hshr4[nhi].lo;
499 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
500 }
501
502 nlo = ((const u8 *)Xi)[0];
503 nlo ^= inp[0];
504 nhi = nlo >> 4;
505 nlo &= 0xf;
506
507 Z.hi ^= Htable[nlo].hi;
508 Z.lo ^= Htable[nlo].lo;
509
510 rem = (size_t)Z.lo & 0xf;
511
512 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
513 Z.hi = (Z.hi >> 4);
514
515 Z.hi ^= Htable[nhi].hi;
516 Z.lo ^= Htable[nhi].lo;
517 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
518 # endif
519
520 if (IS_LITTLE_ENDIAN) {
521 # ifdef BSWAP8
522 Xi[0] = BSWAP8(Z.hi);
523 Xi[1] = BSWAP8(Z.lo);
524 # else
525 u8 *p = (u8 *)Xi;
526 u32 v;
527 v = (u32)(Z.hi >> 32);
528 PUTU32(p, v);
529 v = (u32)(Z.hi);
530 PUTU32(p + 4, v);
531 v = (u32)(Z.lo >> 32);
532 PUTU32(p + 8, v);
533 v = (u32)(Z.lo);
534 PUTU32(p + 12, v);
535 # endif
536 } else {
537 Xi[0] = Z.hi;
538 Xi[1] = Z.lo;
539 }
540 } while (inp += 16, len -= 16);
541 }
542 # endif
543 # else
544 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
545 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
546 size_t len);
547 # endif
548
549 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
550 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
551 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
552 /*
553 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
554 * effect. In other words idea is to hash data while it's still in L1 cache
555 * after encryption pass...
556 */
557 # define GHASH_CHUNK (3*1024)
558 # endif
559
560 #else /* TABLE_BITS */
561
562 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
563 {
564 u128 V, Z = { 0, 0 };
565 long X;
566 int i, j;
567 const long *xi = (const long *)Xi;
568 DECLARE_IS_ENDIAN;
569
570 V.hi = H[0]; /* H is in host byte order, no byte swapping */
571 V.lo = H[1];
572
573 for (j = 0; j < 16 / sizeof(long); ++j) {
574 if (IS_LITTLE_ENDIAN) {
575 if (sizeof(long) == 8) {
576 # ifdef BSWAP8
577 X = (long)(BSWAP8(xi[j]));
578 # else
579 const u8 *p = (const u8 *)(xi + j);
580 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
581 # endif
582 } else {
583 const u8 *p = (const u8 *)(xi + j);
584 X = (long)GETU32(p);
585 }
586 } else
587 X = xi[j];
588
589 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
590 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
591 Z.hi ^= V.hi & M;
592 Z.lo ^= V.lo & M;
593
594 REDUCE1BIT(V);
595 }
596 }
597
598 if (IS_LITTLE_ENDIAN) {
599 # ifdef BSWAP8
600 Xi[0] = BSWAP8(Z.hi);
601 Xi[1] = BSWAP8(Z.lo);
602 # else
603 u8 *p = (u8 *)Xi;
604 u32 v;
605 v = (u32)(Z.hi >> 32);
606 PUTU32(p, v);
607 v = (u32)(Z.hi);
608 PUTU32(p + 4, v);
609 v = (u32)(Z.lo >> 32);
610 PUTU32(p + 8, v);
611 v = (u32)(Z.lo);
612 PUTU32(p + 12, v);
613 # endif
614 } else {
615 Xi[0] = Z.hi;
616 Xi[1] = Z.lo;
617 }
618 }
619
620 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
621
622 #endif
623
624 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
625 # if !defined(I386_ONLY) && \
626 (defined(__i386) || defined(__i386__) || \
627 defined(__x86_64) || defined(__x86_64__) || \
628 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
629 # define GHASH_ASM_X86_OR_64
630 # define GCM_FUNCREF_4BIT
631
632 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
633 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
634 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
635 size_t len);
636
637 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
638 # define gcm_init_avx gcm_init_clmul
639 # define gcm_gmult_avx gcm_gmult_clmul
640 # define gcm_ghash_avx gcm_ghash_clmul
641 # else
642 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
643 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
644 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
645 size_t len);
646 # endif
647
648 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
649 # define GHASH_ASM_X86
650 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
651 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
652 size_t len);
653
654 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
655 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
656 size_t len);
657 # endif
658 # elif (defined(__arm__) || defined(__arm) || defined(__aarch64__)) && defined(GHASH_ASM)
659 # include "arm_arch.h"
660 # if __ARM_MAX_ARCH__>=7
661 # define GHASH_ASM_ARM
662 # define GCM_FUNCREF_4BIT
663 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
664 # if defined(__arm__) || defined(__arm)
665 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
666 # endif
667 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
668 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670 size_t len);
671 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
672 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
673 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
674 size_t len);
675 # endif
676 # elif defined(__sparc__) || defined(__sparc)
677 # include "crypto/sparc_arch.h"
678 # if defined(__arch64__)
679 # define GHASH_ASM_SPARC
680 # define GCM_FUNCREF_4BIT
681 extern unsigned int OPENSSL_sparcv9cap_P[];
682 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
683 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
684 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
685 size_t len);
686 # endif /* __arch64__ */
687 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
688 # include "crypto/ppc_arch.h"
689 # define GHASH_ASM_PPC
690 # define GCM_FUNCREF_4BIT
691 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
692 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
693 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
694 size_t len);
695 # endif
696 #endif
697
698 #ifdef GCM_FUNCREF_4BIT
699 # undef GCM_MUL
700 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
701 # ifdef GHASH
702 # undef GHASH
703 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
704 # endif
705 #endif
706
707 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
708 {
709 DECLARE_IS_ENDIAN;
710
711 memset(ctx, 0, sizeof(*ctx));
712 ctx->block = block;
713 ctx->key = key;
714
715 (*block) (ctx->H.c, ctx->H.c, key);
716
717 if (IS_LITTLE_ENDIAN) {
718 /* H is stored in host byte order */
719 #ifdef BSWAP8
720 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
721 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
722 #else
723 u8 *p = ctx->H.c;
724 u64 hi, lo;
725 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
726 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
727 ctx->H.u[0] = hi;
728 ctx->H.u[1] = lo;
729 #endif
730 }
731 #if TABLE_BITS==8
732 gcm_init_8bit(ctx->Htable, ctx->H.u);
733 #elif TABLE_BITS==4
734 # if defined(GHASH)
735 # define CTX__GHASH(f) (ctx->ghash = (f))
736 # else
737 # define CTX__GHASH(f) (ctx->ghash = NULL)
738 # endif
739 # if defined(GHASH_ASM_X86_OR_64)
740 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
741 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
742 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
743 gcm_init_avx(ctx->Htable, ctx->H.u);
744 ctx->gmult = gcm_gmult_avx;
745 CTX__GHASH(gcm_ghash_avx);
746 } else {
747 gcm_init_clmul(ctx->Htable, ctx->H.u);
748 ctx->gmult = gcm_gmult_clmul;
749 CTX__GHASH(gcm_ghash_clmul);
750 }
751 return;
752 }
753 # endif
754 gcm_init_4bit(ctx->Htable, ctx->H.u);
755 # if defined(GHASH_ASM_X86) /* x86 only */
756 # if defined(OPENSSL_IA32_SSE2)
757 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
758 # else
759 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
760 # endif
761 ctx->gmult = gcm_gmult_4bit_mmx;
762 CTX__GHASH(gcm_ghash_4bit_mmx);
763 } else {
764 ctx->gmult = gcm_gmult_4bit_x86;
765 CTX__GHASH(gcm_ghash_4bit_x86);
766 }
767 # else
768 ctx->gmult = gcm_gmult_4bit;
769 CTX__GHASH(gcm_ghash_4bit);
770 # endif
771 # elif defined(GHASH_ASM_ARM)
772 # ifdef PMULL_CAPABLE
773 if (PMULL_CAPABLE) {
774 gcm_init_v8(ctx->Htable, ctx->H.u);
775 ctx->gmult = gcm_gmult_v8;
776 CTX__GHASH(gcm_ghash_v8);
777 } else
778 # endif
779 # ifdef NEON_CAPABLE
780 if (NEON_CAPABLE) {
781 gcm_init_neon(ctx->Htable, ctx->H.u);
782 ctx->gmult = gcm_gmult_neon;
783 CTX__GHASH(gcm_ghash_neon);
784 } else
785 # endif
786 {
787 gcm_init_4bit(ctx->Htable, ctx->H.u);
788 ctx->gmult = gcm_gmult_4bit;
789 CTX__GHASH(gcm_ghash_4bit);
790 }
791 # elif defined(GHASH_ASM_SPARC)
792 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
793 gcm_init_vis3(ctx->Htable, ctx->H.u);
794 ctx->gmult = gcm_gmult_vis3;
795 CTX__GHASH(gcm_ghash_vis3);
796 } else {
797 gcm_init_4bit(ctx->Htable, ctx->H.u);
798 ctx->gmult = gcm_gmult_4bit;
799 CTX__GHASH(gcm_ghash_4bit);
800 }
801 # elif defined(GHASH_ASM_PPC)
802 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
803 gcm_init_p8(ctx->Htable, ctx->H.u);
804 ctx->gmult = gcm_gmult_p8;
805 CTX__GHASH(gcm_ghash_p8);
806 } else {
807 gcm_init_4bit(ctx->Htable, ctx->H.u);
808 ctx->gmult = gcm_gmult_4bit;
809 CTX__GHASH(gcm_ghash_4bit);
810 }
811 # else
812 gcm_init_4bit(ctx->Htable, ctx->H.u);
813 # endif
814 # undef CTX__GHASH
815 #endif
816 }
817
818 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
819 size_t len)
820 {
821 DECLARE_IS_ENDIAN;
822 unsigned int ctr;
823 #ifdef GCM_FUNCREF_4BIT
824 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
825 #endif
826
827 ctx->len.u[0] = 0; /* AAD length */
828 ctx->len.u[1] = 0; /* message length */
829 ctx->ares = 0;
830 ctx->mres = 0;
831
832 if (len == 12) {
833 memcpy(ctx->Yi.c, iv, 12);
834 ctx->Yi.c[12] = 0;
835 ctx->Yi.c[13] = 0;
836 ctx->Yi.c[14] = 0;
837 ctx->Yi.c[15] = 1;
838 ctr = 1;
839 } else {
840 size_t i;
841 u64 len0 = len;
842
843 /* Borrow ctx->Xi to calculate initial Yi */
844 ctx->Xi.u[0] = 0;
845 ctx->Xi.u[1] = 0;
846
847 while (len >= 16) {
848 for (i = 0; i < 16; ++i)
849 ctx->Xi.c[i] ^= iv[i];
850 GCM_MUL(ctx);
851 iv += 16;
852 len -= 16;
853 }
854 if (len) {
855 for (i = 0; i < len; ++i)
856 ctx->Xi.c[i] ^= iv[i];
857 GCM_MUL(ctx);
858 }
859 len0 <<= 3;
860 if (IS_LITTLE_ENDIAN) {
861 #ifdef BSWAP8
862 ctx->Xi.u[1] ^= BSWAP8(len0);
863 #else
864 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
865 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
866 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
867 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
868 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
869 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
870 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
871 ctx->Xi.c[15] ^= (u8)(len0);
872 #endif
873 } else {
874 ctx->Xi.u[1] ^= len0;
875 }
876
877 GCM_MUL(ctx);
878
879 if (IS_LITTLE_ENDIAN)
880 #ifdef BSWAP4
881 ctr = BSWAP4(ctx->Xi.d[3]);
882 #else
883 ctr = GETU32(ctx->Xi.c + 12);
884 #endif
885 else
886 ctr = ctx->Xi.d[3];
887
888 /* Copy borrowed Xi to Yi */
889 ctx->Yi.u[0] = ctx->Xi.u[0];
890 ctx->Yi.u[1] = ctx->Xi.u[1];
891 }
892
893 ctx->Xi.u[0] = 0;
894 ctx->Xi.u[1] = 0;
895
896 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
897 ++ctr;
898 if (IS_LITTLE_ENDIAN)
899 #ifdef BSWAP4
900 ctx->Yi.d[3] = BSWAP4(ctr);
901 #else
902 PUTU32(ctx->Yi.c + 12, ctr);
903 #endif
904 else
905 ctx->Yi.d[3] = ctr;
906 }
907
908 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
909 size_t len)
910 {
911 size_t i;
912 unsigned int n;
913 u64 alen = ctx->len.u[0];
914 #ifdef GCM_FUNCREF_4BIT
915 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
916 # ifdef GHASH
917 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
918 const u8 *inp, size_t len) = ctx->ghash;
919 # endif
920 #endif
921
922 if (ctx->len.u[1])
923 return -2;
924
925 alen += len;
926 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
927 return -1;
928 ctx->len.u[0] = alen;
929
930 n = ctx->ares;
931 if (n) {
932 while (n && len) {
933 ctx->Xi.c[n] ^= *(aad++);
934 --len;
935 n = (n + 1) % 16;
936 }
937 if (n == 0)
938 GCM_MUL(ctx);
939 else {
940 ctx->ares = n;
941 return 0;
942 }
943 }
944 #ifdef GHASH
945 if ((i = (len & (size_t)-16))) {
946 GHASH(ctx, aad, i);
947 aad += i;
948 len -= i;
949 }
950 #else
951 while (len >= 16) {
952 for (i = 0; i < 16; ++i)
953 ctx->Xi.c[i] ^= aad[i];
954 GCM_MUL(ctx);
955 aad += 16;
956 len -= 16;
957 }
958 #endif
959 if (len) {
960 n = (unsigned int)len;
961 for (i = 0; i < len; ++i)
962 ctx->Xi.c[i] ^= aad[i];
963 }
964
965 ctx->ares = n;
966 return 0;
967 }
968
969 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
970 const unsigned char *in, unsigned char *out,
971 size_t len)
972 {
973 DECLARE_IS_ENDIAN;
974 unsigned int n, ctr, mres;
975 size_t i;
976 u64 mlen = ctx->len.u[1];
977 block128_f block = ctx->block;
978 void *key = ctx->key;
979 #ifdef GCM_FUNCREF_4BIT
980 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
981 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
982 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
983 const u8 *inp, size_t len) = ctx->ghash;
984 # endif
985 #endif
986
987 mlen += len;
988 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
989 return -1;
990 ctx->len.u[1] = mlen;
991
992 mres = ctx->mres;
993
994 if (ctx->ares) {
995 /* First call to encrypt finalizes GHASH(AAD) */
996 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
997 if (len == 0) {
998 GCM_MUL(ctx);
999 ctx->ares = 0;
1000 return 0;
1001 }
1002 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1003 ctx->Xi.u[0] = 0;
1004 ctx->Xi.u[1] = 0;
1005 mres = sizeof(ctx->Xi);
1006 #else
1007 GCM_MUL(ctx);
1008 #endif
1009 ctx->ares = 0;
1010 }
1011
1012 if (IS_LITTLE_ENDIAN)
1013 #ifdef BSWAP4
1014 ctr = BSWAP4(ctx->Yi.d[3]);
1015 #else
1016 ctr = GETU32(ctx->Yi.c + 12);
1017 #endif
1018 else
1019 ctr = ctx->Yi.d[3];
1020
1021 n = mres % 16;
1022 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1023 if (16 % sizeof(size_t) == 0) { /* always true actually */
1024 do {
1025 if (n) {
1026 # if defined(GHASH)
1027 while (n && len) {
1028 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1029 --len;
1030 n = (n + 1) % 16;
1031 }
1032 if (n == 0) {
1033 GHASH(ctx, ctx->Xn, mres);
1034 mres = 0;
1035 } else {
1036 ctx->mres = mres;
1037 return 0;
1038 }
1039 # else
1040 while (n && len) {
1041 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1042 --len;
1043 n = (n + 1) % 16;
1044 }
1045 if (n == 0) {
1046 GCM_MUL(ctx);
1047 mres = 0;
1048 } else {
1049 ctx->mres = n;
1050 return 0;
1051 }
1052 # endif
1053 }
1054 # if defined(STRICT_ALIGNMENT)
1055 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1056 break;
1057 # endif
1058 # if defined(GHASH)
1059 if (len >= 16 && mres) {
1060 GHASH(ctx, ctx->Xn, mres);
1061 mres = 0;
1062 }
1063 # if defined(GHASH_CHUNK)
1064 while (len >= GHASH_CHUNK) {
1065 size_t j = GHASH_CHUNK;
1066
1067 while (j) {
1068 size_t_aX *out_t = (size_t_aX *)out;
1069 const size_t_aX *in_t = (const size_t_aX *)in;
1070
1071 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1072 ++ctr;
1073 if (IS_LITTLE_ENDIAN)
1074 # ifdef BSWAP4
1075 ctx->Yi.d[3] = BSWAP4(ctr);
1076 # else
1077 PUTU32(ctx->Yi.c + 12, ctr);
1078 # endif
1079 else
1080 ctx->Yi.d[3] = ctr;
1081 for (i = 0; i < 16 / sizeof(size_t); ++i)
1082 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1083 out += 16;
1084 in += 16;
1085 j -= 16;
1086 }
1087 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1088 len -= GHASH_CHUNK;
1089 }
1090 # endif
1091 if ((i = (len & (size_t)-16))) {
1092 size_t j = i;
1093
1094 while (len >= 16) {
1095 size_t_aX *out_t = (size_t_aX *)out;
1096 const size_t_aX *in_t = (const size_t_aX *)in;
1097
1098 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1099 ++ctr;
1100 if (IS_LITTLE_ENDIAN)
1101 # ifdef BSWAP4
1102 ctx->Yi.d[3] = BSWAP4(ctr);
1103 # else
1104 PUTU32(ctx->Yi.c + 12, ctr);
1105 # endif
1106 else
1107 ctx->Yi.d[3] = ctr;
1108 for (i = 0; i < 16 / sizeof(size_t); ++i)
1109 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1110 out += 16;
1111 in += 16;
1112 len -= 16;
1113 }
1114 GHASH(ctx, out - j, j);
1115 }
1116 # else
1117 while (len >= 16) {
1118 size_t *out_t = (size_t *)out;
1119 const size_t *in_t = (const size_t *)in;
1120
1121 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1122 ++ctr;
1123 if (IS_LITTLE_ENDIAN)
1124 # ifdef BSWAP4
1125 ctx->Yi.d[3] = BSWAP4(ctr);
1126 # else
1127 PUTU32(ctx->Yi.c + 12, ctr);
1128 # endif
1129 else
1130 ctx->Yi.d[3] = ctr;
1131 for (i = 0; i < 16 / sizeof(size_t); ++i)
1132 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1133 GCM_MUL(ctx);
1134 out += 16;
1135 in += 16;
1136 len -= 16;
1137 }
1138 # endif
1139 if (len) {
1140 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1141 ++ctr;
1142 if (IS_LITTLE_ENDIAN)
1143 # ifdef BSWAP4
1144 ctx->Yi.d[3] = BSWAP4(ctr);
1145 # else
1146 PUTU32(ctx->Yi.c + 12, ctr);
1147 # endif
1148 else
1149 ctx->Yi.d[3] = ctr;
1150 # if defined(GHASH)
1151 while (len--) {
1152 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1153 ++n;
1154 }
1155 # else
1156 while (len--) {
1157 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1158 ++n;
1159 }
1160 mres = n;
1161 # endif
1162 }
1163
1164 ctx->mres = mres;
1165 return 0;
1166 } while (0);
1167 }
1168 #endif
1169 for (i = 0; i < len; ++i) {
1170 if (n == 0) {
1171 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1172 ++ctr;
1173 if (IS_LITTLE_ENDIAN)
1174 #ifdef BSWAP4
1175 ctx->Yi.d[3] = BSWAP4(ctr);
1176 #else
1177 PUTU32(ctx->Yi.c + 12, ctr);
1178 #endif
1179 else
1180 ctx->Yi.d[3] = ctr;
1181 }
1182 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1183 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1184 n = (n + 1) % 16;
1185 if (mres == sizeof(ctx->Xn)) {
1186 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1187 mres = 0;
1188 }
1189 #else
1190 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1191 mres = n = (n + 1) % 16;
1192 if (n == 0)
1193 GCM_MUL(ctx);
1194 #endif
1195 }
1196
1197 ctx->mres = mres;
1198 return 0;
1199 }
1200
1201 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1202 const unsigned char *in, unsigned char *out,
1203 size_t len)
1204 {
1205 DECLARE_IS_ENDIAN;
1206 unsigned int n, ctr, mres;
1207 size_t i;
1208 u64 mlen = ctx->len.u[1];
1209 block128_f block = ctx->block;
1210 void *key = ctx->key;
1211 #ifdef GCM_FUNCREF_4BIT
1212 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1213 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1214 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1215 const u8 *inp, size_t len) = ctx->ghash;
1216 # endif
1217 #endif
1218
1219 mlen += len;
1220 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1221 return -1;
1222 ctx->len.u[1] = mlen;
1223
1224 mres = ctx->mres;
1225
1226 if (ctx->ares) {
1227 /* First call to decrypt finalizes GHASH(AAD) */
1228 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1229 if (len == 0) {
1230 GCM_MUL(ctx);
1231 ctx->ares = 0;
1232 return 0;
1233 }
1234 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1235 ctx->Xi.u[0] = 0;
1236 ctx->Xi.u[1] = 0;
1237 mres = sizeof(ctx->Xi);
1238 #else
1239 GCM_MUL(ctx);
1240 #endif
1241 ctx->ares = 0;
1242 }
1243
1244 if (IS_LITTLE_ENDIAN)
1245 #ifdef BSWAP4
1246 ctr = BSWAP4(ctx->Yi.d[3]);
1247 #else
1248 ctr = GETU32(ctx->Yi.c + 12);
1249 #endif
1250 else
1251 ctr = ctx->Yi.d[3];
1252
1253 n = mres % 16;
1254 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1255 if (16 % sizeof(size_t) == 0) { /* always true actually */
1256 do {
1257 if (n) {
1258 # if defined(GHASH)
1259 while (n && len) {
1260 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1261 --len;
1262 n = (n + 1) % 16;
1263 }
1264 if (n == 0) {
1265 GHASH(ctx, ctx->Xn, mres);
1266 mres = 0;
1267 } else {
1268 ctx->mres = mres;
1269 return 0;
1270 }
1271 # else
1272 while (n && len) {
1273 u8 c = *(in++);
1274 *(out++) = c ^ ctx->EKi.c[n];
1275 ctx->Xi.c[n] ^= c;
1276 --len;
1277 n = (n + 1) % 16;
1278 }
1279 if (n == 0) {
1280 GCM_MUL(ctx);
1281 mres = 0;
1282 } else {
1283 ctx->mres = n;
1284 return 0;
1285 }
1286 # endif
1287 }
1288 # if defined(STRICT_ALIGNMENT)
1289 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1290 break;
1291 # endif
1292 # if defined(GHASH)
1293 if (len >= 16 && mres) {
1294 GHASH(ctx, ctx->Xn, mres);
1295 mres = 0;
1296 }
1297 # if defined(GHASH_CHUNK)
1298 while (len >= GHASH_CHUNK) {
1299 size_t j = GHASH_CHUNK;
1300
1301 GHASH(ctx, in, GHASH_CHUNK);
1302 while (j) {
1303 size_t_aX *out_t = (size_t_aX *)out;
1304 const size_t_aX *in_t = (const size_t_aX *)in;
1305
1306 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1307 ++ctr;
1308 if (IS_LITTLE_ENDIAN)
1309 # ifdef BSWAP4
1310 ctx->Yi.d[3] = BSWAP4(ctr);
1311 # else
1312 PUTU32(ctx->Yi.c + 12, ctr);
1313 # endif
1314 else
1315 ctx->Yi.d[3] = ctr;
1316 for (i = 0; i < 16 / sizeof(size_t); ++i)
1317 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1318 out += 16;
1319 in += 16;
1320 j -= 16;
1321 }
1322 len -= GHASH_CHUNK;
1323 }
1324 # endif
1325 if ((i = (len & (size_t)-16))) {
1326 GHASH(ctx, in, i);
1327 while (len >= 16) {
1328 size_t_aX *out_t = (size_t_aX *)out;
1329 const size_t_aX *in_t = (const size_t_aX *)in;
1330
1331 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1332 ++ctr;
1333 if (IS_LITTLE_ENDIAN)
1334 # ifdef BSWAP4
1335 ctx->Yi.d[3] = BSWAP4(ctr);
1336 # else
1337 PUTU32(ctx->Yi.c + 12, ctr);
1338 # endif
1339 else
1340 ctx->Yi.d[3] = ctr;
1341 for (i = 0; i < 16 / sizeof(size_t); ++i)
1342 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1343 out += 16;
1344 in += 16;
1345 len -= 16;
1346 }
1347 }
1348 # else
1349 while (len >= 16) {
1350 size_t *out_t = (size_t *)out;
1351 const size_t *in_t = (const size_t *)in;
1352
1353 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1354 ++ctr;
1355 if (IS_LITTLE_ENDIAN)
1356 # ifdef BSWAP4
1357 ctx->Yi.d[3] = BSWAP4(ctr);
1358 # else
1359 PUTU32(ctx->Yi.c + 12, ctr);
1360 # endif
1361 else
1362 ctx->Yi.d[3] = ctr;
1363 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1364 size_t c = in_t[i];
1365 out_t[i] = c ^ ctx->EKi.t[i];
1366 ctx->Xi.t[i] ^= c;
1367 }
1368 GCM_MUL(ctx);
1369 out += 16;
1370 in += 16;
1371 len -= 16;
1372 }
1373 # endif
1374 if (len) {
1375 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1376 ++ctr;
1377 if (IS_LITTLE_ENDIAN)
1378 # ifdef BSWAP4
1379 ctx->Yi.d[3] = BSWAP4(ctr);
1380 # else
1381 PUTU32(ctx->Yi.c + 12, ctr);
1382 # endif
1383 else
1384 ctx->Yi.d[3] = ctr;
1385 # if defined(GHASH)
1386 while (len--) {
1387 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1388 ++n;
1389 }
1390 # else
1391 while (len--) {
1392 u8 c = in[n];
1393 ctx->Xi.c[n] ^= c;
1394 out[n] = c ^ ctx->EKi.c[n];
1395 ++n;
1396 }
1397 mres = n;
1398 # endif
1399 }
1400
1401 ctx->mres = mres;
1402 return 0;
1403 } while (0);
1404 }
1405 #endif
1406 for (i = 0; i < len; ++i) {
1407 u8 c;
1408 if (n == 0) {
1409 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1410 ++ctr;
1411 if (IS_LITTLE_ENDIAN)
1412 #ifdef BSWAP4
1413 ctx->Yi.d[3] = BSWAP4(ctr);
1414 #else
1415 PUTU32(ctx->Yi.c + 12, ctr);
1416 #endif
1417 else
1418 ctx->Yi.d[3] = ctr;
1419 }
1420 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1421 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1422 n = (n + 1) % 16;
1423 if (mres == sizeof(ctx->Xn)) {
1424 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1425 mres = 0;
1426 }
1427 #else
1428 c = in[i];
1429 out[i] = c ^ ctx->EKi.c[n];
1430 ctx->Xi.c[n] ^= c;
1431 mres = n = (n + 1) % 16;
1432 if (n == 0)
1433 GCM_MUL(ctx);
1434 #endif
1435 }
1436
1437 ctx->mres = mres;
1438 return 0;
1439 }
1440
1441 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1442 const unsigned char *in, unsigned char *out,
1443 size_t len, ctr128_f stream)
1444 {
1445 #if defined(OPENSSL_SMALL_FOOTPRINT)
1446 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1447 #else
1448 DECLARE_IS_ENDIAN;
1449 unsigned int n, ctr, mres;
1450 size_t i;
1451 u64 mlen = ctx->len.u[1];
1452 void *key = ctx->key;
1453 # ifdef GCM_FUNCREF_4BIT
1454 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1455 # ifdef GHASH
1456 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1457 const u8 *inp, size_t len) = ctx->ghash;
1458 # endif
1459 # endif
1460
1461 mlen += len;
1462 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1463 return -1;
1464 ctx->len.u[1] = mlen;
1465
1466 mres = ctx->mres;
1467
1468 if (ctx->ares) {
1469 /* First call to encrypt finalizes GHASH(AAD) */
1470 #if defined(GHASH)
1471 if (len == 0) {
1472 GCM_MUL(ctx);
1473 ctx->ares = 0;
1474 return 0;
1475 }
1476 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1477 ctx->Xi.u[0] = 0;
1478 ctx->Xi.u[1] = 0;
1479 mres = sizeof(ctx->Xi);
1480 #else
1481 GCM_MUL(ctx);
1482 #endif
1483 ctx->ares = 0;
1484 }
1485
1486 if (IS_LITTLE_ENDIAN)
1487 # ifdef BSWAP4
1488 ctr = BSWAP4(ctx->Yi.d[3]);
1489 # else
1490 ctr = GETU32(ctx->Yi.c + 12);
1491 # endif
1492 else
1493 ctr = ctx->Yi.d[3];
1494
1495 n = mres % 16;
1496 if (n) {
1497 # if defined(GHASH)
1498 while (n && len) {
1499 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1500 --len;
1501 n = (n + 1) % 16;
1502 }
1503 if (n == 0) {
1504 GHASH(ctx, ctx->Xn, mres);
1505 mres = 0;
1506 } else {
1507 ctx->mres = mres;
1508 return 0;
1509 }
1510 # else
1511 while (n && len) {
1512 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1513 --len;
1514 n = (n + 1) % 16;
1515 }
1516 if (n == 0) {
1517 GCM_MUL(ctx);
1518 mres = 0;
1519 } else {
1520 ctx->mres = n;
1521 return 0;
1522 }
1523 # endif
1524 }
1525 # if defined(GHASH)
1526 if (len >= 16 && mres) {
1527 GHASH(ctx, ctx->Xn, mres);
1528 mres = 0;
1529 }
1530 # if defined(GHASH_CHUNK)
1531 while (len >= GHASH_CHUNK) {
1532 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1533 ctr += GHASH_CHUNK / 16;
1534 if (IS_LITTLE_ENDIAN)
1535 # ifdef BSWAP4
1536 ctx->Yi.d[3] = BSWAP4(ctr);
1537 # else
1538 PUTU32(ctx->Yi.c + 12, ctr);
1539 # endif
1540 else
1541 ctx->Yi.d[3] = ctr;
1542 GHASH(ctx, out, GHASH_CHUNK);
1543 out += GHASH_CHUNK;
1544 in += GHASH_CHUNK;
1545 len -= GHASH_CHUNK;
1546 }
1547 # endif
1548 # endif
1549 if ((i = (len & (size_t)-16))) {
1550 size_t j = i / 16;
1551
1552 (*stream) (in, out, j, key, ctx->Yi.c);
1553 ctr += (unsigned int)j;
1554 if (IS_LITTLE_ENDIAN)
1555 # ifdef BSWAP4
1556 ctx->Yi.d[3] = BSWAP4(ctr);
1557 # else
1558 PUTU32(ctx->Yi.c + 12, ctr);
1559 # endif
1560 else
1561 ctx->Yi.d[3] = ctr;
1562 in += i;
1563 len -= i;
1564 # if defined(GHASH)
1565 GHASH(ctx, out, i);
1566 out += i;
1567 # else
1568 while (j--) {
1569 for (i = 0; i < 16; ++i)
1570 ctx->Xi.c[i] ^= out[i];
1571 GCM_MUL(ctx);
1572 out += 16;
1573 }
1574 # endif
1575 }
1576 if (len) {
1577 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1578 ++ctr;
1579 if (IS_LITTLE_ENDIAN)
1580 # ifdef BSWAP4
1581 ctx->Yi.d[3] = BSWAP4(ctr);
1582 # else
1583 PUTU32(ctx->Yi.c + 12, ctr);
1584 # endif
1585 else
1586 ctx->Yi.d[3] = ctr;
1587 while (len--) {
1588 # if defined(GHASH)
1589 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1590 # else
1591 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1592 # endif
1593 ++n;
1594 }
1595 }
1596
1597 ctx->mres = mres;
1598 return 0;
1599 #endif
1600 }
1601
1602 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1603 const unsigned char *in, unsigned char *out,
1604 size_t len, ctr128_f stream)
1605 {
1606 #if defined(OPENSSL_SMALL_FOOTPRINT)
1607 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1608 #else
1609 DECLARE_IS_ENDIAN;
1610 unsigned int n, ctr, mres;
1611 size_t i;
1612 u64 mlen = ctx->len.u[1];
1613 void *key = ctx->key;
1614 # ifdef GCM_FUNCREF_4BIT
1615 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1616 # ifdef GHASH
1617 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1618 const u8 *inp, size_t len) = ctx->ghash;
1619 # endif
1620 # endif
1621
1622 mlen += len;
1623 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1624 return -1;
1625 ctx->len.u[1] = mlen;
1626
1627 mres = ctx->mres;
1628
1629 if (ctx->ares) {
1630 /* First call to decrypt finalizes GHASH(AAD) */
1631 # if defined(GHASH)
1632 if (len == 0) {
1633 GCM_MUL(ctx);
1634 ctx->ares = 0;
1635 return 0;
1636 }
1637 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1638 ctx->Xi.u[0] = 0;
1639 ctx->Xi.u[1] = 0;
1640 mres = sizeof(ctx->Xi);
1641 # else
1642 GCM_MUL(ctx);
1643 # endif
1644 ctx->ares = 0;
1645 }
1646
1647 if (IS_LITTLE_ENDIAN)
1648 # ifdef BSWAP4
1649 ctr = BSWAP4(ctx->Yi.d[3]);
1650 # else
1651 ctr = GETU32(ctx->Yi.c + 12);
1652 # endif
1653 else
1654 ctr = ctx->Yi.d[3];
1655
1656 n = mres % 16;
1657 if (n) {
1658 # if defined(GHASH)
1659 while (n && len) {
1660 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1661 --len;
1662 n = (n + 1) % 16;
1663 }
1664 if (n == 0) {
1665 GHASH(ctx, ctx->Xn, mres);
1666 mres = 0;
1667 } else {
1668 ctx->mres = mres;
1669 return 0;
1670 }
1671 # else
1672 while (n && len) {
1673 u8 c = *(in++);
1674 *(out++) = c ^ ctx->EKi.c[n];
1675 ctx->Xi.c[n] ^= c;
1676 --len;
1677 n = (n + 1) % 16;
1678 }
1679 if (n == 0) {
1680 GCM_MUL(ctx);
1681 mres = 0;
1682 } else {
1683 ctx->mres = n;
1684 return 0;
1685 }
1686 # endif
1687 }
1688 # if defined(GHASH)
1689 if (len >= 16 && mres) {
1690 GHASH(ctx, ctx->Xn, mres);
1691 mres = 0;
1692 }
1693 # if defined(GHASH_CHUNK)
1694 while (len >= GHASH_CHUNK) {
1695 GHASH(ctx, in, GHASH_CHUNK);
1696 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1697 ctr += GHASH_CHUNK / 16;
1698 if (IS_LITTLE_ENDIAN)
1699 # ifdef BSWAP4
1700 ctx->Yi.d[3] = BSWAP4(ctr);
1701 # else
1702 PUTU32(ctx->Yi.c + 12, ctr);
1703 # endif
1704 else
1705 ctx->Yi.d[3] = ctr;
1706 out += GHASH_CHUNK;
1707 in += GHASH_CHUNK;
1708 len -= GHASH_CHUNK;
1709 }
1710 # endif
1711 # endif
1712 if ((i = (len & (size_t)-16))) {
1713 size_t j = i / 16;
1714
1715 # if defined(GHASH)
1716 GHASH(ctx, in, i);
1717 # else
1718 while (j--) {
1719 size_t k;
1720 for (k = 0; k < 16; ++k)
1721 ctx->Xi.c[k] ^= in[k];
1722 GCM_MUL(ctx);
1723 in += 16;
1724 }
1725 j = i / 16;
1726 in -= i;
1727 # endif
1728 (*stream) (in, out, j, key, ctx->Yi.c);
1729 ctr += (unsigned int)j;
1730 if (IS_LITTLE_ENDIAN)
1731 # ifdef BSWAP4
1732 ctx->Yi.d[3] = BSWAP4(ctr);
1733 # else
1734 PUTU32(ctx->Yi.c + 12, ctr);
1735 # endif
1736 else
1737 ctx->Yi.d[3] = ctr;
1738 out += i;
1739 in += i;
1740 len -= i;
1741 }
1742 if (len) {
1743 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1744 ++ctr;
1745 if (IS_LITTLE_ENDIAN)
1746 # ifdef BSWAP4
1747 ctx->Yi.d[3] = BSWAP4(ctr);
1748 # else
1749 PUTU32(ctx->Yi.c + 12, ctr);
1750 # endif
1751 else
1752 ctx->Yi.d[3] = ctr;
1753 while (len--) {
1754 # if defined(GHASH)
1755 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1756 # else
1757 u8 c = in[n];
1758 ctx->Xi.c[mres++] ^= c;
1759 out[n] = c ^ ctx->EKi.c[n];
1760 # endif
1761 ++n;
1762 }
1763 }
1764
1765 ctx->mres = mres;
1766 return 0;
1767 #endif
1768 }
1769
1770 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1771 size_t len)
1772 {
1773 DECLARE_IS_ENDIAN;
1774 u64 alen = ctx->len.u[0] << 3;
1775 u64 clen = ctx->len.u[1] << 3;
1776 #ifdef GCM_FUNCREF_4BIT
1777 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1778 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1779 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1780 const u8 *inp, size_t len) = ctx->ghash;
1781 # endif
1782 #endif
1783
1784 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1785 u128 bitlen;
1786 unsigned int mres = ctx->mres;
1787
1788 if (mres) {
1789 unsigned blocks = (mres + 15) & -16;
1790
1791 memset(ctx->Xn + mres, 0, blocks - mres);
1792 mres = blocks;
1793 if (mres == sizeof(ctx->Xn)) {
1794 GHASH(ctx, ctx->Xn, mres);
1795 mres = 0;
1796 }
1797 } else if (ctx->ares) {
1798 GCM_MUL(ctx);
1799 }
1800 #else
1801 if (ctx->mres || ctx->ares)
1802 GCM_MUL(ctx);
1803 #endif
1804
1805 if (IS_LITTLE_ENDIAN) {
1806 #ifdef BSWAP8
1807 alen = BSWAP8(alen);
1808 clen = BSWAP8(clen);
1809 #else
1810 u8 *p = ctx->len.c;
1811
1812 ctx->len.u[0] = alen;
1813 ctx->len.u[1] = clen;
1814
1815 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1816 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1817 #endif
1818 }
1819
1820 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1821 bitlen.hi = alen;
1822 bitlen.lo = clen;
1823 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1824 mres += sizeof(bitlen);
1825 GHASH(ctx, ctx->Xn, mres);
1826 #else
1827 ctx->Xi.u[0] ^= alen;
1828 ctx->Xi.u[1] ^= clen;
1829 GCM_MUL(ctx);
1830 #endif
1831
1832 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1833 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1834
1835 if (tag && len <= sizeof(ctx->Xi))
1836 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1837 else
1838 return -1;
1839 }
1840
1841 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1842 {
1843 CRYPTO_gcm128_finish(ctx, NULL, 0);
1844 memcpy(tag, ctx->Xi.c,
1845 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1846 }
1847
1848 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1849 {
1850 GCM128_CONTEXT *ret;
1851
1852 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1853 CRYPTO_gcm128_init(ret, key, block);
1854
1855 return ret;
1856 }
1857
1858 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1859 {
1860 OPENSSL_clear_free(ctx, sizeof(*ctx));
1861 }
1862