xref: /netbsd-src/crypto/external/bsd/openssl/dist/crypto/modes/gcm128.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*
2  * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the OpenSSL license (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/crypto.h>
11 #include "modes_local.h"
12 #include <string.h>
13 
14 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
15 typedef size_t size_t_aX __attribute((__aligned__(1)));
16 #else
17 typedef size_t size_t_aX;
18 #endif
19 
20 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
21 /* redefine, because alignment is ensured */
22 # undef  GETU32
23 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
24 # undef  PUTU32
25 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
26 #endif
27 
28 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
29 #define REDUCE1BIT(V)   do { \
30         if (sizeof(size_t)==8) { \
31                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32                 V.lo  = (V.hi<<63)|(V.lo>>1); \
33                 V.hi  = (V.hi>>1 )^T; \
34         } \
35         else { \
36                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37                 V.lo  = (V.hi<<63)|(V.lo>>1); \
38                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
39         } \
40 } while(0)
41 
42 /*-
43  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
44  * never be set to 8. 8 is effectively reserved for testing purposes.
45  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
46  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
47  * whole spectrum of possible table driven implementations. Why? In
48  * non-"Shoup's" case memory access pattern is segmented in such manner,
49  * that it's trivial to see that cache timing information can reveal
50  * fair portion of intermediate hash value. Given that ciphertext is
51  * always available to attacker, it's possible for him to attempt to
52  * deduce secret parameter H and if successful, tamper with messages
53  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
54  * not as trivial, but there is no reason to believe that it's resistant
55  * to cache-timing attack. And the thing about "8-bit" implementation is
56  * that it consumes 16 (sixteen) times more memory, 4KB per individual
57  * key + 1KB shared. Well, on pros side it should be twice as fast as
58  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
59  * was observed to run ~75% faster, closer to 100% for commercial
60  * compilers... Yet "4-bit" procedure is preferred, because it's
61  * believed to provide better security-performance balance and adequate
62  * all-round performance. "All-round" refers to things like:
63  *
64  * - shorter setup time effectively improves overall timing for
65  *   handling short messages;
66  * - larger table allocation can become unbearable because of VM
67  *   subsystem penalties (for example on Windows large enough free
68  *   results in VM working set trimming, meaning that consequent
69  *   malloc would immediately incur working set expansion);
70  * - larger table has larger cache footprint, which can affect
71  *   performance of other code paths (not necessarily even from same
72  *   thread in Hyper-Threading world);
73  *
74  * Value of 1 is not appropriate for performance reasons.
75  */
76 #if     TABLE_BITS==8
77 
78 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
79 {
80     int i, j;
81     u128 V;
82 
83     Htable[0].hi = 0;
84     Htable[0].lo = 0;
85     V.hi = H[0];
86     V.lo = H[1];
87 
88     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
89         REDUCE1BIT(V);
90         Htable[i] = V;
91     }
92 
93     for (i = 2; i < 256; i <<= 1) {
94         u128 *Hi = Htable + i, H0 = *Hi;
95         for (j = 1; j < i; ++j) {
96             Hi[j].hi = H0.hi ^ Htable[j].hi;
97             Hi[j].lo = H0.lo ^ Htable[j].lo;
98         }
99     }
100 }
101 
102 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
103 {
104     u128 Z = { 0, 0 };
105     const u8 *xi = (const u8 *)Xi + 15;
106     size_t rem, n = *xi;
107     const union {
108         long one;
109         char little;
110     } is_endian = { 1 };
111     static const size_t rem_8bit[256] = {
112         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
113         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
114         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
115         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
116         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
117         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
118         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
119         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
120         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
121         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
122         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
123         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
124         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
125         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
126         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
127         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
128         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
129         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
130         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
131         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
132         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
133         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
134         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
135         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
136         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
137         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
138         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
139         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
140         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
141         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
142         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
143         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
144         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
145         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
146         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
147         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
148         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
149         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
150         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
151         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
152         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
153         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
154         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
155         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
156         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
157         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
158         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
159         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
160         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
161         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
162         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
163         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
164         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
165         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
166         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
167         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
168         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
169         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
170         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
171         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
172         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
173         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
174         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
175         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
176     };
177 
178     while (1) {
179         Z.hi ^= Htable[n].hi;
180         Z.lo ^= Htable[n].lo;
181 
182         if ((u8 *)Xi == xi)
183             break;
184 
185         n = *(--xi);
186 
187         rem = (size_t)Z.lo & 0xff;
188         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
189         Z.hi = (Z.hi >> 8);
190         if (sizeof(size_t) == 8)
191             Z.hi ^= rem_8bit[rem];
192         else
193             Z.hi ^= (u64)rem_8bit[rem] << 32;
194     }
195 
196     if (is_endian.little) {
197 # ifdef BSWAP8
198         Xi[0] = BSWAP8(Z.hi);
199         Xi[1] = BSWAP8(Z.lo);
200 # else
201         u8 *p = (u8 *)Xi;
202         u32 v;
203         v = (u32)(Z.hi >> 32);
204         PUTU32(p, v);
205         v = (u32)(Z.hi);
206         PUTU32(p + 4, v);
207         v = (u32)(Z.lo >> 32);
208         PUTU32(p + 8, v);
209         v = (u32)(Z.lo);
210         PUTU32(p + 12, v);
211 # endif
212     } else {
213         Xi[0] = Z.hi;
214         Xi[1] = Z.lo;
215     }
216 }
217 
218 # define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
219 
220 #elif   TABLE_BITS==4
221 
222 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
223 {
224     u128 V;
225 # if defined(OPENSSL_SMALL_FOOTPRINT)
226     int i;
227 # endif
228 
229     Htable[0].hi = 0;
230     Htable[0].lo = 0;
231     V.hi = H[0];
232     V.lo = H[1];
233 
234 # if defined(OPENSSL_SMALL_FOOTPRINT)
235     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
236         REDUCE1BIT(V);
237         Htable[i] = V;
238     }
239 
240     for (i = 2; i < 16; i <<= 1) {
241         u128 *Hi = Htable + i;
242         int j;
243         for (V = *Hi, j = 1; j < i; ++j) {
244             Hi[j].hi = V.hi ^ Htable[j].hi;
245             Hi[j].lo = V.lo ^ Htable[j].lo;
246         }
247     }
248 # else
249     Htable[8] = V;
250     REDUCE1BIT(V);
251     Htable[4] = V;
252     REDUCE1BIT(V);
253     Htable[2] = V;
254     REDUCE1BIT(V);
255     Htable[1] = V;
256     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
257     V = Htable[4];
258     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
259     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
260     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
261     V = Htable[8];
262     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
263     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
264     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
265     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
266     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
267     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
268     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
269 # endif
270 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
271     /*
272      * ARM assembler expects specific dword order in Htable.
273      */
274     {
275         int j;
276         const union {
277             long one;
278             char little;
279         } is_endian = { 1 };
280 
281         if (is_endian.little)
282             for (j = 0; j < 16; ++j) {
283                 V = Htable[j];
284                 Htable[j].hi = V.lo;
285                 Htable[j].lo = V.hi;
286         } else
287             for (j = 0; j < 16; ++j) {
288                 V = Htable[j];
289                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
290                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
291             }
292     }
293 # endif
294 }
295 
296 # ifndef GHASH_ASM
297 static const size_t rem_4bit[16] = {
298     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
299     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
300     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
301     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
302 };
303 
304 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
305 {
306     u128 Z;
307     int cnt = 15;
308     size_t rem, nlo, nhi;
309     const union {
310         long one;
311         char little;
312     } is_endian = { 1 };
313 
314     nlo = ((const u8 *)Xi)[15];
315     nhi = nlo >> 4;
316     nlo &= 0xf;
317 
318     Z.hi = Htable[nlo].hi;
319     Z.lo = Htable[nlo].lo;
320 
321     while (1) {
322         rem = (size_t)Z.lo & 0xf;
323         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
324         Z.hi = (Z.hi >> 4);
325         if (sizeof(size_t) == 8)
326             Z.hi ^= rem_4bit[rem];
327         else
328             Z.hi ^= (u64)rem_4bit[rem] << 32;
329 
330         Z.hi ^= Htable[nhi].hi;
331         Z.lo ^= Htable[nhi].lo;
332 
333         if (--cnt < 0)
334             break;
335 
336         nlo = ((const u8 *)Xi)[cnt];
337         nhi = nlo >> 4;
338         nlo &= 0xf;
339 
340         rem = (size_t)Z.lo & 0xf;
341         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
342         Z.hi = (Z.hi >> 4);
343         if (sizeof(size_t) == 8)
344             Z.hi ^= rem_4bit[rem];
345         else
346             Z.hi ^= (u64)rem_4bit[rem] << 32;
347 
348         Z.hi ^= Htable[nlo].hi;
349         Z.lo ^= Htable[nlo].lo;
350     }
351 
352     if (is_endian.little) {
353 #  ifdef BSWAP8
354         Xi[0] = BSWAP8(Z.hi);
355         Xi[1] = BSWAP8(Z.lo);
356 #  else
357         u8 *p = (u8 *)Xi;
358         u32 v;
359         v = (u32)(Z.hi >> 32);
360         PUTU32(p, v);
361         v = (u32)(Z.hi);
362         PUTU32(p + 4, v);
363         v = (u32)(Z.lo >> 32);
364         PUTU32(p + 8, v);
365         v = (u32)(Z.lo);
366         PUTU32(p + 12, v);
367 #  endif
368     } else {
369         Xi[0] = Z.hi;
370         Xi[1] = Z.lo;
371     }
372 }
373 
374 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
375 /*
376  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
377  * details... Compiler-generated code doesn't seem to give any
378  * performance improvement, at least not on x86[_64]. It's here
379  * mostly as reference and a placeholder for possible future
380  * non-trivial optimization[s]...
381  */
382 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
383                            const u8 *inp, size_t len)
384 {
385     u128 Z;
386     int cnt;
387     size_t rem, nlo, nhi;
388     const union {
389         long one;
390         char little;
391     } is_endian = { 1 };
392 
393 #   if 1
394     do {
395         cnt = 15;
396         nlo = ((const u8 *)Xi)[15];
397         nlo ^= inp[15];
398         nhi = nlo >> 4;
399         nlo &= 0xf;
400 
401         Z.hi = Htable[nlo].hi;
402         Z.lo = Htable[nlo].lo;
403 
404         while (1) {
405             rem = (size_t)Z.lo & 0xf;
406             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
407             Z.hi = (Z.hi >> 4);
408             if (sizeof(size_t) == 8)
409                 Z.hi ^= rem_4bit[rem];
410             else
411                 Z.hi ^= (u64)rem_4bit[rem] << 32;
412 
413             Z.hi ^= Htable[nhi].hi;
414             Z.lo ^= Htable[nhi].lo;
415 
416             if (--cnt < 0)
417                 break;
418 
419             nlo = ((const u8 *)Xi)[cnt];
420             nlo ^= inp[cnt];
421             nhi = nlo >> 4;
422             nlo &= 0xf;
423 
424             rem = (size_t)Z.lo & 0xf;
425             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
426             Z.hi = (Z.hi >> 4);
427             if (sizeof(size_t) == 8)
428                 Z.hi ^= rem_4bit[rem];
429             else
430                 Z.hi ^= (u64)rem_4bit[rem] << 32;
431 
432             Z.hi ^= Htable[nlo].hi;
433             Z.lo ^= Htable[nlo].lo;
434         }
435 #   else
436     /*
437      * Extra 256+16 bytes per-key plus 512 bytes shared tables
438      * [should] give ~50% improvement... One could have PACK()-ed
439      * the rem_8bit even here, but the priority is to minimize
440      * cache footprint...
441      */
442     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
443     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
444     static const unsigned short rem_8bit[256] = {
445         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
446         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
447         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
448         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
449         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
450         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
451         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
452         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
453         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
454         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
455         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
456         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
457         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
458         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
459         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
460         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
461         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
462         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
463         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
464         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
465         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
466         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
467         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
468         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
469         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
470         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
471         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
472         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
473         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
474         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
475         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
476         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
477     };
478     /*
479      * This pre-processing phase slows down procedure by approximately
480      * same time as it makes each loop spin faster. In other words
481      * single block performance is approximately same as straightforward
482      * "4-bit" implementation, and then it goes only faster...
483      */
484     for (cnt = 0; cnt < 16; ++cnt) {
485         Z.hi = Htable[cnt].hi;
486         Z.lo = Htable[cnt].lo;
487         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
488         Hshr4[cnt].hi = (Z.hi >> 4);
489         Hshl4[cnt] = (u8)(Z.lo << 4);
490     }
491 
492     do {
493         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
494             nlo = ((const u8 *)Xi)[cnt];
495             nlo ^= inp[cnt];
496             nhi = nlo >> 4;
497             nlo &= 0xf;
498 
499             Z.hi ^= Htable[nlo].hi;
500             Z.lo ^= Htable[nlo].lo;
501 
502             rem = (size_t)Z.lo & 0xff;
503 
504             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
505             Z.hi = (Z.hi >> 8);
506 
507             Z.hi ^= Hshr4[nhi].hi;
508             Z.lo ^= Hshr4[nhi].lo;
509             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
510         }
511 
512         nlo = ((const u8 *)Xi)[0];
513         nlo ^= inp[0];
514         nhi = nlo >> 4;
515         nlo &= 0xf;
516 
517         Z.hi ^= Htable[nlo].hi;
518         Z.lo ^= Htable[nlo].lo;
519 
520         rem = (size_t)Z.lo & 0xf;
521 
522         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
523         Z.hi = (Z.hi >> 4);
524 
525         Z.hi ^= Htable[nhi].hi;
526         Z.lo ^= Htable[nhi].lo;
527         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
528 #   endif
529 
530         if (is_endian.little) {
531 #   ifdef BSWAP8
532             Xi[0] = BSWAP8(Z.hi);
533             Xi[1] = BSWAP8(Z.lo);
534 #   else
535             u8 *p = (u8 *)Xi;
536             u32 v;
537             v = (u32)(Z.hi >> 32);
538             PUTU32(p, v);
539             v = (u32)(Z.hi);
540             PUTU32(p + 4, v);
541             v = (u32)(Z.lo >> 32);
542             PUTU32(p + 8, v);
543             v = (u32)(Z.lo);
544             PUTU32(p + 12, v);
545 #   endif
546         } else {
547             Xi[0] = Z.hi;
548             Xi[1] = Z.lo;
549         }
550     } while (inp += 16, len -= 16);
551 }
552 #  endif
553 # else
554 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
555 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
556                     size_t len);
557 # endif
558 
559 # define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
560 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
561 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
562 /*
563  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
564  * effect. In other words idea is to hash data while it's still in L1 cache
565  * after encryption pass...
566  */
567 #  define GHASH_CHUNK       (3*1024)
568 # endif
569 
570 #else                           /* TABLE_BITS */
571 
572 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
573 {
574     u128 V, Z = { 0, 0 };
575     long X;
576     int i, j;
577     const long *xi = (const long *)Xi;
578     const union {
579         long one;
580         char little;
581     } is_endian = { 1 };
582 
583     V.hi = H[0];                /* H is in host byte order, no byte swapping */
584     V.lo = H[1];
585 
586     for (j = 0; j < 16 / sizeof(long); ++j) {
587         if (is_endian.little) {
588             if (sizeof(long) == 8) {
589 # ifdef BSWAP8
590                 X = (long)(BSWAP8(xi[j]));
591 # else
592                 const u8 *p = (const u8 *)(xi + j);
593                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
594 # endif
595             } else {
596                 const u8 *p = (const u8 *)(xi + j);
597                 X = (long)GETU32(p);
598             }
599         } else
600             X = xi[j];
601 
602         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
603             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
604             Z.hi ^= V.hi & M;
605             Z.lo ^= V.lo & M;
606 
607             REDUCE1BIT(V);
608         }
609     }
610 
611     if (is_endian.little) {
612 # ifdef BSWAP8
613         Xi[0] = BSWAP8(Z.hi);
614         Xi[1] = BSWAP8(Z.lo);
615 # else
616         u8 *p = (u8 *)Xi;
617         u32 v;
618         v = (u32)(Z.hi >> 32);
619         PUTU32(p, v);
620         v = (u32)(Z.hi);
621         PUTU32(p + 4, v);
622         v = (u32)(Z.lo >> 32);
623         PUTU32(p + 8, v);
624         v = (u32)(Z.lo);
625         PUTU32(p + 12, v);
626 # endif
627     } else {
628         Xi[0] = Z.hi;
629         Xi[1] = Z.lo;
630     }
631 }
632 
633 # define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
634 
635 #endif
636 
637 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
638 # if    !defined(I386_ONLY) && \
639         (defined(__i386)        || defined(__i386__)    || \
640          defined(__x86_64)      || defined(__x86_64__)  || \
641          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
642 #  define GHASH_ASM_X86_OR_64
643 #  define GCM_FUNCREF_4BIT
644 extern unsigned int OPENSSL_ia32cap_P[];
645 
646 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
647 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
648 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
649                      size_t len);
650 
651 #  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
652 #   define gcm_init_avx   gcm_init_clmul
653 #   define gcm_gmult_avx  gcm_gmult_clmul
654 #   define gcm_ghash_avx  gcm_ghash_clmul
655 #  else
656 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
657 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
658 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
659                    size_t len);
660 #  endif
661 
662 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
663 #   define GHASH_ASM_X86
664 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
665 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666                         size_t len);
667 
668 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670                         size_t len);
671 #  endif
672 # elif (defined(__arm__) || defined(__arm) || defined(__aarch64__)) && defined(GHASH_ASM)
673 #  include "arm_arch.h"
674 #  if __ARM_MAX_ARCH__>=7
675 #   define GHASH_ASM_ARM
676 #   define GCM_FUNCREF_4BIT
677 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
678 #   if defined(__arm__) || defined(__arm)
679 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
680 #   endif
681 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
682 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
683 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684                     size_t len);
685 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
686 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
687 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
688                   size_t len);
689 #  endif
690 # elif defined(__sparc__) || defined(__sparc)
691 #  include "sparc_arch.h"
692 #  if defined(__arch64__)
693 #   define GHASH_ASM_SPARC
694 #   define GCM_FUNCREF_4BIT
695 extern unsigned int OPENSSL_sparcv9cap_P[];
696 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
697 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
698 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
699                     size_t len);
700 #  endif /* __arch64__ */
701 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
702 #  include "ppc_arch.h"
703 #  define GHASH_ASM_PPC
704 #  define GCM_FUNCREF_4BIT
705 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
706 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
707 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
708                   size_t len);
709 # endif
710 #endif
711 
712 #ifdef GCM_FUNCREF_4BIT
713 # undef  GCM_MUL
714 # define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
715 # ifdef GHASH
716 #  undef  GHASH
717 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
718 # endif
719 #endif
720 
721 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
722 {
723     const union {
724         long one;
725         char little;
726     } is_endian = { 1 };
727 
728     memset(ctx, 0, sizeof(*ctx));
729     ctx->block = block;
730     ctx->key = key;
731 
732     (*block) (ctx->H.c, ctx->H.c, key);
733 
734     if (is_endian.little) {
735         /* H is stored in host byte order */
736 #ifdef BSWAP8
737         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
738         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
739 #else
740         u8 *p = ctx->H.c;
741         u64 hi, lo;
742         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
743         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
744         ctx->H.u[0] = hi;
745         ctx->H.u[1] = lo;
746 #endif
747     }
748 #if     TABLE_BITS==8
749     gcm_init_8bit(ctx->Htable, ctx->H.u);
750 #elif   TABLE_BITS==4
751 # if    defined(GHASH)
752 #  define CTX__GHASH(f) (ctx->ghash = (f))
753 # else
754 #  define CTX__GHASH(f) (ctx->ghash = NULL)
755 # endif
756 # if    defined(GHASH_ASM_X86_OR_64)
757 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
758     if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
759         if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
760             gcm_init_avx(ctx->Htable, ctx->H.u);
761             ctx->gmult = gcm_gmult_avx;
762             CTX__GHASH(gcm_ghash_avx);
763         } else {
764             gcm_init_clmul(ctx->Htable, ctx->H.u);
765             ctx->gmult = gcm_gmult_clmul;
766             CTX__GHASH(gcm_ghash_clmul);
767         }
768         return;
769     }
770 #  endif
771     gcm_init_4bit(ctx->Htable, ctx->H.u);
772 #  if   defined(GHASH_ASM_X86)  /* x86 only */
773 #   if  defined(OPENSSL_IA32_SSE2)
774     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
775 #   else
776     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
777 #   endif
778         ctx->gmult = gcm_gmult_4bit_mmx;
779         CTX__GHASH(gcm_ghash_4bit_mmx);
780     } else {
781         ctx->gmult = gcm_gmult_4bit_x86;
782         CTX__GHASH(gcm_ghash_4bit_x86);
783     }
784 #  else
785     ctx->gmult = gcm_gmult_4bit;
786     CTX__GHASH(gcm_ghash_4bit);
787 #  endif
788 # elif  defined(GHASH_ASM_ARM)
789 #  ifdef PMULL_CAPABLE
790     if (PMULL_CAPABLE) {
791         gcm_init_v8(ctx->Htable, ctx->H.u);
792         ctx->gmult = gcm_gmult_v8;
793         CTX__GHASH(gcm_ghash_v8);
794     } else
795 #  endif
796 #  ifdef NEON_CAPABLE
797     if (NEON_CAPABLE) {
798         gcm_init_neon(ctx->Htable, ctx->H.u);
799         ctx->gmult = gcm_gmult_neon;
800         CTX__GHASH(gcm_ghash_neon);
801     } else
802 #  endif
803     {
804         gcm_init_4bit(ctx->Htable, ctx->H.u);
805         ctx->gmult = gcm_gmult_4bit;
806         CTX__GHASH(gcm_ghash_4bit);
807     }
808 # elif  defined(GHASH_ASM_SPARC)
809     if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
810         gcm_init_vis3(ctx->Htable, ctx->H.u);
811         ctx->gmult = gcm_gmult_vis3;
812         CTX__GHASH(gcm_ghash_vis3);
813     } else {
814         gcm_init_4bit(ctx->Htable, ctx->H.u);
815         ctx->gmult = gcm_gmult_4bit;
816         CTX__GHASH(gcm_ghash_4bit);
817     }
818 # elif  defined(GHASH_ASM_PPC)
819     if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
820         gcm_init_p8(ctx->Htable, ctx->H.u);
821         ctx->gmult = gcm_gmult_p8;
822         CTX__GHASH(gcm_ghash_p8);
823     } else {
824         gcm_init_4bit(ctx->Htable, ctx->H.u);
825         ctx->gmult = gcm_gmult_4bit;
826         CTX__GHASH(gcm_ghash_4bit);
827     }
828 # else
829     gcm_init_4bit(ctx->Htable, ctx->H.u);
830 # endif
831 # undef CTX__GHASH
832 #endif
833 }
834 
835 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
836                          size_t len)
837 {
838     const union {
839         long one;
840         char little;
841     } is_endian = { 1 };
842     unsigned int ctr;
843 #ifdef GCM_FUNCREF_4BIT
844     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
845 #endif
846 
847     ctx->len.u[0] = 0;          /* AAD length */
848     ctx->len.u[1] = 0;          /* message length */
849     ctx->ares = 0;
850     ctx->mres = 0;
851 
852     if (len == 12) {
853         memcpy(ctx->Yi.c, iv, 12);
854         ctx->Yi.c[12] = 0;
855         ctx->Yi.c[13] = 0;
856         ctx->Yi.c[14] = 0;
857         ctx->Yi.c[15] = 1;
858         ctr = 1;
859     } else {
860         size_t i;
861         u64 len0 = len;
862 
863         /* Borrow ctx->Xi to calculate initial Yi */
864         ctx->Xi.u[0] = 0;
865         ctx->Xi.u[1] = 0;
866 
867         while (len >= 16) {
868             for (i = 0; i < 16; ++i)
869                 ctx->Xi.c[i] ^= iv[i];
870             GCM_MUL(ctx);
871             iv += 16;
872             len -= 16;
873         }
874         if (len) {
875             for (i = 0; i < len; ++i)
876                 ctx->Xi.c[i] ^= iv[i];
877             GCM_MUL(ctx);
878         }
879         len0 <<= 3;
880         if (is_endian.little) {
881 #ifdef BSWAP8
882             ctx->Xi.u[1] ^= BSWAP8(len0);
883 #else
884             ctx->Xi.c[8] ^= (u8)(len0 >> 56);
885             ctx->Xi.c[9] ^= (u8)(len0 >> 48);
886             ctx->Xi.c[10] ^= (u8)(len0 >> 40);
887             ctx->Xi.c[11] ^= (u8)(len0 >> 32);
888             ctx->Xi.c[12] ^= (u8)(len0 >> 24);
889             ctx->Xi.c[13] ^= (u8)(len0 >> 16);
890             ctx->Xi.c[14] ^= (u8)(len0 >> 8);
891             ctx->Xi.c[15] ^= (u8)(len0);
892 #endif
893         } else {
894             ctx->Xi.u[1] ^= len0;
895         }
896 
897         GCM_MUL(ctx);
898 
899         if (is_endian.little)
900 #ifdef BSWAP4
901             ctr = BSWAP4(ctx->Xi.d[3]);
902 #else
903             ctr = GETU32(ctx->Xi.c + 12);
904 #endif
905         else
906             ctr = ctx->Xi.d[3];
907 
908         /* Copy borrowed Xi to Yi */
909         ctx->Yi.u[0] = ctx->Xi.u[0];
910         ctx->Yi.u[1] = ctx->Xi.u[1];
911     }
912 
913     ctx->Xi.u[0] = 0;
914     ctx->Xi.u[1] = 0;
915 
916     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
917     ++ctr;
918     if (is_endian.little)
919 #ifdef BSWAP4
920         ctx->Yi.d[3] = BSWAP4(ctr);
921 #else
922         PUTU32(ctx->Yi.c + 12, ctr);
923 #endif
924     else
925         ctx->Yi.d[3] = ctr;
926 }
927 
928 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
929                       size_t len)
930 {
931     size_t i;
932     unsigned int n;
933     u64 alen = ctx->len.u[0];
934 #ifdef GCM_FUNCREF_4BIT
935     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
936 # ifdef GHASH
937     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
938                          const u8 *inp, size_t len) = ctx->ghash;
939 # endif
940 #endif
941 
942     if (ctx->len.u[1])
943         return -2;
944 
945     alen += len;
946     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
947         return -1;
948     ctx->len.u[0] = alen;
949 
950     n = ctx->ares;
951     if (n) {
952         while (n && len) {
953             ctx->Xi.c[n] ^= *(aad++);
954             --len;
955             n = (n + 1) % 16;
956         }
957         if (n == 0)
958             GCM_MUL(ctx);
959         else {
960             ctx->ares = n;
961             return 0;
962         }
963     }
964 #ifdef GHASH
965     if ((i = (len & (size_t)-16))) {
966         GHASH(ctx, aad, i);
967         aad += i;
968         len -= i;
969     }
970 #else
971     while (len >= 16) {
972         for (i = 0; i < 16; ++i)
973             ctx->Xi.c[i] ^= aad[i];
974         GCM_MUL(ctx);
975         aad += 16;
976         len -= 16;
977     }
978 #endif
979     if (len) {
980         n = (unsigned int)len;
981         for (i = 0; i < len; ++i)
982             ctx->Xi.c[i] ^= aad[i];
983     }
984 
985     ctx->ares = n;
986     return 0;
987 }
988 
989 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
990                           const unsigned char *in, unsigned char *out,
991                           size_t len)
992 {
993     const union {
994         long one;
995         char little;
996     } is_endian = { 1 };
997     unsigned int n, ctr, mres;
998     size_t i;
999     u64 mlen = ctx->len.u[1];
1000     block128_f block = ctx->block;
1001     void *key = ctx->key;
1002 #ifdef GCM_FUNCREF_4BIT
1003     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1004 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1005     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1006                          const u8 *inp, size_t len) = ctx->ghash;
1007 # endif
1008 #endif
1009 
1010     mlen += len;
1011     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1012         return -1;
1013     ctx->len.u[1] = mlen;
1014 
1015     mres = ctx->mres;
1016 
1017     if (ctx->ares) {
1018         /* First call to encrypt finalizes GHASH(AAD) */
1019 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1020         if (len == 0) {
1021             GCM_MUL(ctx);
1022             ctx->ares = 0;
1023             return 0;
1024         }
1025         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1026         ctx->Xi.u[0] = 0;
1027         ctx->Xi.u[1] = 0;
1028         mres = sizeof(ctx->Xi);
1029 #else
1030         GCM_MUL(ctx);
1031 #endif
1032         ctx->ares = 0;
1033     }
1034 
1035     if (is_endian.little)
1036 #ifdef BSWAP4
1037         ctr = BSWAP4(ctx->Yi.d[3]);
1038 #else
1039         ctr = GETU32(ctx->Yi.c + 12);
1040 #endif
1041     else
1042         ctr = ctx->Yi.d[3];
1043 
1044     n = mres % 16;
1045 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1046     if (16 % sizeof(size_t) == 0) { /* always true actually */
1047         do {
1048             if (n) {
1049 # if defined(GHASH)
1050                 while (n && len) {
1051                     ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1052                     --len;
1053                     n = (n + 1) % 16;
1054                 }
1055                 if (n == 0) {
1056                     GHASH(ctx, ctx->Xn, mres);
1057                     mres = 0;
1058                 } else {
1059                     ctx->mres = mres;
1060                     return 0;
1061                 }
1062 # else
1063                 while (n && len) {
1064                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1065                     --len;
1066                     n = (n + 1) % 16;
1067                 }
1068                 if (n == 0) {
1069                     GCM_MUL(ctx);
1070                     mres = 0;
1071                 } else {
1072                     ctx->mres = n;
1073                     return 0;
1074                 }
1075 # endif
1076             }
1077 # if defined(STRICT_ALIGNMENT)
1078             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1079                 break;
1080 # endif
1081 # if defined(GHASH)
1082             if (len >= 16 && mres) {
1083                 GHASH(ctx, ctx->Xn, mres);
1084                 mres = 0;
1085             }
1086 #  if defined(GHASH_CHUNK)
1087             while (len >= GHASH_CHUNK) {
1088                 size_t j = GHASH_CHUNK;
1089 
1090                 while (j) {
1091                     size_t_aX *out_t = (size_t_aX *)out;
1092                     const size_t_aX *in_t = (const size_t_aX *)in;
1093 
1094                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1095                     ++ctr;
1096                     if (is_endian.little)
1097 #   ifdef BSWAP4
1098                         ctx->Yi.d[3] = BSWAP4(ctr);
1099 #   else
1100                         PUTU32(ctx->Yi.c + 12, ctr);
1101 #   endif
1102                     else
1103                         ctx->Yi.d[3] = ctr;
1104                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1105                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1106                     out += 16;
1107                     in += 16;
1108                     j -= 16;
1109                 }
1110                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1111                 len -= GHASH_CHUNK;
1112             }
1113 #  endif
1114             if ((i = (len & (size_t)-16))) {
1115                 size_t j = i;
1116 
1117                 while (len >= 16) {
1118                     size_t_aX *out_t = (size_t_aX *)out;
1119                     const size_t_aX *in_t = (const size_t_aX *)in;
1120 
1121                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1122                     ++ctr;
1123                     if (is_endian.little)
1124 #  ifdef BSWAP4
1125                         ctx->Yi.d[3] = BSWAP4(ctr);
1126 #  else
1127                         PUTU32(ctx->Yi.c + 12, ctr);
1128 #  endif
1129                     else
1130                         ctx->Yi.d[3] = ctr;
1131                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1132                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1133                     out += 16;
1134                     in += 16;
1135                     len -= 16;
1136                 }
1137                 GHASH(ctx, out - j, j);
1138             }
1139 # else
1140             while (len >= 16) {
1141                 size_t *out_t = (size_t *)out;
1142                 const size_t *in_t = (const size_t *)in;
1143 
1144                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1145                 ++ctr;
1146                 if (is_endian.little)
1147 #  ifdef BSWAP4
1148                     ctx->Yi.d[3] = BSWAP4(ctr);
1149 #  else
1150                     PUTU32(ctx->Yi.c + 12, ctr);
1151 #  endif
1152                 else
1153                     ctx->Yi.d[3] = ctr;
1154                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1155                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1156                 GCM_MUL(ctx);
1157                 out += 16;
1158                 in += 16;
1159                 len -= 16;
1160             }
1161 # endif
1162             if (len) {
1163                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1164                 ++ctr;
1165                 if (is_endian.little)
1166 # ifdef BSWAP4
1167                     ctx->Yi.d[3] = BSWAP4(ctr);
1168 # else
1169                     PUTU32(ctx->Yi.c + 12, ctr);
1170 # endif
1171                 else
1172                     ctx->Yi.d[3] = ctr;
1173 # if defined(GHASH)
1174                 while (len--) {
1175                     ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1176                     ++n;
1177                 }
1178 # else
1179                 while (len--) {
1180                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1181                     ++n;
1182                 }
1183                 mres = n;
1184 # endif
1185             }
1186 
1187             ctx->mres = mres;
1188             return 0;
1189         } while (0);
1190     }
1191 #endif
1192     for (i = 0; i < len; ++i) {
1193         if (n == 0) {
1194             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1195             ++ctr;
1196             if (is_endian.little)
1197 #ifdef BSWAP4
1198                 ctx->Yi.d[3] = BSWAP4(ctr);
1199 #else
1200                 PUTU32(ctx->Yi.c + 12, ctr);
1201 #endif
1202             else
1203                 ctx->Yi.d[3] = ctr;
1204         }
1205 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1206         ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1207         n = (n + 1) % 16;
1208         if (mres == sizeof(ctx->Xn)) {
1209             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1210             mres = 0;
1211         }
1212 #else
1213         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1214         mres = n = (n + 1) % 16;
1215         if (n == 0)
1216             GCM_MUL(ctx);
1217 #endif
1218     }
1219 
1220     ctx->mres = mres;
1221     return 0;
1222 }
1223 
1224 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1225                           const unsigned char *in, unsigned char *out,
1226                           size_t len)
1227 {
1228     const union {
1229         long one;
1230         char little;
1231     } is_endian = { 1 };
1232     unsigned int n, ctr, mres;
1233     size_t i;
1234     u64 mlen = ctx->len.u[1];
1235     block128_f block = ctx->block;
1236     void *key = ctx->key;
1237 #ifdef GCM_FUNCREF_4BIT
1238     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1239 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1240     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1241                          const u8 *inp, size_t len) = ctx->ghash;
1242 # endif
1243 #endif
1244 
1245     mlen += len;
1246     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1247         return -1;
1248     ctx->len.u[1] = mlen;
1249 
1250     mres = ctx->mres;
1251 
1252     if (ctx->ares) {
1253         /* First call to decrypt finalizes GHASH(AAD) */
1254 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1255         if (len == 0) {
1256             GCM_MUL(ctx);
1257             ctx->ares = 0;
1258             return 0;
1259         }
1260         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1261         ctx->Xi.u[0] = 0;
1262         ctx->Xi.u[1] = 0;
1263         mres = sizeof(ctx->Xi);
1264 #else
1265         GCM_MUL(ctx);
1266 #endif
1267         ctx->ares = 0;
1268     }
1269 
1270     if (is_endian.little)
1271 #ifdef BSWAP4
1272         ctr = BSWAP4(ctx->Yi.d[3]);
1273 #else
1274         ctr = GETU32(ctx->Yi.c + 12);
1275 #endif
1276     else
1277         ctr = ctx->Yi.d[3];
1278 
1279     n = mres % 16;
1280 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1281     if (16 % sizeof(size_t) == 0) { /* always true actually */
1282         do {
1283             if (n) {
1284 # if defined(GHASH)
1285                 while (n && len) {
1286                     *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1287                     --len;
1288                     n = (n + 1) % 16;
1289                 }
1290                 if (n == 0) {
1291                     GHASH(ctx, ctx->Xn, mres);
1292                     mres = 0;
1293                 } else {
1294                     ctx->mres = mres;
1295                     return 0;
1296                 }
1297 # else
1298                 while (n && len) {
1299                     u8 c = *(in++);
1300                     *(out++) = c ^ ctx->EKi.c[n];
1301                     ctx->Xi.c[n] ^= c;
1302                     --len;
1303                     n = (n + 1) % 16;
1304                 }
1305                 if (n == 0) {
1306                     GCM_MUL(ctx);
1307                     mres = 0;
1308                 } else {
1309                     ctx->mres = n;
1310                     return 0;
1311                 }
1312 # endif
1313             }
1314 # if defined(STRICT_ALIGNMENT)
1315             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1316                 break;
1317 # endif
1318 # if defined(GHASH)
1319             if (len >= 16 && mres) {
1320                 GHASH(ctx, ctx->Xn, mres);
1321                 mres = 0;
1322             }
1323 #  if defined(GHASH_CHUNK)
1324             while (len >= GHASH_CHUNK) {
1325                 size_t j = GHASH_CHUNK;
1326 
1327                 GHASH(ctx, in, GHASH_CHUNK);
1328                 while (j) {
1329                     size_t_aX *out_t = (size_t_aX *)out;
1330                     const size_t_aX *in_t = (const size_t_aX *)in;
1331 
1332                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1333                     ++ctr;
1334                     if (is_endian.little)
1335 #   ifdef BSWAP4
1336                         ctx->Yi.d[3] = BSWAP4(ctr);
1337 #   else
1338                         PUTU32(ctx->Yi.c + 12, ctr);
1339 #   endif
1340                     else
1341                         ctx->Yi.d[3] = ctr;
1342                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1343                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1344                     out += 16;
1345                     in += 16;
1346                     j -= 16;
1347                 }
1348                 len -= GHASH_CHUNK;
1349             }
1350 #  endif
1351             if ((i = (len & (size_t)-16))) {
1352                 GHASH(ctx, in, i);
1353                 while (len >= 16) {
1354                     size_t_aX *out_t = (size_t_aX *)out;
1355                     const size_t_aX *in_t = (const size_t_aX *)in;
1356 
1357                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1358                     ++ctr;
1359                     if (is_endian.little)
1360 #  ifdef BSWAP4
1361                         ctx->Yi.d[3] = BSWAP4(ctr);
1362 #  else
1363                         PUTU32(ctx->Yi.c + 12, ctr);
1364 #  endif
1365                     else
1366                         ctx->Yi.d[3] = ctr;
1367                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1368                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1369                     out += 16;
1370                     in += 16;
1371                     len -= 16;
1372                 }
1373             }
1374 # else
1375             while (len >= 16) {
1376                 size_t *out_t = (size_t *)out;
1377                 const size_t *in_t = (const size_t *)in;
1378 
1379                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1380                 ++ctr;
1381                 if (is_endian.little)
1382 #  ifdef BSWAP4
1383                     ctx->Yi.d[3] = BSWAP4(ctr);
1384 #  else
1385                     PUTU32(ctx->Yi.c + 12, ctr);
1386 #  endif
1387                 else
1388                     ctx->Yi.d[3] = ctr;
1389                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1390                     size_t c = in_t[i];
1391                     out_t[i] = c ^ ctx->EKi.t[i];
1392                     ctx->Xi.t[i] ^= c;
1393                 }
1394                 GCM_MUL(ctx);
1395                 out += 16;
1396                 in += 16;
1397                 len -= 16;
1398             }
1399 # endif
1400             if (len) {
1401                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1402                 ++ctr;
1403                 if (is_endian.little)
1404 # ifdef BSWAP4
1405                     ctx->Yi.d[3] = BSWAP4(ctr);
1406 # else
1407                     PUTU32(ctx->Yi.c + 12, ctr);
1408 # endif
1409                 else
1410                     ctx->Yi.d[3] = ctr;
1411 # if defined(GHASH)
1412                 while (len--) {
1413                     out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1414                     ++n;
1415                 }
1416 # else
1417                 while (len--) {
1418                     u8 c = in[n];
1419                     ctx->Xi.c[n] ^= c;
1420                     out[n] = c ^ ctx->EKi.c[n];
1421                     ++n;
1422                 }
1423                 mres = n;
1424 # endif
1425             }
1426 
1427             ctx->mres = mres;
1428             return 0;
1429         } while (0);
1430     }
1431 #endif
1432     for (i = 0; i < len; ++i) {
1433         u8 c;
1434         if (n == 0) {
1435             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1436             ++ctr;
1437             if (is_endian.little)
1438 #ifdef BSWAP4
1439                 ctx->Yi.d[3] = BSWAP4(ctr);
1440 #else
1441                 PUTU32(ctx->Yi.c + 12, ctr);
1442 #endif
1443             else
1444                 ctx->Yi.d[3] = ctr;
1445         }
1446 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1447         out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1448         n = (n + 1) % 16;
1449         if (mres == sizeof(ctx->Xn)) {
1450             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1451             mres = 0;
1452         }
1453 #else
1454         c = in[i];
1455         out[i] = c ^ ctx->EKi.c[n];
1456         ctx->Xi.c[n] ^= c;
1457         mres = n = (n + 1) % 16;
1458         if (n == 0)
1459             GCM_MUL(ctx);
1460 #endif
1461     }
1462 
1463     ctx->mres = mres;
1464     return 0;
1465 }
1466 
1467 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1468                                 const unsigned char *in, unsigned char *out,
1469                                 size_t len, ctr128_f stream)
1470 {
1471 #if defined(OPENSSL_SMALL_FOOTPRINT)
1472     return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1473 #else
1474     const union {
1475         long one;
1476         char little;
1477     } is_endian = { 1 };
1478     unsigned int n, ctr, mres;
1479     size_t i;
1480     u64 mlen = ctx->len.u[1];
1481     void *key = ctx->key;
1482 # ifdef GCM_FUNCREF_4BIT
1483     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1484 #  ifdef GHASH
1485     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1486                          const u8 *inp, size_t len) = ctx->ghash;
1487 #  endif
1488 # endif
1489 
1490     mlen += len;
1491     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1492         return -1;
1493     ctx->len.u[1] = mlen;
1494 
1495     mres = ctx->mres;
1496 
1497     if (ctx->ares) {
1498         /* First call to encrypt finalizes GHASH(AAD) */
1499 #if defined(GHASH)
1500         if (len == 0) {
1501             GCM_MUL(ctx);
1502             ctx->ares = 0;
1503             return 0;
1504         }
1505         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1506         ctx->Xi.u[0] = 0;
1507         ctx->Xi.u[1] = 0;
1508         mres = sizeof(ctx->Xi);
1509 #else
1510         GCM_MUL(ctx);
1511 #endif
1512         ctx->ares = 0;
1513     }
1514 
1515     if (is_endian.little)
1516 # ifdef BSWAP4
1517         ctr = BSWAP4(ctx->Yi.d[3]);
1518 # else
1519         ctr = GETU32(ctx->Yi.c + 12);
1520 # endif
1521     else
1522         ctr = ctx->Yi.d[3];
1523 
1524     n = mres % 16;
1525     if (n) {
1526 # if defined(GHASH)
1527         while (n && len) {
1528             ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1529             --len;
1530             n = (n + 1) % 16;
1531         }
1532         if (n == 0) {
1533             GHASH(ctx, ctx->Xn, mres);
1534             mres = 0;
1535         } else {
1536             ctx->mres = mres;
1537             return 0;
1538         }
1539 # else
1540         while (n && len) {
1541             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1542             --len;
1543             n = (n + 1) % 16;
1544         }
1545         if (n == 0) {
1546             GCM_MUL(ctx);
1547             mres = 0;
1548         } else {
1549             ctx->mres = n;
1550             return 0;
1551         }
1552 # endif
1553     }
1554 # if defined(GHASH)
1555         if (len >= 16 && mres) {
1556             GHASH(ctx, ctx->Xn, mres);
1557             mres = 0;
1558         }
1559 #  if defined(GHASH_CHUNK)
1560     while (len >= GHASH_CHUNK) {
1561         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1562         ctr += GHASH_CHUNK / 16;
1563         if (is_endian.little)
1564 #   ifdef BSWAP4
1565             ctx->Yi.d[3] = BSWAP4(ctr);
1566 #   else
1567             PUTU32(ctx->Yi.c + 12, ctr);
1568 #   endif
1569         else
1570             ctx->Yi.d[3] = ctr;
1571         GHASH(ctx, out, GHASH_CHUNK);
1572         out += GHASH_CHUNK;
1573         in += GHASH_CHUNK;
1574         len -= GHASH_CHUNK;
1575     }
1576 #  endif
1577 # endif
1578     if ((i = (len & (size_t)-16))) {
1579         size_t j = i / 16;
1580 
1581         (*stream) (in, out, j, key, ctx->Yi.c);
1582         ctr += (unsigned int)j;
1583         if (is_endian.little)
1584 # ifdef BSWAP4
1585             ctx->Yi.d[3] = BSWAP4(ctr);
1586 # else
1587             PUTU32(ctx->Yi.c + 12, ctr);
1588 # endif
1589         else
1590             ctx->Yi.d[3] = ctr;
1591         in += i;
1592         len -= i;
1593 # if defined(GHASH)
1594         GHASH(ctx, out, i);
1595         out += i;
1596 # else
1597         while (j--) {
1598             for (i = 0; i < 16; ++i)
1599                 ctx->Xi.c[i] ^= out[i];
1600             GCM_MUL(ctx);
1601             out += 16;
1602         }
1603 # endif
1604     }
1605     if (len) {
1606         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1607         ++ctr;
1608         if (is_endian.little)
1609 # ifdef BSWAP4
1610             ctx->Yi.d[3] = BSWAP4(ctr);
1611 # else
1612             PUTU32(ctx->Yi.c + 12, ctr);
1613 # endif
1614         else
1615             ctx->Yi.d[3] = ctr;
1616         while (len--) {
1617 # if defined(GHASH)
1618             ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1619 # else
1620             ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1621 # endif
1622             ++n;
1623         }
1624     }
1625 
1626     ctx->mres = mres;
1627     return 0;
1628 #endif
1629 }
1630 
1631 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1632                                 const unsigned char *in, unsigned char *out,
1633                                 size_t len, ctr128_f stream)
1634 {
1635 #if defined(OPENSSL_SMALL_FOOTPRINT)
1636     return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1637 #else
1638     const union {
1639         long one;
1640         char little;
1641     } is_endian = { 1 };
1642     unsigned int n, ctr, mres;
1643     size_t i;
1644     u64 mlen = ctx->len.u[1];
1645     void *key = ctx->key;
1646 # ifdef GCM_FUNCREF_4BIT
1647     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1648 #  ifdef GHASH
1649     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1650                          const u8 *inp, size_t len) = ctx->ghash;
1651 #  endif
1652 # endif
1653 
1654     mlen += len;
1655     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1656         return -1;
1657     ctx->len.u[1] = mlen;
1658 
1659     mres = ctx->mres;
1660 
1661     if (ctx->ares) {
1662         /* First call to decrypt finalizes GHASH(AAD) */
1663 # if defined(GHASH)
1664         if (len == 0) {
1665             GCM_MUL(ctx);
1666             ctx->ares = 0;
1667             return 0;
1668         }
1669         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1670         ctx->Xi.u[0] = 0;
1671         ctx->Xi.u[1] = 0;
1672         mres = sizeof(ctx->Xi);
1673 # else
1674         GCM_MUL(ctx);
1675 # endif
1676         ctx->ares = 0;
1677     }
1678 
1679     if (is_endian.little)
1680 # ifdef BSWAP4
1681         ctr = BSWAP4(ctx->Yi.d[3]);
1682 # else
1683         ctr = GETU32(ctx->Yi.c + 12);
1684 # endif
1685     else
1686         ctr = ctx->Yi.d[3];
1687 
1688     n = mres % 16;
1689     if (n) {
1690 # if defined(GHASH)
1691         while (n && len) {
1692             *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1693             --len;
1694             n = (n + 1) % 16;
1695         }
1696         if (n == 0) {
1697             GHASH(ctx, ctx->Xn, mres);
1698             mres = 0;
1699         } else {
1700             ctx->mres = mres;
1701             return 0;
1702         }
1703 # else
1704         while (n && len) {
1705             u8 c = *(in++);
1706             *(out++) = c ^ ctx->EKi.c[n];
1707             ctx->Xi.c[n] ^= c;
1708             --len;
1709             n = (n + 1) % 16;
1710         }
1711         if (n == 0) {
1712             GCM_MUL(ctx);
1713             mres = 0;
1714         } else {
1715             ctx->mres = n;
1716             return 0;
1717         }
1718 # endif
1719     }
1720 # if defined(GHASH)
1721     if (len >= 16 && mres) {
1722         GHASH(ctx, ctx->Xn, mres);
1723         mres = 0;
1724     }
1725 #  if defined(GHASH_CHUNK)
1726     while (len >= GHASH_CHUNK) {
1727         GHASH(ctx, in, GHASH_CHUNK);
1728         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1729         ctr += GHASH_CHUNK / 16;
1730         if (is_endian.little)
1731 #   ifdef BSWAP4
1732             ctx->Yi.d[3] = BSWAP4(ctr);
1733 #   else
1734             PUTU32(ctx->Yi.c + 12, ctr);
1735 #   endif
1736         else
1737             ctx->Yi.d[3] = ctr;
1738         out += GHASH_CHUNK;
1739         in += GHASH_CHUNK;
1740         len -= GHASH_CHUNK;
1741     }
1742 #  endif
1743 # endif
1744     if ((i = (len & (size_t)-16))) {
1745         size_t j = i / 16;
1746 
1747 # if defined(GHASH)
1748         GHASH(ctx, in, i);
1749 # else
1750         while (j--) {
1751             size_t k;
1752             for (k = 0; k < 16; ++k)
1753                 ctx->Xi.c[k] ^= in[k];
1754             GCM_MUL(ctx);
1755             in += 16;
1756         }
1757         j = i / 16;
1758         in -= i;
1759 # endif
1760         (*stream) (in, out, j, key, ctx->Yi.c);
1761         ctr += (unsigned int)j;
1762         if (is_endian.little)
1763 # ifdef BSWAP4
1764             ctx->Yi.d[3] = BSWAP4(ctr);
1765 # else
1766             PUTU32(ctx->Yi.c + 12, ctr);
1767 # endif
1768         else
1769             ctx->Yi.d[3] = ctr;
1770         out += i;
1771         in += i;
1772         len -= i;
1773     }
1774     if (len) {
1775         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1776         ++ctr;
1777         if (is_endian.little)
1778 # ifdef BSWAP4
1779             ctx->Yi.d[3] = BSWAP4(ctr);
1780 # else
1781             PUTU32(ctx->Yi.c + 12, ctr);
1782 # endif
1783         else
1784             ctx->Yi.d[3] = ctr;
1785         while (len--) {
1786 # if defined(GHASH)
1787             out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1788 # else
1789             u8 c = in[n];
1790             ctx->Xi.c[mres++] ^= c;
1791             out[n] = c ^ ctx->EKi.c[n];
1792 # endif
1793             ++n;
1794         }
1795     }
1796 
1797     ctx->mres = mres;
1798     return 0;
1799 #endif
1800 }
1801 
1802 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1803                          size_t len)
1804 {
1805     const union {
1806         long one;
1807         char little;
1808     } is_endian = { 1 };
1809     u64 alen = ctx->len.u[0] << 3;
1810     u64 clen = ctx->len.u[1] << 3;
1811 #ifdef GCM_FUNCREF_4BIT
1812     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1813 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1814     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1815                          const u8 *inp, size_t len) = ctx->ghash;
1816 # endif
1817 #endif
1818 
1819 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1820     u128 bitlen;
1821     unsigned int mres = ctx->mres;
1822 
1823     if (mres) {
1824         unsigned blocks = (mres + 15) & -16;
1825 
1826         memset(ctx->Xn + mres, 0, blocks - mres);
1827         mres = blocks;
1828         if (mres == sizeof(ctx->Xn)) {
1829             GHASH(ctx, ctx->Xn, mres);
1830             mres = 0;
1831         }
1832     } else if (ctx->ares) {
1833         GCM_MUL(ctx);
1834     }
1835 #else
1836     if (ctx->mres || ctx->ares)
1837         GCM_MUL(ctx);
1838 #endif
1839 
1840     if (is_endian.little) {
1841 #ifdef BSWAP8
1842         alen = BSWAP8(alen);
1843         clen = BSWAP8(clen);
1844 #else
1845         u8 *p = ctx->len.c;
1846 
1847         ctx->len.u[0] = alen;
1848         ctx->len.u[1] = clen;
1849 
1850         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1851         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1852 #endif
1853     }
1854 
1855 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1856     bitlen.hi = alen;
1857     bitlen.lo = clen;
1858     memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1859     mres += sizeof(bitlen);
1860     GHASH(ctx, ctx->Xn, mres);
1861 #else
1862     ctx->Xi.u[0] ^= alen;
1863     ctx->Xi.u[1] ^= clen;
1864     GCM_MUL(ctx);
1865 #endif
1866 
1867     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1868     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1869 
1870     if (tag && len <= sizeof(ctx->Xi))
1871         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1872     else
1873         return -1;
1874 }
1875 
1876 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1877 {
1878     CRYPTO_gcm128_finish(ctx, NULL, 0);
1879     memcpy(tag, ctx->Xi.c,
1880            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1881 }
1882 
1883 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1884 {
1885     GCM128_CONTEXT *ret;
1886 
1887     if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1888         CRYPTO_gcm128_init(ret, key, block);
1889 
1890     return ret;
1891 }
1892 
1893 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1894 {
1895     OPENSSL_clear_free(ctx, sizeof(*ctx));
1896 }
1897