xref: /netbsd-src/crypto/external/bsd/openssl/dist/crypto/modes/gcm128.c (revision 8fbed61efdd901c0e09614c9f45356aeeab23fe3)
1 /*
2  * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <string.h>
11 #include <openssl/crypto.h>
12 #include "internal/cryptlib.h"
13 #include "internal/endian.h"
14 #include "crypto/modes.h"
15 
16 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
17 typedef size_t size_t_aX __attribute((__aligned__(1)));
18 #else
19 typedef size_t size_t_aX;
20 #endif
21 
22 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
23 /* redefine, because alignment is ensured */
24 # undef  GETU32
25 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
26 # undef  PUTU32
27 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
28 #endif
29 
30 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
31 #define REDUCE1BIT(V)   do { \
32         if (sizeof(size_t)==8) { \
33                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
34                 V.lo  = (V.hi<<63)|(V.lo>>1); \
35                 V.hi  = (V.hi>>1 )^T; \
36         } \
37         else { \
38                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
39                 V.lo  = (V.hi<<63)|(V.lo>>1); \
40                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
41         } \
42 } while(0)
43 
44 /*-
45  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
46  * never be set to 8. 8 is effectively reserved for testing purposes.
47  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
48  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
49  * whole spectrum of possible table driven implementations. Why? In
50  * non-"Shoup's" case memory access pattern is segmented in such manner,
51  * that it's trivial to see that cache timing information can reveal
52  * fair portion of intermediate hash value. Given that ciphertext is
53  * always available to attacker, it's possible for him to attempt to
54  * deduce secret parameter H and if successful, tamper with messages
55  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
56  * not as trivial, but there is no reason to believe that it's resistant
57  * to cache-timing attack. And the thing about "8-bit" implementation is
58  * that it consumes 16 (sixteen) times more memory, 4KB per individual
59  * key + 1KB shared. Well, on pros side it should be twice as fast as
60  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
61  * was observed to run ~75% faster, closer to 100% for commercial
62  * compilers... Yet "4-bit" procedure is preferred, because it's
63  * believed to provide better security-performance balance and adequate
64  * all-round performance. "All-round" refers to things like:
65  *
66  * - shorter setup time effectively improves overall timing for
67  *   handling short messages;
68  * - larger table allocation can become unbearable because of VM
69  *   subsystem penalties (for example on Windows large enough free
70  *   results in VM working set trimming, meaning that consequent
71  *   malloc would immediately incur working set expansion);
72  * - larger table has larger cache footprint, which can affect
73  *   performance of other code paths (not necessarily even from same
74  *   thread in Hyper-Threading world);
75  *
76  * Value of 1 is not appropriate for performance reasons.
77  */
78 #if     TABLE_BITS==8
79 
gcm_init_8bit(u128 Htable[256],u64 H[2])80 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
81 {
82     int i, j;
83     u128 V;
84 
85     Htable[0].hi = 0;
86     Htable[0].lo = 0;
87     V.hi = H[0];
88     V.lo = H[1];
89 
90     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
91         REDUCE1BIT(V);
92         Htable[i] = V;
93     }
94 
95     for (i = 2; i < 256; i <<= 1) {
96         u128 *Hi = Htable + i, H0 = *Hi;
97         for (j = 1; j < i; ++j) {
98             Hi[j].hi = H0.hi ^ Htable[j].hi;
99             Hi[j].lo = H0.lo ^ Htable[j].lo;
100         }
101     }
102 }
103 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])104 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
105 {
106     u128 Z = { 0, 0 };
107     const u8 *xi = (const u8 *)Xi + 15;
108     size_t rem, n = *xi;
109     DECLARE_IS_ENDIAN;
110     static const size_t rem_8bit[256] = {
111         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
112         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
113         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
114         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
115         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
116         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
117         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
118         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
119         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
120         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
121         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
122         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
123         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
124         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
125         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
126         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
127         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
128         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
129         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
130         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
131         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
132         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
133         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
134         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
135         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
136         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
137         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
138         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
139         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
140         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
141         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
142         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
143         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
144         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
145         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
146         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
147         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
148         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
149         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
150         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
151         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
152         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
153         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
154         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
155         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
156         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
157         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
158         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
159         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
160         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
161         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
162         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
163         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
164         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
165         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
166         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
167         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
168         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
169         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
170         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
171         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
172         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
173         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
174         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
175     };
176 
177     while (1) {
178         Z.hi ^= Htable[n].hi;
179         Z.lo ^= Htable[n].lo;
180 
181         if ((u8 *)Xi == xi)
182             break;
183 
184         n = *(--xi);
185 
186         rem = (size_t)Z.lo & 0xff;
187         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
188         Z.hi = (Z.hi >> 8);
189         if (sizeof(size_t) == 8)
190             Z.hi ^= rem_8bit[rem];
191         else
192             Z.hi ^= (u64)rem_8bit[rem] << 32;
193     }
194 
195     if (IS_LITTLE_ENDIAN) {
196 # ifdef BSWAP8
197         Xi[0] = BSWAP8(Z.hi);
198         Xi[1] = BSWAP8(Z.lo);
199 # else
200         u8 *p = (u8 *)Xi;
201         u32 v;
202         v = (u32)(Z.hi >> 32);
203         PUTU32(p, v);
204         v = (u32)(Z.hi);
205         PUTU32(p + 4, v);
206         v = (u32)(Z.lo >> 32);
207         PUTU32(p + 8, v);
208         v = (u32)(Z.lo);
209         PUTU32(p + 12, v);
210 # endif
211     } else {
212         Xi[0] = Z.hi;
213         Xi[1] = Z.lo;
214     }
215 }
216 
217 # define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
218 
219 #elif   TABLE_BITS==4
220 
gcm_init_4bit(u128 Htable[16],u64 H[2])221 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
222 {
223     u128 V;
224 # if defined(OPENSSL_SMALL_FOOTPRINT)
225     int i;
226 # endif
227 
228     Htable[0].hi = 0;
229     Htable[0].lo = 0;
230     V.hi = H[0];
231     V.lo = H[1];
232 
233 # if defined(OPENSSL_SMALL_FOOTPRINT)
234     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
235         REDUCE1BIT(V);
236         Htable[i] = V;
237     }
238 
239     for (i = 2; i < 16; i <<= 1) {
240         u128 *Hi = Htable + i;
241         int j;
242         for (V = *Hi, j = 1; j < i; ++j) {
243             Hi[j].hi = V.hi ^ Htable[j].hi;
244             Hi[j].lo = V.lo ^ Htable[j].lo;
245         }
246     }
247 # else
248     Htable[8] = V;
249     REDUCE1BIT(V);
250     Htable[4] = V;
251     REDUCE1BIT(V);
252     Htable[2] = V;
253     REDUCE1BIT(V);
254     Htable[1] = V;
255     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
256     V = Htable[4];
257     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
258     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
259     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
260     V = Htable[8];
261     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
262     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
263     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
264     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
265     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
266     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
267     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
268 # endif
269 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
270     /*
271      * ARM assembler expects specific dword order in Htable.
272      */
273     {
274         int j;
275         DECLARE_IS_ENDIAN;
276 
277         if (IS_LITTLE_ENDIAN)
278             for (j = 0; j < 16; ++j) {
279                 V = Htable[j];
280                 Htable[j].hi = V.lo;
281                 Htable[j].lo = V.hi;
282         } else
283             for (j = 0; j < 16; ++j) {
284                 V = Htable[j];
285                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
286                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
287             }
288     }
289 # endif
290 }
291 
292 # ifndef GHASH_ASM
293 static const size_t rem_4bit[16] = {
294     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
295     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
296     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
297     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
298 };
299 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])300 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
301 {
302     u128 Z;
303     int cnt = 15;
304     size_t rem, nlo, nhi;
305     DECLARE_IS_ENDIAN;
306 
307     nlo = ((const u8 *)Xi)[15];
308     nhi = nlo >> 4;
309     nlo &= 0xf;
310 
311     Z.hi = Htable[nlo].hi;
312     Z.lo = Htable[nlo].lo;
313 
314     while (1) {
315         rem = (size_t)Z.lo & 0xf;
316         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
317         Z.hi = (Z.hi >> 4);
318         if (sizeof(size_t) == 8)
319             Z.hi ^= rem_4bit[rem];
320         else
321             Z.hi ^= (u64)rem_4bit[rem] << 32;
322 
323         Z.hi ^= Htable[nhi].hi;
324         Z.lo ^= Htable[nhi].lo;
325 
326         if (--cnt < 0)
327             break;
328 
329         nlo = ((const u8 *)Xi)[cnt];
330         nhi = nlo >> 4;
331         nlo &= 0xf;
332 
333         rem = (size_t)Z.lo & 0xf;
334         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
335         Z.hi = (Z.hi >> 4);
336         if (sizeof(size_t) == 8)
337             Z.hi ^= rem_4bit[rem];
338         else
339             Z.hi ^= (u64)rem_4bit[rem] << 32;
340 
341         Z.hi ^= Htable[nlo].hi;
342         Z.lo ^= Htable[nlo].lo;
343     }
344 
345     if (IS_LITTLE_ENDIAN) {
346 #  ifdef BSWAP8
347         Xi[0] = BSWAP8(Z.hi);
348         Xi[1] = BSWAP8(Z.lo);
349 #  else
350         u8 *p = (u8 *)Xi;
351         u32 v;
352         v = (u32)(Z.hi >> 32);
353         PUTU32(p, v);
354         v = (u32)(Z.hi);
355         PUTU32(p + 4, v);
356         v = (u32)(Z.lo >> 32);
357         PUTU32(p + 8, v);
358         v = (u32)(Z.lo);
359         PUTU32(p + 12, v);
360 #  endif
361     } else {
362         Xi[0] = Z.hi;
363         Xi[1] = Z.lo;
364     }
365 }
366 
367 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
368 /*
369  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
370  * details... Compiler-generated code doesn't seem to give any
371  * performance improvement, at least not on x86[_64]. It's here
372  * mostly as reference and a placeholder for possible future
373  * non-trivial optimization[s]...
374  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)375 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
376                            const u8 *inp, size_t len)
377 {
378     u128 Z;
379     int cnt;
380     size_t rem, nlo, nhi;
381     DECLARE_IS_ENDIAN;
382 
383 #   if 1
384     do {
385         cnt = 15;
386         nlo = ((const u8 *)Xi)[15];
387         nlo ^= inp[15];
388         nhi = nlo >> 4;
389         nlo &= 0xf;
390 
391         Z.hi = Htable[nlo].hi;
392         Z.lo = Htable[nlo].lo;
393 
394         while (1) {
395             rem = (size_t)Z.lo & 0xf;
396             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
397             Z.hi = (Z.hi >> 4);
398             if (sizeof(size_t) == 8)
399                 Z.hi ^= rem_4bit[rem];
400             else
401                 Z.hi ^= (u64)rem_4bit[rem] << 32;
402 
403             Z.hi ^= Htable[nhi].hi;
404             Z.lo ^= Htable[nhi].lo;
405 
406             if (--cnt < 0)
407                 break;
408 
409             nlo = ((const u8 *)Xi)[cnt];
410             nlo ^= inp[cnt];
411             nhi = nlo >> 4;
412             nlo &= 0xf;
413 
414             rem = (size_t)Z.lo & 0xf;
415             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
416             Z.hi = (Z.hi >> 4);
417             if (sizeof(size_t) == 8)
418                 Z.hi ^= rem_4bit[rem];
419             else
420                 Z.hi ^= (u64)rem_4bit[rem] << 32;
421 
422             Z.hi ^= Htable[nlo].hi;
423             Z.lo ^= Htable[nlo].lo;
424         }
425 #   else
426     /*
427      * Extra 256+16 bytes per-key plus 512 bytes shared tables
428      * [should] give ~50% improvement... One could have PACK()-ed
429      * the rem_8bit even here, but the priority is to minimize
430      * cache footprint...
431      */
432     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
433     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
434     static const unsigned short rem_8bit[256] = {
435         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
436         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
437         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
438         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
439         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
440         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
441         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
442         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
443         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
444         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
445         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
446         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
447         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
448         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
449         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
450         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
451         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
452         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
453         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
454         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
455         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
456         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
457         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
458         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
459         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
460         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
461         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
462         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
463         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
464         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
465         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
466         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
467     };
468     /*
469      * This pre-processing phase slows down procedure by approximately
470      * same time as it makes each loop spin faster. In other words
471      * single block performance is approximately same as straightforward
472      * "4-bit" implementation, and then it goes only faster...
473      */
474     for (cnt = 0; cnt < 16; ++cnt) {
475         Z.hi = Htable[cnt].hi;
476         Z.lo = Htable[cnt].lo;
477         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
478         Hshr4[cnt].hi = (Z.hi >> 4);
479         Hshl4[cnt] = (u8)(Z.lo << 4);
480     }
481 
482     do {
483         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
484             nlo = ((const u8 *)Xi)[cnt];
485             nlo ^= inp[cnt];
486             nhi = nlo >> 4;
487             nlo &= 0xf;
488 
489             Z.hi ^= Htable[nlo].hi;
490             Z.lo ^= Htable[nlo].lo;
491 
492             rem = (size_t)Z.lo & 0xff;
493 
494             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
495             Z.hi = (Z.hi >> 8);
496 
497             Z.hi ^= Hshr4[nhi].hi;
498             Z.lo ^= Hshr4[nhi].lo;
499             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
500         }
501 
502         nlo = ((const u8 *)Xi)[0];
503         nlo ^= inp[0];
504         nhi = nlo >> 4;
505         nlo &= 0xf;
506 
507         Z.hi ^= Htable[nlo].hi;
508         Z.lo ^= Htable[nlo].lo;
509 
510         rem = (size_t)Z.lo & 0xf;
511 
512         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
513         Z.hi = (Z.hi >> 4);
514 
515         Z.hi ^= Htable[nhi].hi;
516         Z.lo ^= Htable[nhi].lo;
517         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
518 #   endif
519 
520         if (IS_LITTLE_ENDIAN) {
521 #   ifdef BSWAP8
522             Xi[0] = BSWAP8(Z.hi);
523             Xi[1] = BSWAP8(Z.lo);
524 #   else
525             u8 *p = (u8 *)Xi;
526             u32 v;
527             v = (u32)(Z.hi >> 32);
528             PUTU32(p, v);
529             v = (u32)(Z.hi);
530             PUTU32(p + 4, v);
531             v = (u32)(Z.lo >> 32);
532             PUTU32(p + 8, v);
533             v = (u32)(Z.lo);
534             PUTU32(p + 12, v);
535 #   endif
536         } else {
537             Xi[0] = Z.hi;
538             Xi[1] = Z.lo;
539         }
540     } while (inp += 16, len -= 16);
541 }
542 #  endif
543 # else
544 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
545 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
546                     size_t len);
547 # endif
548 
549 # define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
550 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
551 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
552 /*
553  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
554  * effect. In other words idea is to hash data while it's still in L1 cache
555  * after encryption pass...
556  */
557 #  define GHASH_CHUNK       (3*1024)
558 # endif
559 
560 #else                           /* TABLE_BITS */
561 
562 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
563 {
564     u128 V, Z = { 0, 0 };
565     long X;
566     int i, j;
567     const long *xi = (const long *)Xi;
568     DECLARE_IS_ENDIAN;
569 
570     V.hi = H[0];                /* H is in host byte order, no byte swapping */
571     V.lo = H[1];
572 
573     for (j = 0; j < 16 / sizeof(long); ++j) {
574         if (IS_LITTLE_ENDIAN) {
575             if (sizeof(long) == 8) {
576 # ifdef BSWAP8
577                 X = (long)(BSWAP8(xi[j]));
578 # else
579                 const u8 *p = (const u8 *)(xi + j);
580                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
581 # endif
582             } else {
583                 const u8 *p = (const u8 *)(xi + j);
584                 X = (long)GETU32(p);
585             }
586         } else
587             X = xi[j];
588 
589         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
590             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
591             Z.hi ^= V.hi & M;
592             Z.lo ^= V.lo & M;
593 
594             REDUCE1BIT(V);
595         }
596     }
597 
598     if (IS_LITTLE_ENDIAN) {
599 # ifdef BSWAP8
600         Xi[0] = BSWAP8(Z.hi);
601         Xi[1] = BSWAP8(Z.lo);
602 # else
603         u8 *p = (u8 *)Xi;
604         u32 v;
605         v = (u32)(Z.hi >> 32);
606         PUTU32(p, v);
607         v = (u32)(Z.hi);
608         PUTU32(p + 4, v);
609         v = (u32)(Z.lo >> 32);
610         PUTU32(p + 8, v);
611         v = (u32)(Z.lo);
612         PUTU32(p + 12, v);
613 # endif
614     } else {
615         Xi[0] = Z.hi;
616         Xi[1] = Z.lo;
617     }
618 }
619 
620 # define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
621 
622 #endif
623 
624 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
625 # if    !defined(I386_ONLY) && \
626         (defined(__i386)        || defined(__i386__)    || \
627          defined(__x86_64)      || defined(__x86_64__)  || \
628          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
629 #  define GHASH_ASM_X86_OR_64
630 #  define GCM_FUNCREF_4BIT
631 
632 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
633 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
634 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
635                      size_t len);
636 
637 #  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
638 #   define gcm_init_avx   gcm_init_clmul
639 #   define gcm_gmult_avx  gcm_gmult_clmul
640 #   define gcm_ghash_avx  gcm_ghash_clmul
641 #  else
642 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
643 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
644 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
645                    size_t len);
646 #  endif
647 
648 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
649 #   define GHASH_ASM_X86
650 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
651 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
652                         size_t len);
653 
654 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
655 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
656                         size_t len);
657 #  endif
658 # elif (defined(__arm__) || defined(__arm) || defined(__aarch64__)) && defined(GHASH_ASM)
659 #  include "arm_arch.h"
660 #  if __ARM_MAX_ARCH__>=7
661 #   define GHASH_ASM_ARM
662 #   define GCM_FUNCREF_4BIT
663 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
664 #   if defined(__arm__) || defined(__arm)
665 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
666 #   endif
667 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
668 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670                     size_t len);
671 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
672 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
673 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
674                   size_t len);
675 #  endif
676 # elif defined(__sparc__) || defined(__sparc)
677 #  include "crypto/sparc_arch.h"
678 #  if defined(__arch64__)
679 #   define GHASH_ASM_SPARC
680 #   define GCM_FUNCREF_4BIT
681 extern unsigned int OPENSSL_sparcv9cap_P[];
682 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
683 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
684 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
685                     size_t len);
686 #  endif /* __arch64__ */
687 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
688 #  include "crypto/ppc_arch.h"
689 #  define GHASH_ASM_PPC
690 #  define GCM_FUNCREF_4BIT
691 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
692 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
693 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
694                   size_t len);
695 # endif
696 #endif
697 
698 #ifdef GCM_FUNCREF_4BIT
699 # undef  GCM_MUL
700 # define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
701 # ifdef GHASH
702 #  undef  GHASH
703 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
704 # endif
705 #endif
706 
707 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
708 {
709     DECLARE_IS_ENDIAN;
710 
711     memset(ctx, 0, sizeof(*ctx));
712     ctx->block = block;
713     ctx->key = key;
714 
715     (*block) (ctx->H.c, ctx->H.c, key);
716 
717     if (IS_LITTLE_ENDIAN) {
718         /* H is stored in host byte order */
719 #ifdef BSWAP8
720         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
721         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
722 #else
723         u8 *p = ctx->H.c;
724         u64 hi, lo;
725         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
726         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
727         ctx->H.u[0] = hi;
728         ctx->H.u[1] = lo;
729 #endif
730     }
731 #if     TABLE_BITS==8
732     gcm_init_8bit(ctx->Htable, ctx->H.u);
733 #elif   TABLE_BITS==4
734 # if    defined(GHASH)
735 #  define CTX__GHASH(f) (ctx->ghash = (f))
736 # else
737 #  define CTX__GHASH(f) (ctx->ghash = NULL)
738 # endif
739 # if    defined(GHASH_ASM_X86_OR_64)
740 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
741     if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
742         if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
743             gcm_init_avx(ctx->Htable, ctx->H.u);
744             ctx->gmult = gcm_gmult_avx;
745             CTX__GHASH(gcm_ghash_avx);
746         } else {
747             gcm_init_clmul(ctx->Htable, ctx->H.u);
748             ctx->gmult = gcm_gmult_clmul;
749             CTX__GHASH(gcm_ghash_clmul);
750         }
751         return;
752     }
753 #  endif
754     gcm_init_4bit(ctx->Htable, ctx->H.u);
755 #  if   defined(GHASH_ASM_X86)  /* x86 only */
756 #   if  defined(OPENSSL_IA32_SSE2)
757     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
758 #   else
759     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
760 #   endif
761         ctx->gmult = gcm_gmult_4bit_mmx;
762         CTX__GHASH(gcm_ghash_4bit_mmx);
763     } else {
764         ctx->gmult = gcm_gmult_4bit_x86;
765         CTX__GHASH(gcm_ghash_4bit_x86);
766     }
767 #  else
768     ctx->gmult = gcm_gmult_4bit;
769     CTX__GHASH(gcm_ghash_4bit);
770 #  endif
771 # elif  defined(GHASH_ASM_ARM)
772 #  ifdef PMULL_CAPABLE
773     if (PMULL_CAPABLE) {
774         gcm_init_v8(ctx->Htable, ctx->H.u);
775         ctx->gmult = gcm_gmult_v8;
776         CTX__GHASH(gcm_ghash_v8);
777     } else
778 #  endif
779 #  ifdef NEON_CAPABLE
780     if (NEON_CAPABLE) {
781         gcm_init_neon(ctx->Htable, ctx->H.u);
782         ctx->gmult = gcm_gmult_neon;
783         CTX__GHASH(gcm_ghash_neon);
784     } else
785 #  endif
786     {
787         gcm_init_4bit(ctx->Htable, ctx->H.u);
788         ctx->gmult = gcm_gmult_4bit;
789         CTX__GHASH(gcm_ghash_4bit);
790     }
791 # elif  defined(GHASH_ASM_SPARC)
792     if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
793         gcm_init_vis3(ctx->Htable, ctx->H.u);
794         ctx->gmult = gcm_gmult_vis3;
795         CTX__GHASH(gcm_ghash_vis3);
796     } else {
797         gcm_init_4bit(ctx->Htable, ctx->H.u);
798         ctx->gmult = gcm_gmult_4bit;
799         CTX__GHASH(gcm_ghash_4bit);
800     }
801 # elif  defined(GHASH_ASM_PPC)
802     if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
803         gcm_init_p8(ctx->Htable, ctx->H.u);
804         ctx->gmult = gcm_gmult_p8;
805         CTX__GHASH(gcm_ghash_p8);
806     } else {
807         gcm_init_4bit(ctx->Htable, ctx->H.u);
808         ctx->gmult = gcm_gmult_4bit;
809         CTX__GHASH(gcm_ghash_4bit);
810     }
811 # else
812     gcm_init_4bit(ctx->Htable, ctx->H.u);
813 # endif
814 # undef CTX__GHASH
815 #endif
816 }
817 
818 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
819                          size_t len)
820 {
821     DECLARE_IS_ENDIAN;
822     unsigned int ctr;
823 #ifdef GCM_FUNCREF_4BIT
824     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
825 #endif
826 
827     ctx->len.u[0] = 0;          /* AAD length */
828     ctx->len.u[1] = 0;          /* message length */
829     ctx->ares = 0;
830     ctx->mres = 0;
831 
832     if (len == 12) {
833         memcpy(ctx->Yi.c, iv, 12);
834         ctx->Yi.c[12] = 0;
835         ctx->Yi.c[13] = 0;
836         ctx->Yi.c[14] = 0;
837         ctx->Yi.c[15] = 1;
838         ctr = 1;
839     } else {
840         size_t i;
841         u64 len0 = len;
842 
843         /* Borrow ctx->Xi to calculate initial Yi */
844         ctx->Xi.u[0] = 0;
845         ctx->Xi.u[1] = 0;
846 
847         while (len >= 16) {
848             for (i = 0; i < 16; ++i)
849                 ctx->Xi.c[i] ^= iv[i];
850             GCM_MUL(ctx);
851             iv += 16;
852             len -= 16;
853         }
854         if (len) {
855             for (i = 0; i < len; ++i)
856                 ctx->Xi.c[i] ^= iv[i];
857             GCM_MUL(ctx);
858         }
859         len0 <<= 3;
860         if (IS_LITTLE_ENDIAN) {
861 #ifdef BSWAP8
862             ctx->Xi.u[1] ^= BSWAP8(len0);
863 #else
864             ctx->Xi.c[8] ^= (u8)(len0 >> 56);
865             ctx->Xi.c[9] ^= (u8)(len0 >> 48);
866             ctx->Xi.c[10] ^= (u8)(len0 >> 40);
867             ctx->Xi.c[11] ^= (u8)(len0 >> 32);
868             ctx->Xi.c[12] ^= (u8)(len0 >> 24);
869             ctx->Xi.c[13] ^= (u8)(len0 >> 16);
870             ctx->Xi.c[14] ^= (u8)(len0 >> 8);
871             ctx->Xi.c[15] ^= (u8)(len0);
872 #endif
873         } else {
874             ctx->Xi.u[1] ^= len0;
875         }
876 
877         GCM_MUL(ctx);
878 
879         if (IS_LITTLE_ENDIAN)
880 #ifdef BSWAP4
881             ctr = BSWAP4(ctx->Xi.d[3]);
882 #else
883             ctr = GETU32(ctx->Xi.c + 12);
884 #endif
885         else
886             ctr = ctx->Xi.d[3];
887 
888         /* Copy borrowed Xi to Yi */
889         ctx->Yi.u[0] = ctx->Xi.u[0];
890         ctx->Yi.u[1] = ctx->Xi.u[1];
891     }
892 
893     ctx->Xi.u[0] = 0;
894     ctx->Xi.u[1] = 0;
895 
896     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
897     ++ctr;
898     if (IS_LITTLE_ENDIAN)
899 #ifdef BSWAP4
900         ctx->Yi.d[3] = BSWAP4(ctr);
901 #else
902         PUTU32(ctx->Yi.c + 12, ctr);
903 #endif
904     else
905         ctx->Yi.d[3] = ctr;
906 }
907 
908 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
909                       size_t len)
910 {
911     size_t i;
912     unsigned int n;
913     u64 alen = ctx->len.u[0];
914 #ifdef GCM_FUNCREF_4BIT
915     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
916 # ifdef GHASH
917     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
918                          const u8 *inp, size_t len) = ctx->ghash;
919 # endif
920 #endif
921 
922     if (ctx->len.u[1])
923         return -2;
924 
925     alen += len;
926     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
927         return -1;
928     ctx->len.u[0] = alen;
929 
930     n = ctx->ares;
931     if (n) {
932         while (n && len) {
933             ctx->Xi.c[n] ^= *(aad++);
934             --len;
935             n = (n + 1) % 16;
936         }
937         if (n == 0)
938             GCM_MUL(ctx);
939         else {
940             ctx->ares = n;
941             return 0;
942         }
943     }
944 #ifdef GHASH
945     if ((i = (len & (size_t)-16))) {
946         GHASH(ctx, aad, i);
947         aad += i;
948         len -= i;
949     }
950 #else
951     while (len >= 16) {
952         for (i = 0; i < 16; ++i)
953             ctx->Xi.c[i] ^= aad[i];
954         GCM_MUL(ctx);
955         aad += 16;
956         len -= 16;
957     }
958 #endif
959     if (len) {
960         n = (unsigned int)len;
961         for (i = 0; i < len; ++i)
962             ctx->Xi.c[i] ^= aad[i];
963     }
964 
965     ctx->ares = n;
966     return 0;
967 }
968 
969 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
970                           const unsigned char *in, unsigned char *out,
971                           size_t len)
972 {
973     DECLARE_IS_ENDIAN;
974     unsigned int n, ctr, mres;
975     size_t i;
976     u64 mlen = ctx->len.u[1];
977     block128_f block = ctx->block;
978     void *key = ctx->key;
979 #ifdef GCM_FUNCREF_4BIT
980     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
981 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
982     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
983                          const u8 *inp, size_t len) = ctx->ghash;
984 # endif
985 #endif
986 
987     mlen += len;
988     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
989         return -1;
990     ctx->len.u[1] = mlen;
991 
992     mres = ctx->mres;
993 
994     if (ctx->ares) {
995         /* First call to encrypt finalizes GHASH(AAD) */
996 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
997         if (len == 0) {
998             GCM_MUL(ctx);
999             ctx->ares = 0;
1000             return 0;
1001         }
1002         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1003         ctx->Xi.u[0] = 0;
1004         ctx->Xi.u[1] = 0;
1005         mres = sizeof(ctx->Xi);
1006 #else
1007         GCM_MUL(ctx);
1008 #endif
1009         ctx->ares = 0;
1010     }
1011 
1012     if (IS_LITTLE_ENDIAN)
1013 #ifdef BSWAP4
1014         ctr = BSWAP4(ctx->Yi.d[3]);
1015 #else
1016         ctr = GETU32(ctx->Yi.c + 12);
1017 #endif
1018     else
1019         ctr = ctx->Yi.d[3];
1020 
1021     n = mres % 16;
1022 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1023     if (16 % sizeof(size_t) == 0) { /* always true actually */
1024         do {
1025             if (n) {
1026 # if defined(GHASH)
1027                 while (n && len) {
1028                     ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1029                     --len;
1030                     n = (n + 1) % 16;
1031                 }
1032                 if (n == 0) {
1033                     GHASH(ctx, ctx->Xn, mres);
1034                     mres = 0;
1035                 } else {
1036                     ctx->mres = mres;
1037                     return 0;
1038                 }
1039 # else
1040                 while (n && len) {
1041                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1042                     --len;
1043                     n = (n + 1) % 16;
1044                 }
1045                 if (n == 0) {
1046                     GCM_MUL(ctx);
1047                     mres = 0;
1048                 } else {
1049                     ctx->mres = n;
1050                     return 0;
1051                 }
1052 # endif
1053             }
1054 # if defined(STRICT_ALIGNMENT)
1055             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1056                 break;
1057 # endif
1058 # if defined(GHASH)
1059             if (len >= 16 && mres) {
1060                 GHASH(ctx, ctx->Xn, mres);
1061                 mres = 0;
1062             }
1063 #  if defined(GHASH_CHUNK)
1064             while (len >= GHASH_CHUNK) {
1065                 size_t j = GHASH_CHUNK;
1066 
1067                 while (j) {
1068                     size_t_aX *out_t = (size_t_aX *)out;
1069                     const size_t_aX *in_t = (const size_t_aX *)in;
1070 
1071                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1072                     ++ctr;
1073                     if (IS_LITTLE_ENDIAN)
1074 #   ifdef BSWAP4
1075                         ctx->Yi.d[3] = BSWAP4(ctr);
1076 #   else
1077                         PUTU32(ctx->Yi.c + 12, ctr);
1078 #   endif
1079                     else
1080                         ctx->Yi.d[3] = ctr;
1081                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1082                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1083                     out += 16;
1084                     in += 16;
1085                     j -= 16;
1086                 }
1087                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1088                 len -= GHASH_CHUNK;
1089             }
1090 #  endif
1091             if ((i = (len & (size_t)-16))) {
1092                 size_t j = i;
1093 
1094                 while (len >= 16) {
1095                     size_t_aX *out_t = (size_t_aX *)out;
1096                     const size_t_aX *in_t = (const size_t_aX *)in;
1097 
1098                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1099                     ++ctr;
1100                     if (IS_LITTLE_ENDIAN)
1101 #  ifdef BSWAP4
1102                         ctx->Yi.d[3] = BSWAP4(ctr);
1103 #  else
1104                         PUTU32(ctx->Yi.c + 12, ctr);
1105 #  endif
1106                     else
1107                         ctx->Yi.d[3] = ctr;
1108                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1109                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1110                     out += 16;
1111                     in += 16;
1112                     len -= 16;
1113                 }
1114                 GHASH(ctx, out - j, j);
1115             }
1116 # else
1117             while (len >= 16) {
1118                 size_t *out_t = (size_t *)out;
1119                 const size_t *in_t = (const size_t *)in;
1120 
1121                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1122                 ++ctr;
1123                 if (IS_LITTLE_ENDIAN)
1124 #  ifdef BSWAP4
1125                     ctx->Yi.d[3] = BSWAP4(ctr);
1126 #  else
1127                     PUTU32(ctx->Yi.c + 12, ctr);
1128 #  endif
1129                 else
1130                     ctx->Yi.d[3] = ctr;
1131                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1132                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1133                 GCM_MUL(ctx);
1134                 out += 16;
1135                 in += 16;
1136                 len -= 16;
1137             }
1138 # endif
1139             if (len) {
1140                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1141                 ++ctr;
1142                 if (IS_LITTLE_ENDIAN)
1143 # ifdef BSWAP4
1144                     ctx->Yi.d[3] = BSWAP4(ctr);
1145 # else
1146                     PUTU32(ctx->Yi.c + 12, ctr);
1147 # endif
1148                 else
1149                     ctx->Yi.d[3] = ctr;
1150 # if defined(GHASH)
1151                 while (len--) {
1152                     ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1153                     ++n;
1154                 }
1155 # else
1156                 while (len--) {
1157                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1158                     ++n;
1159                 }
1160                 mres = n;
1161 # endif
1162             }
1163 
1164             ctx->mres = mres;
1165             return 0;
1166         } while (0);
1167     }
1168 #endif
1169     for (i = 0; i < len; ++i) {
1170         if (n == 0) {
1171             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1172             ++ctr;
1173             if (IS_LITTLE_ENDIAN)
1174 #ifdef BSWAP4
1175                 ctx->Yi.d[3] = BSWAP4(ctr);
1176 #else
1177                 PUTU32(ctx->Yi.c + 12, ctr);
1178 #endif
1179             else
1180                 ctx->Yi.d[3] = ctr;
1181         }
1182 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1183         ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1184         n = (n + 1) % 16;
1185         if (mres == sizeof(ctx->Xn)) {
1186             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1187             mres = 0;
1188         }
1189 #else
1190         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1191         mres = n = (n + 1) % 16;
1192         if (n == 0)
1193             GCM_MUL(ctx);
1194 #endif
1195     }
1196 
1197     ctx->mres = mres;
1198     return 0;
1199 }
1200 
1201 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1202                           const unsigned char *in, unsigned char *out,
1203                           size_t len)
1204 {
1205     DECLARE_IS_ENDIAN;
1206     unsigned int n, ctr, mres;
1207     size_t i;
1208     u64 mlen = ctx->len.u[1];
1209     block128_f block = ctx->block;
1210     void *key = ctx->key;
1211 #ifdef GCM_FUNCREF_4BIT
1212     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1213 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1214     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1215                          const u8 *inp, size_t len) = ctx->ghash;
1216 # endif
1217 #endif
1218 
1219     mlen += len;
1220     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1221         return -1;
1222     ctx->len.u[1] = mlen;
1223 
1224     mres = ctx->mres;
1225 
1226     if (ctx->ares) {
1227         /* First call to decrypt finalizes GHASH(AAD) */
1228 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1229         if (len == 0) {
1230             GCM_MUL(ctx);
1231             ctx->ares = 0;
1232             return 0;
1233         }
1234         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1235         ctx->Xi.u[0] = 0;
1236         ctx->Xi.u[1] = 0;
1237         mres = sizeof(ctx->Xi);
1238 #else
1239         GCM_MUL(ctx);
1240 #endif
1241         ctx->ares = 0;
1242     }
1243 
1244     if (IS_LITTLE_ENDIAN)
1245 #ifdef BSWAP4
1246         ctr = BSWAP4(ctx->Yi.d[3]);
1247 #else
1248         ctr = GETU32(ctx->Yi.c + 12);
1249 #endif
1250     else
1251         ctr = ctx->Yi.d[3];
1252 
1253     n = mres % 16;
1254 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1255     if (16 % sizeof(size_t) == 0) { /* always true actually */
1256         do {
1257             if (n) {
1258 # if defined(GHASH)
1259                 while (n && len) {
1260                     *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1261                     --len;
1262                     n = (n + 1) % 16;
1263                 }
1264                 if (n == 0) {
1265                     GHASH(ctx, ctx->Xn, mres);
1266                     mres = 0;
1267                 } else {
1268                     ctx->mres = mres;
1269                     return 0;
1270                 }
1271 # else
1272                 while (n && len) {
1273                     u8 c = *(in++);
1274                     *(out++) = c ^ ctx->EKi.c[n];
1275                     ctx->Xi.c[n] ^= c;
1276                     --len;
1277                     n = (n + 1) % 16;
1278                 }
1279                 if (n == 0) {
1280                     GCM_MUL(ctx);
1281                     mres = 0;
1282                 } else {
1283                     ctx->mres = n;
1284                     return 0;
1285                 }
1286 # endif
1287             }
1288 # if defined(STRICT_ALIGNMENT)
1289             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1290                 break;
1291 # endif
1292 # if defined(GHASH)
1293             if (len >= 16 && mres) {
1294                 GHASH(ctx, ctx->Xn, mres);
1295                 mres = 0;
1296             }
1297 #  if defined(GHASH_CHUNK)
1298             while (len >= GHASH_CHUNK) {
1299                 size_t j = GHASH_CHUNK;
1300 
1301                 GHASH(ctx, in, GHASH_CHUNK);
1302                 while (j) {
1303                     size_t_aX *out_t = (size_t_aX *)out;
1304                     const size_t_aX *in_t = (const size_t_aX *)in;
1305 
1306                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1307                     ++ctr;
1308                     if (IS_LITTLE_ENDIAN)
1309 #   ifdef BSWAP4
1310                         ctx->Yi.d[3] = BSWAP4(ctr);
1311 #   else
1312                         PUTU32(ctx->Yi.c + 12, ctr);
1313 #   endif
1314                     else
1315                         ctx->Yi.d[3] = ctr;
1316                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1317                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1318                     out += 16;
1319                     in += 16;
1320                     j -= 16;
1321                 }
1322                 len -= GHASH_CHUNK;
1323             }
1324 #  endif
1325             if ((i = (len & (size_t)-16))) {
1326                 GHASH(ctx, in, i);
1327                 while (len >= 16) {
1328                     size_t_aX *out_t = (size_t_aX *)out;
1329                     const size_t_aX *in_t = (const size_t_aX *)in;
1330 
1331                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1332                     ++ctr;
1333                     if (IS_LITTLE_ENDIAN)
1334 #  ifdef BSWAP4
1335                         ctx->Yi.d[3] = BSWAP4(ctr);
1336 #  else
1337                         PUTU32(ctx->Yi.c + 12, ctr);
1338 #  endif
1339                     else
1340                         ctx->Yi.d[3] = ctr;
1341                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1342                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1343                     out += 16;
1344                     in += 16;
1345                     len -= 16;
1346                 }
1347             }
1348 # else
1349             while (len >= 16) {
1350                 size_t *out_t = (size_t *)out;
1351                 const size_t *in_t = (const size_t *)in;
1352 
1353                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1354                 ++ctr;
1355                 if (IS_LITTLE_ENDIAN)
1356 #  ifdef BSWAP4
1357                     ctx->Yi.d[3] = BSWAP4(ctr);
1358 #  else
1359                     PUTU32(ctx->Yi.c + 12, ctr);
1360 #  endif
1361                 else
1362                     ctx->Yi.d[3] = ctr;
1363                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1364                     size_t c = in_t[i];
1365                     out_t[i] = c ^ ctx->EKi.t[i];
1366                     ctx->Xi.t[i] ^= c;
1367                 }
1368                 GCM_MUL(ctx);
1369                 out += 16;
1370                 in += 16;
1371                 len -= 16;
1372             }
1373 # endif
1374             if (len) {
1375                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1376                 ++ctr;
1377                 if (IS_LITTLE_ENDIAN)
1378 # ifdef BSWAP4
1379                     ctx->Yi.d[3] = BSWAP4(ctr);
1380 # else
1381                     PUTU32(ctx->Yi.c + 12, ctr);
1382 # endif
1383                 else
1384                     ctx->Yi.d[3] = ctr;
1385 # if defined(GHASH)
1386                 while (len--) {
1387                     out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1388                     ++n;
1389                 }
1390 # else
1391                 while (len--) {
1392                     u8 c = in[n];
1393                     ctx->Xi.c[n] ^= c;
1394                     out[n] = c ^ ctx->EKi.c[n];
1395                     ++n;
1396                 }
1397                 mres = n;
1398 # endif
1399             }
1400 
1401             ctx->mres = mres;
1402             return 0;
1403         } while (0);
1404     }
1405 #endif
1406     for (i = 0; i < len; ++i) {
1407         u8 c;
1408         if (n == 0) {
1409             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1410             ++ctr;
1411             if (IS_LITTLE_ENDIAN)
1412 #ifdef BSWAP4
1413                 ctx->Yi.d[3] = BSWAP4(ctr);
1414 #else
1415                 PUTU32(ctx->Yi.c + 12, ctr);
1416 #endif
1417             else
1418                 ctx->Yi.d[3] = ctr;
1419         }
1420 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1421         out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1422         n = (n + 1) % 16;
1423         if (mres == sizeof(ctx->Xn)) {
1424             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1425             mres = 0;
1426         }
1427 #else
1428         c = in[i];
1429         out[i] = c ^ ctx->EKi.c[n];
1430         ctx->Xi.c[n] ^= c;
1431         mres = n = (n + 1) % 16;
1432         if (n == 0)
1433             GCM_MUL(ctx);
1434 #endif
1435     }
1436 
1437     ctx->mres = mres;
1438     return 0;
1439 }
1440 
1441 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1442                                 const unsigned char *in, unsigned char *out,
1443                                 size_t len, ctr128_f stream)
1444 {
1445 #if defined(OPENSSL_SMALL_FOOTPRINT)
1446     return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1447 #else
1448     DECLARE_IS_ENDIAN;
1449     unsigned int n, ctr, mres;
1450     size_t i;
1451     u64 mlen = ctx->len.u[1];
1452     void *key = ctx->key;
1453 # ifdef GCM_FUNCREF_4BIT
1454     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1455 #  ifdef GHASH
1456     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1457                          const u8 *inp, size_t len) = ctx->ghash;
1458 #  endif
1459 # endif
1460 
1461     mlen += len;
1462     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1463         return -1;
1464     ctx->len.u[1] = mlen;
1465 
1466     mres = ctx->mres;
1467 
1468     if (ctx->ares) {
1469         /* First call to encrypt finalizes GHASH(AAD) */
1470 #if defined(GHASH)
1471         if (len == 0) {
1472             GCM_MUL(ctx);
1473             ctx->ares = 0;
1474             return 0;
1475         }
1476         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1477         ctx->Xi.u[0] = 0;
1478         ctx->Xi.u[1] = 0;
1479         mres = sizeof(ctx->Xi);
1480 #else
1481         GCM_MUL(ctx);
1482 #endif
1483         ctx->ares = 0;
1484     }
1485 
1486     if (IS_LITTLE_ENDIAN)
1487 # ifdef BSWAP4
1488         ctr = BSWAP4(ctx->Yi.d[3]);
1489 # else
1490         ctr = GETU32(ctx->Yi.c + 12);
1491 # endif
1492     else
1493         ctr = ctx->Yi.d[3];
1494 
1495     n = mres % 16;
1496     if (n) {
1497 # if defined(GHASH)
1498         while (n && len) {
1499             ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1500             --len;
1501             n = (n + 1) % 16;
1502         }
1503         if (n == 0) {
1504             GHASH(ctx, ctx->Xn, mres);
1505             mres = 0;
1506         } else {
1507             ctx->mres = mres;
1508             return 0;
1509         }
1510 # else
1511         while (n && len) {
1512             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1513             --len;
1514             n = (n + 1) % 16;
1515         }
1516         if (n == 0) {
1517             GCM_MUL(ctx);
1518             mres = 0;
1519         } else {
1520             ctx->mres = n;
1521             return 0;
1522         }
1523 # endif
1524     }
1525 # if defined(GHASH)
1526         if (len >= 16 && mres) {
1527             GHASH(ctx, ctx->Xn, mres);
1528             mres = 0;
1529         }
1530 #  if defined(GHASH_CHUNK)
1531     while (len >= GHASH_CHUNK) {
1532         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1533         ctr += GHASH_CHUNK / 16;
1534         if (IS_LITTLE_ENDIAN)
1535 #   ifdef BSWAP4
1536             ctx->Yi.d[3] = BSWAP4(ctr);
1537 #   else
1538             PUTU32(ctx->Yi.c + 12, ctr);
1539 #   endif
1540         else
1541             ctx->Yi.d[3] = ctr;
1542         GHASH(ctx, out, GHASH_CHUNK);
1543         out += GHASH_CHUNK;
1544         in += GHASH_CHUNK;
1545         len -= GHASH_CHUNK;
1546     }
1547 #  endif
1548 # endif
1549     if ((i = (len & (size_t)-16))) {
1550         size_t j = i / 16;
1551 
1552         (*stream) (in, out, j, key, ctx->Yi.c);
1553         ctr += (unsigned int)j;
1554         if (IS_LITTLE_ENDIAN)
1555 # ifdef BSWAP4
1556             ctx->Yi.d[3] = BSWAP4(ctr);
1557 # else
1558             PUTU32(ctx->Yi.c + 12, ctr);
1559 # endif
1560         else
1561             ctx->Yi.d[3] = ctr;
1562         in += i;
1563         len -= i;
1564 # if defined(GHASH)
1565         GHASH(ctx, out, i);
1566         out += i;
1567 # else
1568         while (j--) {
1569             for (i = 0; i < 16; ++i)
1570                 ctx->Xi.c[i] ^= out[i];
1571             GCM_MUL(ctx);
1572             out += 16;
1573         }
1574 # endif
1575     }
1576     if (len) {
1577         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1578         ++ctr;
1579         if (IS_LITTLE_ENDIAN)
1580 # ifdef BSWAP4
1581             ctx->Yi.d[3] = BSWAP4(ctr);
1582 # else
1583             PUTU32(ctx->Yi.c + 12, ctr);
1584 # endif
1585         else
1586             ctx->Yi.d[3] = ctr;
1587         while (len--) {
1588 # if defined(GHASH)
1589             ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1590 # else
1591             ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1592 # endif
1593             ++n;
1594         }
1595     }
1596 
1597     ctx->mres = mres;
1598     return 0;
1599 #endif
1600 }
1601 
1602 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1603                                 const unsigned char *in, unsigned char *out,
1604                                 size_t len, ctr128_f stream)
1605 {
1606 #if defined(OPENSSL_SMALL_FOOTPRINT)
1607     return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1608 #else
1609     DECLARE_IS_ENDIAN;
1610     unsigned int n, ctr, mres;
1611     size_t i;
1612     u64 mlen = ctx->len.u[1];
1613     void *key = ctx->key;
1614 # ifdef GCM_FUNCREF_4BIT
1615     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1616 #  ifdef GHASH
1617     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1618                          const u8 *inp, size_t len) = ctx->ghash;
1619 #  endif
1620 # endif
1621 
1622     mlen += len;
1623     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1624         return -1;
1625     ctx->len.u[1] = mlen;
1626 
1627     mres = ctx->mres;
1628 
1629     if (ctx->ares) {
1630         /* First call to decrypt finalizes GHASH(AAD) */
1631 # if defined(GHASH)
1632         if (len == 0) {
1633             GCM_MUL(ctx);
1634             ctx->ares = 0;
1635             return 0;
1636         }
1637         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1638         ctx->Xi.u[0] = 0;
1639         ctx->Xi.u[1] = 0;
1640         mres = sizeof(ctx->Xi);
1641 # else
1642         GCM_MUL(ctx);
1643 # endif
1644         ctx->ares = 0;
1645     }
1646 
1647     if (IS_LITTLE_ENDIAN)
1648 # ifdef BSWAP4
1649         ctr = BSWAP4(ctx->Yi.d[3]);
1650 # else
1651         ctr = GETU32(ctx->Yi.c + 12);
1652 # endif
1653     else
1654         ctr = ctx->Yi.d[3];
1655 
1656     n = mres % 16;
1657     if (n) {
1658 # if defined(GHASH)
1659         while (n && len) {
1660             *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1661             --len;
1662             n = (n + 1) % 16;
1663         }
1664         if (n == 0) {
1665             GHASH(ctx, ctx->Xn, mres);
1666             mres = 0;
1667         } else {
1668             ctx->mres = mres;
1669             return 0;
1670         }
1671 # else
1672         while (n && len) {
1673             u8 c = *(in++);
1674             *(out++) = c ^ ctx->EKi.c[n];
1675             ctx->Xi.c[n] ^= c;
1676             --len;
1677             n = (n + 1) % 16;
1678         }
1679         if (n == 0) {
1680             GCM_MUL(ctx);
1681             mres = 0;
1682         } else {
1683             ctx->mres = n;
1684             return 0;
1685         }
1686 # endif
1687     }
1688 # if defined(GHASH)
1689     if (len >= 16 && mres) {
1690         GHASH(ctx, ctx->Xn, mres);
1691         mres = 0;
1692     }
1693 #  if defined(GHASH_CHUNK)
1694     while (len >= GHASH_CHUNK) {
1695         GHASH(ctx, in, GHASH_CHUNK);
1696         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1697         ctr += GHASH_CHUNK / 16;
1698         if (IS_LITTLE_ENDIAN)
1699 #   ifdef BSWAP4
1700             ctx->Yi.d[3] = BSWAP4(ctr);
1701 #   else
1702             PUTU32(ctx->Yi.c + 12, ctr);
1703 #   endif
1704         else
1705             ctx->Yi.d[3] = ctr;
1706         out += GHASH_CHUNK;
1707         in += GHASH_CHUNK;
1708         len -= GHASH_CHUNK;
1709     }
1710 #  endif
1711 # endif
1712     if ((i = (len & (size_t)-16))) {
1713         size_t j = i / 16;
1714 
1715 # if defined(GHASH)
1716         GHASH(ctx, in, i);
1717 # else
1718         while (j--) {
1719             size_t k;
1720             for (k = 0; k < 16; ++k)
1721                 ctx->Xi.c[k] ^= in[k];
1722             GCM_MUL(ctx);
1723             in += 16;
1724         }
1725         j = i / 16;
1726         in -= i;
1727 # endif
1728         (*stream) (in, out, j, key, ctx->Yi.c);
1729         ctr += (unsigned int)j;
1730         if (IS_LITTLE_ENDIAN)
1731 # ifdef BSWAP4
1732             ctx->Yi.d[3] = BSWAP4(ctr);
1733 # else
1734             PUTU32(ctx->Yi.c + 12, ctr);
1735 # endif
1736         else
1737             ctx->Yi.d[3] = ctr;
1738         out += i;
1739         in += i;
1740         len -= i;
1741     }
1742     if (len) {
1743         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1744         ++ctr;
1745         if (IS_LITTLE_ENDIAN)
1746 # ifdef BSWAP4
1747             ctx->Yi.d[3] = BSWAP4(ctr);
1748 # else
1749             PUTU32(ctx->Yi.c + 12, ctr);
1750 # endif
1751         else
1752             ctx->Yi.d[3] = ctr;
1753         while (len--) {
1754 # if defined(GHASH)
1755             out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1756 # else
1757             u8 c = in[n];
1758             ctx->Xi.c[mres++] ^= c;
1759             out[n] = c ^ ctx->EKi.c[n];
1760 # endif
1761             ++n;
1762         }
1763     }
1764 
1765     ctx->mres = mres;
1766     return 0;
1767 #endif
1768 }
1769 
1770 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1771                          size_t len)
1772 {
1773     DECLARE_IS_ENDIAN;
1774     u64 alen = ctx->len.u[0] << 3;
1775     u64 clen = ctx->len.u[1] << 3;
1776 #ifdef GCM_FUNCREF_4BIT
1777     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1778 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1779     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1780                          const u8 *inp, size_t len) = ctx->ghash;
1781 # endif
1782 #endif
1783 
1784 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1785     u128 bitlen;
1786     unsigned int mres = ctx->mres;
1787 
1788     if (mres) {
1789         unsigned blocks = (mres + 15) & -16;
1790 
1791         memset(ctx->Xn + mres, 0, blocks - mres);
1792         mres = blocks;
1793         if (mres == sizeof(ctx->Xn)) {
1794             GHASH(ctx, ctx->Xn, mres);
1795             mres = 0;
1796         }
1797     } else if (ctx->ares) {
1798         GCM_MUL(ctx);
1799     }
1800 #else
1801     if (ctx->mres || ctx->ares)
1802         GCM_MUL(ctx);
1803 #endif
1804 
1805     if (IS_LITTLE_ENDIAN) {
1806 #ifdef BSWAP8
1807         alen = BSWAP8(alen);
1808         clen = BSWAP8(clen);
1809 #else
1810         u8 *p = ctx->len.c;
1811 
1812         ctx->len.u[0] = alen;
1813         ctx->len.u[1] = clen;
1814 
1815         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1816         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1817 #endif
1818     }
1819 
1820 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1821     bitlen.hi = alen;
1822     bitlen.lo = clen;
1823     memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1824     mres += sizeof(bitlen);
1825     GHASH(ctx, ctx->Xn, mres);
1826 #else
1827     ctx->Xi.u[0] ^= alen;
1828     ctx->Xi.u[1] ^= clen;
1829     GCM_MUL(ctx);
1830 #endif
1831 
1832     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1833     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1834 
1835     if (tag && len <= sizeof(ctx->Xi))
1836         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1837     else
1838         return -1;
1839 }
1840 
1841 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1842 {
1843     CRYPTO_gcm128_finish(ctx, NULL, 0);
1844     memcpy(tag, ctx->Xi.c,
1845            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1846 }
1847 
1848 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1849 {
1850     GCM128_CONTEXT *ret;
1851 
1852     if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1853         CRYPTO_gcm128_init(ret, key, block);
1854 
1855     return ret;
1856 }
1857 
1858 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1859 {
1860     OPENSSL_clear_free(ctx, sizeof(*ctx));
1861 }
1862