xref: /openbsd-src/lib/libcrypto/modes/gcm128.c (revision 0b5493cb0c742202428c9cd6ec8d6e309804d203)
1 /* $OpenBSD: gcm128.c,v 1.25 2023/07/08 14:56:54 beck Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <openssl/crypto.h>
54 #include "modes_local.h"
55 #include <string.h>
56 
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 #  define NDEBUG
60 # endif
61 #endif
62 
63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #undef	PUTU32
68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)							\
73 	do {								\
74 		if (sizeof(size_t)==8) {				\
75 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1));	\
76 			V.lo  = (V.hi<<63)|(V.lo>>1);			\
77 			V.hi  = (V.hi>>1 )^T;				\
78 		} else {						\
79 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1));	\
80 			V.lo  = (V.hi<<63)|(V.lo>>1);			\
81 			V.hi  = (V.hi>>1 )^((u64)T<<32);		\
82 		}							\
83 	} while(0)
84 
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if	TABLE_BITS==8
120 
121 static void
122 gcm_init_8bit(u128 Htable[256], u64 H[2])
123 {
124 	int  i, j;
125 	u128 V;
126 
127 	Htable[0].hi = 0;
128 	Htable[0].lo = 0;
129 	V.hi = H[0];
130 	V.lo = H[1];
131 
132 	for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
133 		REDUCE1BIT(V);
134 		Htable[i] = V;
135 	}
136 
137 	for (i = 2; i < 256; i <<= 1) {
138 		u128 *Hi = Htable + i, H0 = *Hi;
139 		for (j = 1; j < i; ++j) {
140 			Hi[j].hi = H0.hi ^ Htable[j].hi;
141 			Hi[j].lo = H0.lo ^ Htable[j].lo;
142 		}
143 	}
144 }
145 
146 static void
147 gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
148 {
149 	u128 Z = { 0, 0};
150 	const u8 *xi = (const u8 *)Xi + 15;
151 	size_t rem, n = *xi;
152 	static const size_t rem_8bit[256] = {
153 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217 
218 	while (1) {
219 		Z.hi ^= Htable[n].hi;
220 		Z.lo ^= Htable[n].lo;
221 
222 		if ((u8 *)Xi == xi)
223 			break;
224 
225 		n = *(--xi);
226 
227 		rem = (size_t)Z.lo & 0xff;
228 		Z.lo = (Z.hi << 56)|(Z.lo >> 8);
229 		Z.hi = (Z.hi >> 8);
230 #if SIZE_MAX == 0xffffffffffffffff
231 		Z.hi ^= rem_8bit[rem];
232 #else
233 		Z.hi ^= (u64)rem_8bit[rem] << 32;
234 #endif
235 	}
236 
237 #if BYTE_ORDER == LITTLE_ENDIAN
238 #ifdef BSWAP8
239 	Xi[0] = BSWAP8(Z.hi);
240 	Xi[1] = BSWAP8(Z.lo);
241 #else
242 	u8 *p = (u8 *)Xi;
243 	u32 v;
244 	v = (u32)(Z.hi >> 32);
245 	PUTU32(p, v);
246 	v = (u32)(Z.hi);
247 	PUTU32(p + 4, v);
248 	v = (u32)(Z.lo >> 32);
249 	PUTU32(p + 8, v);
250 	v = (u32)(Z.lo);
251 	PUTU32(p + 12, v);
252 #endif
253 #else /* BIG_ENDIAN */
254 	Xi[0] = Z.hi;
255 	Xi[1] = Z.lo;
256 #endif
257 }
258 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
259 
260 #elif	TABLE_BITS==4
261 
262 static void
263 gcm_init_4bit(u128 Htable[16], u64 H[2])
264 {
265 	u128 V;
266 #if defined(OPENSSL_SMALL_FOOTPRINT)
267 	int  i;
268 #endif
269 
270 	Htable[0].hi = 0;
271 	Htable[0].lo = 0;
272 	V.hi = H[0];
273 	V.lo = H[1];
274 
275 #if defined(OPENSSL_SMALL_FOOTPRINT)
276 	for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
277 		REDUCE1BIT(V);
278 		Htable[i] = V;
279 	}
280 
281 	for (i = 2; i < 16; i <<= 1) {
282 		u128 *Hi = Htable + i;
283 		int   j;
284 		for (V = *Hi, j = 1; j < i; ++j) {
285 			Hi[j].hi = V.hi ^ Htable[j].hi;
286 			Hi[j].lo = V.lo ^ Htable[j].lo;
287 		}
288 	}
289 #else
290 	Htable[8] = V;
291 	REDUCE1BIT(V);
292 	Htable[4] = V;
293 	REDUCE1BIT(V);
294 	Htable[2] = V;
295 	REDUCE1BIT(V);
296 	Htable[1] = V;
297 	Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
298 	V = Htable[4];
299 	Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
300 	Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
301 	Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
302 	V = Htable[8];
303 	Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
304 	Htable[10].hi = V.hi ^ Htable[2].hi,
305 	    Htable[10].lo = V.lo ^ Htable[2].lo;
306 	Htable[11].hi = V.hi ^ Htable[3].hi,
307 	    Htable[11].lo = V.lo ^ Htable[3].lo;
308 	Htable[12].hi = V.hi ^ Htable[4].hi,
309 	    Htable[12].lo = V.lo ^ Htable[4].lo;
310 	Htable[13].hi = V.hi ^ Htable[5].hi,
311 	    Htable[13].lo = V.lo ^ Htable[5].lo;
312 	Htable[14].hi = V.hi ^ Htable[6].hi,
313 	    Htable[14].lo = V.lo ^ Htable[6].lo;
314 	Htable[15].hi = V.hi ^ Htable[7].hi,
315 	    Htable[15].lo = V.lo ^ Htable[7].lo;
316 #endif
317 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
318 	/*
319 	 * ARM assembler expects specific dword order in Htable.
320 	 */
321 	{
322 		int j;
323 #if BYTE_ORDER == LITTLE_ENDIAN
324 		for (j = 0; j < 16; ++j) {
325 			V = Htable[j];
326 			Htable[j].hi = V.lo;
327 			Htable[j].lo = V.hi;
328 		}
329 #else /* BIG_ENDIAN */
330 		for (j = 0; j < 16; ++j) {
331 			V = Htable[j];
332 			Htable[j].hi = V.lo << 32|V.lo >> 32;
333 			Htable[j].lo = V.hi << 32|V.hi >> 32;
334 		}
335 #endif
336 	}
337 #endif
338 }
339 
340 #ifndef GHASH_ASM
341 static const size_t rem_4bit[16] = {
342 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
343 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
344 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
345 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
346 
347 static void
348 gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
349 {
350 	u128 Z;
351 	int cnt = 15;
352 	size_t rem, nlo, nhi;
353 
354 	nlo = ((const u8 *)Xi)[15];
355 	nhi = nlo >> 4;
356 	nlo &= 0xf;
357 
358 	Z.hi = Htable[nlo].hi;
359 	Z.lo = Htable[nlo].lo;
360 
361 	while (1) {
362 		rem = (size_t)Z.lo & 0xf;
363 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
364 		Z.hi = (Z.hi >> 4);
365 #if SIZE_MAX == 0xffffffffffffffff
366 		Z.hi ^= rem_4bit[rem];
367 #else
368 		Z.hi ^= (u64)rem_4bit[rem] << 32;
369 #endif
370 		Z.hi ^= Htable[nhi].hi;
371 		Z.lo ^= Htable[nhi].lo;
372 
373 		if (--cnt < 0)
374 			break;
375 
376 		nlo = ((const u8 *)Xi)[cnt];
377 		nhi = nlo >> 4;
378 		nlo &= 0xf;
379 
380 		rem = (size_t)Z.lo & 0xf;
381 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
382 		Z.hi = (Z.hi >> 4);
383 #if SIZE_MAX == 0xffffffffffffffff
384 		Z.hi ^= rem_4bit[rem];
385 #else
386 		Z.hi ^= (u64)rem_4bit[rem] << 32;
387 #endif
388 		Z.hi ^= Htable[nlo].hi;
389 		Z.lo ^= Htable[nlo].lo;
390 	}
391 
392 #if BYTE_ORDER == LITTLE_ENDIAN
393 #ifdef BSWAP8
394 	Xi[0] = BSWAP8(Z.hi);
395 	Xi[1] = BSWAP8(Z.lo);
396 #else
397 	u8 *p = (u8 *)Xi;
398 	u32 v;
399 	v = (u32)(Z.hi >> 32);
400 	PUTU32(p, v);
401 	v = (u32)(Z.hi);
402 	PUTU32(p + 4, v);
403 	v = (u32)(Z.lo >> 32);
404 	PUTU32(p + 8, v);
405 	v = (u32)(Z.lo);
406 	PUTU32(p + 12, v);
407 #endif
408 #else /* BIG_ENDIAN */
409 	Xi[0] = Z.hi;
410 	Xi[1] = Z.lo;
411 #endif
412 }
413 
414 #if !defined(OPENSSL_SMALL_FOOTPRINT)
415 /*
416  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
417  * details... Compiler-generated code doesn't seem to give any
418  * performance improvement, at least not on x86[_64]. It's here
419  * mostly as reference and a placeholder for possible future
420  * non-trivial optimization[s]...
421  */
422 static void
423 gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
424     const u8 *inp, size_t len)
425 {
426 	u128 Z;
427 	int cnt;
428 	size_t rem, nlo, nhi;
429 
430 #if 1
431 	do {
432 		cnt = 15;
433 		nlo = ((const u8 *)Xi)[15];
434 		nlo ^= inp[15];
435 		nhi = nlo >> 4;
436 		nlo &= 0xf;
437 
438 		Z.hi = Htable[nlo].hi;
439 		Z.lo = Htable[nlo].lo;
440 
441 		while (1) {
442 			rem = (size_t)Z.lo & 0xf;
443 			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
444 			Z.hi = (Z.hi >> 4);
445 #if SIZE_MAX == 0xffffffffffffffff
446 			Z.hi ^= rem_4bit[rem];
447 #else
448 			Z.hi ^= (u64)rem_4bit[rem] << 32;
449 #endif
450 			Z.hi ^= Htable[nhi].hi;
451 			Z.lo ^= Htable[nhi].lo;
452 
453 			if (--cnt < 0)
454 				break;
455 
456 			nlo = ((const u8 *)Xi)[cnt];
457 			nlo ^= inp[cnt];
458 			nhi = nlo >> 4;
459 			nlo &= 0xf;
460 
461 			rem = (size_t)Z.lo & 0xf;
462 			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
463 			Z.hi = (Z.hi >> 4);
464 #if SIZE_MAX == 0xffffffffffffffff
465 			Z.hi ^= rem_4bit[rem];
466 #else
467 			Z.hi ^= (u64)rem_4bit[rem] << 32;
468 #endif
469 			Z.hi ^= Htable[nlo].hi;
470 			Z.lo ^= Htable[nlo].lo;
471 		}
472 #else
473     /*
474      * Extra 256+16 bytes per-key plus 512 bytes shared tables
475      * [should] give ~50% improvement... One could have PACK()-ed
476      * the rem_8bit even here, but the priority is to minimize
477      * cache footprint...
478      */
479 	u128 Hshr4[16];	/* Htable shifted right by 4 bits */
480 	u8 Hshl4[16];	/* Htable shifted left  by 4 bits */
481 	static const unsigned short rem_8bit[256] = {
482 		0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
483 		0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
484 		0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
485 		0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
486 		0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
487 		0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
488 		0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
489 		0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
490 		0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
491 		0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
492 		0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
493 		0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
494 		0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
495 		0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
496 		0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
497 		0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
498 		0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
499 		0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
500 		0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
501 		0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
502 		0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
503 		0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
504 		0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
505 		0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
506 		0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
507 		0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
508 		0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
509 		0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
510 		0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
511 		0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
512 		0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
513 		0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
514     /*
515      * This pre-processing phase slows down procedure by approximately
516      * same time as it makes each loop spin faster. In other words
517      * single block performance is approximately same as straightforward
518      * "4-bit" implementation, and then it goes only faster...
519      */
520 	for (cnt = 0; cnt < 16; ++cnt) {
521 		Z.hi = Htable[cnt].hi;
522 		Z.lo = Htable[cnt].lo;
523 		Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4);
524 		Hshr4[cnt].hi = (Z.hi >> 4);
525 		Hshl4[cnt] = (u8)(Z.lo << 4);
526 	}
527 
528 	do {
529 		for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
530 			nlo = ((const u8 *)Xi)[cnt];
531 			nlo ^= inp[cnt];
532 			nhi = nlo >> 4;
533 			nlo &= 0xf;
534 
535 			Z.hi ^= Htable[nlo].hi;
536 			Z.lo ^= Htable[nlo].lo;
537 
538 			rem = (size_t)Z.lo & 0xff;
539 
540 			Z.lo = (Z.hi << 56)|(Z.lo >> 8);
541 			Z.hi = (Z.hi >> 8);
542 
543 			Z.hi ^= Hshr4[nhi].hi;
544 			Z.lo ^= Hshr4[nhi].lo;
545 			Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
546 		}
547 
548 		nlo = ((const u8 *)Xi)[0];
549 		nlo ^= inp[0];
550 		nhi = nlo >> 4;
551 		nlo &= 0xf;
552 
553 		Z.hi ^= Htable[nlo].hi;
554 		Z.lo ^= Htable[nlo].lo;
555 
556 		rem = (size_t)Z.lo & 0xf;
557 
558 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
559 		Z.hi = (Z.hi >> 4);
560 
561 		Z.hi ^= Htable[nhi].hi;
562 		Z.lo ^= Htable[nhi].lo;
563 		Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
564 #endif
565 
566 #if BYTE_ORDER == LITTLE_ENDIAN
567 #ifdef BSWAP8
568 		Xi[0] = BSWAP8(Z.hi);
569 		Xi[1] = BSWAP8(Z.lo);
570 #else
571 		u8 *p = (u8 *)Xi;
572 		u32 v;
573 		v = (u32)(Z.hi >> 32);
574 		PUTU32(p, v);
575 		v = (u32)(Z.hi);
576 		PUTU32(p + 4, v);
577 		v = (u32)(Z.lo >> 32);
578 		PUTU32(p + 8, v);
579 		v = (u32)(Z.lo);
580 		PUTU32(p + 12, v);
581 #endif
582 #else /* BIG_ENDIAN */
583 		Xi[0] = Z.hi;
584 		Xi[1] = Z.lo;
585 #endif
586 	} while (inp += 16, len -= 16);
587 }
588 #endif
589 #else
590 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
591 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
592     size_t len);
593 #endif
594 
595 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
596 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
597 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
598 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
599  * trashing effect. In other words idea is to hash data while it's
600  * still in L1 cache after encryption pass... */
601 #define GHASH_CHUNK       (3*1024)
602 #endif
603 
604 #else	/* TABLE_BITS */
605 
606 static void
607 gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
608 {
609 	u128 V, Z = { 0,0 };
610 	long X;
611 	int i, j;
612 	const long *xi = (const long *)Xi;
613 
614 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
615 	V.lo = H[1];
616 
617 	for (j = 0; j < 16/sizeof(long); ++j) {
618 #if BYTE_ORDER == LITTLE_ENDIAN
619 #if SIZE_MAX == 0xffffffffffffffff
620 #ifdef BSWAP8
621 		X = (long)(BSWAP8(xi[j]));
622 #else
623 		const u8 *p = (const u8 *)(xi + j);
624 		X = (long)((u64)GETU32(p) << 32|GETU32(p + 4));
625 #endif
626 #else
627 		const u8 *p = (const u8 *)(xi + j);
628 		X = (long)GETU32(p);
629 #endif
630 #else /* BIG_ENDIAN */
631 		X = xi[j];
632 #endif
633 
634 		for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) {
635 			u64 M = (u64)(X >> (8*sizeof(long) - 1));
636 			Z.hi ^= V.hi & M;
637 			Z.lo ^= V.lo & M;
638 
639 			REDUCE1BIT(V);
640 		}
641 	}
642 
643 #if BYTE_ORDER == LITTLE_ENDIAN
644 #ifdef BSWAP8
645 	Xi[0] = BSWAP8(Z.hi);
646 	Xi[1] = BSWAP8(Z.lo);
647 #else
648 	u8 *p = (u8 *)Xi;
649 	u32 v;
650 	v = (u32)(Z.hi >> 32);
651 	PUTU32(p, v);
652 	v = (u32)(Z.hi);
653 	PUTU32(p + 4, v);
654 	v = (u32)(Z.lo >> 32);
655 	PUTU32(p + 8, v);
656 	v = (u32)(Z.lo);
657 	PUTU32(p + 12, v);
658 #endif
659 #else /* BIG_ENDIAN */
660 	Xi[0] = Z.hi;
661 	Xi[1] = Z.lo;
662 #endif
663 }
664 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
665 
666 #endif
667 
668 #if	defined(GHASH_ASM) &&						\
669 	(defined(__i386)	|| defined(__i386__)	||		\
670 	 defined(__x86_64)	|| defined(__x86_64__)	||		\
671 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
672 #include "x86_arch.h"
673 #endif
674 
675 #if	TABLE_BITS==4 && defined(GHASH_ASM)
676 # if	(defined(__i386)	|| defined(__i386__)	||		\
677 	 defined(__x86_64)	|| defined(__x86_64__)	||		\
678 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
679 #  define GHASH_ASM_X86_OR_64
680 #  define GCM_FUNCREF_4BIT
681 
682 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
683 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
684 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
685     size_t len);
686 
687 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
688 #   define GHASH_ASM_X86
689 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
690 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
691     size_t len);
692 
693 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
694 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
695     size_t len);
696 #  endif
697 # elif defined(__arm__) || defined(__arm)
698 #  include "arm_arch.h"
699 #  if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
700 #   define GHASH_ASM_ARM
701 #   define GCM_FUNCREF_4BIT
702 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
703 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
704     size_t len);
705 #  endif
706 # endif
707 #endif
708 
709 #ifdef GCM_FUNCREF_4BIT
710 # undef  GCM_MUL
711 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
712 # ifdef GHASH
713 #  undef  GHASH
714 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
715 # endif
716 #endif
717 
718 void
719 CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
720 {
721 	memset(ctx, 0, sizeof(*ctx));
722 	ctx->block = block;
723 	ctx->key = key;
724 
725 	(*block)(ctx->H.c, ctx->H.c, key);
726 
727 #if BYTE_ORDER == LITTLE_ENDIAN
728 	/* H is stored in host byte order */
729 #ifdef BSWAP8
730 	ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
731 	ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
732 #else
733 	u8 *p = ctx->H.c;
734 	u64 hi, lo;
735 	hi = (u64)GETU32(p) << 32|GETU32(p + 4);
736 	lo = (u64)GETU32(p + 8) << 32|GETU32(p + 12);
737 	ctx->H.u[0] = hi;
738 	ctx->H.u[1] = lo;
739 #endif
740 #endif
741 
742 #if	TABLE_BITS==8
743 	gcm_init_8bit(ctx->Htable, ctx->H.u);
744 #elif	TABLE_BITS==4
745 # if	defined(GHASH_ASM_X86_OR_64)
746 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
747 	/* check FXSR and PCLMULQDQ bits */
748 	if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
749 	    (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
750 		gcm_init_clmul(ctx->Htable, ctx->H.u);
751 		ctx->gmult = gcm_gmult_clmul;
752 		ctx->ghash = gcm_ghash_clmul;
753 		return;
754 	}
755 #  endif
756 	gcm_init_4bit(ctx->Htable, ctx->H.u);
757 #  if	defined(GHASH_ASM_X86)			/* x86 only */
758 #   if	defined(OPENSSL_IA32_SSE2)
759 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) {	/* check SSE bit */
760 #   else
761 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) {	/* check MMX bit */
762 #   endif
763 		ctx->gmult = gcm_gmult_4bit_mmx;
764 		ctx->ghash = gcm_ghash_4bit_mmx;
765 	} else {
766 		ctx->gmult = gcm_gmult_4bit_x86;
767 		ctx->ghash = gcm_ghash_4bit_x86;
768 	}
769 #  else
770 	ctx->gmult = gcm_gmult_4bit;
771 	ctx->ghash = gcm_ghash_4bit;
772 #  endif
773 # elif	defined(GHASH_ASM_ARM)
774 	if (OPENSSL_armcap_P & ARMV7_NEON) {
775 		ctx->gmult = gcm_gmult_neon;
776 		ctx->ghash = gcm_ghash_neon;
777 	} else {
778 		gcm_init_4bit(ctx->Htable, ctx->H.u);
779 		ctx->gmult = gcm_gmult_4bit;
780 		ctx->ghash = gcm_ghash_4bit;
781 	}
782 # else
783 	gcm_init_4bit(ctx->Htable, ctx->H.u);
784 # endif
785 #endif
786 }
787 LCRYPTO_ALIAS(CRYPTO_gcm128_init);
788 
789 void
790 CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len)
791 {
792 	unsigned int ctr;
793 #ifdef GCM_FUNCREF_4BIT
794 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
795 #endif
796 
797 	ctx->Yi.u[0] = 0;
798 	ctx->Yi.u[1] = 0;
799 	ctx->Xi.u[0] = 0;
800 	ctx->Xi.u[1] = 0;
801 	ctx->len.u[0] = 0;	/* AAD length */
802 	ctx->len.u[1] = 0;	/* message length */
803 	ctx->ares = 0;
804 	ctx->mres = 0;
805 
806 	if (len == 12) {
807 		memcpy(ctx->Yi.c, iv, 12);
808 		ctx->Yi.c[15] = 1;
809 		ctr = 1;
810 	} else {
811 		size_t i;
812 		u64 len0 = len;
813 
814 		while (len >= 16) {
815 			for (i = 0; i < 16; ++i)
816 				ctx->Yi.c[i] ^= iv[i];
817 			GCM_MUL(ctx, Yi);
818 			iv += 16;
819 			len -= 16;
820 		}
821 		if (len) {
822 			for (i = 0; i < len; ++i)
823 				ctx->Yi.c[i] ^= iv[i];
824 			GCM_MUL(ctx, Yi);
825 		}
826 		len0 <<= 3;
827 #if BYTE_ORDER == LITTLE_ENDIAN
828 #ifdef BSWAP8
829 		ctx->Yi.u[1] ^= BSWAP8(len0);
830 #else
831 		ctx->Yi.c[8] ^= (u8)(len0 >> 56);
832 		ctx->Yi.c[9] ^= (u8)(len0 >> 48);
833 		ctx->Yi.c[10] ^= (u8)(len0 >> 40);
834 		ctx->Yi.c[11] ^= (u8)(len0 >> 32);
835 		ctx->Yi.c[12] ^= (u8)(len0 >> 24);
836 		ctx->Yi.c[13] ^= (u8)(len0 >> 16);
837 		ctx->Yi.c[14] ^= (u8)(len0 >> 8);
838 		ctx->Yi.c[15] ^= (u8)(len0);
839 #endif
840 #else /* BIG_ENDIAN */
841 		ctx->Yi.u[1] ^= len0;
842 #endif
843 
844 		GCM_MUL(ctx, Yi);
845 
846 #if BYTE_ORDER == LITTLE_ENDIAN
847 #ifdef BSWAP4
848 		ctr = BSWAP4(ctx->Yi.d[3]);
849 #else
850 		ctr = GETU32(ctx->Yi.c + 12);
851 #endif
852 #else /* BIG_ENDIAN */
853 		ctr = ctx->Yi.d[3];
854 #endif
855 	}
856 
857 	(*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
858 	++ctr;
859 #if BYTE_ORDER == LITTLE_ENDIAN
860 #ifdef BSWAP4
861 	ctx->Yi.d[3] = BSWAP4(ctr);
862 #else
863 	PUTU32(ctx->Yi.c + 12, ctr);
864 #endif
865 #else /* BIG_ENDIAN */
866 	ctx->Yi.d[3] = ctr;
867 #endif
868 }
869 LCRYPTO_ALIAS(CRYPTO_gcm128_setiv);
870 
871 int
872 CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len)
873 {
874 	size_t i;
875 	unsigned int n;
876 	u64 alen = ctx->len.u[0];
877 #ifdef GCM_FUNCREF_4BIT
878 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
879 # ifdef GHASH
880 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
881 	    const u8 *inp, size_t len) = ctx->ghash;
882 # endif
883 #endif
884 
885 	if (ctx->len.u[1])
886 		return -2;
887 
888 	alen += len;
889 	if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
890 		return -1;
891 	ctx->len.u[0] = alen;
892 
893 	n = ctx->ares;
894 	if (n) {
895 		while (n && len) {
896 			ctx->Xi.c[n] ^= *(aad++);
897 			--len;
898 			n = (n + 1) % 16;
899 		}
900 		if (n == 0)
901 			GCM_MUL(ctx, Xi);
902 		else {
903 			ctx->ares = n;
904 			return 0;
905 		}
906 	}
907 
908 #ifdef GHASH
909 	if ((i = (len & (size_t)-16))) {
910 		GHASH(ctx, aad, i);
911 		aad += i;
912 		len -= i;
913 	}
914 #else
915 	while (len >= 16) {
916 		for (i = 0; i < 16; ++i)
917 			ctx->Xi.c[i] ^= aad[i];
918 		GCM_MUL(ctx, Xi);
919 		aad += 16;
920 		len -= 16;
921 	}
922 #endif
923 	if (len) {
924 		n = (unsigned int)len;
925 		for (i = 0; i < len; ++i)
926 			ctx->Xi.c[i] ^= aad[i];
927 	}
928 
929 	ctx->ares = n;
930 	return 0;
931 }
932 LCRYPTO_ALIAS(CRYPTO_gcm128_aad);
933 
934 int
935 CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
936     const unsigned char *in, unsigned char *out,
937     size_t len)
938 {
939 	unsigned int n, ctr;
940 	size_t i;
941 	u64 mlen = ctx->len.u[1];
942 	block128_f block = ctx->block;
943 	void *key = ctx->key;
944 #ifdef GCM_FUNCREF_4BIT
945 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
946 # ifdef GHASH
947 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
948 	    const u8 *inp, size_t len) = ctx->ghash;
949 # endif
950 #endif
951 
952 	mlen += len;
953 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
954 		return -1;
955 	ctx->len.u[1] = mlen;
956 
957 	if (ctx->ares) {
958 		/* First call to encrypt finalizes GHASH(AAD) */
959 		GCM_MUL(ctx, Xi);
960 		ctx->ares = 0;
961 	}
962 
963 #if BYTE_ORDER == LITTLE_ENDIAN
964 #ifdef BSWAP4
965 	ctr = BSWAP4(ctx->Yi.d[3]);
966 #else
967 	ctr = GETU32(ctx->Yi.c + 12);
968 #endif
969 #else /* BIG_ENDIAN */
970 	ctr = ctx->Yi.d[3];
971 #endif
972 
973 	n = ctx->mres;
974 #if !defined(OPENSSL_SMALL_FOOTPRINT)
975 	if (16 % sizeof(size_t) == 0)
976 		do {	/* always true actually */
977 			if (n) {
978 				while (n && len) {
979 					ctx->Xi.c[n] ^= *(out++) = *(in++) ^
980 					    ctx->EKi.c[n];
981 					--len;
982 					n = (n + 1) % 16;
983 				}
984 				if (n == 0)
985 					GCM_MUL(ctx, Xi);
986 				else {
987 					ctx->mres = n;
988 					return 0;
989 				}
990 			}
991 #ifdef __STRICT_ALIGNMENT
992 			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
993 				break;
994 #endif
995 #if defined(GHASH) && defined(GHASH_CHUNK)
996 			while (len >= GHASH_CHUNK) {
997 				size_t j = GHASH_CHUNK;
998 
999 				while (j) {
1000 					size_t *out_t = (size_t *)out;
1001 					const size_t *in_t = (const size_t *)in;
1002 
1003 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1004 					++ctr;
1005 #if BYTE_ORDER == LITTLE_ENDIAN
1006 #ifdef BSWAP4
1007 					ctx->Yi.d[3] = BSWAP4(ctr);
1008 #else
1009 					PUTU32(ctx->Yi.c + 12, ctr);
1010 #endif
1011 #else /* BIG_ENDIAN */
1012 					ctx->Yi.d[3] = ctr;
1013 #endif
1014 					for (i = 0; i < 16/sizeof(size_t); ++i)
1015 						out_t[i] = in_t[i] ^
1016 						    ctx->EKi.t[i];
1017 					out += 16;
1018 					in += 16;
1019 					j -= 16;
1020 				}
1021 				GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1022 				len -= GHASH_CHUNK;
1023 			}
1024 			if ((i = (len & (size_t)-16))) {
1025 				size_t j = i;
1026 
1027 				while (len >= 16) {
1028 					size_t *out_t = (size_t *)out;
1029 					const size_t *in_t = (const size_t *)in;
1030 
1031 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1032 					++ctr;
1033 #if BYTE_ORDER == LITTLE_ENDIAN
1034 #ifdef BSWAP4
1035 					ctx->Yi.d[3] = BSWAP4(ctr);
1036 #else
1037 					PUTU32(ctx->Yi.c + 12, ctr);
1038 #endif
1039 #else /* BIG_ENDIAN */
1040 					ctx->Yi.d[3] = ctr;
1041 #endif
1042 					for (i = 0; i < 16/sizeof(size_t); ++i)
1043 						out_t[i] = in_t[i] ^
1044 						    ctx->EKi.t[i];
1045 					out += 16;
1046 					in += 16;
1047 					len -= 16;
1048 				}
1049 				GHASH(ctx, out - j, j);
1050 			}
1051 #else
1052 			while (len >= 16) {
1053 				size_t *out_t = (size_t *)out;
1054 				const size_t *in_t = (const size_t *)in;
1055 
1056 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1057 				++ctr;
1058 #if BYTE_ORDER == LITTLE_ENDIAN
1059 #ifdef BSWAP4
1060 				ctx->Yi.d[3] = BSWAP4(ctr);
1061 #else
1062 				PUTU32(ctx->Yi.c + 12, ctr);
1063 #endif
1064 #else /* BIG_ENDIAN */
1065 				ctx->Yi.d[3] = ctr;
1066 #endif
1067 				for (i = 0; i < 16/sizeof(size_t); ++i)
1068 					ctx->Xi.t[i] ^=
1069 					    out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1070 				GCM_MUL(ctx, Xi);
1071 				out += 16;
1072 				in += 16;
1073 				len -= 16;
1074 			}
1075 #endif
1076 			if (len) {
1077 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1078 				++ctr;
1079 #if BYTE_ORDER == LITTLE_ENDIAN
1080 #ifdef BSWAP4
1081 				ctx->Yi.d[3] = BSWAP4(ctr);
1082 #else
1083 				PUTU32(ctx->Yi.c + 12, ctr);
1084 #endif
1085 #else /* BIG_ENDIAN */
1086 				ctx->Yi.d[3] = ctr;
1087 #endif
1088 				while (len--) {
1089 					ctx->Xi.c[n] ^= out[n] = in[n] ^
1090 					    ctx->EKi.c[n];
1091 					++n;
1092 				}
1093 			}
1094 
1095 			ctx->mres = n;
1096 			return 0;
1097 		} while (0);
1098 #endif
1099 	for (i = 0; i < len; ++i) {
1100 		if (n == 0) {
1101 			(*block)(ctx->Yi.c, ctx->EKi.c, key);
1102 			++ctr;
1103 #if BYTE_ORDER == LITTLE_ENDIAN
1104 #ifdef BSWAP4
1105 			ctx->Yi.d[3] = BSWAP4(ctr);
1106 #else
1107 			PUTU32(ctx->Yi.c + 12, ctr);
1108 #endif
1109 #else /* BIG_ENDIAN */
1110 			ctx->Yi.d[3] = ctr;
1111 #endif
1112 		}
1113 		ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1114 		n = (n + 1) % 16;
1115 		if (n == 0)
1116 			GCM_MUL(ctx, Xi);
1117 	}
1118 
1119 	ctx->mres = n;
1120 	return 0;
1121 }
1122 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt);
1123 
1124 int
1125 CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1126     const unsigned char *in, unsigned char *out,
1127     size_t len)
1128 {
1129 	unsigned int n, ctr;
1130 	size_t i;
1131 	u64 mlen = ctx->len.u[1];
1132 	block128_f block = ctx->block;
1133 	void *key = ctx->key;
1134 #ifdef GCM_FUNCREF_4BIT
1135 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1136 # ifdef GHASH
1137 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1138 	    const u8 *inp, size_t len) = ctx->ghash;
1139 # endif
1140 #endif
1141 
1142 	mlen += len;
1143 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1144 		return -1;
1145 	ctx->len.u[1] = mlen;
1146 
1147 	if (ctx->ares) {
1148 		/* First call to decrypt finalizes GHASH(AAD) */
1149 		GCM_MUL(ctx, Xi);
1150 		ctx->ares = 0;
1151 	}
1152 
1153 #if BYTE_ORDER == LITTLE_ENDIAN
1154 #ifdef BSWAP4
1155 	ctr = BSWAP4(ctx->Yi.d[3]);
1156 #else
1157 	ctr = GETU32(ctx->Yi.c + 12);
1158 #endif
1159 #else /* BIG_ENDIAN */
1160 	ctr = ctx->Yi.d[3];
1161 #endif
1162 
1163 	n = ctx->mres;
1164 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1165 	if (16 % sizeof(size_t) == 0)
1166 		do {	/* always true actually */
1167 			if (n) {
1168 				while (n && len) {
1169 					u8 c = *(in++);
1170 					*(out++) = c ^ ctx->EKi.c[n];
1171 					ctx->Xi.c[n] ^= c;
1172 					--len;
1173 					n = (n + 1) % 16;
1174 				}
1175 				if (n == 0)
1176 					GCM_MUL(ctx, Xi);
1177 				else {
1178 					ctx->mres = n;
1179 					return 0;
1180 				}
1181 			}
1182 #ifdef __STRICT_ALIGNMENT
1183 			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
1184 				break;
1185 #endif
1186 #if defined(GHASH) && defined(GHASH_CHUNK)
1187 			while (len >= GHASH_CHUNK) {
1188 				size_t j = GHASH_CHUNK;
1189 
1190 				GHASH(ctx, in, GHASH_CHUNK);
1191 				while (j) {
1192 					size_t *out_t = (size_t *)out;
1193 					const size_t *in_t = (const size_t *)in;
1194 
1195 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1196 					++ctr;
1197 #if BYTE_ORDER == LITTLE_ENDIAN
1198 #ifdef BSWAP4
1199 					ctx->Yi.d[3] = BSWAP4(ctr);
1200 #else
1201 					PUTU32(ctx->Yi.c + 12, ctr);
1202 #endif
1203 #else /* BIG_ENDIAN */
1204 					ctx->Yi.d[3] = ctr;
1205 #endif
1206 					for (i = 0; i < 16/sizeof(size_t); ++i)
1207 						out_t[i] = in_t[i] ^
1208 						    ctx->EKi.t[i];
1209 					out += 16;
1210 					in += 16;
1211 					j -= 16;
1212 				}
1213 				len -= GHASH_CHUNK;
1214 			}
1215 			if ((i = (len & (size_t)-16))) {
1216 				GHASH(ctx, in, i);
1217 				while (len >= 16) {
1218 					size_t *out_t = (size_t *)out;
1219 					const size_t *in_t = (const size_t *)in;
1220 
1221 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1222 					++ctr;
1223 #if BYTE_ORDER == LITTLE_ENDIAN
1224 #ifdef BSWAP4
1225 					ctx->Yi.d[3] = BSWAP4(ctr);
1226 #else
1227 					PUTU32(ctx->Yi.c + 12, ctr);
1228 #endif
1229 #else /* BIG_ENDIAN */
1230 					ctx->Yi.d[3] = ctr;
1231 #endif
1232 					for (i = 0; i < 16/sizeof(size_t); ++i)
1233 						out_t[i] = in_t[i] ^
1234 						    ctx->EKi.t[i];
1235 					out += 16;
1236 					in += 16;
1237 					len -= 16;
1238 				}
1239 			}
1240 #else
1241 			while (len >= 16) {
1242 				size_t *out_t = (size_t *)out;
1243 				const size_t *in_t = (const size_t *)in;
1244 
1245 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1246 				++ctr;
1247 #if BYTE_ORDER == LITTLE_ENDIAN
1248 #ifdef BSWAP4
1249 				ctx->Yi.d[3] = BSWAP4(ctr);
1250 #else
1251 				PUTU32(ctx->Yi.c + 12, ctr);
1252 #endif
1253 #else /* BIG_ENDIAN */
1254 				ctx->Yi.d[3] = ctr;
1255 #endif
1256 				for (i = 0; i < 16/sizeof(size_t); ++i) {
1257 					size_t c = in[i];
1258 					out[i] = c ^ ctx->EKi.t[i];
1259 					ctx->Xi.t[i] ^= c;
1260 				}
1261 				GCM_MUL(ctx, Xi);
1262 				out += 16;
1263 				in += 16;
1264 				len -= 16;
1265 			}
1266 #endif
1267 			if (len) {
1268 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1269 				++ctr;
1270 #if BYTE_ORDER == LITTLE_ENDIAN
1271 #ifdef BSWAP4
1272 				ctx->Yi.d[3] = BSWAP4(ctr);
1273 #else
1274 				PUTU32(ctx->Yi.c + 12, ctr);
1275 #endif
1276 #else /* BIG_ENDIAN */
1277 				ctx->Yi.d[3] = ctr;
1278 #endif
1279 				while (len--) {
1280 					u8 c = in[n];
1281 					ctx->Xi.c[n] ^= c;
1282 					out[n] = c ^ ctx->EKi.c[n];
1283 					++n;
1284 				}
1285 			}
1286 
1287 			ctx->mres = n;
1288 			return 0;
1289 		} while (0);
1290 #endif
1291 	for (i = 0; i < len; ++i) {
1292 		u8 c;
1293 		if (n == 0) {
1294 			(*block)(ctx->Yi.c, ctx->EKi.c, key);
1295 			++ctr;
1296 #if BYTE_ORDER == LITTLE_ENDIAN
1297 #ifdef BSWAP4
1298 			ctx->Yi.d[3] = BSWAP4(ctr);
1299 #else
1300 			PUTU32(ctx->Yi.c + 12, ctr);
1301 #endif
1302 #else /* BIG_ENDIAN */
1303 			ctx->Yi.d[3] = ctr;
1304 #endif
1305 		}
1306 		c = in[i];
1307 		out[i] = c ^ ctx->EKi.c[n];
1308 		ctx->Xi.c[n] ^= c;
1309 		n = (n + 1) % 16;
1310 		if (n == 0)
1311 			GCM_MUL(ctx, Xi);
1312 	}
1313 
1314 	ctx->mres = n;
1315 	return 0;
1316 }
1317 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt);
1318 
1319 int
1320 CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1321     const unsigned char *in, unsigned char *out,
1322     size_t len, ctr128_f stream)
1323 {
1324 	unsigned int n, ctr;
1325 	size_t i;
1326 	u64 mlen = ctx->len.u[1];
1327 	void *key = ctx->key;
1328 #ifdef GCM_FUNCREF_4BIT
1329 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1330 # ifdef GHASH
1331 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1332 	    const u8 *inp, size_t len) = ctx->ghash;
1333 # endif
1334 #endif
1335 
1336 	mlen += len;
1337 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1338 		return -1;
1339 	ctx->len.u[1] = mlen;
1340 
1341 	if (ctx->ares) {
1342 		/* First call to encrypt finalizes GHASH(AAD) */
1343 		GCM_MUL(ctx, Xi);
1344 		ctx->ares = 0;
1345 	}
1346 
1347 #if BYTE_ORDER == LITTLE_ENDIAN
1348 #ifdef BSWAP4
1349 	ctr = BSWAP4(ctx->Yi.d[3]);
1350 #else
1351 	ctr = GETU32(ctx->Yi.c + 12);
1352 #endif
1353 #else /* BIG_ENDIAN */
1354 	ctr = ctx->Yi.d[3];
1355 #endif
1356 
1357 	n = ctx->mres;
1358 	if (n) {
1359 		while (n && len) {
1360 			ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1361 			--len;
1362 			n = (n + 1) % 16;
1363 		}
1364 		if (n == 0)
1365 			GCM_MUL(ctx, Xi);
1366 		else {
1367 			ctx->mres = n;
1368 			return 0;
1369 		}
1370 	}
1371 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1372 	while (len >= GHASH_CHUNK) {
1373 		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1374 		ctr += GHASH_CHUNK/16;
1375 #if BYTE_ORDER == LITTLE_ENDIAN
1376 #ifdef BSWAP4
1377 		ctx->Yi.d[3] = BSWAP4(ctr);
1378 #else
1379 		PUTU32(ctx->Yi.c + 12, ctr);
1380 #endif
1381 #else /* BIG_ENDIAN */
1382 		ctx->Yi.d[3] = ctr;
1383 #endif
1384 		GHASH(ctx, out, GHASH_CHUNK);
1385 		out += GHASH_CHUNK;
1386 		in += GHASH_CHUNK;
1387 		len -= GHASH_CHUNK;
1388 	}
1389 #endif
1390 	if ((i = (len & (size_t)-16))) {
1391 		size_t j = i/16;
1392 
1393 		(*stream)(in, out, j, key, ctx->Yi.c);
1394 		ctr += (unsigned int)j;
1395 #if BYTE_ORDER == LITTLE_ENDIAN
1396 #ifdef BSWAP4
1397 		ctx->Yi.d[3] = BSWAP4(ctr);
1398 #else
1399 		PUTU32(ctx->Yi.c + 12, ctr);
1400 #endif
1401 #else /* BIG_ENDIAN */
1402 		ctx->Yi.d[3] = ctr;
1403 #endif
1404 		in += i;
1405 		len -= i;
1406 #if defined(GHASH)
1407 		GHASH(ctx, out, i);
1408 		out += i;
1409 #else
1410 		while (j--) {
1411 			for (i = 0; i < 16; ++i)
1412 				ctx->Xi.c[i] ^= out[i];
1413 			GCM_MUL(ctx, Xi);
1414 			out += 16;
1415 		}
1416 #endif
1417 	}
1418 	if (len) {
1419 		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1420 		++ctr;
1421 #if BYTE_ORDER == LITTLE_ENDIAN
1422 #ifdef BSWAP4
1423 		ctx->Yi.d[3] = BSWAP4(ctr);
1424 #else
1425 		PUTU32(ctx->Yi.c + 12, ctr);
1426 #endif
1427 #else /* BIG_ENDIAN */
1428 		ctx->Yi.d[3] = ctr;
1429 #endif
1430 		while (len--) {
1431 			ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1432 			++n;
1433 		}
1434 	}
1435 
1436 	ctx->mres = n;
1437 	return 0;
1438 }
1439 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32);
1440 
1441 int
1442 CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1443     const unsigned char *in, unsigned char *out,
1444     size_t len, ctr128_f stream)
1445 {
1446 	unsigned int n, ctr;
1447 	size_t i;
1448 	u64 mlen = ctx->len.u[1];
1449 	void *key = ctx->key;
1450 #ifdef GCM_FUNCREF_4BIT
1451 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1452 # ifdef GHASH
1453 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1454 	    const u8 *inp, size_t len) = ctx->ghash;
1455 # endif
1456 #endif
1457 
1458 	mlen += len;
1459 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1460 		return -1;
1461 	ctx->len.u[1] = mlen;
1462 
1463 	if (ctx->ares) {
1464 		/* First call to decrypt finalizes GHASH(AAD) */
1465 		GCM_MUL(ctx, Xi);
1466 		ctx->ares = 0;
1467 	}
1468 
1469 #if BYTE_ORDER == LITTLE_ENDIAN
1470 #ifdef BSWAP4
1471 	ctr = BSWAP4(ctx->Yi.d[3]);
1472 #else
1473 	ctr = GETU32(ctx->Yi.c + 12);
1474 #endif
1475 #else /* BIG_ENDIAN */
1476 	ctr = ctx->Yi.d[3];
1477 #endif
1478 
1479 	n = ctx->mres;
1480 	if (n) {
1481 		while (n && len) {
1482 			u8 c = *(in++);
1483 			*(out++) = c ^ ctx->EKi.c[n];
1484 			ctx->Xi.c[n] ^= c;
1485 			--len;
1486 			n = (n + 1) % 16;
1487 		}
1488 		if (n == 0)
1489 			GCM_MUL(ctx, Xi);
1490 		else {
1491 			ctx->mres = n;
1492 			return 0;
1493 		}
1494 	}
1495 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1496 	while (len >= GHASH_CHUNK) {
1497 		GHASH(ctx, in, GHASH_CHUNK);
1498 		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1499 		ctr += GHASH_CHUNK/16;
1500 #if BYTE_ORDER == LITTLE_ENDIAN
1501 #ifdef BSWAP4
1502 		ctx->Yi.d[3] = BSWAP4(ctr);
1503 #else
1504 		PUTU32(ctx->Yi.c + 12, ctr);
1505 #endif
1506 #else /* BIG_ENDIAN */
1507 		ctx->Yi.d[3] = ctr;
1508 #endif
1509 		out += GHASH_CHUNK;
1510 		in += GHASH_CHUNK;
1511 		len -= GHASH_CHUNK;
1512 	}
1513 #endif
1514 	if ((i = (len & (size_t)-16))) {
1515 		size_t j = i/16;
1516 
1517 #if defined(GHASH)
1518 		GHASH(ctx, in, i);
1519 #else
1520 		while (j--) {
1521 			size_t k;
1522 			for (k = 0; k < 16; ++k)
1523 				ctx->Xi.c[k] ^= in[k];
1524 			GCM_MUL(ctx, Xi);
1525 			in += 16;
1526 		}
1527 		j = i/16;
1528 		in -= i;
1529 #endif
1530 		(*stream)(in, out, j, key, ctx->Yi.c);
1531 		ctr += (unsigned int)j;
1532 #if BYTE_ORDER == LITTLE_ENDIAN
1533 #ifdef BSWAP4
1534 		ctx->Yi.d[3] = BSWAP4(ctr);
1535 #else
1536 		PUTU32(ctx->Yi.c + 12, ctr);
1537 #endif
1538 #else /* BIG_ENDIAN */
1539 		ctx->Yi.d[3] = ctr;
1540 #endif
1541 		out += i;
1542 		in += i;
1543 		len -= i;
1544 	}
1545 	if (len) {
1546 		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1547 		++ctr;
1548 #if BYTE_ORDER == LITTLE_ENDIAN
1549 #ifdef BSWAP4
1550 		ctx->Yi.d[3] = BSWAP4(ctr);
1551 #else
1552 		PUTU32(ctx->Yi.c + 12, ctr);
1553 #endif
1554 #else /* BIG_ENDIAN */
1555 		ctx->Yi.d[3] = ctr;
1556 #endif
1557 		while (len--) {
1558 			u8 c = in[n];
1559 			ctx->Xi.c[n] ^= c;
1560 			out[n] = c ^ ctx->EKi.c[n];
1561 			++n;
1562 		}
1563 	}
1564 
1565 	ctx->mres = n;
1566 	return 0;
1567 }
1568 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32);
1569 
1570 int
1571 CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1572     size_t len)
1573 {
1574 	u64 alen = ctx->len.u[0] << 3;
1575 	u64 clen = ctx->len.u[1] << 3;
1576 #ifdef GCM_FUNCREF_4BIT
1577 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1578 #endif
1579 
1580 	if (ctx->mres || ctx->ares)
1581 		GCM_MUL(ctx, Xi);
1582 
1583 #if BYTE_ORDER == LITTLE_ENDIAN
1584 #ifdef BSWAP8
1585 	alen = BSWAP8(alen);
1586 	clen = BSWAP8(clen);
1587 #else
1588 	{
1589 		u8 *p = ctx->len.c;
1590 
1591 		ctx->len.u[0] = alen;
1592 		ctx->len.u[1] = clen;
1593 
1594 		alen = (u64)GETU32(p) << 32|GETU32(p + 4);
1595 		clen = (u64)GETU32(p + 8) << 32|GETU32(p + 12);
1596 	}
1597 #endif
1598 #endif
1599 
1600 	ctx->Xi.u[0] ^= alen;
1601 	ctx->Xi.u[1] ^= clen;
1602 	GCM_MUL(ctx, Xi);
1603 
1604 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1605 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1606 
1607 	if (tag && len <= sizeof(ctx->Xi))
1608 		return memcmp(ctx->Xi.c, tag, len);
1609 	else
1610 		return -1;
1611 }
1612 LCRYPTO_ALIAS(CRYPTO_gcm128_finish);
1613 
1614 void
1615 CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1616 {
1617 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1618 	memcpy(tag, ctx->Xi.c,
1619 	    len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1620 }
1621 LCRYPTO_ALIAS(CRYPTO_gcm128_tag);
1622 
1623 GCM128_CONTEXT *
1624 CRYPTO_gcm128_new(void *key, block128_f block)
1625 {
1626 	GCM128_CONTEXT *ret;
1627 
1628 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1629 		CRYPTO_gcm128_init(ret, key, block);
1630 
1631 	return ret;
1632 }
1633 LCRYPTO_ALIAS(CRYPTO_gcm128_new);
1634 
1635 void
1636 CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1637 {
1638 	freezero(ctx, sizeof(*ctx));
1639 }
1640 LCRYPTO_ALIAS(CRYPTO_gcm128_release);
1641