xref: /openbsd-src/lib/libcrypto/modes/gcm128.c (revision d59bb9942320b767f2a19aaa7690c8c6e30b724c)
1 /* $OpenBSD: gcm128.c,v 1.15 2016/11/04 17:30:30 miod Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <openssl/crypto.h>
54 #include "modes_lcl.h"
55 #include <string.h>
56 
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 #  define NDEBUG
60 # endif
61 #endif
62 
63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #undef	PUTU32
68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)	\
73 	do { \
74 		if (sizeof(size_t)==8) { \
75 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
76 			V.lo  = (V.hi<<63)|(V.lo>>1); \
77 			V.hi  = (V.hi>>1 )^T; \
78 		} else { \
79 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 			V.lo  = (V.hi<<63)|(V.lo>>1); \
81 			V.hi  = (V.hi>>1 )^((u64)T<<32); \
82 		} \
83 	} while(0)
84 
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if	TABLE_BITS==8
120 
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123 	int  i, j;
124 	u128 V;
125 
126 	Htable[0].hi = 0;
127 	Htable[0].lo = 0;
128 	V.hi = H[0];
129 	V.lo = H[1];
130 
131 	for (Htable[128]=V, i=64; i>0; i>>=1) {
132 		REDUCE1BIT(V);
133 		Htable[i] = V;
134 	}
135 
136 	for (i=2; i<256; i<<=1) {
137 		u128 *Hi = Htable+i, H0 = *Hi;
138 		for (j=1; j<i; ++j) {
139 			Hi[j].hi = H0.hi^Htable[j].hi;
140 			Hi[j].lo = H0.lo^Htable[j].lo;
141 		}
142 	}
143 }
144 
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147 	u128 Z = { 0, 0};
148 	const u8 *xi = (const u8 *)Xi+15;
149 	size_t rem, n = *xi;
150 	static const size_t rem_8bit[256] = {
151 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
215 
216 	while (1) {
217 		Z.hi ^= Htable[n].hi;
218 		Z.lo ^= Htable[n].lo;
219 
220 		if ((u8 *)Xi==xi)	break;
221 
222 		n = *(--xi);
223 
224 		rem  = (size_t)Z.lo&0xff;
225 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
226 		Z.hi = (Z.hi>>8);
227 		if (sizeof(size_t)==8)
228 			Z.hi ^= rem_8bit[rem];
229 		else
230 			Z.hi ^= (u64)rem_8bit[rem]<<32;
231 	}
232 
233 	if (BYTE_ORDER == LITTLE_ENDIAN) {
234 #ifdef BSWAP8
235 		Xi[0] = BSWAP8(Z.hi);
236 		Xi[1] = BSWAP8(Z.lo);
237 #else
238 		u8 *p = (u8 *)Xi;
239 		u32 v;
240 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
241 		v = (u32)(Z.hi);	PUTU32(p+4,v);
242 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
243 		v = (u32)(Z.lo);	PUTU32(p+12,v);
244 #endif
245 	}
246 	else {
247 		Xi[0] = Z.hi;
248 		Xi[1] = Z.lo;
249 	}
250 }
251 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
252 
253 #elif	TABLE_BITS==4
254 
255 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
256 {
257 	u128 V;
258 #if defined(OPENSSL_SMALL_FOOTPRINT)
259 	int  i;
260 #endif
261 
262 	Htable[0].hi = 0;
263 	Htable[0].lo = 0;
264 	V.hi = H[0];
265 	V.lo = H[1];
266 
267 #if defined(OPENSSL_SMALL_FOOTPRINT)
268 	for (Htable[8]=V, i=4; i>0; i>>=1) {
269 		REDUCE1BIT(V);
270 		Htable[i] = V;
271 	}
272 
273 	for (i=2; i<16; i<<=1) {
274 		u128 *Hi = Htable+i;
275 		int   j;
276 		for (V=*Hi, j=1; j<i; ++j) {
277 			Hi[j].hi = V.hi^Htable[j].hi;
278 			Hi[j].lo = V.lo^Htable[j].lo;
279 		}
280 	}
281 #else
282 	Htable[8] = V;
283 	REDUCE1BIT(V);
284 	Htable[4] = V;
285 	REDUCE1BIT(V);
286 	Htable[2] = V;
287 	REDUCE1BIT(V);
288 	Htable[1] = V;
289 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
290 	V=Htable[4];
291 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
292 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
293 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
294 	V=Htable[8];
295 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
296 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
297 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
298 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
299 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
300 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
301 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
302 #endif
303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
304 	/*
305 	 * ARM assembler expects specific dword order in Htable.
306 	 */
307 	{
308 	int j;
309 
310 	if (BYTE_ORDER == LITTLE_ENDIAN)
311 		for (j=0;j<16;++j) {
312 			V = Htable[j];
313 			Htable[j].hi = V.lo;
314 			Htable[j].lo = V.hi;
315 		}
316 	else
317 		for (j=0;j<16;++j) {
318 			V = Htable[j];
319 			Htable[j].hi = V.lo<<32|V.lo>>32;
320 			Htable[j].lo = V.hi<<32|V.hi>>32;
321 		}
322 	}
323 #endif
324 }
325 
326 #ifndef GHASH_ASM
327 static const size_t rem_4bit[16] = {
328 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
329 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
330 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
331 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
332 
333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
334 {
335 	u128 Z;
336 	int cnt = 15;
337 	size_t rem, nlo, nhi;
338 
339 	nlo  = ((const u8 *)Xi)[15];
340 	nhi  = nlo>>4;
341 	nlo &= 0xf;
342 
343 	Z.hi = Htable[nlo].hi;
344 	Z.lo = Htable[nlo].lo;
345 
346 	while (1) {
347 		rem  = (size_t)Z.lo&0xf;
348 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
349 		Z.hi = (Z.hi>>4);
350 		if (sizeof(size_t)==8)
351 			Z.hi ^= rem_4bit[rem];
352 		else
353 			Z.hi ^= (u64)rem_4bit[rem]<<32;
354 
355 		Z.hi ^= Htable[nhi].hi;
356 		Z.lo ^= Htable[nhi].lo;
357 
358 		if (--cnt<0)		break;
359 
360 		nlo  = ((const u8 *)Xi)[cnt];
361 		nhi  = nlo>>4;
362 		nlo &= 0xf;
363 
364 		rem  = (size_t)Z.lo&0xf;
365 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
366 		Z.hi = (Z.hi>>4);
367 		if (sizeof(size_t)==8)
368 			Z.hi ^= rem_4bit[rem];
369 		else
370 			Z.hi ^= (u64)rem_4bit[rem]<<32;
371 
372 		Z.hi ^= Htable[nlo].hi;
373 		Z.lo ^= Htable[nlo].lo;
374 	}
375 
376 	if (BYTE_ORDER == LITTLE_ENDIAN) {
377 #ifdef BSWAP8
378 		Xi[0] = BSWAP8(Z.hi);
379 		Xi[1] = BSWAP8(Z.lo);
380 #else
381 		u8 *p = (u8 *)Xi;
382 		u32 v;
383 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
384 		v = (u32)(Z.hi);	PUTU32(p+4,v);
385 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
386 		v = (u32)(Z.lo);	PUTU32(p+12,v);
387 #endif
388 	}
389 	else {
390 		Xi[0] = Z.hi;
391 		Xi[1] = Z.lo;
392 	}
393 }
394 
395 #if !defined(OPENSSL_SMALL_FOOTPRINT)
396 /*
397  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
398  * details... Compiler-generated code doesn't seem to give any
399  * performance improvement, at least not on x86[_64]. It's here
400  * mostly as reference and a placeholder for possible future
401  * non-trivial optimization[s]...
402  */
403 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
404 				const u8 *inp,size_t len)
405 {
406     u128 Z;
407     int cnt;
408     size_t rem, nlo, nhi;
409 
410 #if 1
411     do {
412 	cnt  = 15;
413 	nlo  = ((const u8 *)Xi)[15];
414 	nlo ^= inp[15];
415 	nhi  = nlo>>4;
416 	nlo &= 0xf;
417 
418 	Z.hi = Htable[nlo].hi;
419 	Z.lo = Htable[nlo].lo;
420 
421 	while (1) {
422 		rem  = (size_t)Z.lo&0xf;
423 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
424 		Z.hi = (Z.hi>>4);
425 		if (sizeof(size_t)==8)
426 			Z.hi ^= rem_4bit[rem];
427 		else
428 			Z.hi ^= (u64)rem_4bit[rem]<<32;
429 
430 		Z.hi ^= Htable[nhi].hi;
431 		Z.lo ^= Htable[nhi].lo;
432 
433 		if (--cnt<0)		break;
434 
435 		nlo  = ((const u8 *)Xi)[cnt];
436 		nlo ^= inp[cnt];
437 		nhi  = nlo>>4;
438 		nlo &= 0xf;
439 
440 		rem  = (size_t)Z.lo&0xf;
441 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
442 		Z.hi = (Z.hi>>4);
443 		if (sizeof(size_t)==8)
444 			Z.hi ^= rem_4bit[rem];
445 		else
446 			Z.hi ^= (u64)rem_4bit[rem]<<32;
447 
448 		Z.hi ^= Htable[nlo].hi;
449 		Z.lo ^= Htable[nlo].lo;
450 	}
451 #else
452     /*
453      * Extra 256+16 bytes per-key plus 512 bytes shared tables
454      * [should] give ~50% improvement... One could have PACK()-ed
455      * the rem_8bit even here, but the priority is to minimize
456      * cache footprint...
457      */
458     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
459     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
460     static const unsigned short rem_8bit[256] = {
461 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
462 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
463 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
464 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
465 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
466 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
467 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
468 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
469 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
470 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
471 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
472 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
473 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
474 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
475 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
476 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
477 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
478 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
479 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
480 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
481 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
482 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
483 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
484 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
485 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
486 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
487 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
488 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
489 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
490 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
491 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
492 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
493     /*
494      * This pre-processing phase slows down procedure by approximately
495      * same time as it makes each loop spin faster. In other words
496      * single block performance is approximately same as straightforward
497      * "4-bit" implementation, and then it goes only faster...
498      */
499     for (cnt=0; cnt<16; ++cnt) {
500 	Z.hi = Htable[cnt].hi;
501 	Z.lo = Htable[cnt].lo;
502 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
503 	Hshr4[cnt].hi = (Z.hi>>4);
504 	Hshl4[cnt]    = (u8)(Z.lo<<4);
505     }
506 
507     do {
508 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
509 		nlo  = ((const u8 *)Xi)[cnt];
510 		nlo ^= inp[cnt];
511 		nhi  = nlo>>4;
512 		nlo &= 0xf;
513 
514 		Z.hi ^= Htable[nlo].hi;
515 		Z.lo ^= Htable[nlo].lo;
516 
517 		rem = (size_t)Z.lo&0xff;
518 
519 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
520 		Z.hi = (Z.hi>>8);
521 
522 		Z.hi ^= Hshr4[nhi].hi;
523 		Z.lo ^= Hshr4[nhi].lo;
524 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
525 	}
526 
527 	nlo  = ((const u8 *)Xi)[0];
528 	nlo ^= inp[0];
529 	nhi  = nlo>>4;
530 	nlo &= 0xf;
531 
532 	Z.hi ^= Htable[nlo].hi;
533 	Z.lo ^= Htable[nlo].lo;
534 
535 	rem = (size_t)Z.lo&0xf;
536 
537 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
538 	Z.hi = (Z.hi>>4);
539 
540 	Z.hi ^= Htable[nhi].hi;
541 	Z.lo ^= Htable[nhi].lo;
542 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
543 #endif
544 
545 	if (BYTE_ORDER == LITTLE_ENDIAN) {
546 #ifdef BSWAP8
547 		Xi[0] = BSWAP8(Z.hi);
548 		Xi[1] = BSWAP8(Z.lo);
549 #else
550 		u8 *p = (u8 *)Xi;
551 		u32 v;
552 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
553 		v = (u32)(Z.hi);	PUTU32(p+4,v);
554 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
555 		v = (u32)(Z.lo);	PUTU32(p+12,v);
556 #endif
557 	}
558 	else {
559 		Xi[0] = Z.hi;
560 		Xi[1] = Z.lo;
561 	}
562     } while (inp+=16, len-=16);
563 }
564 #endif
565 #else
566 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
567 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
568 #endif
569 
570 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
571 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
572 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
573 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
574  * trashing effect. In other words idea is to hash data while it's
575  * still in L1 cache after encryption pass... */
576 #define GHASH_CHUNK       (3*1024)
577 #endif
578 
579 #else	/* TABLE_BITS */
580 
581 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
582 {
583 	u128 V,Z = { 0,0 };
584 	long X;
585 	int  i,j;
586 	const long *xi = (const long *)Xi;
587 
588 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
589 	V.lo = H[1];
590 
591 	for (j=0; j<16/sizeof(long); ++j) {
592 		if (BYTE_ORDER == LITTLE_ENDIAN) {
593 			if (sizeof(long)==8) {
594 #ifdef BSWAP8
595 				X = (long)(BSWAP8(xi[j]));
596 #else
597 				const u8 *p = (const u8 *)(xi+j);
598 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
599 #endif
600 			}
601 			else {
602 				const u8 *p = (const u8 *)(xi+j);
603 				X = (long)GETU32(p);
604 			}
605 		}
606 		else
607 			X = xi[j];
608 
609 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
610 			u64 M = (u64)(X>>(8*sizeof(long)-1));
611 			Z.hi ^= V.hi&M;
612 			Z.lo ^= V.lo&M;
613 
614 			REDUCE1BIT(V);
615 		}
616 	}
617 
618 	if (BYTE_ORDER == LITTLE_ENDIAN) {
619 #ifdef BSWAP8
620 		Xi[0] = BSWAP8(Z.hi);
621 		Xi[1] = BSWAP8(Z.lo);
622 #else
623 		u8 *p = (u8 *)Xi;
624 		u32 v;
625 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
626 		v = (u32)(Z.hi);	PUTU32(p+4,v);
627 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
628 		v = (u32)(Z.lo);	PUTU32(p+12,v);
629 #endif
630 	}
631 	else {
632 		Xi[0] = Z.hi;
633 		Xi[1] = Z.lo;
634 	}
635 }
636 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
637 
638 #endif
639 
640 #if	defined(GHASH_ASM) && \
641 	(defined(__i386)	|| defined(__i386__)	|| \
642 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
643 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
644 #include "x86_arch.h"
645 #endif
646 
647 #if	TABLE_BITS==4 && defined(GHASH_ASM)
648 # if	(defined(__i386)	|| defined(__i386__)	|| \
649 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
650 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
651 #  define GHASH_ASM_X86_OR_64
652 #  define GCM_FUNCREF_4BIT
653 
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657 
658 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662 
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673 #  endif
674 # endif
675 #endif
676 
677 #ifdef GCM_FUNCREF_4BIT
678 # undef  GCM_MUL
679 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680 # ifdef GHASH
681 #  undef  GHASH
682 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683 # endif
684 #endif
685 
686 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687 {
688 	memset(ctx,0,sizeof(*ctx));
689 	ctx->block = block;
690 	ctx->key   = key;
691 
692 	(*block)(ctx->H.c,ctx->H.c,key);
693 
694 	if (BYTE_ORDER == LITTLE_ENDIAN) {
695 		/* H is stored in host byte order */
696 #ifdef BSWAP8
697 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
698 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
699 #else
700 		u8 *p = ctx->H.c;
701 		u64 hi,lo;
702 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
703 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
704 		ctx->H.u[0] = hi;
705 		ctx->H.u[1] = lo;
706 #endif
707 	}
708 
709 #if	TABLE_BITS==8
710 	gcm_init_8bit(ctx->Htable,ctx->H.u);
711 #elif	TABLE_BITS==4
712 # if	defined(GHASH_ASM_X86_OR_64)
713 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
714 	/* check FXSR and PCLMULQDQ bits */
715 	if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
716 	    (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
717 		gcm_init_clmul(ctx->Htable,ctx->H.u);
718 		ctx->gmult = gcm_gmult_clmul;
719 		ctx->ghash = gcm_ghash_clmul;
720 		return;
721 	}
722 #  endif
723 	gcm_init_4bit(ctx->Htable,ctx->H.u);
724 #  if	defined(GHASH_ASM_X86)			/* x86 only */
725 #   if	defined(OPENSSL_IA32_SSE2)
726 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) {	/* check SSE bit */
727 #   else
728 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) {	/* check MMX bit */
729 #   endif
730 		ctx->gmult = gcm_gmult_4bit_mmx;
731 		ctx->ghash = gcm_ghash_4bit_mmx;
732 	} else {
733 		ctx->gmult = gcm_gmult_4bit_x86;
734 		ctx->ghash = gcm_ghash_4bit_x86;
735 	}
736 #  else
737 	ctx->gmult = gcm_gmult_4bit;
738 	ctx->ghash = gcm_ghash_4bit;
739 #  endif
740 # elif	defined(GHASH_ASM_ARM)
741 	if (OPENSSL_armcap_P & ARMV7_NEON) {
742 		ctx->gmult = gcm_gmult_neon;
743 		ctx->ghash = gcm_ghash_neon;
744 	} else {
745 		gcm_init_4bit(ctx->Htable,ctx->H.u);
746 		ctx->gmult = gcm_gmult_4bit;
747 		ctx->ghash = gcm_ghash_4bit;
748 	}
749 # else
750 	gcm_init_4bit(ctx->Htable,ctx->H.u);
751 # endif
752 #endif
753 }
754 
755 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
756 {
757 	unsigned int ctr;
758 #ifdef GCM_FUNCREF_4BIT
759 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
760 #endif
761 
762 	ctx->Yi.u[0]  = 0;
763 	ctx->Yi.u[1]  = 0;
764 	ctx->Xi.u[0]  = 0;
765 	ctx->Xi.u[1]  = 0;
766 	ctx->len.u[0] = 0;	/* AAD length */
767 	ctx->len.u[1] = 0;	/* message length */
768 	ctx->ares = 0;
769 	ctx->mres = 0;
770 
771 	if (len==12) {
772 		memcpy(ctx->Yi.c,iv,12);
773 		ctx->Yi.c[15]=1;
774 		ctr=1;
775 	}
776 	else {
777 		size_t i;
778 		u64 len0 = len;
779 
780 		while (len>=16) {
781 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
782 			GCM_MUL(ctx,Yi);
783 			iv += 16;
784 			len -= 16;
785 		}
786 		if (len) {
787 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
788 			GCM_MUL(ctx,Yi);
789 		}
790 		len0 <<= 3;
791 		if (BYTE_ORDER == LITTLE_ENDIAN) {
792 #ifdef BSWAP8
793 			ctx->Yi.u[1]  ^= BSWAP8(len0);
794 #else
795 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
796 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
797 			ctx->Yi.c[10] ^= (u8)(len0>>40);
798 			ctx->Yi.c[11] ^= (u8)(len0>>32);
799 			ctx->Yi.c[12] ^= (u8)(len0>>24);
800 			ctx->Yi.c[13] ^= (u8)(len0>>16);
801 			ctx->Yi.c[14] ^= (u8)(len0>>8);
802 			ctx->Yi.c[15] ^= (u8)(len0);
803 #endif
804 		}
805 		else
806 			ctx->Yi.u[1]  ^= len0;
807 
808 		GCM_MUL(ctx,Yi);
809 
810 		if (BYTE_ORDER == LITTLE_ENDIAN)
811 #ifdef BSWAP4
812 			ctr = BSWAP4(ctx->Yi.d[3]);
813 #else
814 			ctr = GETU32(ctx->Yi.c+12);
815 #endif
816 		else
817 			ctr = ctx->Yi.d[3];
818 	}
819 
820 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
821 	++ctr;
822 	if (BYTE_ORDER == LITTLE_ENDIAN)
823 #ifdef BSWAP4
824 		ctx->Yi.d[3] = BSWAP4(ctr);
825 #else
826 		PUTU32(ctx->Yi.c+12,ctr);
827 #endif
828 	else
829 		ctx->Yi.d[3] = ctr;
830 }
831 
832 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
833 {
834 	size_t i;
835 	unsigned int n;
836 	u64 alen = ctx->len.u[0];
837 #ifdef GCM_FUNCREF_4BIT
838 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
839 # ifdef GHASH
840 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
841 				const u8 *inp,size_t len)	= ctx->ghash;
842 # endif
843 #endif
844 
845 	if (ctx->len.u[1]) return -2;
846 
847 	alen += len;
848 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
849 		return -1;
850 	ctx->len.u[0] = alen;
851 
852 	n = ctx->ares;
853 	if (n) {
854 		while (n && len) {
855 			ctx->Xi.c[n] ^= *(aad++);
856 			--len;
857 			n = (n+1)%16;
858 		}
859 		if (n==0) GCM_MUL(ctx,Xi);
860 		else {
861 			ctx->ares = n;
862 			return 0;
863 		}
864 	}
865 
866 #ifdef GHASH
867 	if ((i = (len&(size_t)-16))) {
868 		GHASH(ctx,aad,i);
869 		aad += i;
870 		len -= i;
871 	}
872 #else
873 	while (len>=16) {
874 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
875 		GCM_MUL(ctx,Xi);
876 		aad += 16;
877 		len -= 16;
878 	}
879 #endif
880 	if (len) {
881 		n = (unsigned int)len;
882 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
883 	}
884 
885 	ctx->ares = n;
886 	return 0;
887 }
888 
889 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
890 		const unsigned char *in, unsigned char *out,
891 		size_t len)
892 {
893 	unsigned int n, ctr;
894 	size_t i;
895 	u64        mlen  = ctx->len.u[1];
896 	block128_f block = ctx->block;
897 	void      *key   = ctx->key;
898 #ifdef GCM_FUNCREF_4BIT
899 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
900 # ifdef GHASH
901 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
902 				const u8 *inp,size_t len)	= ctx->ghash;
903 # endif
904 #endif
905 
906 	mlen += len;
907 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
908 		return -1;
909 	ctx->len.u[1] = mlen;
910 
911 	if (ctx->ares) {
912 		/* First call to encrypt finalizes GHASH(AAD) */
913 		GCM_MUL(ctx,Xi);
914 		ctx->ares = 0;
915 	}
916 
917 	if (BYTE_ORDER == LITTLE_ENDIAN)
918 #ifdef BSWAP4
919 		ctr = BSWAP4(ctx->Yi.d[3]);
920 #else
921 		ctr = GETU32(ctx->Yi.c+12);
922 #endif
923 	else
924 		ctr = ctx->Yi.d[3];
925 
926 	n = ctx->mres;
927 #if !defined(OPENSSL_SMALL_FOOTPRINT)
928 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
929 		if (n) {
930 			while (n && len) {
931 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
932 				--len;
933 				n = (n+1)%16;
934 			}
935 			if (n==0) GCM_MUL(ctx,Xi);
936 			else {
937 				ctx->mres = n;
938 				return 0;
939 			}
940 		}
941 #ifdef __STRICT_ALIGNMENT
942 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
943 			break;
944 #endif
945 #if defined(GHASH) && defined(GHASH_CHUNK)
946 		while (len>=GHASH_CHUNK) {
947 		    size_t j=GHASH_CHUNK;
948 
949 		    while (j) {
950 		    	size_t *out_t=(size_t *)out;
951 		    	const size_t *in_t=(const size_t *)in;
952 
953 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
954 			++ctr;
955 			if (BYTE_ORDER == LITTLE_ENDIAN)
956 #ifdef BSWAP4
957 				ctx->Yi.d[3] = BSWAP4(ctr);
958 #else
959 				PUTU32(ctx->Yi.c+12,ctr);
960 #endif
961 			else
962 				ctx->Yi.d[3] = ctr;
963 			for (i=0; i<16/sizeof(size_t); ++i)
964 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
965 			out += 16;
966 			in  += 16;
967 			j   -= 16;
968 		    }
969 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
970 		    len -= GHASH_CHUNK;
971 		}
972 		if ((i = (len&(size_t)-16))) {
973 		    size_t j=i;
974 
975 		    while (len>=16) {
976 		    	size_t *out_t=(size_t *)out;
977 		    	const size_t *in_t=(const size_t *)in;
978 
979 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
980 			++ctr;
981 			if (BYTE_ORDER == LITTLE_ENDIAN)
982 #ifdef BSWAP4
983 				ctx->Yi.d[3] = BSWAP4(ctr);
984 #else
985 				PUTU32(ctx->Yi.c+12,ctr);
986 #endif
987 			else
988 				ctx->Yi.d[3] = ctr;
989 			for (i=0; i<16/sizeof(size_t); ++i)
990 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
991 			out += 16;
992 			in  += 16;
993 			len -= 16;
994 		    }
995 		    GHASH(ctx,out-j,j);
996 		}
997 #else
998 		while (len>=16) {
999 		    	size_t *out_t=(size_t *)out;
1000 		    	const size_t *in_t=(const size_t *)in;
1001 
1002 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1003 			++ctr;
1004 			if (BYTE_ORDER == LITTLE_ENDIAN)
1005 #ifdef BSWAP4
1006 				ctx->Yi.d[3] = BSWAP4(ctr);
1007 #else
1008 				PUTU32(ctx->Yi.c+12,ctr);
1009 #endif
1010 			else
1011 				ctx->Yi.d[3] = ctr;
1012 			for (i=0; i<16/sizeof(size_t); ++i)
1013 				ctx->Xi.t[i] ^=
1014 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1015 			GCM_MUL(ctx,Xi);
1016 			out += 16;
1017 			in  += 16;
1018 			len -= 16;
1019 		}
1020 #endif
1021 		if (len) {
1022 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1023 			++ctr;
1024 			if (BYTE_ORDER == LITTLE_ENDIAN)
1025 #ifdef BSWAP4
1026 				ctx->Yi.d[3] = BSWAP4(ctr);
1027 #else
1028 				PUTU32(ctx->Yi.c+12,ctr);
1029 #endif
1030 			else
1031 				ctx->Yi.d[3] = ctr;
1032 			while (len--) {
1033 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1034 				++n;
1035 			}
1036 		}
1037 
1038 		ctx->mres = n;
1039 		return 0;
1040 	} while(0);
1041 #endif
1042 	for (i=0;i<len;++i) {
1043 		if (n==0) {
1044 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1045 			++ctr;
1046 			if (BYTE_ORDER == LITTLE_ENDIAN)
1047 #ifdef BSWAP4
1048 				ctx->Yi.d[3] = BSWAP4(ctr);
1049 #else
1050 				PUTU32(ctx->Yi.c+12,ctr);
1051 #endif
1052 			else
1053 				ctx->Yi.d[3] = ctr;
1054 		}
1055 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1056 		n = (n+1)%16;
1057 		if (n==0)
1058 			GCM_MUL(ctx,Xi);
1059 	}
1060 
1061 	ctx->mres = n;
1062 	return 0;
1063 }
1064 
1065 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1066 		const unsigned char *in, unsigned char *out,
1067 		size_t len)
1068 {
1069 	unsigned int n, ctr;
1070 	size_t i;
1071 	u64        mlen  = ctx->len.u[1];
1072 	block128_f block = ctx->block;
1073 	void      *key   = ctx->key;
1074 #ifdef GCM_FUNCREF_4BIT
1075 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1076 # ifdef GHASH
1077 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1078 				const u8 *inp,size_t len)	= ctx->ghash;
1079 # endif
1080 #endif
1081 
1082 	mlen += len;
1083 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1084 		return -1;
1085 	ctx->len.u[1] = mlen;
1086 
1087 	if (ctx->ares) {
1088 		/* First call to decrypt finalizes GHASH(AAD) */
1089 		GCM_MUL(ctx,Xi);
1090 		ctx->ares = 0;
1091 	}
1092 
1093 	if (BYTE_ORDER == LITTLE_ENDIAN)
1094 #ifdef BSWAP4
1095 		ctr = BSWAP4(ctx->Yi.d[3]);
1096 #else
1097 		ctr = GETU32(ctx->Yi.c+12);
1098 #endif
1099 	else
1100 		ctr = ctx->Yi.d[3];
1101 
1102 	n = ctx->mres;
1103 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1104 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1105 		if (n) {
1106 			while (n && len) {
1107 				u8 c = *(in++);
1108 				*(out++) = c^ctx->EKi.c[n];
1109 				ctx->Xi.c[n] ^= c;
1110 				--len;
1111 				n = (n+1)%16;
1112 			}
1113 			if (n==0) GCM_MUL (ctx,Xi);
1114 			else {
1115 				ctx->mres = n;
1116 				return 0;
1117 			}
1118 		}
1119 #ifdef __STRICT_ALIGNMENT
1120 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1121 			break;
1122 #endif
1123 #if defined(GHASH) && defined(GHASH_CHUNK)
1124 		while (len>=GHASH_CHUNK) {
1125 		    size_t j=GHASH_CHUNK;
1126 
1127 		    GHASH(ctx,in,GHASH_CHUNK);
1128 		    while (j) {
1129 		    	size_t *out_t=(size_t *)out;
1130 		    	const size_t *in_t=(const size_t *)in;
1131 
1132 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1133 			++ctr;
1134 			if (BYTE_ORDER == LITTLE_ENDIAN)
1135 #ifdef BSWAP4
1136 				ctx->Yi.d[3] = BSWAP4(ctr);
1137 #else
1138 				PUTU32(ctx->Yi.c+12,ctr);
1139 #endif
1140 			else
1141 				ctx->Yi.d[3] = ctr;
1142 			for (i=0; i<16/sizeof(size_t); ++i)
1143 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1144 			out += 16;
1145 			in  += 16;
1146 			j   -= 16;
1147 		    }
1148 		    len -= GHASH_CHUNK;
1149 		}
1150 		if ((i = (len&(size_t)-16))) {
1151 		    GHASH(ctx,in,i);
1152 		    while (len>=16) {
1153 		    	size_t *out_t=(size_t *)out;
1154 		    	const size_t *in_t=(const size_t *)in;
1155 
1156 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1157 			++ctr;
1158 			if (BYTE_ORDER == LITTLE_ENDIAN)
1159 #ifdef BSWAP4
1160 				ctx->Yi.d[3] = BSWAP4(ctr);
1161 #else
1162 				PUTU32(ctx->Yi.c+12,ctr);
1163 #endif
1164 			else
1165 				ctx->Yi.d[3] = ctr;
1166 			for (i=0; i<16/sizeof(size_t); ++i)
1167 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1168 			out += 16;
1169 			in  += 16;
1170 			len -= 16;
1171 		    }
1172 		}
1173 #else
1174 		while (len>=16) {
1175 		    	size_t *out_t=(size_t *)out;
1176 		    	const size_t *in_t=(const size_t *)in;
1177 
1178 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1179 			++ctr;
1180 			if (BYTE_ORDER == LITTLE_ENDIAN)
1181 #ifdef BSWAP4
1182 				ctx->Yi.d[3] = BSWAP4(ctr);
1183 #else
1184 				PUTU32(ctx->Yi.c+12,ctr);
1185 #endif
1186 			else
1187 				ctx->Yi.d[3] = ctr;
1188 			for (i=0; i<16/sizeof(size_t); ++i) {
1189 				size_t c = in[i];
1190 				out[i] = c^ctx->EKi.t[i];
1191 				ctx->Xi.t[i] ^= c;
1192 			}
1193 			GCM_MUL(ctx,Xi);
1194 			out += 16;
1195 			in  += 16;
1196 			len -= 16;
1197 		}
1198 #endif
1199 		if (len) {
1200 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1201 			++ctr;
1202 			if (BYTE_ORDER == LITTLE_ENDIAN)
1203 #ifdef BSWAP4
1204 				ctx->Yi.d[3] = BSWAP4(ctr);
1205 #else
1206 				PUTU32(ctx->Yi.c+12,ctr);
1207 #endif
1208 			else
1209 				ctx->Yi.d[3] = ctr;
1210 			while (len--) {
1211 				u8 c = in[n];
1212 				ctx->Xi.c[n] ^= c;
1213 				out[n] = c^ctx->EKi.c[n];
1214 				++n;
1215 			}
1216 		}
1217 
1218 		ctx->mres = n;
1219 		return 0;
1220 	} while(0);
1221 #endif
1222 	for (i=0;i<len;++i) {
1223 		u8 c;
1224 		if (n==0) {
1225 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1226 			++ctr;
1227 			if (BYTE_ORDER == LITTLE_ENDIAN)
1228 #ifdef BSWAP4
1229 				ctx->Yi.d[3] = BSWAP4(ctr);
1230 #else
1231 				PUTU32(ctx->Yi.c+12,ctr);
1232 #endif
1233 			else
1234 				ctx->Yi.d[3] = ctr;
1235 		}
1236 		c = in[i];
1237 		out[i] = c^ctx->EKi.c[n];
1238 		ctx->Xi.c[n] ^= c;
1239 		n = (n+1)%16;
1240 		if (n==0)
1241 			GCM_MUL(ctx,Xi);
1242 	}
1243 
1244 	ctx->mres = n;
1245 	return 0;
1246 }
1247 
1248 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1249 		const unsigned char *in, unsigned char *out,
1250 		size_t len, ctr128_f stream)
1251 {
1252 	unsigned int n, ctr;
1253 	size_t i;
1254 	u64   mlen = ctx->len.u[1];
1255 	void *key  = ctx->key;
1256 #ifdef GCM_FUNCREF_4BIT
1257 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1258 # ifdef GHASH
1259 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1260 				const u8 *inp,size_t len)	= ctx->ghash;
1261 # endif
1262 #endif
1263 
1264 	mlen += len;
1265 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1266 		return -1;
1267 	ctx->len.u[1] = mlen;
1268 
1269 	if (ctx->ares) {
1270 		/* First call to encrypt finalizes GHASH(AAD) */
1271 		GCM_MUL(ctx,Xi);
1272 		ctx->ares = 0;
1273 	}
1274 
1275 	if (BYTE_ORDER == LITTLE_ENDIAN)
1276 #ifdef BSWAP4
1277 		ctr = BSWAP4(ctx->Yi.d[3]);
1278 #else
1279 		ctr = GETU32(ctx->Yi.c+12);
1280 #endif
1281 	else
1282 		ctr = ctx->Yi.d[3];
1283 
1284 	n = ctx->mres;
1285 	if (n) {
1286 		while (n && len) {
1287 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1288 			--len;
1289 			n = (n+1)%16;
1290 		}
1291 		if (n==0) GCM_MUL(ctx,Xi);
1292 		else {
1293 			ctx->mres = n;
1294 			return 0;
1295 		}
1296 	}
1297 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1298 	while (len>=GHASH_CHUNK) {
1299 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1300 		ctr += GHASH_CHUNK/16;
1301 		if (BYTE_ORDER == LITTLE_ENDIAN)
1302 #ifdef BSWAP4
1303 			ctx->Yi.d[3] = BSWAP4(ctr);
1304 #else
1305 			PUTU32(ctx->Yi.c+12,ctr);
1306 #endif
1307 		else
1308 			ctx->Yi.d[3] = ctr;
1309 		GHASH(ctx,out,GHASH_CHUNK);
1310 		out += GHASH_CHUNK;
1311 		in  += GHASH_CHUNK;
1312 		len -= GHASH_CHUNK;
1313 	}
1314 #endif
1315 	if ((i = (len&(size_t)-16))) {
1316 		size_t j=i/16;
1317 
1318 		(*stream)(in,out,j,key,ctx->Yi.c);
1319 		ctr += (unsigned int)j;
1320 		if (BYTE_ORDER == LITTLE_ENDIAN)
1321 #ifdef BSWAP4
1322 			ctx->Yi.d[3] = BSWAP4(ctr);
1323 #else
1324 			PUTU32(ctx->Yi.c+12,ctr);
1325 #endif
1326 		else
1327 			ctx->Yi.d[3] = ctr;
1328 		in  += i;
1329 		len -= i;
1330 #if defined(GHASH)
1331 		GHASH(ctx,out,i);
1332 		out += i;
1333 #else
1334 		while (j--) {
1335 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1336 			GCM_MUL(ctx,Xi);
1337 			out += 16;
1338 		}
1339 #endif
1340 	}
1341 	if (len) {
1342 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1343 		++ctr;
1344 		if (BYTE_ORDER == LITTLE_ENDIAN)
1345 #ifdef BSWAP4
1346 			ctx->Yi.d[3] = BSWAP4(ctr);
1347 #else
1348 			PUTU32(ctx->Yi.c+12,ctr);
1349 #endif
1350 		else
1351 			ctx->Yi.d[3] = ctr;
1352 		while (len--) {
1353 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1354 			++n;
1355 		}
1356 	}
1357 
1358 	ctx->mres = n;
1359 	return 0;
1360 }
1361 
1362 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1363 		const unsigned char *in, unsigned char *out,
1364 		size_t len,ctr128_f stream)
1365 {
1366 	unsigned int n, ctr;
1367 	size_t i;
1368 	u64   mlen = ctx->len.u[1];
1369 	void *key  = ctx->key;
1370 #ifdef GCM_FUNCREF_4BIT
1371 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1372 # ifdef GHASH
1373 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1374 				const u8 *inp,size_t len)	= ctx->ghash;
1375 # endif
1376 #endif
1377 
1378 	mlen += len;
1379 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1380 		return -1;
1381 	ctx->len.u[1] = mlen;
1382 
1383 	if (ctx->ares) {
1384 		/* First call to decrypt finalizes GHASH(AAD) */
1385 		GCM_MUL(ctx,Xi);
1386 		ctx->ares = 0;
1387 	}
1388 
1389 	if (BYTE_ORDER == LITTLE_ENDIAN)
1390 #ifdef BSWAP4
1391 		ctr = BSWAP4(ctx->Yi.d[3]);
1392 #else
1393 		ctr = GETU32(ctx->Yi.c+12);
1394 #endif
1395 	else
1396 		ctr = ctx->Yi.d[3];
1397 
1398 	n = ctx->mres;
1399 	if (n) {
1400 		while (n && len) {
1401 			u8 c = *(in++);
1402 			*(out++) = c^ctx->EKi.c[n];
1403 			ctx->Xi.c[n] ^= c;
1404 			--len;
1405 			n = (n+1)%16;
1406 		}
1407 		if (n==0) GCM_MUL (ctx,Xi);
1408 		else {
1409 			ctx->mres = n;
1410 			return 0;
1411 		}
1412 	}
1413 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1414 	while (len>=GHASH_CHUNK) {
1415 		GHASH(ctx,in,GHASH_CHUNK);
1416 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1417 		ctr += GHASH_CHUNK/16;
1418 		if (BYTE_ORDER == LITTLE_ENDIAN)
1419 #ifdef BSWAP4
1420 			ctx->Yi.d[3] = BSWAP4(ctr);
1421 #else
1422 			PUTU32(ctx->Yi.c+12,ctr);
1423 #endif
1424 		else
1425 			ctx->Yi.d[3] = ctr;
1426 		out += GHASH_CHUNK;
1427 		in  += GHASH_CHUNK;
1428 		len -= GHASH_CHUNK;
1429 	}
1430 #endif
1431 	if ((i = (len&(size_t)-16))) {
1432 		size_t j=i/16;
1433 
1434 #if defined(GHASH)
1435 		GHASH(ctx,in,i);
1436 #else
1437 		while (j--) {
1438 			size_t k;
1439 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1440 			GCM_MUL(ctx,Xi);
1441 			in += 16;
1442 		}
1443 		j   = i/16;
1444 		in -= i;
1445 #endif
1446 		(*stream)(in,out,j,key,ctx->Yi.c);
1447 		ctr += (unsigned int)j;
1448 		if (BYTE_ORDER == LITTLE_ENDIAN)
1449 #ifdef BSWAP4
1450 			ctx->Yi.d[3] = BSWAP4(ctr);
1451 #else
1452 			PUTU32(ctx->Yi.c+12,ctr);
1453 #endif
1454 		else
1455 			ctx->Yi.d[3] = ctr;
1456 		out += i;
1457 		in  += i;
1458 		len -= i;
1459 	}
1460 	if (len) {
1461 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1462 		++ctr;
1463 		if (BYTE_ORDER == LITTLE_ENDIAN)
1464 #ifdef BSWAP4
1465 			ctx->Yi.d[3] = BSWAP4(ctr);
1466 #else
1467 			PUTU32(ctx->Yi.c+12,ctr);
1468 #endif
1469 		else
1470 			ctx->Yi.d[3] = ctr;
1471 		while (len--) {
1472 			u8 c = in[n];
1473 			ctx->Xi.c[n] ^= c;
1474 			out[n] = c^ctx->EKi.c[n];
1475 			++n;
1476 		}
1477 	}
1478 
1479 	ctx->mres = n;
1480 	return 0;
1481 }
1482 
1483 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1484 			size_t len)
1485 {
1486 	u64 alen = ctx->len.u[0]<<3;
1487 	u64 clen = ctx->len.u[1]<<3;
1488 #ifdef GCM_FUNCREF_4BIT
1489 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1490 #endif
1491 
1492 	if (ctx->mres || ctx->ares)
1493 		GCM_MUL(ctx,Xi);
1494 
1495 	if (BYTE_ORDER == LITTLE_ENDIAN) {
1496 #ifdef BSWAP8
1497 		alen = BSWAP8(alen);
1498 		clen = BSWAP8(clen);
1499 #else
1500 		u8 *p = ctx->len.c;
1501 
1502 		ctx->len.u[0] = alen;
1503 		ctx->len.u[1] = clen;
1504 
1505 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1506 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1507 #endif
1508 	}
1509 
1510 	ctx->Xi.u[0] ^= alen;
1511 	ctx->Xi.u[1] ^= clen;
1512 	GCM_MUL(ctx,Xi);
1513 
1514 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1515 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1516 
1517 	if (tag && len<=sizeof(ctx->Xi))
1518 		return memcmp(ctx->Xi.c,tag,len);
1519 	else
1520 		return -1;
1521 }
1522 
1523 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1524 {
1525 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1526 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1527 }
1528 
1529 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1530 {
1531 	GCM128_CONTEXT *ret;
1532 
1533 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1534 		CRYPTO_gcm128_init(ret,key,block);
1535 
1536 	return ret;
1537 }
1538 
1539 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1540 {
1541 	if (ctx) {
1542 		explicit_bzero(ctx,sizeof(*ctx));
1543 		free(ctx);
1544 	}
1545 }
1546