xref: /openbsd-src/lib/libcrypto/modes/gcm128.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /* $OpenBSD: gcm128.c,v 1.10 2014/07/09 16:06:13 miod Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <openssl/crypto.h>
54 #include "modes_lcl.h"
55 #include <string.h>
56 
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 #  define NDEBUG
60 # endif
61 #endif
62 #include <assert.h>
63 
64 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
65 /* redefine, because alignment is ensured */
66 #undef	GETU32
67 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
68 #undef	PUTU32
69 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
70 #endif
71 
72 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
73 #define REDUCE1BIT(V)	\
74 	do { \
75 		if (sizeof(size_t)==8) { \
76 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77 			V.lo  = (V.hi<<63)|(V.lo>>1); \
78 			V.hi  = (V.hi>>1 )^T; \
79 		} else { \
80 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
81 			V.lo  = (V.hi<<63)|(V.lo>>1); \
82 			V.hi  = (V.hi>>1 )^((u64)T<<32); \
83 		} \
84 	} while(0)
85 
86 /*
87  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
88  * never be set to 8. 8 is effectively reserved for testing purposes.
89  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
90  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
91  * whole spectrum of possible table driven implementations. Why? In
92  * non-"Shoup's" case memory access pattern is segmented in such manner,
93  * that it's trivial to see that cache timing information can reveal
94  * fair portion of intermediate hash value. Given that ciphertext is
95  * always available to attacker, it's possible for him to attempt to
96  * deduce secret parameter H and if successful, tamper with messages
97  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
98  * not as trivial, but there is no reason to believe that it's resistant
99  * to cache-timing attack. And the thing about "8-bit" implementation is
100  * that it consumes 16 (sixteen) times more memory, 4KB per individual
101  * key + 1KB shared. Well, on pros side it should be twice as fast as
102  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
103  * was observed to run ~75% faster, closer to 100% for commercial
104  * compilers... Yet "4-bit" procedure is preferred, because it's
105  * believed to provide better security-performance balance and adequate
106  * all-round performance. "All-round" refers to things like:
107  *
108  * - shorter setup time effectively improves overall timing for
109  *   handling short messages;
110  * - larger table allocation can become unbearable because of VM
111  *   subsystem penalties (for example on Windows large enough free
112  *   results in VM working set trimming, meaning that consequent
113  *   malloc would immediately incur working set expansion);
114  * - larger table has larger cache footprint, which can affect
115  *   performance of other code paths (not necessarily even from same
116  *   thread in Hyper-Threading world);
117  *
118  * Value of 1 is not appropriate for performance reasons.
119  */
120 #if	TABLE_BITS==8
121 
122 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
123 {
124 	int  i, j;
125 	u128 V;
126 
127 	Htable[0].hi = 0;
128 	Htable[0].lo = 0;
129 	V.hi = H[0];
130 	V.lo = H[1];
131 
132 	for (Htable[128]=V, i=64; i>0; i>>=1) {
133 		REDUCE1BIT(V);
134 		Htable[i] = V;
135 	}
136 
137 	for (i=2; i<256; i<<=1) {
138 		u128 *Hi = Htable+i, H0 = *Hi;
139 		for (j=1; j<i; ++j) {
140 			Hi[j].hi = H0.hi^Htable[j].hi;
141 			Hi[j].lo = H0.lo^Htable[j].lo;
142 		}
143 	}
144 }
145 
146 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
147 {
148 	u128 Z = { 0, 0};
149 	const u8 *xi = (const u8 *)Xi+15;
150 	size_t rem, n = *xi;
151 	static const size_t rem_8bit[256] = {
152 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216 
217 	while (1) {
218 		Z.hi ^= Htable[n].hi;
219 		Z.lo ^= Htable[n].lo;
220 
221 		if ((u8 *)Xi==xi)	break;
222 
223 		n = *(--xi);
224 
225 		rem  = (size_t)Z.lo&0xff;
226 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
227 		Z.hi = (Z.hi>>8);
228 		if (sizeof(size_t)==8)
229 			Z.hi ^= rem_8bit[rem];
230 		else
231 			Z.hi ^= (u64)rem_8bit[rem]<<32;
232 	}
233 
234 	if (BYTE_ORDER == LITTLE_ENDIAN) {
235 #ifdef BSWAP8
236 		Xi[0] = BSWAP8(Z.hi);
237 		Xi[1] = BSWAP8(Z.lo);
238 #else
239 		u8 *p = (u8 *)Xi;
240 		u32 v;
241 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
242 		v = (u32)(Z.hi);	PUTU32(p+4,v);
243 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244 		v = (u32)(Z.lo);	PUTU32(p+12,v);
245 #endif
246 	}
247 	else {
248 		Xi[0] = Z.hi;
249 		Xi[1] = Z.lo;
250 	}
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253 
254 #elif	TABLE_BITS==4
255 
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258 	u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260 	int  i;
261 #endif
262 
263 	Htable[0].hi = 0;
264 	Htable[0].lo = 0;
265 	V.hi = H[0];
266 	V.lo = H[1];
267 
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269 	for (Htable[8]=V, i=4; i>0; i>>=1) {
270 		REDUCE1BIT(V);
271 		Htable[i] = V;
272 	}
273 
274 	for (i=2; i<16; i<<=1) {
275 		u128 *Hi = Htable+i;
276 		int   j;
277 		for (V=*Hi, j=1; j<i; ++j) {
278 			Hi[j].hi = V.hi^Htable[j].hi;
279 			Hi[j].lo = V.lo^Htable[j].lo;
280 		}
281 	}
282 #else
283 	Htable[8] = V;
284 	REDUCE1BIT(V);
285 	Htable[4] = V;
286 	REDUCE1BIT(V);
287 	Htable[2] = V;
288 	REDUCE1BIT(V);
289 	Htable[1] = V;
290 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291 	V=Htable[4];
292 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295 	V=Htable[8];
296 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305 	/*
306 	 * ARM assembler expects specific dword order in Htable.
307 	 */
308 	{
309 	int j;
310 
311 	if (BYTE_ORDER == LITTLE_ENDIAN)
312 		for (j=0;j<16;++j) {
313 			V = Htable[j];
314 			Htable[j].hi = V.lo;
315 			Htable[j].lo = V.hi;
316 		}
317 	else
318 		for (j=0;j<16;++j) {
319 			V = Htable[j];
320 			Htable[j].hi = V.lo<<32|V.lo>>32;
321 			Htable[j].lo = V.hi<<32|V.hi>>32;
322 		}
323 	}
324 #endif
325 }
326 
327 #ifndef GHASH_ASM
328 static const size_t rem_4bit[16] = {
329 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
330 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
331 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
332 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
333 
334 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
335 {
336 	u128 Z;
337 	int cnt = 15;
338 	size_t rem, nlo, nhi;
339 
340 	nlo  = ((const u8 *)Xi)[15];
341 	nhi  = nlo>>4;
342 	nlo &= 0xf;
343 
344 	Z.hi = Htable[nlo].hi;
345 	Z.lo = Htable[nlo].lo;
346 
347 	while (1) {
348 		rem  = (size_t)Z.lo&0xf;
349 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
350 		Z.hi = (Z.hi>>4);
351 		if (sizeof(size_t)==8)
352 			Z.hi ^= rem_4bit[rem];
353 		else
354 			Z.hi ^= (u64)rem_4bit[rem]<<32;
355 
356 		Z.hi ^= Htable[nhi].hi;
357 		Z.lo ^= Htable[nhi].lo;
358 
359 		if (--cnt<0)		break;
360 
361 		nlo  = ((const u8 *)Xi)[cnt];
362 		nhi  = nlo>>4;
363 		nlo &= 0xf;
364 
365 		rem  = (size_t)Z.lo&0xf;
366 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
367 		Z.hi = (Z.hi>>4);
368 		if (sizeof(size_t)==8)
369 			Z.hi ^= rem_4bit[rem];
370 		else
371 			Z.hi ^= (u64)rem_4bit[rem]<<32;
372 
373 		Z.hi ^= Htable[nlo].hi;
374 		Z.lo ^= Htable[nlo].lo;
375 	}
376 
377 	if (BYTE_ORDER == LITTLE_ENDIAN) {
378 #ifdef BSWAP8
379 		Xi[0] = BSWAP8(Z.hi);
380 		Xi[1] = BSWAP8(Z.lo);
381 #else
382 		u8 *p = (u8 *)Xi;
383 		u32 v;
384 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
385 		v = (u32)(Z.hi);	PUTU32(p+4,v);
386 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
387 		v = (u32)(Z.lo);	PUTU32(p+12,v);
388 #endif
389 	}
390 	else {
391 		Xi[0] = Z.hi;
392 		Xi[1] = Z.lo;
393 	}
394 }
395 
396 #if !defined(OPENSSL_SMALL_FOOTPRINT)
397 /*
398  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
399  * details... Compiler-generated code doesn't seem to give any
400  * performance improvement, at least not on x86[_64]. It's here
401  * mostly as reference and a placeholder for possible future
402  * non-trivial optimization[s]...
403  */
404 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
405 				const u8 *inp,size_t len)
406 {
407     u128 Z;
408     int cnt;
409     size_t rem, nlo, nhi;
410 
411 #if 1
412     do {
413 	cnt  = 15;
414 	nlo  = ((const u8 *)Xi)[15];
415 	nlo ^= inp[15];
416 	nhi  = nlo>>4;
417 	nlo &= 0xf;
418 
419 	Z.hi = Htable[nlo].hi;
420 	Z.lo = Htable[nlo].lo;
421 
422 	while (1) {
423 		rem  = (size_t)Z.lo&0xf;
424 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
425 		Z.hi = (Z.hi>>4);
426 		if (sizeof(size_t)==8)
427 			Z.hi ^= rem_4bit[rem];
428 		else
429 			Z.hi ^= (u64)rem_4bit[rem]<<32;
430 
431 		Z.hi ^= Htable[nhi].hi;
432 		Z.lo ^= Htable[nhi].lo;
433 
434 		if (--cnt<0)		break;
435 
436 		nlo  = ((const u8 *)Xi)[cnt];
437 		nlo ^= inp[cnt];
438 		nhi  = nlo>>4;
439 		nlo &= 0xf;
440 
441 		rem  = (size_t)Z.lo&0xf;
442 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
443 		Z.hi = (Z.hi>>4);
444 		if (sizeof(size_t)==8)
445 			Z.hi ^= rem_4bit[rem];
446 		else
447 			Z.hi ^= (u64)rem_4bit[rem]<<32;
448 
449 		Z.hi ^= Htable[nlo].hi;
450 		Z.lo ^= Htable[nlo].lo;
451 	}
452 #else
453     /*
454      * Extra 256+16 bytes per-key plus 512 bytes shared tables
455      * [should] give ~50% improvement... One could have PACK()-ed
456      * the rem_8bit even here, but the priority is to minimize
457      * cache footprint...
458      */
459     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
460     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
461     static const unsigned short rem_8bit[256] = {
462 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
463 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
464 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
465 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
466 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
467 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
468 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
469 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
470 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
471 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
472 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
473 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
474 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
475 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
476 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
477 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
478 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
479 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
480 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
481 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
482 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
483 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
484 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
485 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
486 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
487 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
488 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
489 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
490 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
491 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
492 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
493 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
494     /*
495      * This pre-processing phase slows down procedure by approximately
496      * same time as it makes each loop spin faster. In other words
497      * single block performance is approximately same as straightforward
498      * "4-bit" implementation, and then it goes only faster...
499      */
500     for (cnt=0; cnt<16; ++cnt) {
501 	Z.hi = Htable[cnt].hi;
502 	Z.lo = Htable[cnt].lo;
503 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
504 	Hshr4[cnt].hi = (Z.hi>>4);
505 	Hshl4[cnt]    = (u8)(Z.lo<<4);
506     }
507 
508     do {
509 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
510 		nlo  = ((const u8 *)Xi)[cnt];
511 		nlo ^= inp[cnt];
512 		nhi  = nlo>>4;
513 		nlo &= 0xf;
514 
515 		Z.hi ^= Htable[nlo].hi;
516 		Z.lo ^= Htable[nlo].lo;
517 
518 		rem = (size_t)Z.lo&0xff;
519 
520 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
521 		Z.hi = (Z.hi>>8);
522 
523 		Z.hi ^= Hshr4[nhi].hi;
524 		Z.lo ^= Hshr4[nhi].lo;
525 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
526 	}
527 
528 	nlo  = ((const u8 *)Xi)[0];
529 	nlo ^= inp[0];
530 	nhi  = nlo>>4;
531 	nlo &= 0xf;
532 
533 	Z.hi ^= Htable[nlo].hi;
534 	Z.lo ^= Htable[nlo].lo;
535 
536 	rem = (size_t)Z.lo&0xf;
537 
538 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
539 	Z.hi = (Z.hi>>4);
540 
541 	Z.hi ^= Htable[nhi].hi;
542 	Z.lo ^= Htable[nhi].lo;
543 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
544 #endif
545 
546 	if (BYTE_ORDER == LITTLE_ENDIAN) {
547 #ifdef BSWAP8
548 		Xi[0] = BSWAP8(Z.hi);
549 		Xi[1] = BSWAP8(Z.lo);
550 #else
551 		u8 *p = (u8 *)Xi;
552 		u32 v;
553 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
554 		v = (u32)(Z.hi);	PUTU32(p+4,v);
555 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
556 		v = (u32)(Z.lo);	PUTU32(p+12,v);
557 #endif
558 	}
559 	else {
560 		Xi[0] = Z.hi;
561 		Xi[1] = Z.lo;
562 	}
563     } while (inp+=16, len-=16);
564 }
565 #endif
566 #else
567 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
568 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
569 #endif
570 
571 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
572 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
573 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
574 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
575  * trashing effect. In other words idea is to hash data while it's
576  * still in L1 cache after encryption pass... */
577 #define GHASH_CHUNK       (3*1024)
578 #endif
579 
580 #else	/* TABLE_BITS */
581 
582 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
583 {
584 	u128 V,Z = { 0,0 };
585 	long X;
586 	int  i,j;
587 	const long *xi = (const long *)Xi;
588 
589 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
590 	V.lo = H[1];
591 
592 	for (j=0; j<16/sizeof(long); ++j) {
593 		if (BYTE_ORDER == LITTLE_ENDIAN) {
594 			if (sizeof(long)==8) {
595 #ifdef BSWAP8
596 				X = (long)(BSWAP8(xi[j]));
597 #else
598 				const u8 *p = (const u8 *)(xi+j);
599 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
600 #endif
601 			}
602 			else {
603 				const u8 *p = (const u8 *)(xi+j);
604 				X = (long)GETU32(p);
605 			}
606 		}
607 		else
608 			X = xi[j];
609 
610 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
611 			u64 M = (u64)(X>>(8*sizeof(long)-1));
612 			Z.hi ^= V.hi&M;
613 			Z.lo ^= V.lo&M;
614 
615 			REDUCE1BIT(V);
616 		}
617 	}
618 
619 	if (BYTE_ORDER == LITTLE_ENDIAN) {
620 #ifdef BSWAP8
621 		Xi[0] = BSWAP8(Z.hi);
622 		Xi[1] = BSWAP8(Z.lo);
623 #else
624 		u8 *p = (u8 *)Xi;
625 		u32 v;
626 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
627 		v = (u32)(Z.hi);	PUTU32(p+4,v);
628 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
629 		v = (u32)(Z.lo);	PUTU32(p+12,v);
630 #endif
631 	}
632 	else {
633 		Xi[0] = Z.hi;
634 		Xi[1] = Z.lo;
635 	}
636 }
637 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
638 
639 #endif
640 
641 #if	TABLE_BITS==4 && defined(GHASH_ASM)
642 # if	!defined(I386_ONLY) && \
643 	(defined(__i386)	|| defined(__i386__)	|| \
644 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
645 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
646 #  define GHASH_ASM_X86_OR_64
647 #  define GCM_FUNCREF_4BIT
648 extern unsigned int OPENSSL_ia32cap_P[2];
649 
650 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
651 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
652 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
653 
654 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
655 #   define GHASH_ASM_X86
656 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
657 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
658 
659 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
660 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 #  endif
662 # elif defined(__arm__) || defined(__arm)
663 #  include "arm_arch.h"
664 #  if __ARM_ARCH__>=7
665 #   define GHASH_ASM_ARM
666 #   define GCM_FUNCREF_4BIT
667 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
668 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
669 #  endif
670 # endif
671 #endif
672 
673 #ifdef GCM_FUNCREF_4BIT
674 # undef  GCM_MUL
675 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
676 # ifdef GHASH
677 #  undef  GHASH
678 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
679 # endif
680 #endif
681 
682 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
683 {
684 	memset(ctx,0,sizeof(*ctx));
685 	ctx->block = block;
686 	ctx->key   = key;
687 
688 	(*block)(ctx->H.c,ctx->H.c,key);
689 
690 	if (BYTE_ORDER == LITTLE_ENDIAN) {
691 		/* H is stored in host byte order */
692 #ifdef BSWAP8
693 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
694 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
695 #else
696 		u8 *p = ctx->H.c;
697 		u64 hi,lo;
698 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
699 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
700 		ctx->H.u[0] = hi;
701 		ctx->H.u[1] = lo;
702 #endif
703 	}
704 
705 #if	TABLE_BITS==8
706 	gcm_init_8bit(ctx->Htable,ctx->H.u);
707 #elif	TABLE_BITS==4
708 # if	defined(GHASH_ASM_X86_OR_64)
709 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
710 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
711 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
712 		gcm_init_clmul(ctx->Htable,ctx->H.u);
713 		ctx->gmult = gcm_gmult_clmul;
714 		ctx->ghash = gcm_ghash_clmul;
715 		return;
716 	}
717 #  endif
718 	gcm_init_4bit(ctx->Htable,ctx->H.u);
719 #  if	defined(GHASH_ASM_X86)			/* x86 only */
720 #   if	defined(OPENSSL_IA32_SSE2)
721 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
722 #   else
723 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
724 #   endif
725 		ctx->gmult = gcm_gmult_4bit_mmx;
726 		ctx->ghash = gcm_ghash_4bit_mmx;
727 	} else {
728 		ctx->gmult = gcm_gmult_4bit_x86;
729 		ctx->ghash = gcm_ghash_4bit_x86;
730 	}
731 #  else
732 	ctx->gmult = gcm_gmult_4bit;
733 	ctx->ghash = gcm_ghash_4bit;
734 #  endif
735 # elif	defined(GHASH_ASM_ARM)
736 	if (OPENSSL_armcap_P & ARMV7_NEON) {
737 		ctx->gmult = gcm_gmult_neon;
738 		ctx->ghash = gcm_ghash_neon;
739 	} else {
740 		gcm_init_4bit(ctx->Htable,ctx->H.u);
741 		ctx->gmult = gcm_gmult_4bit;
742 		ctx->ghash = gcm_ghash_4bit;
743 	}
744 # else
745 	gcm_init_4bit(ctx->Htable,ctx->H.u);
746 # endif
747 #endif
748 }
749 
750 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
751 {
752 	unsigned int ctr;
753 #ifdef GCM_FUNCREF_4BIT
754 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
755 #endif
756 
757 	ctx->Yi.u[0]  = 0;
758 	ctx->Yi.u[1]  = 0;
759 	ctx->Xi.u[0]  = 0;
760 	ctx->Xi.u[1]  = 0;
761 	ctx->len.u[0] = 0;	/* AAD length */
762 	ctx->len.u[1] = 0;	/* message length */
763 	ctx->ares = 0;
764 	ctx->mres = 0;
765 
766 	if (len==12) {
767 		memcpy(ctx->Yi.c,iv,12);
768 		ctx->Yi.c[15]=1;
769 		ctr=1;
770 	}
771 	else {
772 		size_t i;
773 		u64 len0 = len;
774 
775 		while (len>=16) {
776 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
777 			GCM_MUL(ctx,Yi);
778 			iv += 16;
779 			len -= 16;
780 		}
781 		if (len) {
782 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
783 			GCM_MUL(ctx,Yi);
784 		}
785 		len0 <<= 3;
786 		if (BYTE_ORDER == LITTLE_ENDIAN) {
787 #ifdef BSWAP8
788 			ctx->Yi.u[1]  ^= BSWAP8(len0);
789 #else
790 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
791 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
792 			ctx->Yi.c[10] ^= (u8)(len0>>40);
793 			ctx->Yi.c[11] ^= (u8)(len0>>32);
794 			ctx->Yi.c[12] ^= (u8)(len0>>24);
795 			ctx->Yi.c[13] ^= (u8)(len0>>16);
796 			ctx->Yi.c[14] ^= (u8)(len0>>8);
797 			ctx->Yi.c[15] ^= (u8)(len0);
798 #endif
799 		}
800 		else
801 			ctx->Yi.u[1]  ^= len0;
802 
803 		GCM_MUL(ctx,Yi);
804 
805 		if (BYTE_ORDER == LITTLE_ENDIAN)
806 #ifdef BSWAP4
807 			ctr = BSWAP4(ctx->Yi.d[3]);
808 #else
809 			ctr = GETU32(ctx->Yi.c+12);
810 #endif
811 		else
812 			ctr = ctx->Yi.d[3];
813 	}
814 
815 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
816 	++ctr;
817 	if (BYTE_ORDER == LITTLE_ENDIAN)
818 #ifdef BSWAP4
819 		ctx->Yi.d[3] = BSWAP4(ctr);
820 #else
821 		PUTU32(ctx->Yi.c+12,ctr);
822 #endif
823 	else
824 		ctx->Yi.d[3] = ctr;
825 }
826 
827 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
828 {
829 	size_t i;
830 	unsigned int n;
831 	u64 alen = ctx->len.u[0];
832 #ifdef GCM_FUNCREF_4BIT
833 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
834 # ifdef GHASH
835 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
836 				const u8 *inp,size_t len)	= ctx->ghash;
837 # endif
838 #endif
839 
840 	if (ctx->len.u[1]) return -2;
841 
842 	alen += len;
843 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
844 		return -1;
845 	ctx->len.u[0] = alen;
846 
847 	n = ctx->ares;
848 	if (n) {
849 		while (n && len) {
850 			ctx->Xi.c[n] ^= *(aad++);
851 			--len;
852 			n = (n+1)%16;
853 		}
854 		if (n==0) GCM_MUL(ctx,Xi);
855 		else {
856 			ctx->ares = n;
857 			return 0;
858 		}
859 	}
860 
861 #ifdef GHASH
862 	if ((i = (len&(size_t)-16))) {
863 		GHASH(ctx,aad,i);
864 		aad += i;
865 		len -= i;
866 	}
867 #else
868 	while (len>=16) {
869 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
870 		GCM_MUL(ctx,Xi);
871 		aad += 16;
872 		len -= 16;
873 	}
874 #endif
875 	if (len) {
876 		n = (unsigned int)len;
877 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
878 	}
879 
880 	ctx->ares = n;
881 	return 0;
882 }
883 
884 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
885 		const unsigned char *in, unsigned char *out,
886 		size_t len)
887 {
888 	unsigned int n, ctr;
889 	size_t i;
890 	u64        mlen  = ctx->len.u[1];
891 	block128_f block = ctx->block;
892 	void      *key   = ctx->key;
893 #ifdef GCM_FUNCREF_4BIT
894 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
895 # ifdef GHASH
896 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
897 				const u8 *inp,size_t len)	= ctx->ghash;
898 # endif
899 #endif
900 
901 #if 0
902 	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
903 #endif
904 	mlen += len;
905 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
906 		return -1;
907 	ctx->len.u[1] = mlen;
908 
909 	if (ctx->ares) {
910 		/* First call to encrypt finalizes GHASH(AAD) */
911 		GCM_MUL(ctx,Xi);
912 		ctx->ares = 0;
913 	}
914 
915 	if (BYTE_ORDER == LITTLE_ENDIAN)
916 #ifdef BSWAP4
917 		ctr = BSWAP4(ctx->Yi.d[3]);
918 #else
919 		ctr = GETU32(ctx->Yi.c+12);
920 #endif
921 	else
922 		ctr = ctx->Yi.d[3];
923 
924 	n = ctx->mres;
925 #if !defined(OPENSSL_SMALL_FOOTPRINT)
926 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
927 		if (n) {
928 			while (n && len) {
929 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
930 				--len;
931 				n = (n+1)%16;
932 			}
933 			if (n==0) GCM_MUL(ctx,Xi);
934 			else {
935 				ctx->mres = n;
936 				return 0;
937 			}
938 		}
939 #ifdef __STRICT_ALIGNMENT
940 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
941 			break;
942 #endif
943 #if defined(GHASH) && defined(GHASH_CHUNK)
944 		while (len>=GHASH_CHUNK) {
945 		    size_t j=GHASH_CHUNK;
946 
947 		    while (j) {
948 		    	size_t *out_t=(size_t *)out;
949 		    	const size_t *in_t=(const size_t *)in;
950 
951 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
952 			++ctr;
953 			if (BYTE_ORDER == LITTLE_ENDIAN)
954 #ifdef BSWAP4
955 				ctx->Yi.d[3] = BSWAP4(ctr);
956 #else
957 				PUTU32(ctx->Yi.c+12,ctr);
958 #endif
959 			else
960 				ctx->Yi.d[3] = ctr;
961 			for (i=0; i<16/sizeof(size_t); ++i)
962 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
963 			out += 16;
964 			in  += 16;
965 			j   -= 16;
966 		    }
967 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
968 		    len -= GHASH_CHUNK;
969 		}
970 		if ((i = (len&(size_t)-16))) {
971 		    size_t j=i;
972 
973 		    while (len>=16) {
974 		    	size_t *out_t=(size_t *)out;
975 		    	const size_t *in_t=(const size_t *)in;
976 
977 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
978 			++ctr;
979 			if (BYTE_ORDER == LITTLE_ENDIAN)
980 #ifdef BSWAP4
981 				ctx->Yi.d[3] = BSWAP4(ctr);
982 #else
983 				PUTU32(ctx->Yi.c+12,ctr);
984 #endif
985 			else
986 				ctx->Yi.d[3] = ctr;
987 			for (i=0; i<16/sizeof(size_t); ++i)
988 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
989 			out += 16;
990 			in  += 16;
991 			len -= 16;
992 		    }
993 		    GHASH(ctx,out-j,j);
994 		}
995 #else
996 		while (len>=16) {
997 		    	size_t *out_t=(size_t *)out;
998 		    	const size_t *in_t=(const size_t *)in;
999 
1000 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1001 			++ctr;
1002 			if (BYTE_ORDER == LITTLE_ENDIAN)
1003 #ifdef BSWAP4
1004 				ctx->Yi.d[3] = BSWAP4(ctr);
1005 #else
1006 				PUTU32(ctx->Yi.c+12,ctr);
1007 #endif
1008 			else
1009 				ctx->Yi.d[3] = ctr;
1010 			for (i=0; i<16/sizeof(size_t); ++i)
1011 				ctx->Xi.t[i] ^=
1012 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1013 			GCM_MUL(ctx,Xi);
1014 			out += 16;
1015 			in  += 16;
1016 			len -= 16;
1017 		}
1018 #endif
1019 		if (len) {
1020 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1021 			++ctr;
1022 			if (BYTE_ORDER == LITTLE_ENDIAN)
1023 #ifdef BSWAP4
1024 				ctx->Yi.d[3] = BSWAP4(ctr);
1025 #else
1026 				PUTU32(ctx->Yi.c+12,ctr);
1027 #endif
1028 			else
1029 				ctx->Yi.d[3] = ctr;
1030 			while (len--) {
1031 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1032 				++n;
1033 			}
1034 		}
1035 
1036 		ctx->mres = n;
1037 		return 0;
1038 	} while(0);
1039 #endif
1040 	for (i=0;i<len;++i) {
1041 		if (n==0) {
1042 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1043 			++ctr;
1044 			if (BYTE_ORDER == LITTLE_ENDIAN)
1045 #ifdef BSWAP4
1046 				ctx->Yi.d[3] = BSWAP4(ctr);
1047 #else
1048 				PUTU32(ctx->Yi.c+12,ctr);
1049 #endif
1050 			else
1051 				ctx->Yi.d[3] = ctr;
1052 		}
1053 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1054 		n = (n+1)%16;
1055 		if (n==0)
1056 			GCM_MUL(ctx,Xi);
1057 	}
1058 
1059 	ctx->mres = n;
1060 	return 0;
1061 }
1062 
1063 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1064 		const unsigned char *in, unsigned char *out,
1065 		size_t len)
1066 {
1067 	unsigned int n, ctr;
1068 	size_t i;
1069 	u64        mlen  = ctx->len.u[1];
1070 	block128_f block = ctx->block;
1071 	void      *key   = ctx->key;
1072 #ifdef GCM_FUNCREF_4BIT
1073 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1074 # ifdef GHASH
1075 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1076 				const u8 *inp,size_t len)	= ctx->ghash;
1077 # endif
1078 #endif
1079 
1080 	mlen += len;
1081 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1082 		return -1;
1083 	ctx->len.u[1] = mlen;
1084 
1085 	if (ctx->ares) {
1086 		/* First call to decrypt finalizes GHASH(AAD) */
1087 		GCM_MUL(ctx,Xi);
1088 		ctx->ares = 0;
1089 	}
1090 
1091 	if (BYTE_ORDER == LITTLE_ENDIAN)
1092 #ifdef BSWAP4
1093 		ctr = BSWAP4(ctx->Yi.d[3]);
1094 #else
1095 		ctr = GETU32(ctx->Yi.c+12);
1096 #endif
1097 	else
1098 		ctr = ctx->Yi.d[3];
1099 
1100 	n = ctx->mres;
1101 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1102 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1103 		if (n) {
1104 			while (n && len) {
1105 				u8 c = *(in++);
1106 				*(out++) = c^ctx->EKi.c[n];
1107 				ctx->Xi.c[n] ^= c;
1108 				--len;
1109 				n = (n+1)%16;
1110 			}
1111 			if (n==0) GCM_MUL (ctx,Xi);
1112 			else {
1113 				ctx->mres = n;
1114 				return 0;
1115 			}
1116 		}
1117 #ifdef __STRICT_ALIGNMENT
1118 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1119 			break;
1120 #endif
1121 #if defined(GHASH) && defined(GHASH_CHUNK)
1122 		while (len>=GHASH_CHUNK) {
1123 		    size_t j=GHASH_CHUNK;
1124 
1125 		    GHASH(ctx,in,GHASH_CHUNK);
1126 		    while (j) {
1127 		    	size_t *out_t=(size_t *)out;
1128 		    	const size_t *in_t=(const size_t *)in;
1129 
1130 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1131 			++ctr;
1132 			if (BYTE_ORDER == LITTLE_ENDIAN)
1133 #ifdef BSWAP4
1134 				ctx->Yi.d[3] = BSWAP4(ctr);
1135 #else
1136 				PUTU32(ctx->Yi.c+12,ctr);
1137 #endif
1138 			else
1139 				ctx->Yi.d[3] = ctr;
1140 			for (i=0; i<16/sizeof(size_t); ++i)
1141 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1142 			out += 16;
1143 			in  += 16;
1144 			j   -= 16;
1145 		    }
1146 		    len -= GHASH_CHUNK;
1147 		}
1148 		if ((i = (len&(size_t)-16))) {
1149 		    GHASH(ctx,in,i);
1150 		    while (len>=16) {
1151 		    	size_t *out_t=(size_t *)out;
1152 		    	const size_t *in_t=(const size_t *)in;
1153 
1154 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1155 			++ctr;
1156 			if (BYTE_ORDER == LITTLE_ENDIAN)
1157 #ifdef BSWAP4
1158 				ctx->Yi.d[3] = BSWAP4(ctr);
1159 #else
1160 				PUTU32(ctx->Yi.c+12,ctr);
1161 #endif
1162 			else
1163 				ctx->Yi.d[3] = ctr;
1164 			for (i=0; i<16/sizeof(size_t); ++i)
1165 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1166 			out += 16;
1167 			in  += 16;
1168 			len -= 16;
1169 		    }
1170 		}
1171 #else
1172 		while (len>=16) {
1173 		    	size_t *out_t=(size_t *)out;
1174 		    	const size_t *in_t=(const size_t *)in;
1175 
1176 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1177 			++ctr;
1178 			if (BYTE_ORDER == LITTLE_ENDIAN)
1179 #ifdef BSWAP4
1180 				ctx->Yi.d[3] = BSWAP4(ctr);
1181 #else
1182 				PUTU32(ctx->Yi.c+12,ctr);
1183 #endif
1184 			else
1185 				ctx->Yi.d[3] = ctr;
1186 			for (i=0; i<16/sizeof(size_t); ++i) {
1187 				size_t c = in[i];
1188 				out[i] = c^ctx->EKi.t[i];
1189 				ctx->Xi.t[i] ^= c;
1190 			}
1191 			GCM_MUL(ctx,Xi);
1192 			out += 16;
1193 			in  += 16;
1194 			len -= 16;
1195 		}
1196 #endif
1197 		if (len) {
1198 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1199 			++ctr;
1200 			if (BYTE_ORDER == LITTLE_ENDIAN)
1201 #ifdef BSWAP4
1202 				ctx->Yi.d[3] = BSWAP4(ctr);
1203 #else
1204 				PUTU32(ctx->Yi.c+12,ctr);
1205 #endif
1206 			else
1207 				ctx->Yi.d[3] = ctr;
1208 			while (len--) {
1209 				u8 c = in[n];
1210 				ctx->Xi.c[n] ^= c;
1211 				out[n] = c^ctx->EKi.c[n];
1212 				++n;
1213 			}
1214 		}
1215 
1216 		ctx->mres = n;
1217 		return 0;
1218 	} while(0);
1219 #endif
1220 	for (i=0;i<len;++i) {
1221 		u8 c;
1222 		if (n==0) {
1223 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1224 			++ctr;
1225 			if (BYTE_ORDER == LITTLE_ENDIAN)
1226 #ifdef BSWAP4
1227 				ctx->Yi.d[3] = BSWAP4(ctr);
1228 #else
1229 				PUTU32(ctx->Yi.c+12,ctr);
1230 #endif
1231 			else
1232 				ctx->Yi.d[3] = ctr;
1233 		}
1234 		c = in[i];
1235 		out[i] = c^ctx->EKi.c[n];
1236 		ctx->Xi.c[n] ^= c;
1237 		n = (n+1)%16;
1238 		if (n==0)
1239 			GCM_MUL(ctx,Xi);
1240 	}
1241 
1242 	ctx->mres = n;
1243 	return 0;
1244 }
1245 
1246 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1247 		const unsigned char *in, unsigned char *out,
1248 		size_t len, ctr128_f stream)
1249 {
1250 	unsigned int n, ctr;
1251 	size_t i;
1252 	u64   mlen = ctx->len.u[1];
1253 	void *key  = ctx->key;
1254 #ifdef GCM_FUNCREF_4BIT
1255 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1256 # ifdef GHASH
1257 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1258 				const u8 *inp,size_t len)	= ctx->ghash;
1259 # endif
1260 #endif
1261 
1262 	mlen += len;
1263 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1264 		return -1;
1265 	ctx->len.u[1] = mlen;
1266 
1267 	if (ctx->ares) {
1268 		/* First call to encrypt finalizes GHASH(AAD) */
1269 		GCM_MUL(ctx,Xi);
1270 		ctx->ares = 0;
1271 	}
1272 
1273 	if (BYTE_ORDER == LITTLE_ENDIAN)
1274 #ifdef BSWAP4
1275 		ctr = BSWAP4(ctx->Yi.d[3]);
1276 #else
1277 		ctr = GETU32(ctx->Yi.c+12);
1278 #endif
1279 	else
1280 		ctr = ctx->Yi.d[3];
1281 
1282 	n = ctx->mres;
1283 	if (n) {
1284 		while (n && len) {
1285 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1286 			--len;
1287 			n = (n+1)%16;
1288 		}
1289 		if (n==0) GCM_MUL(ctx,Xi);
1290 		else {
1291 			ctx->mres = n;
1292 			return 0;
1293 		}
1294 	}
1295 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1296 	while (len>=GHASH_CHUNK) {
1297 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1298 		ctr += GHASH_CHUNK/16;
1299 		if (BYTE_ORDER == LITTLE_ENDIAN)
1300 #ifdef BSWAP4
1301 			ctx->Yi.d[3] = BSWAP4(ctr);
1302 #else
1303 			PUTU32(ctx->Yi.c+12,ctr);
1304 #endif
1305 		else
1306 			ctx->Yi.d[3] = ctr;
1307 		GHASH(ctx,out,GHASH_CHUNK);
1308 		out += GHASH_CHUNK;
1309 		in  += GHASH_CHUNK;
1310 		len -= GHASH_CHUNK;
1311 	}
1312 #endif
1313 	if ((i = (len&(size_t)-16))) {
1314 		size_t j=i/16;
1315 
1316 		(*stream)(in,out,j,key,ctx->Yi.c);
1317 		ctr += (unsigned int)j;
1318 		if (BYTE_ORDER == LITTLE_ENDIAN)
1319 #ifdef BSWAP4
1320 			ctx->Yi.d[3] = BSWAP4(ctr);
1321 #else
1322 			PUTU32(ctx->Yi.c+12,ctr);
1323 #endif
1324 		else
1325 			ctx->Yi.d[3] = ctr;
1326 		in  += i;
1327 		len -= i;
1328 #if defined(GHASH)
1329 		GHASH(ctx,out,i);
1330 		out += i;
1331 #else
1332 		while (j--) {
1333 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1334 			GCM_MUL(ctx,Xi);
1335 			out += 16;
1336 		}
1337 #endif
1338 	}
1339 	if (len) {
1340 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1341 		++ctr;
1342 		if (BYTE_ORDER == LITTLE_ENDIAN)
1343 #ifdef BSWAP4
1344 			ctx->Yi.d[3] = BSWAP4(ctr);
1345 #else
1346 			PUTU32(ctx->Yi.c+12,ctr);
1347 #endif
1348 		else
1349 			ctx->Yi.d[3] = ctr;
1350 		while (len--) {
1351 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1352 			++n;
1353 		}
1354 	}
1355 
1356 	ctx->mres = n;
1357 	return 0;
1358 }
1359 
1360 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1361 		const unsigned char *in, unsigned char *out,
1362 		size_t len,ctr128_f stream)
1363 {
1364 	unsigned int n, ctr;
1365 	size_t i;
1366 	u64   mlen = ctx->len.u[1];
1367 	void *key  = ctx->key;
1368 #ifdef GCM_FUNCREF_4BIT
1369 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1370 # ifdef GHASH
1371 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1372 				const u8 *inp,size_t len)	= ctx->ghash;
1373 # endif
1374 #endif
1375 
1376 	mlen += len;
1377 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1378 		return -1;
1379 	ctx->len.u[1] = mlen;
1380 
1381 	if (ctx->ares) {
1382 		/* First call to decrypt finalizes GHASH(AAD) */
1383 		GCM_MUL(ctx,Xi);
1384 		ctx->ares = 0;
1385 	}
1386 
1387 	if (BYTE_ORDER == LITTLE_ENDIAN)
1388 #ifdef BSWAP4
1389 		ctr = BSWAP4(ctx->Yi.d[3]);
1390 #else
1391 		ctr = GETU32(ctx->Yi.c+12);
1392 #endif
1393 	else
1394 		ctr = ctx->Yi.d[3];
1395 
1396 	n = ctx->mres;
1397 	if (n) {
1398 		while (n && len) {
1399 			u8 c = *(in++);
1400 			*(out++) = c^ctx->EKi.c[n];
1401 			ctx->Xi.c[n] ^= c;
1402 			--len;
1403 			n = (n+1)%16;
1404 		}
1405 		if (n==0) GCM_MUL (ctx,Xi);
1406 		else {
1407 			ctx->mres = n;
1408 			return 0;
1409 		}
1410 	}
1411 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1412 	while (len>=GHASH_CHUNK) {
1413 		GHASH(ctx,in,GHASH_CHUNK);
1414 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1415 		ctr += GHASH_CHUNK/16;
1416 		if (BYTE_ORDER == LITTLE_ENDIAN)
1417 #ifdef BSWAP4
1418 			ctx->Yi.d[3] = BSWAP4(ctr);
1419 #else
1420 			PUTU32(ctx->Yi.c+12,ctr);
1421 #endif
1422 		else
1423 			ctx->Yi.d[3] = ctr;
1424 		out += GHASH_CHUNK;
1425 		in  += GHASH_CHUNK;
1426 		len -= GHASH_CHUNK;
1427 	}
1428 #endif
1429 	if ((i = (len&(size_t)-16))) {
1430 		size_t j=i/16;
1431 
1432 #if defined(GHASH)
1433 		GHASH(ctx,in,i);
1434 #else
1435 		while (j--) {
1436 			size_t k;
1437 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1438 			GCM_MUL(ctx,Xi);
1439 			in += 16;
1440 		}
1441 		j   = i/16;
1442 		in -= i;
1443 #endif
1444 		(*stream)(in,out,j,key,ctx->Yi.c);
1445 		ctr += (unsigned int)j;
1446 		if (BYTE_ORDER == LITTLE_ENDIAN)
1447 #ifdef BSWAP4
1448 			ctx->Yi.d[3] = BSWAP4(ctr);
1449 #else
1450 			PUTU32(ctx->Yi.c+12,ctr);
1451 #endif
1452 		else
1453 			ctx->Yi.d[3] = ctr;
1454 		out += i;
1455 		in  += i;
1456 		len -= i;
1457 	}
1458 	if (len) {
1459 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1460 		++ctr;
1461 		if (BYTE_ORDER == LITTLE_ENDIAN)
1462 #ifdef BSWAP4
1463 			ctx->Yi.d[3] = BSWAP4(ctr);
1464 #else
1465 			PUTU32(ctx->Yi.c+12,ctr);
1466 #endif
1467 		else
1468 			ctx->Yi.d[3] = ctr;
1469 		while (len--) {
1470 			u8 c = in[n];
1471 			ctx->Xi.c[n] ^= c;
1472 			out[n] = c^ctx->EKi.c[n];
1473 			++n;
1474 		}
1475 	}
1476 
1477 	ctx->mres = n;
1478 	return 0;
1479 }
1480 
1481 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1482 			size_t len)
1483 {
1484 	u64 alen = ctx->len.u[0]<<3;
1485 	u64 clen = ctx->len.u[1]<<3;
1486 #ifdef GCM_FUNCREF_4BIT
1487 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1488 #endif
1489 
1490 	if (ctx->mres || ctx->ares)
1491 		GCM_MUL(ctx,Xi);
1492 
1493 	if (BYTE_ORDER == LITTLE_ENDIAN) {
1494 #ifdef BSWAP8
1495 		alen = BSWAP8(alen);
1496 		clen = BSWAP8(clen);
1497 #else
1498 		u8 *p = ctx->len.c;
1499 
1500 		ctx->len.u[0] = alen;
1501 		ctx->len.u[1] = clen;
1502 
1503 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1504 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1505 #endif
1506 	}
1507 
1508 	ctx->Xi.u[0] ^= alen;
1509 	ctx->Xi.u[1] ^= clen;
1510 	GCM_MUL(ctx,Xi);
1511 
1512 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1513 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1514 
1515 	if (tag && len<=sizeof(ctx->Xi))
1516 		return memcmp(ctx->Xi.c,tag,len);
1517 	else
1518 		return -1;
1519 }
1520 
1521 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1522 {
1523 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1524 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1525 }
1526 
1527 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1528 {
1529 	GCM128_CONTEXT *ret;
1530 
1531 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1532 		CRYPTO_gcm128_init(ret,key,block);
1533 
1534 	return ret;
1535 }
1536 
1537 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1538 {
1539 	if (ctx) {
1540 		OPENSSL_cleanse(ctx,sizeof(*ctx));
1541 		free(ctx);
1542 	}
1543 }
1544