xref: /netbsd-src/crypto/external/bsd/openssl/dist/crypto/modes/gcm128.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49 
50 #define OPENSSL_FIPSAPI
51 
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55 
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62 
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #undef	PUTU32
68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)	do { \
73 	if (sizeof(size_t)==8) { \
74 		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 		V.lo  = (V.hi<<63)|(V.lo>>1); \
76 		V.hi  = (V.hi>>1 )^T; \
77 	} \
78 	else { \
79 		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 		V.lo  = (V.hi<<63)|(V.lo>>1); \
81 		V.hi  = (V.hi>>1 )^((u64)T<<32); \
82 	} \
83 } while(0)
84 
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if	TABLE_BITS==8
120 
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123 	int  i, j;
124 	u128 V;
125 
126 	Htable[0].hi = 0;
127 	Htable[0].lo = 0;
128 	V.hi = H[0];
129 	V.lo = H[1];
130 
131 	for (Htable[128]=V, i=64; i>0; i>>=1) {
132 		REDUCE1BIT(V);
133 		Htable[i] = V;
134 	}
135 
136 	for (i=2; i<256; i<<=1) {
137 		u128 *Hi = Htable+i, H0 = *Hi;
138 		for (j=1; j<i; ++j) {
139 			Hi[j].hi = H0.hi^Htable[j].hi;
140 			Hi[j].lo = H0.lo^Htable[j].lo;
141 		}
142 	}
143 }
144 
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147 	u128 Z = { 0, 0};
148 	const u8 *xi = (const u8 *)Xi+15;
149 	size_t rem, n = *xi;
150 	const union { long one; char little; } is_endian = {1};
151 	static const size_t rem_8bit[256] = {
152 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216 
217 	while (1) {
218 		Z.hi ^= Htable[n].hi;
219 		Z.lo ^= Htable[n].lo;
220 
221 		if ((u8 *)Xi==xi)	break;
222 
223 		n = *(--xi);
224 
225 		rem  = (size_t)Z.lo&0xff;
226 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
227 		Z.hi = (Z.hi>>8);
228 		if (sizeof(size_t)==8)
229 			Z.hi ^= rem_8bit[rem];
230 		else
231 			Z.hi ^= (u64)rem_8bit[rem]<<32;
232 	}
233 
234 	if (is_endian.little) {
235 #ifdef BSWAP8
236 		Xi[0] = BSWAP8(Z.hi);
237 		Xi[1] = BSWAP8(Z.lo);
238 #else
239 		u8 *p = (u8 *)Xi;
240 		u32 v;
241 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
242 		v = (u32)(Z.hi);	PUTU32(p+4,v);
243 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244 		v = (u32)(Z.lo);	PUTU32(p+12,v);
245 #endif
246 	}
247 	else {
248 		Xi[0] = Z.hi;
249 		Xi[1] = Z.lo;
250 	}
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253 
254 #elif	TABLE_BITS==4
255 
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258 	u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260 	int  i;
261 #endif
262 
263 	Htable[0].hi = 0;
264 	Htable[0].lo = 0;
265 	V.hi = H[0];
266 	V.lo = H[1];
267 
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269 	for (Htable[8]=V, i=4; i>0; i>>=1) {
270 		REDUCE1BIT(V);
271 		Htable[i] = V;
272 	}
273 
274 	for (i=2; i<16; i<<=1) {
275 		u128 *Hi = Htable+i;
276 		int   j;
277 		for (V=*Hi, j=1; j<i; ++j) {
278 			Hi[j].hi = V.hi^Htable[j].hi;
279 			Hi[j].lo = V.lo^Htable[j].lo;
280 		}
281 	}
282 #else
283 	Htable[8] = V;
284 	REDUCE1BIT(V);
285 	Htable[4] = V;
286 	REDUCE1BIT(V);
287 	Htable[2] = V;
288 	REDUCE1BIT(V);
289 	Htable[1] = V;
290 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291 	V=Htable[4];
292 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295 	V=Htable[8];
296 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305 	/*
306 	 * ARM assembler expects specific dword order in Htable.
307 	 */
308 	{
309 	int j;
310 	const union { long one; char little; } is_endian = {1};
311 
312 	if (is_endian.little)
313 		for (j=0;j<16;++j) {
314 			V = Htable[j];
315 			Htable[j].hi = V.lo;
316 			Htable[j].lo = V.hi;
317 		}
318 	else
319 		for (j=0;j<16;++j) {
320 			V = Htable[j];
321 			Htable[j].hi = V.lo<<32|V.lo>>32;
322 			Htable[j].lo = V.hi<<32|V.hi>>32;
323 		}
324 	}
325 #endif
326 }
327 
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334 
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337 	u128 Z;
338 	int cnt = 15;
339 	size_t rem, nlo, nhi;
340 	const union { long one; char little; } is_endian = {1};
341 
342 	nlo  = ((const u8 *)Xi)[15];
343 	nhi  = nlo>>4;
344 	nlo &= 0xf;
345 
346 	Z.hi = Htable[nlo].hi;
347 	Z.lo = Htable[nlo].lo;
348 
349 	while (1) {
350 		rem  = (size_t)Z.lo&0xf;
351 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
352 		Z.hi = (Z.hi>>4);
353 		if (sizeof(size_t)==8)
354 			Z.hi ^= rem_4bit[rem];
355 		else
356 			Z.hi ^= (u64)rem_4bit[rem]<<32;
357 
358 		Z.hi ^= Htable[nhi].hi;
359 		Z.lo ^= Htable[nhi].lo;
360 
361 		if (--cnt<0)		break;
362 
363 		nlo  = ((const u8 *)Xi)[cnt];
364 		nhi  = nlo>>4;
365 		nlo &= 0xf;
366 
367 		rem  = (size_t)Z.lo&0xf;
368 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
369 		Z.hi = (Z.hi>>4);
370 		if (sizeof(size_t)==8)
371 			Z.hi ^= rem_4bit[rem];
372 		else
373 			Z.hi ^= (u64)rem_4bit[rem]<<32;
374 
375 		Z.hi ^= Htable[nlo].hi;
376 		Z.lo ^= Htable[nlo].lo;
377 	}
378 
379 	if (is_endian.little) {
380 #ifdef BSWAP8
381 		Xi[0] = BSWAP8(Z.hi);
382 		Xi[1] = BSWAP8(Z.lo);
383 #else
384 		u8 *p = (u8 *)Xi;
385 		u32 v;
386 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
387 		v = (u32)(Z.hi);	PUTU32(p+4,v);
388 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
389 		v = (u32)(Z.lo);	PUTU32(p+12,v);
390 #endif
391 	}
392 	else {
393 		Xi[0] = Z.hi;
394 		Xi[1] = Z.lo;
395 	}
396 }
397 
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407 				const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413 
414 #if 1
415     do {
416 	cnt  = 15;
417 	nlo  = ((const u8 *)Xi)[15];
418 	nlo ^= inp[15];
419 	nhi  = nlo>>4;
420 	nlo &= 0xf;
421 
422 	Z.hi = Htable[nlo].hi;
423 	Z.lo = Htable[nlo].lo;
424 
425 	while (1) {
426 		rem  = (size_t)Z.lo&0xf;
427 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
428 		Z.hi = (Z.hi>>4);
429 		if (sizeof(size_t)==8)
430 			Z.hi ^= rem_4bit[rem];
431 		else
432 			Z.hi ^= (u64)rem_4bit[rem]<<32;
433 
434 		Z.hi ^= Htable[nhi].hi;
435 		Z.lo ^= Htable[nhi].lo;
436 
437 		if (--cnt<0)		break;
438 
439 		nlo  = ((const u8 *)Xi)[cnt];
440 		nlo ^= inp[cnt];
441 		nhi  = nlo>>4;
442 		nlo &= 0xf;
443 
444 		rem  = (size_t)Z.lo&0xf;
445 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
446 		Z.hi = (Z.hi>>4);
447 		if (sizeof(size_t)==8)
448 			Z.hi ^= rem_4bit[rem];
449 		else
450 			Z.hi ^= (u64)rem_4bit[rem]<<32;
451 
452 		Z.hi ^= Htable[nlo].hi;
453 		Z.lo ^= Htable[nlo].lo;
454 	}
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */
462     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
463     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504 	Z.hi = Htable[cnt].hi;
505 	Z.lo = Htable[cnt].lo;
506 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507 	Hshr4[cnt].hi = (Z.hi>>4);
508 	Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510 
511     do {
512 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513 		nlo  = ((const u8 *)Xi)[cnt];
514 		nlo ^= inp[cnt];
515 		nhi  = nlo>>4;
516 		nlo &= 0xf;
517 
518 		Z.hi ^= Htable[nlo].hi;
519 		Z.lo ^= Htable[nlo].lo;
520 
521 		rem = (size_t)Z.lo&0xff;
522 
523 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
524 		Z.hi = (Z.hi>>8);
525 
526 		Z.hi ^= Hshr4[nhi].hi;
527 		Z.lo ^= Hshr4[nhi].lo;
528 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529 	}
530 
531 	nlo  = ((const u8 *)Xi)[0];
532 	nlo ^= inp[0];
533 	nhi  = nlo>>4;
534 	nlo &= 0xf;
535 
536 	Z.hi ^= Htable[nlo].hi;
537 	Z.lo ^= Htable[nlo].lo;
538 
539 	rem = (size_t)Z.lo&0xf;
540 
541 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
542 	Z.hi = (Z.hi>>4);
543 
544 	Z.hi ^= Htable[nhi].hi;
545 	Z.lo ^= Htable[nhi].lo;
546 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548 
549 	if (is_endian.little) {
550 #ifdef BSWAP8
551 		Xi[0] = BSWAP8(Z.hi);
552 		Xi[1] = BSWAP8(Z.lo);
553 #else
554 		u8 *p = (u8 *)Xi;
555 		u32 v;
556 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
557 		v = (u32)(Z.hi);	PUTU32(p+4,v);
558 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
559 		v = (u32)(Z.lo);	PUTU32(p+12,v);
560 #endif
561 	}
562 	else {
563 		Xi[0] = Z.hi;
564 		Xi[1] = Z.lo;
565 	}
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573 
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582 
583 #else	/* TABLE_BITS */
584 
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587 	u128 V,Z = { 0,0 };
588 	long X;
589 	int  i,j;
590 	const long *xi = (const long *)Xi;
591 	const union { long one; char little; } is_endian = {1};
592 
593 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
594 	V.lo = H[1];
595 
596 	for (j=0; j<16/sizeof(long); ++j) {
597 		if (is_endian.little) {
598 			if (sizeof(long)==8) {
599 #ifdef BSWAP8
600 				X = (long)(BSWAP8(xi[j]));
601 #else
602 				const u8 *p = (const u8 *)(xi+j);
603 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605 			}
606 			else {
607 				const u8 *p = (const u8 *)(xi+j);
608 				X = (long)GETU32(p);
609 			}
610 		}
611 		else
612 			X = xi[j];
613 
614 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615 			u64 M = (u64)(X>>(8*sizeof(long)-1));
616 			Z.hi ^= V.hi&M;
617 			Z.lo ^= V.lo&M;
618 
619 			REDUCE1BIT(V);
620 		}
621 	}
622 
623 	if (is_endian.little) {
624 #ifdef BSWAP8
625 		Xi[0] = BSWAP8(Z.hi);
626 		Xi[1] = BSWAP8(Z.lo);
627 #else
628 		u8 *p = (u8 *)Xi;
629 		u32 v;
630 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
631 		v = (u32)(Z.hi);	PUTU32(p+4,v);
632 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
633 		v = (u32)(Z.lo);	PUTU32(p+12,v);
634 #endif
635 	}
636 	else {
637 		Xi[0] = Z.hi;
638 		Xi[1] = Z.lo;
639 	}
640 }
641 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642 
643 #endif
644 
645 #if	TABLE_BITS==4 && defined(GHASH_ASM)
646 # if	!defined(I386_ONLY) && \
647 	(defined(__i386)	|| defined(__i386__)	|| \
648 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
649 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653 
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657 
658 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662 
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673 #  endif
674 # endif
675 #endif
676 
677 #ifdef GCM_FUNCREF_4BIT
678 # undef  GCM_MUL
679 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680 # ifdef GHASH
681 #  undef  GHASH
682 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683 # endif
684 #endif
685 
686 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687 {
688 	const union { long one; char little; } is_endian = {1};
689 
690 	memset(ctx,0,sizeof(*ctx));
691 	ctx->block = block;
692 	ctx->key   = key;
693 
694 	(*block)(ctx->H.c,ctx->H.c,key);
695 
696 	if (is_endian.little) {
697 		/* H is stored in host byte order */
698 #ifdef BSWAP8
699 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701 #else
702 		u8 *p = ctx->H.c;
703 		u64 hi,lo;
704 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
705 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706 		ctx->H.u[0] = hi;
707 		ctx->H.u[1] = lo;
708 #endif
709 	}
710 
711 #if	TABLE_BITS==8
712 	gcm_init_8bit(ctx->Htable,ctx->H.u);
713 #elif	TABLE_BITS==4
714 # if	defined(GHASH_ASM_X86_OR_64)
715 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
717 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
718 		gcm_init_clmul(ctx->Htable,ctx->H.u);
719 		ctx->gmult = gcm_gmult_clmul;
720 		ctx->ghash = gcm_ghash_clmul;
721 		return;
722 	}
723 #  endif
724 	gcm_init_4bit(ctx->Htable,ctx->H.u);
725 #  if	defined(GHASH_ASM_X86)			/* x86 only */
726 #   if	defined(OPENSSL_IA32_SSE2)
727 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
728 #   else
729 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
730 #   endif
731 		ctx->gmult = gcm_gmult_4bit_mmx;
732 		ctx->ghash = gcm_ghash_4bit_mmx;
733 	} else {
734 		ctx->gmult = gcm_gmult_4bit_x86;
735 		ctx->ghash = gcm_ghash_4bit_x86;
736 	}
737 #  else
738 	ctx->gmult = gcm_gmult_4bit;
739 	ctx->ghash = gcm_ghash_4bit;
740 #  endif
741 # elif	defined(GHASH_ASM_ARM)
742 	if (OPENSSL_armcap_P & ARMV7_NEON) {
743 		ctx->gmult = gcm_gmult_neon;
744 		ctx->ghash = gcm_ghash_neon;
745 	} else {
746 		gcm_init_4bit(ctx->Htable,ctx->H.u);
747 		ctx->gmult = gcm_gmult_4bit;
748 		ctx->ghash = gcm_ghash_4bit;
749 	}
750 # else
751 	gcm_init_4bit(ctx->Htable,ctx->H.u);
752 # endif
753 #endif
754 }
755 
756 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
757 {
758 	const union { long one; char little; } is_endian = {1};
759 	unsigned int ctr;
760 #ifdef GCM_FUNCREF_4BIT
761 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
762 #endif
763 
764 	ctx->Yi.u[0]  = 0;
765 	ctx->Yi.u[1]  = 0;
766 	ctx->Xi.u[0]  = 0;
767 	ctx->Xi.u[1]  = 0;
768 	ctx->len.u[0] = 0;	/* AAD length */
769 	ctx->len.u[1] = 0;	/* message length */
770 	ctx->ares = 0;
771 	ctx->mres = 0;
772 
773 	if (len==12) {
774 		memcpy(ctx->Yi.c,iv,12);
775 		ctx->Yi.c[15]=1;
776 		ctr=1;
777 	}
778 	else {
779 		size_t i;
780 		u64 len0 = len;
781 
782 		while (len>=16) {
783 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
784 			GCM_MUL(ctx,Yi);
785 			iv += 16;
786 			len -= 16;
787 		}
788 		if (len) {
789 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790 			GCM_MUL(ctx,Yi);
791 		}
792 		len0 <<= 3;
793 		if (is_endian.little) {
794 #ifdef BSWAP8
795 			ctx->Yi.u[1]  ^= BSWAP8(len0);
796 #else
797 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
798 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
799 			ctx->Yi.c[10] ^= (u8)(len0>>40);
800 			ctx->Yi.c[11] ^= (u8)(len0>>32);
801 			ctx->Yi.c[12] ^= (u8)(len0>>24);
802 			ctx->Yi.c[13] ^= (u8)(len0>>16);
803 			ctx->Yi.c[14] ^= (u8)(len0>>8);
804 			ctx->Yi.c[15] ^= (u8)(len0);
805 #endif
806 		}
807 		else
808 			ctx->Yi.u[1]  ^= len0;
809 
810 		GCM_MUL(ctx,Yi);
811 
812 		if (is_endian.little)
813 #ifdef BSWAP4
814 			ctr = BSWAP4(ctx->Yi.d[3]);
815 #else
816 			ctr = GETU32(ctx->Yi.c+12);
817 #endif
818 		else
819 			ctr = ctx->Yi.d[3];
820 	}
821 
822 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
823 	++ctr;
824 	if (is_endian.little)
825 #ifdef BSWAP4
826 		ctx->Yi.d[3] = BSWAP4(ctr);
827 #else
828 		PUTU32(ctx->Yi.c+12,ctr);
829 #endif
830 	else
831 		ctx->Yi.d[3] = ctr;
832 }
833 
834 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
835 {
836 	size_t i;
837 	unsigned int n;
838 	u64 alen = ctx->len.u[0];
839 #ifdef GCM_FUNCREF_4BIT
840 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
841 # ifdef GHASH
842 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
843 				const u8 *inp,size_t len)	= ctx->ghash;
844 # endif
845 #endif
846 
847 	if (ctx->len.u[1]) return -2;
848 
849 	alen += len;
850 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
851 		return -1;
852 	ctx->len.u[0] = alen;
853 
854 	n = ctx->ares;
855 	if (n) {
856 		while (n && len) {
857 			ctx->Xi.c[n] ^= *(aad++);
858 			--len;
859 			n = (n+1)%16;
860 		}
861 		if (n==0) GCM_MUL(ctx,Xi);
862 		else {
863 			ctx->ares = n;
864 			return 0;
865 		}
866 	}
867 
868 #ifdef GHASH
869 	if ((i = (len&(size_t)-16))) {
870 		GHASH(ctx,aad,i);
871 		aad += i;
872 		len -= i;
873 	}
874 #else
875 	while (len>=16) {
876 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
877 		GCM_MUL(ctx,Xi);
878 		aad += 16;
879 		len -= 16;
880 	}
881 #endif
882 	if (len) {
883 		n = (unsigned int)len;
884 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
885 	}
886 
887 	ctx->ares = n;
888 	return 0;
889 }
890 
891 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
892 		const unsigned char *in, unsigned char *out,
893 		size_t len)
894 {
895 	const union { long one; char little; } is_endian = {1};
896 	unsigned int n, ctr;
897 	size_t i;
898 	u64        mlen  = ctx->len.u[1];
899 	block128_f block = ctx->block;
900 	void      *key   = ctx->key;
901 #ifdef GCM_FUNCREF_4BIT
902 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
903 # ifdef GHASH
904 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
905 				const u8 *inp,size_t len)	= ctx->ghash;
906 # endif
907 #endif
908 
909 #if 0
910 	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
911 #endif
912 	mlen += len;
913 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
914 		return -1;
915 	ctx->len.u[1] = mlen;
916 
917 	if (ctx->ares) {
918 		/* First call to encrypt finalizes GHASH(AAD) */
919 		GCM_MUL(ctx,Xi);
920 		ctx->ares = 0;
921 	}
922 
923 	if (is_endian.little)
924 #ifdef BSWAP4
925 		ctr = BSWAP4(ctx->Yi.d[3]);
926 #else
927 		ctr = GETU32(ctx->Yi.c+12);
928 #endif
929 	else
930 		ctr = ctx->Yi.d[3];
931 
932 	n = ctx->mres;
933 #if !defined(OPENSSL_SMALL_FOOTPRINT)
934 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
935 		if (n) {
936 			while (n && len) {
937 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
938 				--len;
939 				n = (n+1)%16;
940 			}
941 			if (n==0) GCM_MUL(ctx,Xi);
942 			else {
943 				ctx->mres = n;
944 				return 0;
945 			}
946 		}
947 #if defined(STRICT_ALIGNMENT)
948 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
949 			break;
950 #endif
951 #if defined(GHASH) && defined(GHASH_CHUNK)
952 		while (len>=GHASH_CHUNK) {
953 		    size_t j=GHASH_CHUNK;
954 
955 		    while (j) {
956 		    	size_t *out_t=(size_t *)out;
957 		    	const size_t *in_t=(const size_t *)in;
958 
959 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
960 			++ctr;
961 			if (is_endian.little)
962 #ifdef BSWAP4
963 				ctx->Yi.d[3] = BSWAP4(ctr);
964 #else
965 				PUTU32(ctx->Yi.c+12,ctr);
966 #endif
967 			else
968 				ctx->Yi.d[3] = ctr;
969 			for (i=0; i<16/sizeof(size_t); ++i)
970 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
971 			out += 16;
972 			in  += 16;
973 			j   -= 16;
974 		    }
975 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
976 		    len -= GHASH_CHUNK;
977 		}
978 		if ((i = (len&(size_t)-16))) {
979 		    size_t j=i;
980 
981 		    while (len>=16) {
982 		    	size_t *out_t=(size_t *)out;
983 		    	const size_t *in_t=(const size_t *)in;
984 
985 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
986 			++ctr;
987 			if (is_endian.little)
988 #ifdef BSWAP4
989 				ctx->Yi.d[3] = BSWAP4(ctr);
990 #else
991 				PUTU32(ctx->Yi.c+12,ctr);
992 #endif
993 			else
994 				ctx->Yi.d[3] = ctr;
995 			for (i=0; i<16/sizeof(size_t); ++i)
996 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
997 			out += 16;
998 			in  += 16;
999 			len -= 16;
1000 		    }
1001 		    GHASH(ctx,out-j,j);
1002 		}
1003 #else
1004 		while (len>=16) {
1005 		    	size_t *out_t=(size_t *)out;
1006 		    	const size_t *in_t=(const size_t *)in;
1007 
1008 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1009 			++ctr;
1010 			if (is_endian.little)
1011 #ifdef BSWAP4
1012 				ctx->Yi.d[3] = BSWAP4(ctr);
1013 #else
1014 				PUTU32(ctx->Yi.c+12,ctr);
1015 #endif
1016 			else
1017 				ctx->Yi.d[3] = ctr;
1018 			for (i=0; i<16/sizeof(size_t); ++i)
1019 				ctx->Xi.t[i] ^=
1020 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1021 			GCM_MUL(ctx,Xi);
1022 			out += 16;
1023 			in  += 16;
1024 			len -= 16;
1025 		}
1026 #endif
1027 		if (len) {
1028 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1029 			++ctr;
1030 			if (is_endian.little)
1031 #ifdef BSWAP4
1032 				ctx->Yi.d[3] = BSWAP4(ctr);
1033 #else
1034 				PUTU32(ctx->Yi.c+12,ctr);
1035 #endif
1036 			else
1037 				ctx->Yi.d[3] = ctr;
1038 			while (len--) {
1039 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1040 				++n;
1041 			}
1042 		}
1043 
1044 		ctx->mres = n;
1045 		return 0;
1046 	} while(0);
1047 #endif
1048 	for (i=0;i<len;++i) {
1049 		if (n==0) {
1050 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1051 			++ctr;
1052 			if (is_endian.little)
1053 #ifdef BSWAP4
1054 				ctx->Yi.d[3] = BSWAP4(ctr);
1055 #else
1056 				PUTU32(ctx->Yi.c+12,ctr);
1057 #endif
1058 			else
1059 				ctx->Yi.d[3] = ctr;
1060 		}
1061 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1062 		n = (n+1)%16;
1063 		if (n==0)
1064 			GCM_MUL(ctx,Xi);
1065 	}
1066 
1067 	ctx->mres = n;
1068 	return 0;
1069 }
1070 
1071 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1072 		const unsigned char *in, unsigned char *out,
1073 		size_t len)
1074 {
1075 	const union { long one; char little; } is_endian = {1};
1076 	unsigned int n, ctr;
1077 	size_t i;
1078 	u64        mlen  = ctx->len.u[1];
1079 	block128_f block = ctx->block;
1080 	void      *key   = ctx->key;
1081 #ifdef GCM_FUNCREF_4BIT
1082 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1083 # ifdef GHASH
1084 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1085 				const u8 *inp,size_t len)	= ctx->ghash;
1086 # endif
1087 #endif
1088 
1089 	mlen += len;
1090 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1091 		return -1;
1092 	ctx->len.u[1] = mlen;
1093 
1094 	if (ctx->ares) {
1095 		/* First call to decrypt finalizes GHASH(AAD) */
1096 		GCM_MUL(ctx,Xi);
1097 		ctx->ares = 0;
1098 	}
1099 
1100 	if (is_endian.little)
1101 #ifdef BSWAP4
1102 		ctr = BSWAP4(ctx->Yi.d[3]);
1103 #else
1104 		ctr = GETU32(ctx->Yi.c+12);
1105 #endif
1106 	else
1107 		ctr = ctx->Yi.d[3];
1108 
1109 	n = ctx->mres;
1110 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1111 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1112 		if (n) {
1113 			while (n && len) {
1114 				u8 c = *(in++);
1115 				*(out++) = c^ctx->EKi.c[n];
1116 				ctx->Xi.c[n] ^= c;
1117 				--len;
1118 				n = (n+1)%16;
1119 			}
1120 			if (n==0) GCM_MUL (ctx,Xi);
1121 			else {
1122 				ctx->mres = n;
1123 				return 0;
1124 			}
1125 		}
1126 #if defined(STRICT_ALIGNMENT)
1127 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1128 			break;
1129 #endif
1130 #if defined(GHASH) && defined(GHASH_CHUNK)
1131 		while (len>=GHASH_CHUNK) {
1132 		    size_t j=GHASH_CHUNK;
1133 
1134 		    GHASH(ctx,in,GHASH_CHUNK);
1135 		    while (j) {
1136 		    	size_t *out_t=(size_t *)out;
1137 		    	const size_t *in_t=(const size_t *)in;
1138 
1139 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1140 			++ctr;
1141 			if (is_endian.little)
1142 #ifdef BSWAP4
1143 				ctx->Yi.d[3] = BSWAP4(ctr);
1144 #else
1145 				PUTU32(ctx->Yi.c+12,ctr);
1146 #endif
1147 			else
1148 				ctx->Yi.d[3] = ctr;
1149 			for (i=0; i<16/sizeof(size_t); ++i)
1150 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1151 			out += 16;
1152 			in  += 16;
1153 			j   -= 16;
1154 		    }
1155 		    len -= GHASH_CHUNK;
1156 		}
1157 		if ((i = (len&(size_t)-16))) {
1158 		    GHASH(ctx,in,i);
1159 		    while (len>=16) {
1160 		    	size_t *out_t=(size_t *)out;
1161 		    	const size_t *in_t=(const size_t *)in;
1162 
1163 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1164 			++ctr;
1165 			if (is_endian.little)
1166 #ifdef BSWAP4
1167 				ctx->Yi.d[3] = BSWAP4(ctr);
1168 #else
1169 				PUTU32(ctx->Yi.c+12,ctr);
1170 #endif
1171 			else
1172 				ctx->Yi.d[3] = ctr;
1173 			for (i=0; i<16/sizeof(size_t); ++i)
1174 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1175 			out += 16;
1176 			in  += 16;
1177 			len -= 16;
1178 		    }
1179 		}
1180 #else
1181 		while (len>=16) {
1182 		    	size_t *out_t=(size_t *)out;
1183 		    	const size_t *in_t=(const size_t *)in;
1184 
1185 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1186 			++ctr;
1187 			if (is_endian.little)
1188 #ifdef BSWAP4
1189 				ctx->Yi.d[3] = BSWAP4(ctr);
1190 #else
1191 				PUTU32(ctx->Yi.c+12,ctr);
1192 #endif
1193 			else
1194 				ctx->Yi.d[3] = ctr;
1195 			for (i=0; i<16/sizeof(size_t); ++i) {
1196 				size_t c = in[i];
1197 				out[i] = c^ctx->EKi.t[i];
1198 				ctx->Xi.t[i] ^= c;
1199 			}
1200 			GCM_MUL(ctx,Xi);
1201 			out += 16;
1202 			in  += 16;
1203 			len -= 16;
1204 		}
1205 #endif
1206 		if (len) {
1207 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1208 			++ctr;
1209 			if (is_endian.little)
1210 #ifdef BSWAP4
1211 				ctx->Yi.d[3] = BSWAP4(ctr);
1212 #else
1213 				PUTU32(ctx->Yi.c+12,ctr);
1214 #endif
1215 			else
1216 				ctx->Yi.d[3] = ctr;
1217 			while (len--) {
1218 				u8 c = in[n];
1219 				ctx->Xi.c[n] ^= c;
1220 				out[n] = c^ctx->EKi.c[n];
1221 				++n;
1222 			}
1223 		}
1224 
1225 		ctx->mres = n;
1226 		return 0;
1227 	} while(0);
1228 #endif
1229 	for (i=0;i<len;++i) {
1230 		u8 c;
1231 		if (n==0) {
1232 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1233 			++ctr;
1234 			if (is_endian.little)
1235 #ifdef BSWAP4
1236 				ctx->Yi.d[3] = BSWAP4(ctr);
1237 #else
1238 				PUTU32(ctx->Yi.c+12,ctr);
1239 #endif
1240 			else
1241 				ctx->Yi.d[3] = ctr;
1242 		}
1243 		c = in[i];
1244 		out[i] = c^ctx->EKi.c[n];
1245 		ctx->Xi.c[n] ^= c;
1246 		n = (n+1)%16;
1247 		if (n==0)
1248 			GCM_MUL(ctx,Xi);
1249 	}
1250 
1251 	ctx->mres = n;
1252 	return 0;
1253 }
1254 
1255 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1256 		const unsigned char *in, unsigned char *out,
1257 		size_t len, ctr128_f stream)
1258 {
1259 	const union { long one; char little; } is_endian = {1};
1260 	unsigned int n, ctr;
1261 	size_t i;
1262 	u64   mlen = ctx->len.u[1];
1263 	void *key  = ctx->key;
1264 #ifdef GCM_FUNCREF_4BIT
1265 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1266 # ifdef GHASH
1267 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1268 				const u8 *inp,size_t len)	= ctx->ghash;
1269 # endif
1270 #endif
1271 
1272 	mlen += len;
1273 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1274 		return -1;
1275 	ctx->len.u[1] = mlen;
1276 
1277 	if (ctx->ares) {
1278 		/* First call to encrypt finalizes GHASH(AAD) */
1279 		GCM_MUL(ctx,Xi);
1280 		ctx->ares = 0;
1281 	}
1282 
1283 	if (is_endian.little)
1284 #ifdef BSWAP4
1285 		ctr = BSWAP4(ctx->Yi.d[3]);
1286 #else
1287 		ctr = GETU32(ctx->Yi.c+12);
1288 #endif
1289 	else
1290 		ctr = ctx->Yi.d[3];
1291 
1292 	n = ctx->mres;
1293 	if (n) {
1294 		while (n && len) {
1295 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1296 			--len;
1297 			n = (n+1)%16;
1298 		}
1299 		if (n==0) GCM_MUL(ctx,Xi);
1300 		else {
1301 			ctx->mres = n;
1302 			return 0;
1303 		}
1304 	}
1305 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1306 	while (len>=GHASH_CHUNK) {
1307 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1308 		ctr += GHASH_CHUNK/16;
1309 		if (is_endian.little)
1310 #ifdef BSWAP4
1311 			ctx->Yi.d[3] = BSWAP4(ctr);
1312 #else
1313 			PUTU32(ctx->Yi.c+12,ctr);
1314 #endif
1315 		else
1316 			ctx->Yi.d[3] = ctr;
1317 		GHASH(ctx,out,GHASH_CHUNK);
1318 		out += GHASH_CHUNK;
1319 		in  += GHASH_CHUNK;
1320 		len -= GHASH_CHUNK;
1321 	}
1322 #endif
1323 	if ((i = (len&(size_t)-16))) {
1324 		size_t j=i/16;
1325 
1326 		(*stream)(in,out,j,key,ctx->Yi.c);
1327 		ctr += (unsigned int)j;
1328 		if (is_endian.little)
1329 #ifdef BSWAP4
1330 			ctx->Yi.d[3] = BSWAP4(ctr);
1331 #else
1332 			PUTU32(ctx->Yi.c+12,ctr);
1333 #endif
1334 		else
1335 			ctx->Yi.d[3] = ctr;
1336 		in  += i;
1337 		len -= i;
1338 #if defined(GHASH)
1339 		GHASH(ctx,out,i);
1340 		out += i;
1341 #else
1342 		while (j--) {
1343 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1344 			GCM_MUL(ctx,Xi);
1345 			out += 16;
1346 		}
1347 #endif
1348 	}
1349 	if (len) {
1350 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1351 		++ctr;
1352 		if (is_endian.little)
1353 #ifdef BSWAP4
1354 			ctx->Yi.d[3] = BSWAP4(ctr);
1355 #else
1356 			PUTU32(ctx->Yi.c+12,ctr);
1357 #endif
1358 		else
1359 			ctx->Yi.d[3] = ctr;
1360 		while (len--) {
1361 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1362 			++n;
1363 		}
1364 	}
1365 
1366 	ctx->mres = n;
1367 	return 0;
1368 }
1369 
1370 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1371 		const unsigned char *in, unsigned char *out,
1372 		size_t len,ctr128_f stream)
1373 {
1374 	const union { long one; char little; } is_endian = {1};
1375 	unsigned int n, ctr;
1376 	size_t i;
1377 	u64   mlen = ctx->len.u[1];
1378 	void *key  = ctx->key;
1379 #ifdef GCM_FUNCREF_4BIT
1380 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1381 # ifdef GHASH
1382 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1383 				const u8 *inp,size_t len)	= ctx->ghash;
1384 # endif
1385 #endif
1386 
1387 	mlen += len;
1388 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1389 		return -1;
1390 	ctx->len.u[1] = mlen;
1391 
1392 	if (ctx->ares) {
1393 		/* First call to decrypt finalizes GHASH(AAD) */
1394 		GCM_MUL(ctx,Xi);
1395 		ctx->ares = 0;
1396 	}
1397 
1398 	if (is_endian.little)
1399 #ifdef BSWAP4
1400 		ctr = BSWAP4(ctx->Yi.d[3]);
1401 #else
1402 		ctr = GETU32(ctx->Yi.c+12);
1403 #endif
1404 	else
1405 		ctr = ctx->Yi.d[3];
1406 
1407 	n = ctx->mres;
1408 	if (n) {
1409 		while (n && len) {
1410 			u8 c = *(in++);
1411 			*(out++) = c^ctx->EKi.c[n];
1412 			ctx->Xi.c[n] ^= c;
1413 			--len;
1414 			n = (n+1)%16;
1415 		}
1416 		if (n==0) GCM_MUL (ctx,Xi);
1417 		else {
1418 			ctx->mres = n;
1419 			return 0;
1420 		}
1421 	}
1422 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1423 	while (len>=GHASH_CHUNK) {
1424 		GHASH(ctx,in,GHASH_CHUNK);
1425 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1426 		ctr += GHASH_CHUNK/16;
1427 		if (is_endian.little)
1428 #ifdef BSWAP4
1429 			ctx->Yi.d[3] = BSWAP4(ctr);
1430 #else
1431 			PUTU32(ctx->Yi.c+12,ctr);
1432 #endif
1433 		else
1434 			ctx->Yi.d[3] = ctr;
1435 		out += GHASH_CHUNK;
1436 		in  += GHASH_CHUNK;
1437 		len -= GHASH_CHUNK;
1438 	}
1439 #endif
1440 	if ((i = (len&(size_t)-16))) {
1441 		size_t j=i/16;
1442 
1443 #if defined(GHASH)
1444 		GHASH(ctx,in,i);
1445 #else
1446 		while (j--) {
1447 			size_t k;
1448 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1449 			GCM_MUL(ctx,Xi);
1450 			in += 16;
1451 		}
1452 		j   = i/16;
1453 		in -= i;
1454 #endif
1455 		(*stream)(in,out,j,key,ctx->Yi.c);
1456 		ctr += (unsigned int)j;
1457 		if (is_endian.little)
1458 #ifdef BSWAP4
1459 			ctx->Yi.d[3] = BSWAP4(ctr);
1460 #else
1461 			PUTU32(ctx->Yi.c+12,ctr);
1462 #endif
1463 		else
1464 			ctx->Yi.d[3] = ctr;
1465 		out += i;
1466 		in  += i;
1467 		len -= i;
1468 	}
1469 	if (len) {
1470 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1471 		++ctr;
1472 		if (is_endian.little)
1473 #ifdef BSWAP4
1474 			ctx->Yi.d[3] = BSWAP4(ctr);
1475 #else
1476 			PUTU32(ctx->Yi.c+12,ctr);
1477 #endif
1478 		else
1479 			ctx->Yi.d[3] = ctr;
1480 		while (len--) {
1481 			u8 c = in[n];
1482 			ctx->Xi.c[n] ^= c;
1483 			out[n] = c^ctx->EKi.c[n];
1484 			++n;
1485 		}
1486 	}
1487 
1488 	ctx->mres = n;
1489 	return 0;
1490 }
1491 
1492 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1493 			size_t len)
1494 {
1495 	const union { long one; char little; } is_endian = {1};
1496 	u64 alen = ctx->len.u[0]<<3;
1497 	u64 clen = ctx->len.u[1]<<3;
1498 #ifdef GCM_FUNCREF_4BIT
1499 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1500 #endif
1501 
1502 	if (ctx->mres || ctx->ares)
1503 		GCM_MUL(ctx,Xi);
1504 
1505 	if (is_endian.little) {
1506 #ifdef BSWAP8
1507 		alen = BSWAP8(alen);
1508 		clen = BSWAP8(clen);
1509 #else
1510 		u8 *p = ctx->len.c;
1511 
1512 		ctx->len.u[0] = alen;
1513 		ctx->len.u[1] = clen;
1514 
1515 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1516 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1517 #endif
1518 	}
1519 
1520 	ctx->Xi.u[0] ^= alen;
1521 	ctx->Xi.u[1] ^= clen;
1522 	GCM_MUL(ctx,Xi);
1523 
1524 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1525 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1526 
1527 	if (tag && len<=sizeof(ctx->Xi))
1528 		return memcmp(ctx->Xi.c,tag,len);
1529 	else
1530 		return -1;
1531 }
1532 
1533 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1534 {
1535 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1536 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1537 }
1538 
1539 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1540 {
1541 	GCM128_CONTEXT *ret;
1542 
1543 	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1544 		CRYPTO_gcm128_init(ret,key,block);
1545 
1546 	return ret;
1547 }
1548 
1549 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1550 {
1551 	if (ctx) {
1552 		OPENSSL_cleanse(ctx,sizeof(*ctx));
1553 		OPENSSL_free(ctx);
1554 	}
1555 }
1556 
1557 #if defined(SELFTEST)
1558 #include <stdio.h>
1559 #include <openssl/aes.h>
1560 
1561 /* Test Case 1 */
1562 static const u8	K1[16],
1563 		*P1=NULL,
1564 		*A1=NULL,
1565 		IV1[12],
1566 		*C1=NULL,
1567 		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1568 
1569 /* Test Case 2 */
1570 #define K2 K1
1571 #define A2 A1
1572 #define IV2 IV1
1573 static const u8	P2[16],
1574 		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1575 		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1576 
1577 /* Test Case 3 */
1578 #define A3 A2
1579 static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1580 		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1581 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1582 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1583 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1584 		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1585 		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1586 			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1587 			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1588 			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1589 		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1590 
1591 /* Test Case 4 */
1592 #define K4 K3
1593 #define IV4 IV3
1594 static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1595 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1596 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1597 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1598 		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1599 			0xab,0xad,0xda,0xd2},
1600 		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1601 			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1602 			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1603 			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1604 		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1605 
1606 /* Test Case 5 */
1607 #define K5 K4
1608 #define P5 P4
1609 #define A5 A4
1610 static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1611 		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1612 			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1613 			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1614 			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1615 		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1616 
1617 /* Test Case 6 */
1618 #define K6 K5
1619 #define P6 P5
1620 #define A6 A5
1621 static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1622 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1623 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1624 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1625 		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1626 			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1627 			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1628 			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1629 		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1630 
1631 /* Test Case 7 */
1632 static const u8 K7[24],
1633 		*P7=NULL,
1634 		*A7=NULL,
1635 		IV7[12],
1636 		*C7=NULL,
1637 		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1638 
1639 /* Test Case 8 */
1640 #define K8 K7
1641 #define IV8 IV7
1642 #define A8 A7
1643 static const u8	P8[16],
1644 		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1645 		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1646 
1647 /* Test Case 9 */
1648 #define A9 A8
1649 static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1650 			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1651 		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1652 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1653 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1654 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1655 		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1656 		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1657 			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1658 			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1659 			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1660 		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1661 
1662 /* Test Case 10 */
1663 #define K10 K9
1664 #define IV10 IV9
1665 static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1666 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1667 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1668 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1669 		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1670 			0xab,0xad,0xda,0xd2},
1671 		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1672 			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1673 			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1674 			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1675 		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1676 
1677 /* Test Case 11 */
1678 #define K11 K10
1679 #define P11 P10
1680 #define A11 A10
1681 static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1682 		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1683 			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1684 			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1685 			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1686 		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1687 
1688 /* Test Case 12 */
1689 #define K12 K11
1690 #define P12 P11
1691 #define A12 A11
1692 static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1693 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1694 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1695 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1696 		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1697 			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1698 			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1699 			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1700 		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1701 
1702 /* Test Case 13 */
1703 static const u8	K13[32],
1704 		*P13=NULL,
1705 		*A13=NULL,
1706 		IV13[12],
1707 		*C13=NULL,
1708 		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1709 
1710 /* Test Case 14 */
1711 #define K14 K13
1712 #define A14 A13
1713 static const u8	P14[16],
1714 		IV14[12],
1715 		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1716 		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1717 
1718 /* Test Case 15 */
1719 #define A15 A14
1720 static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1721 			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1722 		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1723 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1724 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1725 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1726 		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1727 		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1728 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1729 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1730 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1731 		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1732 
1733 /* Test Case 16 */
1734 #define K16 K15
1735 #define IV16 IV15
1736 static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1737 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1738 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1739 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1740 		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1741 			0xab,0xad,0xda,0xd2},
1742 		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1743 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1744 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1745 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1746 		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1747 
1748 /* Test Case 17 */
1749 #define K17 K16
1750 #define P17 P16
1751 #define A17 A16
1752 static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1753 		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1754 			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1755 			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1756 			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1757 		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1758 
1759 /* Test Case 18 */
1760 #define K18 K17
1761 #define P18 P17
1762 #define A18 A17
1763 static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1764 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1765 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1766 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1767 		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1768 			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1769 			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1770 			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1771 		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1772 
1773 /* Test Case 19 */
1774 #define K19 K1
1775 #define P19 P1
1776 #define IV19 IV1
1777 #define C19 C1
1778 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1779 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1780 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1781 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1782 			0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1783 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1784 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1785 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1786 		T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1787 
1788 /* Test Case 20 */
1789 #define K20 K1
1790 #define A20 A1
1791 static const u8 IV20[64]={0xff,0xff,0xff,0xff},	/* this results in 0xff in counter LSB */
1792 		P20[288],
1793 		C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1794 			0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1795 			0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1796 			0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1797 			0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1798 			0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1799 			0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1800 			0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1801 			0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1802 			0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1803 			0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1804 			0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1805 			0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1806 			0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1807 			0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1808 			0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1809 			0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1810 			0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1811 		T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1812 
1813 #define TEST_CASE(n)	do {					\
1814 	u8 out[sizeof(P##n)];					\
1815 	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
1816 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
1817 	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1818 	memset(out,0,sizeof(out));				\
1819 	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1820 	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
1821 	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1822 	    (C##n && memcmp(out,C##n,sizeof(out))))		\
1823 		ret++, printf ("encrypt test#%d failed.\n",n);	\
1824 	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1825 	memset(out,0,sizeof(out));				\
1826 	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1827 	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
1828 	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1829 	    (P##n && memcmp(out,P##n,sizeof(out))))		\
1830 		ret++, printf ("decrypt test#%d failed.\n",n);	\
1831 	} while(0)
1832 
1833 int main()
1834 {
1835 	GCM128_CONTEXT ctx;
1836 	AES_KEY key;
1837 	int ret=0;
1838 
1839 	TEST_CASE(1);
1840 	TEST_CASE(2);
1841 	TEST_CASE(3);
1842 	TEST_CASE(4);
1843 	TEST_CASE(5);
1844 	TEST_CASE(6);
1845 	TEST_CASE(7);
1846 	TEST_CASE(8);
1847 	TEST_CASE(9);
1848 	TEST_CASE(10);
1849 	TEST_CASE(11);
1850 	TEST_CASE(12);
1851 	TEST_CASE(13);
1852 	TEST_CASE(14);
1853 	TEST_CASE(15);
1854 	TEST_CASE(16);
1855 	TEST_CASE(17);
1856 	TEST_CASE(18);
1857 	TEST_CASE(19);
1858 	TEST_CASE(20);
1859 
1860 #ifdef OPENSSL_CPUID_OBJ
1861 	{
1862 	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1863 	union { u64 u; u8 c[1024]; } buf;
1864 	int i;
1865 
1866 	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1867 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1868 	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1869 
1870 	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1871 	start = OPENSSL_rdtsc();
1872 	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1873 	gcm_t = OPENSSL_rdtsc() - start;
1874 
1875 	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1876 			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1877 			(block128_f)AES_encrypt);
1878 	start = OPENSSL_rdtsc();
1879 	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1880 			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1881 			(block128_f)AES_encrypt);
1882 	ctr_t = OPENSSL_rdtsc() - start;
1883 
1884 	printf("%.2f-%.2f=%.2f\n",
1885 			gcm_t/(double)sizeof(buf),
1886 			ctr_t/(double)sizeof(buf),
1887 			(gcm_t-ctr_t)/(double)sizeof(buf));
1888 #ifdef GHASH
1889 	{
1890 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1891 				const u8 *inp,size_t len)	= ctx.ghash;
1892 
1893 	GHASH((&ctx),buf.c,sizeof(buf));
1894 	start = OPENSSL_rdtsc();
1895 	for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1896 	gcm_t = OPENSSL_rdtsc() - start;
1897 	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1898 	}
1899 #endif
1900 	}
1901 #endif
1902 
1903 	return ret;
1904 }
1905 #endif
1906