xref: /netbsd-src/external/bsd/file/dist/src/encoding.c (revision 76c7fc5f6b13ed0b1508e6b313e88e59977ed78e)
1 /*	$NetBSD: encoding.c,v 1.8 2019/05/22 17:26:05 christos Exp $	*/
2 
3 /*
4  * Copyright (c) Ian F. Darwin 1986-1995.
5  * Software written by Ian F. Darwin and others;
6  * maintained 1995-present by Christos Zoulas and others.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice immediately at the beginning of the file, without modification,
13  *    this list of conditions, and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 /*
31  * Encoding -- determine the character encoding of a text file.
32  *
33  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
34  * international characters.
35  */
36 
37 #include "file.h"
38 
39 #ifndef	lint
40 #if 0
41 FILE_RCSID("@(#)$File: encoding.c,v 1.20 2019/04/15 16:48:41 christos Exp $")
42 #else
43 __RCSID("$NetBSD: encoding.c,v 1.8 2019/05/22 17:26:05 christos Exp $");
44 #endif
45 #endif	/* lint */
46 
47 #include "magic.h"
48 #include <string.h>
49 #include <memory.h>
50 #include <stdlib.h>
51 
52 
53 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
54 private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
55     size_t *);
56 private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
57 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
58 private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
59 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
60 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
61 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
62 
63 #ifdef DEBUG_ENCODING
64 #define DPRINTF(a) printf a
65 #else
66 #define DPRINTF(a)
67 #endif
68 
69 /*
70  * Try to determine whether text is in some character code we can
71  * identify.  Each of these tests, if it succeeds, will leave
72  * the text converted into one-unichar-per-character Unicode in
73  * ubuf, and the number of characters converted in ulen.
74  */
75 protected int
76 file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
77     size_t *ulen, const char **code, const char **code_mime, const char **type)
78 {
79 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
80 	size_t nbytes = b->flen;
81 	size_t mlen;
82 	int rv = 1, ucs_type;
83 	unsigned char *nbuf = NULL;
84 	unichar *udefbuf;
85 	size_t udeflen;
86 
87 	if (ubuf == NULL)
88 		ubuf = &udefbuf;
89 	if (ulen == NULL)
90 		ulen = &udeflen;
91 
92 	*type = "text";
93 	*ulen = 0;
94 	*code = "unknown";
95 	*code_mime = "binary";
96 
97 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
98 	if ((*ubuf = CAST(unichar *, calloc(CAST(size_t, 1), mlen))) == NULL) {
99 		file_oomem(ms, mlen);
100 		goto done;
101 	}
102 	mlen = (nbytes + 1) * sizeof(nbuf[0]);
103 	if ((nbuf = CAST(unsigned char *,
104 	    calloc(CAST(size_t, 1), mlen))) == NULL) {
105 		file_oomem(ms, mlen);
106 		goto done;
107 	}
108 
109 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
110 		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
111 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
112 			*code = "UTF-7 Unicode";
113 			*code_mime = "utf-7";
114 		} else {
115 			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
116 			*code = "ASCII";
117 			*code_mime = "us-ascii";
118 		}
119 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
120 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
121 		*code = "UTF-8 Unicode (with BOM)";
122 		*code_mime = "utf-8";
123 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
124 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
125 		*code = "UTF-8 Unicode";
126 		*code_mime = "utf-8";
127 	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
128 		if (ucs_type == 1) {
129 			*code = "Little-endian UTF-32 Unicode";
130 			*code_mime = "utf-32le";
131 		} else {
132 			*code = "Big-endian UTF-32 Unicode";
133 			*code_mime = "utf-32be";
134 		}
135 		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
136 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
137 		if (ucs_type == 1) {
138 			*code = "Little-endian UTF-16 Unicode";
139 			*code_mime = "utf-16le";
140 		} else {
141 			*code = "Big-endian UTF-16 Unicode";
142 			*code_mime = "utf-16be";
143 		}
144 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
145 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
146 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
147 		*code = "ISO-8859";
148 		*code_mime = "iso-8859-1";
149 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
150 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
151 		*code = "Non-ISO extended-ASCII";
152 		*code_mime = "unknown-8bit";
153 	} else {
154 		from_ebcdic(buf, nbytes, nbuf);
155 
156 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
157 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
158 			*code = "EBCDIC";
159 			*code_mime = "ebcdic";
160 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
161 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
162 			    *ulen));
163 			*code = "International EBCDIC";
164 			*code_mime = "ebcdic";
165 		} else { /* Doesn't look like text at all */
166 			DPRINTF(("binary\n"));
167 			rv = 0;
168 			*type = "binary";
169 		}
170 	}
171 
172  done:
173 	free(nbuf);
174 	if (ubuf == &udefbuf)
175 		free(udefbuf);
176 
177 	return rv;
178 }
179 
180 /*
181  * This table reflects a particular philosophy about what constitutes
182  * "text," and there is room for disagreement about it.
183  *
184  * Version 3.31 of the file command considered a file to be ASCII if
185  * each of its characters was approved by either the isascii() or
186  * isalpha() function.  On most systems, this would mean that any
187  * file consisting only of characters in the range 0x00 ... 0x7F
188  * would be called ASCII text, but many systems might reasonably
189  * consider some characters outside this range to be alphabetic,
190  * so the file command would call such characters ASCII.  It might
191  * have been more accurate to call this "considered textual on the
192  * local system" than "ASCII."
193  *
194  * It considered a file to be "International language text" if each
195  * of its characters was either an ASCII printing character (according
196  * to the real ASCII standard, not the above test), a character in
197  * the range 0x80 ... 0xFF, or one of the following control characters:
198  * backspace, tab, line feed, vertical tab, form feed, carriage return,
199  * escape.  No attempt was made to determine the language in which files
200  * of this type were written.
201  *
202  *
203  * The table below considers a file to be ASCII if all of its characters
204  * are either ASCII printing characters (again, according to the X3.4
205  * standard, not isascii()) or any of the following controls: bell,
206  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
207  *
208  * I include bell because some programs (particularly shell scripts)
209  * use it literally, even though it is rare in normal text.  I exclude
210  * vertical tab because it never seems to be used in real text.  I also
211  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
212  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
213  * character to.  It might be more appropriate to include it in the 8859
214  * set instead of the ASCII set, but it's got to be included in *something*
215  * we recognize or EBCDIC files aren't going to be considered textual.
216  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
217  * and Latin characters, so these should possibly be allowed.  But they
218  * make a real mess on VT100-style displays if they're not paired properly,
219  * so we are probably better off not calling them text.
220  *
221  * A file is considered to be ISO-8859 text if its characters are all
222  * either ASCII, according to the above definition, or printing characters
223  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
224  *
225  * Finally, a file is considered to be international text from some other
226  * character code if its characters are all either ISO-8859 (according to
227  * the above definition) or characters in the range 0x80 ... 0x9F, which
228  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
229  * consider to be printing characters.
230  */
231 
232 #define F 0   /* character never appears in text */
233 #define T 1   /* character appears in plain ASCII text */
234 #define I 2   /* character appears in ISO-8859 text */
235 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
236 
237 private char text_chars[256] = {
238 	/*                  BEL BS HT LF VT FF CR    */
239 	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
240 	/*                              ESC          */
241 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
242 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
243 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
244 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
245 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
246 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
247 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
248 	/*            NEL                            */
249 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
250 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
251 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
252 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
253 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
254 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
255 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
256 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
257 };
258 
259 private int
260 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
261     size_t *ulen)
262 {
263 	size_t i;
264 
265 	*ulen = 0;
266 
267 	for (i = 0; i < nbytes; i++) {
268 		int t = text_chars[buf[i]];
269 
270 		if (t != T)
271 			return 0;
272 
273 		ubuf[(*ulen)++] = buf[i];
274 	}
275 
276 	return 1;
277 }
278 
279 private int
280 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
281 {
282 	size_t i;
283 
284 	*ulen = 0;
285 
286 	for (i = 0; i < nbytes; i++) {
287 		int t = text_chars[buf[i]];
288 
289 		if (t != T && t != I)
290 			return 0;
291 
292 		ubuf[(*ulen)++] = buf[i];
293 	}
294 
295 	return 1;
296 }
297 
298 private int
299 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
300     size_t *ulen)
301 {
302 	size_t i;
303 
304 	*ulen = 0;
305 
306 	for (i = 0; i < nbytes; i++) {
307 		int t = text_chars[buf[i]];
308 
309 		if (t != T && t != I && t != X)
310 			return 0;
311 
312 		ubuf[(*ulen)++] = buf[i];
313 	}
314 
315 	return 1;
316 }
317 
318 /*
319  * Decide whether some text looks like UTF-8. Returns:
320  *
321  *     -1: invalid UTF-8
322  *      0: uses odd control characters, so doesn't look like text
323  *      1: 7-bit text
324  *      2: definitely UTF-8 text (valid high-bit set bytes)
325  *
326  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
327  * ubuf must be big enough!
328  */
329 protected int
330 file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
331 {
332 	size_t i;
333 	int n;
334 	unichar c;
335 	int gotone = 0, ctrl = 0;
336 
337 	if (ubuf)
338 		*ulen = 0;
339 
340 	for (i = 0; i < nbytes; i++) {
341 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
342 			/*
343 			 * Even if the whole file is valid UTF-8 sequences,
344 			 * still reject it if it uses weird control characters.
345 			 */
346 
347 			if (text_chars[buf[i]] != T)
348 				ctrl = 1;
349 
350 			if (ubuf)
351 				ubuf[(*ulen)++] = buf[i];
352 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
353 			return -1;
354 		} else {			   /* 11xxxxxx begins UTF-8 */
355 			int following;
356 
357 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
358 				c = buf[i] & 0x1f;
359 				following = 1;
360 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
361 				c = buf[i] & 0x0f;
362 				following = 2;
363 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
364 				c = buf[i] & 0x07;
365 				following = 3;
366 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
367 				c = buf[i] & 0x03;
368 				following = 4;
369 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
370 				c = buf[i] & 0x01;
371 				following = 5;
372 			} else
373 				return -1;
374 
375 			for (n = 0; n < following; n++) {
376 				i++;
377 				if (i >= nbytes)
378 					goto done;
379 
380 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
381 					return -1;
382 
383 				c = (c << 6) + (buf[i] & 0x3f);
384 			}
385 
386 			if (ubuf)
387 				ubuf[(*ulen)++] = c;
388 			gotone = 1;
389 		}
390 	}
391 done:
392 	return ctrl ? 0 : (gotone ? 2 : 1);
393 }
394 
395 /*
396  * Decide whether some text looks like UTF-8 with BOM. If there is no
397  * BOM, return -1; otherwise return the result of looks_utf8 on the
398  * rest of the text.
399  */
400 private int
401 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
402     size_t *ulen)
403 {
404 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
405 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
406 	else
407 		return -1;
408 }
409 
410 private int
411 looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
412 {
413 	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
414 		switch (buf[3]) {
415 		case '8':
416 		case '9':
417 		case '+':
418 		case '/':
419 			if (ubuf)
420 				*ulen = 0;
421 			return 1;
422 		default:
423 			return -1;
424 		}
425 	else
426 		return -1;
427 }
428 
429 private int
430 looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
431     size_t *ulen)
432 {
433 	int bigend;
434 	size_t i;
435 
436 	if (nbytes < 2)
437 		return 0;
438 
439 	if (bf[0] == 0xff && bf[1] == 0xfe)
440 		bigend = 0;
441 	else if (bf[0] == 0xfe && bf[1] == 0xff)
442 		bigend = 1;
443 	else
444 		return 0;
445 
446 	*ulen = 0;
447 
448 	for (i = 2; i + 1 < nbytes; i += 2) {
449 		/* XXX fix to properly handle chars > 65536 */
450 
451 		if (bigend)
452 			ubf[(*ulen)++] = bf[i + 1]
453 			    | (CAST(unichar, bf[i]) << 8);
454 		else
455 			ubf[(*ulen)++] = bf[i]
456 			    | (CAST(unichar, bf[i + 1]) << 8);
457 
458 		if (ubf[*ulen - 1] == 0xfffe)
459 			return 0;
460 		if (ubf[*ulen - 1] < 128 &&
461 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
462 			return 0;
463 	}
464 
465 	return 1 + bigend;
466 }
467 
468 private int
469 looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
470     size_t *ulen)
471 {
472 	int bigend;
473 	size_t i;
474 
475 	if (nbytes < 4)
476 		return 0;
477 
478 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
479 		bigend = 0;
480 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
481 		bigend = 1;
482 	else
483 		return 0;
484 
485 	*ulen = 0;
486 
487 	for (i = 4; i + 3 < nbytes; i += 4) {
488 		/* XXX fix to properly handle chars > 65536 */
489 
490 		if (bigend)
491 			ubf[(*ulen)++] = CAST(unichar, bf[i + 3])
492 			    | (CAST(unichar, bf[i + 2]) << 8)
493 			    | (CAST(unichar, bf[i + 1]) << 16)
494 			    | (CAST(unichar, bf[i]) << 24);
495 		else
496 			ubf[(*ulen)++] = CAST(unichar, bf[i + 0])
497 			    | (CAST(unichar, bf[i + 1]) << 8)
498 			    | (CAST(unichar, bf[i + 2]) << 16)
499 			    | (CAST(unichar, bf[i + 3]) << 24);
500 
501 		if (ubf[*ulen - 1] == 0xfffe)
502 			return 0;
503 		if (ubf[*ulen - 1] < 128 &&
504 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
505 			return 0;
506 	}
507 
508 	return 1 + bigend;
509 }
510 #undef F
511 #undef T
512 #undef I
513 #undef X
514 
515 /*
516  * This table maps each EBCDIC character to an (8-bit extended) ASCII
517  * character, as specified in the rationale for the dd(1) command in
518  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
519  *
520  * Unfortunately it does not seem to correspond exactly to any of the
521  * five variants of EBCDIC documented in IBM's _Enterprise Systems
522  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
523  * Edition, July, 1999, pp. I-1 - I-4.
524  *
525  * Fortunately, though, all versions of EBCDIC, including this one, agree
526  * on most of the printing characters that also appear in (7-bit) ASCII.
527  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
528  *
529  * Fortunately too, there is general agreement that codes 0x00 through
530  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
531  * remainder printing characters.
532  *
533  * This is sufficient to allow us to identify EBCDIC text and to distinguish
534  * between old-style and internationalized examples of text.
535  */
536 
537 private unsigned char ebcdic_to_ascii[] = {
538   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
539  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
540 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
541 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
542 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
543 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
544 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
545 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
546 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
547 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
548 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
549 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
550 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
551 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
552 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
553 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
554 };
555 
556 #ifdef notdef
557 /*
558  * The following EBCDIC-to-ASCII table may relate more closely to reality,
559  * or at least to modern reality.  It comes from
560  *
561  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
562  *
563  * and maps the characters of EBCDIC code page 1047 (the code used for
564  * Unix-derived software on IBM's 390 systems) to the corresponding
565  * characters from ISO 8859-1.
566  *
567  * If this table is used instead of the above one, some of the special
568  * cases for the NEL character can be taken out of the code.
569  */
570 
571 private unsigned char ebcdic_1047_to_8859[] = {
572 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
573 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
574 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
575 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
576 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
577 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
578 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
579 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
580 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
581 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
582 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
583 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
584 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
585 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
586 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
587 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
588 };
589 #endif
590 
591 /*
592  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
593  */
594 private void
595 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
596 {
597 	size_t i;
598 
599 	for (i = 0; i < nbytes; i++) {
600 		out[i] = ebcdic_to_ascii[buf[i]];
601 	}
602 }
603