xref: /netbsd-src/external/bsd/file/dist/src/encoding.c (revision ed75d7a867996c84cfa88e3b8906816277e957f7)
1 /*	$NetBSD: encoding.c,v 1.9 2019/12/17 02:31:05 christos Exp $	*/
2 
3 /*
4  * Copyright (c) Ian F. Darwin 1986-1995.
5  * Software written by Ian F. Darwin and others;
6  * maintained 1995-present by Christos Zoulas and others.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice immediately at the beginning of the file, without modification,
13  *    this list of conditions, and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 /*
31  * Encoding -- determine the character encoding of a text file.
32  *
33  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
34  * international characters.
35  */
36 
37 #include "file.h"
38 
39 #ifndef	lint
40 #if 0
41 FILE_RCSID("@(#)$File: encoding.c,v 1.21 2019/06/08 20:49:14 christos Exp $")
42 #else
43 __RCSID("$NetBSD: encoding.c,v 1.9 2019/12/17 02:31:05 christos Exp $");
44 #endif
45 #endif	/* lint */
46 
47 #include "magic.h"
48 #include <string.h>
49 #include <stdlib.h>
50 
51 
52 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
53 private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
54     size_t *);
55 private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
56 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
57 private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
58 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
59 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
60 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
61 
62 #ifdef DEBUG_ENCODING
63 #define DPRINTF(a) printf a
64 #else
65 #define DPRINTF(a)
66 #endif
67 
68 /*
69  * Try to determine whether text is in some character code we can
70  * identify.  Each of these tests, if it succeeds, will leave
71  * the text converted into one-unichar-per-character Unicode in
72  * ubuf, and the number of characters converted in ulen.
73  */
74 protected int
75 file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
76     size_t *ulen, const char **code, const char **code_mime, const char **type)
77 {
78 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
79 	size_t nbytes = b->flen;
80 	size_t mlen;
81 	int rv = 1, ucs_type;
82 	unsigned char *nbuf = NULL;
83 	unichar *udefbuf;
84 	size_t udeflen;
85 
86 	if (ubuf == NULL)
87 		ubuf = &udefbuf;
88 	if (ulen == NULL)
89 		ulen = &udeflen;
90 
91 	*type = "text";
92 	*ulen = 0;
93 	*code = "unknown";
94 	*code_mime = "binary";
95 
96 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
97 	if ((*ubuf = CAST(unichar *, calloc(CAST(size_t, 1), mlen))) == NULL) {
98 		file_oomem(ms, mlen);
99 		goto done;
100 	}
101 	mlen = (nbytes + 1) * sizeof(nbuf[0]);
102 	if ((nbuf = CAST(unsigned char *,
103 	    calloc(CAST(size_t, 1), mlen))) == NULL) {
104 		file_oomem(ms, mlen);
105 		goto done;
106 	}
107 
108 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
109 		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
110 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
111 			*code = "UTF-7 Unicode";
112 			*code_mime = "utf-7";
113 		} else {
114 			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
115 			*code = "ASCII";
116 			*code_mime = "us-ascii";
117 		}
118 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
119 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
120 		*code = "UTF-8 Unicode (with BOM)";
121 		*code_mime = "utf-8";
122 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
123 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
124 		*code = "UTF-8 Unicode";
125 		*code_mime = "utf-8";
126 	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
127 		if (ucs_type == 1) {
128 			*code = "Little-endian UTF-32 Unicode";
129 			*code_mime = "utf-32le";
130 		} else {
131 			*code = "Big-endian UTF-32 Unicode";
132 			*code_mime = "utf-32be";
133 		}
134 		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
135 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
136 		if (ucs_type == 1) {
137 			*code = "Little-endian UTF-16 Unicode";
138 			*code_mime = "utf-16le";
139 		} else {
140 			*code = "Big-endian UTF-16 Unicode";
141 			*code_mime = "utf-16be";
142 		}
143 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
144 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
145 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
146 		*code = "ISO-8859";
147 		*code_mime = "iso-8859-1";
148 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
149 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
150 		*code = "Non-ISO extended-ASCII";
151 		*code_mime = "unknown-8bit";
152 	} else {
153 		from_ebcdic(buf, nbytes, nbuf);
154 
155 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
156 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
157 			*code = "EBCDIC";
158 			*code_mime = "ebcdic";
159 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
160 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
161 			    *ulen));
162 			*code = "International EBCDIC";
163 			*code_mime = "ebcdic";
164 		} else { /* Doesn't look like text at all */
165 			DPRINTF(("binary\n"));
166 			rv = 0;
167 			*type = "binary";
168 		}
169 	}
170 
171  done:
172 	free(nbuf);
173 	if (ubuf == &udefbuf)
174 		free(udefbuf);
175 
176 	return rv;
177 }
178 
179 /*
180  * This table reflects a particular philosophy about what constitutes
181  * "text," and there is room for disagreement about it.
182  *
183  * Version 3.31 of the file command considered a file to be ASCII if
184  * each of its characters was approved by either the isascii() or
185  * isalpha() function.  On most systems, this would mean that any
186  * file consisting only of characters in the range 0x00 ... 0x7F
187  * would be called ASCII text, but many systems might reasonably
188  * consider some characters outside this range to be alphabetic,
189  * so the file command would call such characters ASCII.  It might
190  * have been more accurate to call this "considered textual on the
191  * local system" than "ASCII."
192  *
193  * It considered a file to be "International language text" if each
194  * of its characters was either an ASCII printing character (according
195  * to the real ASCII standard, not the above test), a character in
196  * the range 0x80 ... 0xFF, or one of the following control characters:
197  * backspace, tab, line feed, vertical tab, form feed, carriage return,
198  * escape.  No attempt was made to determine the language in which files
199  * of this type were written.
200  *
201  *
202  * The table below considers a file to be ASCII if all of its characters
203  * are either ASCII printing characters (again, according to the X3.4
204  * standard, not isascii()) or any of the following controls: bell,
205  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
206  *
207  * I include bell because some programs (particularly shell scripts)
208  * use it literally, even though it is rare in normal text.  I exclude
209  * vertical tab because it never seems to be used in real text.  I also
210  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
211  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
212  * character to.  It might be more appropriate to include it in the 8859
213  * set instead of the ASCII set, but it's got to be included in *something*
214  * we recognize or EBCDIC files aren't going to be considered textual.
215  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
216  * and Latin characters, so these should possibly be allowed.  But they
217  * make a real mess on VT100-style displays if they're not paired properly,
218  * so we are probably better off not calling them text.
219  *
220  * A file is considered to be ISO-8859 text if its characters are all
221  * either ASCII, according to the above definition, or printing characters
222  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
223  *
224  * Finally, a file is considered to be international text from some other
225  * character code if its characters are all either ISO-8859 (according to
226  * the above definition) or characters in the range 0x80 ... 0x9F, which
227  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
228  * consider to be printing characters.
229  */
230 
231 #define F 0   /* character never appears in text */
232 #define T 1   /* character appears in plain ASCII text */
233 #define I 2   /* character appears in ISO-8859 text */
234 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
235 
236 private char text_chars[256] = {
237 	/*                  BEL BS HT LF VT FF CR    */
238 	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
239 	/*                              ESC          */
240 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
241 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
242 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
243 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
244 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
245 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
246 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
247 	/*            NEL                            */
248 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
249 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
250 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
251 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
252 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
253 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
254 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
255 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
256 };
257 
258 private int
259 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
260     size_t *ulen)
261 {
262 	size_t i;
263 
264 	*ulen = 0;
265 
266 	for (i = 0; i < nbytes; i++) {
267 		int t = text_chars[buf[i]];
268 
269 		if (t != T)
270 			return 0;
271 
272 		ubuf[(*ulen)++] = buf[i];
273 	}
274 
275 	return 1;
276 }
277 
278 private int
279 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
280 {
281 	size_t i;
282 
283 	*ulen = 0;
284 
285 	for (i = 0; i < nbytes; i++) {
286 		int t = text_chars[buf[i]];
287 
288 		if (t != T && t != I)
289 			return 0;
290 
291 		ubuf[(*ulen)++] = buf[i];
292 	}
293 
294 	return 1;
295 }
296 
297 private int
298 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
299     size_t *ulen)
300 {
301 	size_t i;
302 
303 	*ulen = 0;
304 
305 	for (i = 0; i < nbytes; i++) {
306 		int t = text_chars[buf[i]];
307 
308 		if (t != T && t != I && t != X)
309 			return 0;
310 
311 		ubuf[(*ulen)++] = buf[i];
312 	}
313 
314 	return 1;
315 }
316 
317 /*
318  * Decide whether some text looks like UTF-8. Returns:
319  *
320  *     -1: invalid UTF-8
321  *      0: uses odd control characters, so doesn't look like text
322  *      1: 7-bit text
323  *      2: definitely UTF-8 text (valid high-bit set bytes)
324  *
325  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
326  * ubuf must be big enough!
327  */
328 protected int
329 file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
330 {
331 	size_t i;
332 	int n;
333 	unichar c;
334 	int gotone = 0, ctrl = 0;
335 
336 	if (ubuf)
337 		*ulen = 0;
338 
339 	for (i = 0; i < nbytes; i++) {
340 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
341 			/*
342 			 * Even if the whole file is valid UTF-8 sequences,
343 			 * still reject it if it uses weird control characters.
344 			 */
345 
346 			if (text_chars[buf[i]] != T)
347 				ctrl = 1;
348 
349 			if (ubuf)
350 				ubuf[(*ulen)++] = buf[i];
351 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
352 			return -1;
353 		} else {			   /* 11xxxxxx begins UTF-8 */
354 			int following;
355 
356 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
357 				c = buf[i] & 0x1f;
358 				following = 1;
359 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
360 				c = buf[i] & 0x0f;
361 				following = 2;
362 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
363 				c = buf[i] & 0x07;
364 				following = 3;
365 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
366 				c = buf[i] & 0x03;
367 				following = 4;
368 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
369 				c = buf[i] & 0x01;
370 				following = 5;
371 			} else
372 				return -1;
373 
374 			for (n = 0; n < following; n++) {
375 				i++;
376 				if (i >= nbytes)
377 					goto done;
378 
379 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
380 					return -1;
381 
382 				c = (c << 6) + (buf[i] & 0x3f);
383 			}
384 
385 			if (ubuf)
386 				ubuf[(*ulen)++] = c;
387 			gotone = 1;
388 		}
389 	}
390 done:
391 	return ctrl ? 0 : (gotone ? 2 : 1);
392 }
393 
394 /*
395  * Decide whether some text looks like UTF-8 with BOM. If there is no
396  * BOM, return -1; otherwise return the result of looks_utf8 on the
397  * rest of the text.
398  */
399 private int
400 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
401     size_t *ulen)
402 {
403 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
404 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
405 	else
406 		return -1;
407 }
408 
409 private int
410 looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
411 {
412 	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
413 		switch (buf[3]) {
414 		case '8':
415 		case '9':
416 		case '+':
417 		case '/':
418 			if (ubuf)
419 				*ulen = 0;
420 			return 1;
421 		default:
422 			return -1;
423 		}
424 	else
425 		return -1;
426 }
427 
428 private int
429 looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
430     size_t *ulen)
431 {
432 	int bigend;
433 	size_t i;
434 
435 	if (nbytes < 2)
436 		return 0;
437 
438 	if (bf[0] == 0xff && bf[1] == 0xfe)
439 		bigend = 0;
440 	else if (bf[0] == 0xfe && bf[1] == 0xff)
441 		bigend = 1;
442 	else
443 		return 0;
444 
445 	*ulen = 0;
446 
447 	for (i = 2; i + 1 < nbytes; i += 2) {
448 		/* XXX fix to properly handle chars > 65536 */
449 
450 		if (bigend)
451 			ubf[(*ulen)++] = bf[i + 1]
452 			    | (CAST(unichar, bf[i]) << 8);
453 		else
454 			ubf[(*ulen)++] = bf[i]
455 			    | (CAST(unichar, bf[i + 1]) << 8);
456 
457 		if (ubf[*ulen - 1] == 0xfffe)
458 			return 0;
459 		if (ubf[*ulen - 1] < 128 &&
460 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
461 			return 0;
462 	}
463 
464 	return 1 + bigend;
465 }
466 
467 private int
468 looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
469     size_t *ulen)
470 {
471 	int bigend;
472 	size_t i;
473 
474 	if (nbytes < 4)
475 		return 0;
476 
477 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
478 		bigend = 0;
479 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
480 		bigend = 1;
481 	else
482 		return 0;
483 
484 	*ulen = 0;
485 
486 	for (i = 4; i + 3 < nbytes; i += 4) {
487 		/* XXX fix to properly handle chars > 65536 */
488 
489 		if (bigend)
490 			ubf[(*ulen)++] = CAST(unichar, bf[i + 3])
491 			    | (CAST(unichar, bf[i + 2]) << 8)
492 			    | (CAST(unichar, bf[i + 1]) << 16)
493 			    | (CAST(unichar, bf[i]) << 24);
494 		else
495 			ubf[(*ulen)++] = CAST(unichar, bf[i + 0])
496 			    | (CAST(unichar, bf[i + 1]) << 8)
497 			    | (CAST(unichar, bf[i + 2]) << 16)
498 			    | (CAST(unichar, bf[i + 3]) << 24);
499 
500 		if (ubf[*ulen - 1] == 0xfffe)
501 			return 0;
502 		if (ubf[*ulen - 1] < 128 &&
503 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
504 			return 0;
505 	}
506 
507 	return 1 + bigend;
508 }
509 #undef F
510 #undef T
511 #undef I
512 #undef X
513 
514 /*
515  * This table maps each EBCDIC character to an (8-bit extended) ASCII
516  * character, as specified in the rationale for the dd(1) command in
517  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
518  *
519  * Unfortunately it does not seem to correspond exactly to any of the
520  * five variants of EBCDIC documented in IBM's _Enterprise Systems
521  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
522  * Edition, July, 1999, pp. I-1 - I-4.
523  *
524  * Fortunately, though, all versions of EBCDIC, including this one, agree
525  * on most of the printing characters that also appear in (7-bit) ASCII.
526  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
527  *
528  * Fortunately too, there is general agreement that codes 0x00 through
529  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
530  * remainder printing characters.
531  *
532  * This is sufficient to allow us to identify EBCDIC text and to distinguish
533  * between old-style and internationalized examples of text.
534  */
535 
536 private unsigned char ebcdic_to_ascii[] = {
537   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
538  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
539 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
540 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
541 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
542 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
543 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
544 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
545 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
546 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
547 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
548 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
549 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
550 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
551 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
552 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
553 };
554 
555 #ifdef notdef
556 /*
557  * The following EBCDIC-to-ASCII table may relate more closely to reality,
558  * or at least to modern reality.  It comes from
559  *
560  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
561  *
562  * and maps the characters of EBCDIC code page 1047 (the code used for
563  * Unix-derived software on IBM's 390 systems) to the corresponding
564  * characters from ISO 8859-1.
565  *
566  * If this table is used instead of the above one, some of the special
567  * cases for the NEL character can be taken out of the code.
568  */
569 
570 private unsigned char ebcdic_1047_to_8859[] = {
571 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
572 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
573 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
574 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
575 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
576 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
577 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
578 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
579 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
580 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
581 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
582 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
583 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
584 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
585 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
586 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
587 };
588 #endif
589 
590 /*
591  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
592  */
593 private void
594 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
595 {
596 	size_t i;
597 
598 	for (i = 0; i < nbytes; i++) {
599 		out[i] = ebcdic_to_ascii[buf[i]];
600 	}
601 }
602