xref: /dflybsd-src/contrib/file/src/encoding.c (revision 739f0ef867128a933e021db3d831e906fcafd825)
179343712SPeter Avalos /*
279343712SPeter Avalos  * Copyright (c) Ian F. Darwin 1986-1995.
379343712SPeter Avalos  * Software written by Ian F. Darwin and others;
479343712SPeter Avalos  * maintained 1995-present by Christos Zoulas and others.
579343712SPeter Avalos  *
679343712SPeter Avalos  * Redistribution and use in source and binary forms, with or without
779343712SPeter Avalos  * modification, are permitted provided that the following conditions
879343712SPeter Avalos  * are met:
979343712SPeter Avalos  * 1. Redistributions of source code must retain the above copyright
1079343712SPeter Avalos  *    notice immediately at the beginning of the file, without modification,
1179343712SPeter Avalos  *    this list of conditions, and the following disclaimer.
1279343712SPeter Avalos  * 2. Redistributions in binary form must reproduce the above copyright
1379343712SPeter Avalos  *    notice, this list of conditions and the following disclaimer in the
1479343712SPeter Avalos  *    documentation and/or other materials provided with the distribution.
1579343712SPeter Avalos  *
1679343712SPeter Avalos  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1779343712SPeter Avalos  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1879343712SPeter Avalos  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1979343712SPeter Avalos  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
2079343712SPeter Avalos  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2179343712SPeter Avalos  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2279343712SPeter Avalos  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2379343712SPeter Avalos  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2479343712SPeter Avalos  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2579343712SPeter Avalos  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2679343712SPeter Avalos  * SUCH DAMAGE.
2779343712SPeter Avalos  */
2879343712SPeter Avalos /*
2979343712SPeter Avalos  * Encoding -- determine the character encoding of a text file.
3079343712SPeter Avalos  *
3179343712SPeter Avalos  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
3279343712SPeter Avalos  * international characters.
3379343712SPeter Avalos  */
3479343712SPeter Avalos 
3579343712SPeter Avalos #include "file.h"
3679343712SPeter Avalos 
3779343712SPeter Avalos #ifndef	lint
38*3b9cdfa3SAntonio Huete Jimenez FILE_RCSID("@(#)$File: encoding.c,v 1.39 2022/09/13 18:46:07 christos Exp $")
3979343712SPeter Avalos #endif	/* lint */
4079343712SPeter Avalos 
4179343712SPeter Avalos #include "magic.h"
4279343712SPeter Avalos #include <string.h>
4379343712SPeter Avalos #include <stdlib.h>
4479343712SPeter Avalos 
4579343712SPeter Avalos 
46970935fdSSascha Wildner private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
4779343712SPeter Avalos     size_t *);
48970935fdSSascha Wildner private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
49970935fdSSascha Wildner     size_t *);
50970935fdSSascha Wildner private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
51970935fdSSascha Wildner     size_t *);
52970935fdSSascha Wildner private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
53970935fdSSascha Wildner     size_t *);
54970935fdSSascha Wildner private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
55970935fdSSascha Wildner     size_t *);
56970935fdSSascha Wildner private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
57970935fdSSascha Wildner     size_t *);
58970935fdSSascha Wildner private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
59970935fdSSascha Wildner     size_t *);
6079343712SPeter Avalos private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
6179343712SPeter Avalos 
62f72f8299SJan Lentfer #ifdef DEBUG_ENCODING
63f72f8299SJan Lentfer #define DPRINTF(a) printf a
64f72f8299SJan Lentfer #else
65f72f8299SJan Lentfer #define DPRINTF(a)
66f72f8299SJan Lentfer #endif
67f72f8299SJan Lentfer 
6879343712SPeter Avalos /*
6979343712SPeter Avalos  * Try to determine whether text is in some character code we can
7079343712SPeter Avalos  * identify.  Each of these tests, if it succeeds, will leave
71970935fdSSascha Wildner  * the text converted into one-file_unichar_t-per-character Unicode in
7279343712SPeter Avalos  * ubuf, and the number of characters converted in ulen.
7379343712SPeter Avalos  */
7479343712SPeter Avalos protected int
file_encoding(struct magic_set * ms,const struct buffer * b,file_unichar_t ** ubuf,size_t * ulen,const char ** code,const char ** code_mime,const char ** type)75970935fdSSascha Wildner file_encoding(struct magic_set *ms, const struct buffer *b,
76970935fdSSascha Wildner     file_unichar_t **ubuf, size_t *ulen, const char **code,
77970935fdSSascha Wildner     const char **code_mime, const char **type)
7879343712SPeter Avalos {
796fca56fbSSascha Wildner 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
806fca56fbSSascha Wildner 	size_t nbytes = b->flen;
8179343712SPeter Avalos 	size_t mlen;
8279343712SPeter Avalos 	int rv = 1, ucs_type;
83970935fdSSascha Wildner 	file_unichar_t *udefbuf;
846fca56fbSSascha Wildner 	size_t udeflen;
856fca56fbSSascha Wildner 
866fca56fbSSascha Wildner 	if (ubuf == NULL)
876fca56fbSSascha Wildner 		ubuf = &udefbuf;
886fca56fbSSascha Wildner 	if (ulen == NULL)
896fca56fbSSascha Wildner 		ulen = &udeflen;
9079343712SPeter Avalos 
9117b11469SPeter Avalos 	*type = "text";
92e8af9738SPeter Avalos 	*ulen = 0;
93e8af9738SPeter Avalos 	*code = "unknown";
94e8af9738SPeter Avalos 	*code_mime = "binary";
95e8af9738SPeter Avalos 
96970935fdSSascha Wildner 	if (nbytes > ms->encoding_max)
97970935fdSSascha Wildner 		nbytes = ms->encoding_max;
98970935fdSSascha Wildner 
99e8af9738SPeter Avalos 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
100970935fdSSascha Wildner 	*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
101970935fdSSascha Wildner 	if (*ubuf == NULL) {
10279343712SPeter Avalos 		file_oomem(ms, mlen);
10379343712SPeter Avalos 		goto done;
10479343712SPeter Avalos 	}
10579343712SPeter Avalos 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
106c30bd091SSascha Wildner 		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
107c30bd091SSascha Wildner 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
108970935fdSSascha Wildner 			*code = "Unicode text, UTF-7";
109c30bd091SSascha Wildner 			*code_mime = "utf-7";
110c30bd091SSascha Wildner 		} else {
111e4d4ce0cSPeter Avalos 			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
11279343712SPeter Avalos 			*code = "ASCII";
11379343712SPeter Avalos 			*code_mime = "us-ascii";
114c30bd091SSascha Wildner 		}
11579343712SPeter Avalos 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
116e4d4ce0cSPeter Avalos 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
117970935fdSSascha Wildner 		*code = "Unicode text, UTF-8 (with BOM)";
11879343712SPeter Avalos 		*code_mime = "utf-8";
11979343712SPeter Avalos 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
120e4d4ce0cSPeter Avalos 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
121970935fdSSascha Wildner 		*code = "Unicode text, UTF-8";
12279343712SPeter Avalos 		*code_mime = "utf-8";
1236fca56fbSSascha Wildner 	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
1246fca56fbSSascha Wildner 		if (ucs_type == 1) {
125970935fdSSascha Wildner 			*code = "Unicode text, UTF-32, little-endian";
1266fca56fbSSascha Wildner 			*code_mime = "utf-32le";
1276fca56fbSSascha Wildner 		} else {
128970935fdSSascha Wildner 			*code = "Unicode text, UTF-32, big-endian";
1296fca56fbSSascha Wildner 			*code_mime = "utf-32be";
1306fca56fbSSascha Wildner 		}
1316fca56fbSSascha Wildner 		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
13279343712SPeter Avalos 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
13379343712SPeter Avalos 		if (ucs_type == 1) {
134970935fdSSascha Wildner 			*code = "Unicode text, UTF-16, little-endian";
13579343712SPeter Avalos 			*code_mime = "utf-16le";
13679343712SPeter Avalos 		} else {
137970935fdSSascha Wildner 			*code = "Unicode text, UTF-16, big-endian";
13879343712SPeter Avalos 			*code_mime = "utf-16be";
13979343712SPeter Avalos 		}
140e4d4ce0cSPeter Avalos 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
14179343712SPeter Avalos 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
142e4d4ce0cSPeter Avalos 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
14379343712SPeter Avalos 		*code = "ISO-8859";
14479343712SPeter Avalos 		*code_mime = "iso-8859-1";
14579343712SPeter Avalos 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
146e4d4ce0cSPeter Avalos 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
14779343712SPeter Avalos 		*code = "Non-ISO extended-ASCII";
14879343712SPeter Avalos 		*code_mime = "unknown-8bit";
14979343712SPeter Avalos 	} else {
150*3b9cdfa3SAntonio Huete Jimenez 		unsigned char *nbuf;
151*3b9cdfa3SAntonio Huete Jimenez 
152*3b9cdfa3SAntonio Huete Jimenez 		mlen = (nbytes + 1) * sizeof(nbuf[0]);
153*3b9cdfa3SAntonio Huete Jimenez 		if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) {
154*3b9cdfa3SAntonio Huete Jimenez 			file_oomem(ms, mlen);
155*3b9cdfa3SAntonio Huete Jimenez 			goto done;
156*3b9cdfa3SAntonio Huete Jimenez 		}
15779343712SPeter Avalos 		from_ebcdic(buf, nbytes, nbuf);
15879343712SPeter Avalos 
15979343712SPeter Avalos 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
160e4d4ce0cSPeter Avalos 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
16179343712SPeter Avalos 			*code = "EBCDIC";
16279343712SPeter Avalos 			*code_mime = "ebcdic";
16379343712SPeter Avalos 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
164e4d4ce0cSPeter Avalos 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
165e4d4ce0cSPeter Avalos 			    *ulen));
16679343712SPeter Avalos 			*code = "International EBCDIC";
16779343712SPeter Avalos 			*code_mime = "ebcdic";
16879343712SPeter Avalos 		} else { /* Doesn't look like text at all */
169f72f8299SJan Lentfer 			DPRINTF(("binary\n"));
17079343712SPeter Avalos 			rv = 0;
17179343712SPeter Avalos 			*type = "binary";
17279343712SPeter Avalos 		}
173*3b9cdfa3SAntonio Huete Jimenez 		free(nbuf);
17479343712SPeter Avalos 	}
17579343712SPeter Avalos 
17679343712SPeter Avalos  done:
1776fca56fbSSascha Wildner 	if (ubuf == &udefbuf)
1786fca56fbSSascha Wildner 		free(udefbuf);
17979343712SPeter Avalos 
18079343712SPeter Avalos 	return rv;
18179343712SPeter Avalos }
18279343712SPeter Avalos 
18379343712SPeter Avalos /*
18479343712SPeter Avalos  * This table reflects a particular philosophy about what constitutes
18579343712SPeter Avalos  * "text," and there is room for disagreement about it.
18679343712SPeter Avalos  *
18779343712SPeter Avalos  * Version 3.31 of the file command considered a file to be ASCII if
18879343712SPeter Avalos  * each of its characters was approved by either the isascii() or
18979343712SPeter Avalos  * isalpha() function.  On most systems, this would mean that any
19079343712SPeter Avalos  * file consisting only of characters in the range 0x00 ... 0x7F
19179343712SPeter Avalos  * would be called ASCII text, but many systems might reasonably
19279343712SPeter Avalos  * consider some characters outside this range to be alphabetic,
19379343712SPeter Avalos  * so the file command would call such characters ASCII.  It might
19479343712SPeter Avalos  * have been more accurate to call this "considered textual on the
19579343712SPeter Avalos  * local system" than "ASCII."
19679343712SPeter Avalos  *
19779343712SPeter Avalos  * It considered a file to be "International language text" if each
19879343712SPeter Avalos  * of its characters was either an ASCII printing character (according
19979343712SPeter Avalos  * to the real ASCII standard, not the above test), a character in
20079343712SPeter Avalos  * the range 0x80 ... 0xFF, or one of the following control characters:
20179343712SPeter Avalos  * backspace, tab, line feed, vertical tab, form feed, carriage return,
20279343712SPeter Avalos  * escape.  No attempt was made to determine the language in which files
20379343712SPeter Avalos  * of this type were written.
20479343712SPeter Avalos  *
20579343712SPeter Avalos  *
20679343712SPeter Avalos  * The table below considers a file to be ASCII if all of its characters
20779343712SPeter Avalos  * are either ASCII printing characters (again, according to the X3.4
20879343712SPeter Avalos  * standard, not isascii()) or any of the following controls: bell,
20979343712SPeter Avalos  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
21079343712SPeter Avalos  *
21179343712SPeter Avalos  * I include bell because some programs (particularly shell scripts)
21279343712SPeter Avalos  * use it literally, even though it is rare in normal text.  I exclude
21379343712SPeter Avalos  * vertical tab because it never seems to be used in real text.  I also
21479343712SPeter Avalos  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
21579343712SPeter Avalos  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
21679343712SPeter Avalos  * character to.  It might be more appropriate to include it in the 8859
21779343712SPeter Avalos  * set instead of the ASCII set, but it's got to be included in *something*
21879343712SPeter Avalos  * we recognize or EBCDIC files aren't going to be considered textual.
21979343712SPeter Avalos  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
22079343712SPeter Avalos  * and Latin characters, so these should possibly be allowed.  But they
22179343712SPeter Avalos  * make a real mess on VT100-style displays if they're not paired properly,
22279343712SPeter Avalos  * so we are probably better off not calling them text.
22379343712SPeter Avalos  *
22479343712SPeter Avalos  * A file is considered to be ISO-8859 text if its characters are all
22579343712SPeter Avalos  * either ASCII, according to the above definition, or printing characters
22679343712SPeter Avalos  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
22779343712SPeter Avalos  *
22879343712SPeter Avalos  * Finally, a file is considered to be international text from some other
22979343712SPeter Avalos  * character code if its characters are all either ISO-8859 (according to
23079343712SPeter Avalos  * the above definition) or characters in the range 0x80 ... 0x9F, which
23179343712SPeter Avalos  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
23279343712SPeter Avalos  * consider to be printing characters.
23379343712SPeter Avalos  */
23479343712SPeter Avalos 
23579343712SPeter Avalos #define F 0   /* character never appears in text */
23679343712SPeter Avalos #define T 1   /* character appears in plain ASCII text */
23779343712SPeter Avalos #define I 2   /* character appears in ISO-8859 text */
23879343712SPeter Avalos #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23979343712SPeter Avalos 
24079343712SPeter Avalos private char text_chars[256] = {
241c30bd091SSascha Wildner 	/*                  BEL BS HT LF VT FF CR    */
242c30bd091SSascha Wildner 	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
24379343712SPeter Avalos 	/*                              ESC          */
24479343712SPeter Avalos 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
24579343712SPeter Avalos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
24679343712SPeter Avalos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
24779343712SPeter Avalos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
24879343712SPeter Avalos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
24979343712SPeter Avalos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
25079343712SPeter Avalos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
25179343712SPeter Avalos 	/*            NEL                            */
25279343712SPeter Avalos 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
25379343712SPeter Avalos 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
25479343712SPeter Avalos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
25579343712SPeter Avalos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
25679343712SPeter Avalos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
25779343712SPeter Avalos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
25879343712SPeter Avalos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
25979343712SPeter Avalos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
26079343712SPeter Avalos };
26179343712SPeter Avalos 
262970935fdSSascha Wildner #define LOOKS(NAME, COND) \
263970935fdSSascha Wildner private int \
264970935fdSSascha Wildner looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
265970935fdSSascha Wildner     size_t *ulen) \
266970935fdSSascha Wildner { \
267614728caSSascha Wildner 	size_t i; \
268970935fdSSascha Wildner \
269970935fdSSascha Wildner 	*ulen = 0; \
270970935fdSSascha Wildner \
271970935fdSSascha Wildner 	for (i = 0; i < nbytes; i++) { \
272970935fdSSascha Wildner 		int t = text_chars[buf[i]]; \
273970935fdSSascha Wildner \
274970935fdSSascha Wildner 		if (COND) \
275970935fdSSascha Wildner 			return 0; \
276970935fdSSascha Wildner \
277970935fdSSascha Wildner 		ubuf[(*ulen)++] = buf[i]; \
278970935fdSSascha Wildner 	} \
279970935fdSSascha Wildner 	return 1; \
28079343712SPeter Avalos }
28179343712SPeter Avalos 
282970935fdSSascha Wildner LOOKS(ascii, t != T)
283970935fdSSascha Wildner LOOKS(latin1, t != T && t != I)
284970935fdSSascha Wildner LOOKS(extended, t != T && t != I && t != X)
28579343712SPeter Avalos 
28679343712SPeter Avalos /*
28779343712SPeter Avalos  * Decide whether some text looks like UTF-8. Returns:
28879343712SPeter Avalos  *
28979343712SPeter Avalos  *     -1: invalid UTF-8
29079343712SPeter Avalos  *      0: uses odd control characters, so doesn't look like text
29179343712SPeter Avalos  *      1: 7-bit text
29279343712SPeter Avalos  *      2: definitely UTF-8 text (valid high-bit set bytes)
29379343712SPeter Avalos  *
29479343712SPeter Avalos  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
29579343712SPeter Avalos  * ubuf must be big enough!
29679343712SPeter Avalos  */
297970935fdSSascha Wildner 
298970935fdSSascha Wildner // from: https://golang.org/src/unicode/utf8/utf8.go
299970935fdSSascha Wildner 
300970935fdSSascha Wildner #define	XX 0xF1 // invalid: size 1
301970935fdSSascha Wildner #define	AS 0xF0 // ASCII: size 1
302970935fdSSascha Wildner #define	S1 0x02 // accept 0, size 2
303970935fdSSascha Wildner #define	S2 0x13 // accept 1, size 3
304970935fdSSascha Wildner #define	S3 0x03 // accept 0, size 3
305970935fdSSascha Wildner #define	S4 0x23 // accept 2, size 3
306970935fdSSascha Wildner #define	S5 0x34 // accept 3, size 4
307970935fdSSascha Wildner #define	S6 0x04 // accept 0, size 4
308970935fdSSascha Wildner #define	S7 0x44 // accept 4, size 4
309970935fdSSascha Wildner 
310970935fdSSascha Wildner #define LOCB 0x80
311970935fdSSascha Wildner #define HICB 0xBF
312970935fdSSascha Wildner 
313970935fdSSascha Wildner // first is information about the first byte in a UTF-8 sequence.
314970935fdSSascha Wildner static const uint8_t first[] = {
315970935fdSSascha Wildner     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
316970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
317970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
318970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
319970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
320970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
321970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
322970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
323970935fdSSascha Wildner     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
324970935fdSSascha Wildner     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
325970935fdSSascha Wildner     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
326970935fdSSascha Wildner     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
327970935fdSSascha Wildner     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
328970935fdSSascha Wildner     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
329970935fdSSascha Wildner     XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
330970935fdSSascha Wildner     S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
331970935fdSSascha Wildner     S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
332970935fdSSascha Wildner     S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
333970935fdSSascha Wildner };
334970935fdSSascha Wildner 
335970935fdSSascha Wildner // acceptRange gives the range of valid values for the second byte in a UTF-8
336970935fdSSascha Wildner // sequence.
337970935fdSSascha Wildner struct accept_range {
338970935fdSSascha Wildner 	uint8_t lo; // lowest value for second byte.
339970935fdSSascha Wildner 	uint8_t hi; // highest value for second byte.
340970935fdSSascha Wildner } accept_ranges[16] = {
341970935fdSSascha Wildner // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
342970935fdSSascha Wildner 	{ LOCB, HICB },
343970935fdSSascha Wildner 	{ 0xA0, HICB },
344970935fdSSascha Wildner 	{ LOCB, 0x9F },
345970935fdSSascha Wildner 	{ 0x90, HICB },
346970935fdSSascha Wildner 	{ LOCB, 0x8F },
347970935fdSSascha Wildner };
348970935fdSSascha Wildner 
34979343712SPeter Avalos protected int
file_looks_utf8(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)350970935fdSSascha Wildner file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
351970935fdSSascha Wildner     size_t *ulen)
35279343712SPeter Avalos {
35379343712SPeter Avalos 	size_t i;
35479343712SPeter Avalos 	int n;
355970935fdSSascha Wildner 	file_unichar_t c;
35679343712SPeter Avalos 	int gotone = 0, ctrl = 0;
35779343712SPeter Avalos 
35879343712SPeter Avalos 	if (ubuf)
35979343712SPeter Avalos 		*ulen = 0;
36079343712SPeter Avalos 
36179343712SPeter Avalos 	for (i = 0; i < nbytes; i++) {
36279343712SPeter Avalos 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
36379343712SPeter Avalos 			/*
36479343712SPeter Avalos 			 * Even if the whole file is valid UTF-8 sequences,
36579343712SPeter Avalos 			 * still reject it if it uses weird control characters.
36679343712SPeter Avalos 			 */
36779343712SPeter Avalos 
36879343712SPeter Avalos 			if (text_chars[buf[i]] != T)
36979343712SPeter Avalos 				ctrl = 1;
37079343712SPeter Avalos 
37179343712SPeter Avalos 			if (ubuf)
37279343712SPeter Avalos 				ubuf[(*ulen)++] = buf[i];
37379343712SPeter Avalos 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
37479343712SPeter Avalos 			return -1;
37579343712SPeter Avalos 		} else {			   /* 11xxxxxx begins UTF-8 */
37679343712SPeter Avalos 			int following;
377970935fdSSascha Wildner 			uint8_t x = first[buf[i]];
378614728caSSascha Wildner 			const struct accept_range *ar =
379614728caSSascha Wildner 			    &accept_ranges[(unsigned int)x >> 4];
380970935fdSSascha Wildner 			if (x == XX)
381970935fdSSascha Wildner 				return -1;
38279343712SPeter Avalos 
38379343712SPeter Avalos 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
38479343712SPeter Avalos 				c = buf[i] & 0x1f;
38579343712SPeter Avalos 				following = 1;
38679343712SPeter Avalos 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
38779343712SPeter Avalos 				c = buf[i] & 0x0f;
38879343712SPeter Avalos 				following = 2;
38979343712SPeter Avalos 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
39079343712SPeter Avalos 				c = buf[i] & 0x07;
39179343712SPeter Avalos 				following = 3;
39279343712SPeter Avalos 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
39379343712SPeter Avalos 				c = buf[i] & 0x03;
39479343712SPeter Avalos 				following = 4;
39579343712SPeter Avalos 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
39679343712SPeter Avalos 				c = buf[i] & 0x01;
39779343712SPeter Avalos 				following = 5;
39879343712SPeter Avalos 			} else
39979343712SPeter Avalos 				return -1;
40079343712SPeter Avalos 
40179343712SPeter Avalos 			for (n = 0; n < following; n++) {
40279343712SPeter Avalos 				i++;
40379343712SPeter Avalos 				if (i >= nbytes)
40479343712SPeter Avalos 					goto done;
40579343712SPeter Avalos 
406970935fdSSascha Wildner 				if (n == 0 &&
407970935fdSSascha Wildner 				     (buf[i] < ar->lo || buf[i] > ar->hi))
408970935fdSSascha Wildner 					return -1;
409970935fdSSascha Wildner 
41079343712SPeter Avalos 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
41179343712SPeter Avalos 					return -1;
41279343712SPeter Avalos 
41379343712SPeter Avalos 				c = (c << 6) + (buf[i] & 0x3f);
41479343712SPeter Avalos 			}
41579343712SPeter Avalos 
41679343712SPeter Avalos 			if (ubuf)
41779343712SPeter Avalos 				ubuf[(*ulen)++] = c;
41879343712SPeter Avalos 			gotone = 1;
41979343712SPeter Avalos 		}
42079343712SPeter Avalos 	}
42179343712SPeter Avalos done:
42279343712SPeter Avalos 	return ctrl ? 0 : (gotone ? 2 : 1);
42379343712SPeter Avalos }
42479343712SPeter Avalos 
42579343712SPeter Avalos /*
42679343712SPeter Avalos  * Decide whether some text looks like UTF-8 with BOM. If there is no
42779343712SPeter Avalos  * BOM, return -1; otherwise return the result of looks_utf8 on the
42879343712SPeter Avalos  * rest of the text.
42979343712SPeter Avalos  */
43079343712SPeter Avalos private int
looks_utf8_with_BOM(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)431970935fdSSascha Wildner looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
432970935fdSSascha Wildner     file_unichar_t *ubuf, size_t *ulen)
43379343712SPeter Avalos {
43479343712SPeter Avalos 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
43579343712SPeter Avalos 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
43679343712SPeter Avalos 	else
43779343712SPeter Avalos 		return -1;
43879343712SPeter Avalos }
43979343712SPeter Avalos 
44079343712SPeter Avalos private int
looks_utf7(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)441970935fdSSascha Wildner looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
442970935fdSSascha Wildner     size_t *ulen)
443c30bd091SSascha Wildner {
444c30bd091SSascha Wildner 	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
445c30bd091SSascha Wildner 		switch (buf[3]) {
446c30bd091SSascha Wildner 		case '8':
447c30bd091SSascha Wildner 		case '9':
448c30bd091SSascha Wildner 		case '+':
449c30bd091SSascha Wildner 		case '/':
450c30bd091SSascha Wildner 			if (ubuf)
451c30bd091SSascha Wildner 				*ulen = 0;
452c30bd091SSascha Wildner 			return 1;
453c30bd091SSascha Wildner 		default:
454c30bd091SSascha Wildner 			return -1;
455c30bd091SSascha Wildner 		}
456c30bd091SSascha Wildner 	else
457c30bd091SSascha Wildner 		return -1;
458c30bd091SSascha Wildner }
459c30bd091SSascha Wildner 
460*3b9cdfa3SAntonio Huete Jimenez #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
461*3b9cdfa3SAntonio Huete Jimenez #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
462*3b9cdfa3SAntonio Huete Jimenez #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
463*3b9cdfa3SAntonio Huete Jimenez 
464c30bd091SSascha Wildner private int
looks_ucs16(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)465970935fdSSascha Wildner looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
46679343712SPeter Avalos     size_t *ulen)
46779343712SPeter Avalos {
46879343712SPeter Avalos 	int bigend;
469*3b9cdfa3SAntonio Huete Jimenez 	uint32_t hi;
47079343712SPeter Avalos 	size_t i;
47179343712SPeter Avalos 
47279343712SPeter Avalos 	if (nbytes < 2)
47379343712SPeter Avalos 		return 0;
47479343712SPeter Avalos 
4756fca56fbSSascha Wildner 	if (bf[0] == 0xff && bf[1] == 0xfe)
47679343712SPeter Avalos 		bigend = 0;
4776fca56fbSSascha Wildner 	else if (bf[0] == 0xfe && bf[1] == 0xff)
47879343712SPeter Avalos 		bigend = 1;
47979343712SPeter Avalos 	else
48079343712SPeter Avalos 		return 0;
48179343712SPeter Avalos 
48279343712SPeter Avalos 	*ulen = 0;
483*3b9cdfa3SAntonio Huete Jimenez 	hi = 0;
48479343712SPeter Avalos 
48579343712SPeter Avalos 	for (i = 2; i + 1 < nbytes; i += 2) {
486*3b9cdfa3SAntonio Huete Jimenez 		uint32_t uc;
48779343712SPeter Avalos 
48879343712SPeter Avalos 		if (bigend)
489*3b9cdfa3SAntonio Huete Jimenez 			uc = CAST(uint32_t,
490*3b9cdfa3SAntonio Huete Jimenez 			    bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));
49179343712SPeter Avalos 		else
492*3b9cdfa3SAntonio Huete Jimenez 			uc = CAST(uint32_t,
493*3b9cdfa3SAntonio Huete Jimenez 			    bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));
49479343712SPeter Avalos 
495*3b9cdfa3SAntonio Huete Jimenez 		uc &= 0xffff;
496*3b9cdfa3SAntonio Huete Jimenez 
497*3b9cdfa3SAntonio Huete Jimenez 		switch (uc) {
498*3b9cdfa3SAntonio Huete Jimenez 		case 0xfffe:
499*3b9cdfa3SAntonio Huete Jimenez 		case 0xffff:
50079343712SPeter Avalos 			return 0;
501*3b9cdfa3SAntonio Huete Jimenez 		default:
502*3b9cdfa3SAntonio Huete Jimenez 			if (UCS16_NOCHAR(uc))
503*3b9cdfa3SAntonio Huete Jimenez 				return 0;
504*3b9cdfa3SAntonio Huete Jimenez 			break;
505*3b9cdfa3SAntonio Huete Jimenez 		}
506*3b9cdfa3SAntonio Huete Jimenez 		if (hi) {
507*3b9cdfa3SAntonio Huete Jimenez 			if (!UCS16_LOSURR(uc))
508*3b9cdfa3SAntonio Huete Jimenez 				return 0;
509*3b9cdfa3SAntonio Huete Jimenez 			uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);
510*3b9cdfa3SAntonio Huete Jimenez 			hi = 0;
511*3b9cdfa3SAntonio Huete Jimenez 		}
512*3b9cdfa3SAntonio Huete Jimenez 		if (uc < 128 && text_chars[CAST(size_t, uc)] != T)
513*3b9cdfa3SAntonio Huete Jimenez 			return 0;
514*3b9cdfa3SAntonio Huete Jimenez 		ubf[(*ulen)++] = uc;
515*3b9cdfa3SAntonio Huete Jimenez 		if (UCS16_HISURR(uc))
516*3b9cdfa3SAntonio Huete Jimenez 			hi = uc - 0xd800 + 1;
517*3b9cdfa3SAntonio Huete Jimenez 		if (UCS16_LOSURR(uc))
51879343712SPeter Avalos 			return 0;
51979343712SPeter Avalos 	}
52079343712SPeter Avalos 
52179343712SPeter Avalos 	return 1 + bigend;
52279343712SPeter Avalos }
52379343712SPeter Avalos 
5246fca56fbSSascha Wildner private int
looks_ucs32(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)525970935fdSSascha Wildner looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
5266fca56fbSSascha Wildner     size_t *ulen)
5276fca56fbSSascha Wildner {
5286fca56fbSSascha Wildner 	int bigend;
5296fca56fbSSascha Wildner 	size_t i;
5306fca56fbSSascha Wildner 
5316fca56fbSSascha Wildner 	if (nbytes < 4)
5326fca56fbSSascha Wildner 		return 0;
5336fca56fbSSascha Wildner 
5346fca56fbSSascha Wildner 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
5356fca56fbSSascha Wildner 		bigend = 0;
5366fca56fbSSascha Wildner 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
5376fca56fbSSascha Wildner 		bigend = 1;
5386fca56fbSSascha Wildner 	else
5396fca56fbSSascha Wildner 		return 0;
5406fca56fbSSascha Wildner 
5416fca56fbSSascha Wildner 	*ulen = 0;
5426fca56fbSSascha Wildner 
5436fca56fbSSascha Wildner 	for (i = 4; i + 3 < nbytes; i += 4) {
5446fca56fbSSascha Wildner 		/* XXX fix to properly handle chars > 65536 */
5456fca56fbSSascha Wildner 
5466fca56fbSSascha Wildner 		if (bigend)
547970935fdSSascha Wildner 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
548970935fdSSascha Wildner 			    | (CAST(file_unichar_t, bf[i + 2]) << 8)
549970935fdSSascha Wildner 			    | (CAST(file_unichar_t, bf[i + 1]) << 16)
550970935fdSSascha Wildner 			    | (CAST(file_unichar_t, bf[i]) << 24);
5516fca56fbSSascha Wildner 		else
552970935fdSSascha Wildner 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
553970935fdSSascha Wildner 			    | (CAST(file_unichar_t, bf[i + 1]) << 8)
554970935fdSSascha Wildner 			    | (CAST(file_unichar_t, bf[i + 2]) << 16)
555970935fdSSascha Wildner 			    | (CAST(file_unichar_t, bf[i + 3]) << 24);
5566fca56fbSSascha Wildner 
5576fca56fbSSascha Wildner 		if (ubf[*ulen - 1] == 0xfffe)
5586fca56fbSSascha Wildner 			return 0;
5596fca56fbSSascha Wildner 		if (ubf[*ulen - 1] < 128 &&
5606fca56fbSSascha Wildner 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
5616fca56fbSSascha Wildner 			return 0;
5626fca56fbSSascha Wildner 	}
5636fca56fbSSascha Wildner 
5646fca56fbSSascha Wildner 	return 1 + bigend;
5656fca56fbSSascha Wildner }
56679343712SPeter Avalos #undef F
56779343712SPeter Avalos #undef T
56879343712SPeter Avalos #undef I
56979343712SPeter Avalos #undef X
57079343712SPeter Avalos 
57179343712SPeter Avalos /*
57279343712SPeter Avalos  * This table maps each EBCDIC character to an (8-bit extended) ASCII
57379343712SPeter Avalos  * character, as specified in the rationale for the dd(1) command in
57479343712SPeter Avalos  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
57579343712SPeter Avalos  *
57679343712SPeter Avalos  * Unfortunately it does not seem to correspond exactly to any of the
57779343712SPeter Avalos  * five variants of EBCDIC documented in IBM's _Enterprise Systems
57879343712SPeter Avalos  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
57979343712SPeter Avalos  * Edition, July, 1999, pp. I-1 - I-4.
58079343712SPeter Avalos  *
58179343712SPeter Avalos  * Fortunately, though, all versions of EBCDIC, including this one, agree
58279343712SPeter Avalos  * on most of the printing characters that also appear in (7-bit) ASCII.
58379343712SPeter Avalos  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
58479343712SPeter Avalos  *
58579343712SPeter Avalos  * Fortunately too, there is general agreement that codes 0x00 through
58679343712SPeter Avalos  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
58779343712SPeter Avalos  * remainder printing characters.
58879343712SPeter Avalos  *
58979343712SPeter Avalos  * This is sufficient to allow us to identify EBCDIC text and to distinguish
59079343712SPeter Avalos  * between old-style and internationalized examples of text.
59179343712SPeter Avalos  */
59279343712SPeter Avalos 
59379343712SPeter Avalos private unsigned char ebcdic_to_ascii[] = {
59479343712SPeter Avalos   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
59579343712SPeter Avalos  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
59679343712SPeter Avalos 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
59779343712SPeter Avalos 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
59879343712SPeter Avalos ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
59979343712SPeter Avalos '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
60079343712SPeter Avalos '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
60179343712SPeter Avalos 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
60279343712SPeter Avalos 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
60379343712SPeter Avalos 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
60479343712SPeter Avalos 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
60579343712SPeter Avalos 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
60679343712SPeter Avalos '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
60779343712SPeter Avalos '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
60879343712SPeter Avalos '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
60979343712SPeter Avalos '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
61079343712SPeter Avalos };
61179343712SPeter Avalos 
61279343712SPeter Avalos #ifdef notdef
61379343712SPeter Avalos /*
61479343712SPeter Avalos  * The following EBCDIC-to-ASCII table may relate more closely to reality,
61579343712SPeter Avalos  * or at least to modern reality.  It comes from
61679343712SPeter Avalos  *
61779343712SPeter Avalos  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
61879343712SPeter Avalos  *
61979343712SPeter Avalos  * and maps the characters of EBCDIC code page 1047 (the code used for
62079343712SPeter Avalos  * Unix-derived software on IBM's 390 systems) to the corresponding
62179343712SPeter Avalos  * characters from ISO 8859-1.
62279343712SPeter Avalos  *
62379343712SPeter Avalos  * If this table is used instead of the above one, some of the special
62479343712SPeter Avalos  * cases for the NEL character can be taken out of the code.
62579343712SPeter Avalos  */
62679343712SPeter Avalos 
62779343712SPeter Avalos private unsigned char ebcdic_1047_to_8859[] = {
62879343712SPeter Avalos 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
62979343712SPeter Avalos 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
63079343712SPeter Avalos 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
63179343712SPeter Avalos 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
63279343712SPeter Avalos 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
63379343712SPeter Avalos 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
63479343712SPeter Avalos 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
63579343712SPeter Avalos 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
63679343712SPeter Avalos 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
63779343712SPeter Avalos 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
63879343712SPeter Avalos 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
63979343712SPeter Avalos 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
64079343712SPeter Avalos 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
64179343712SPeter Avalos 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
64279343712SPeter Avalos 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
64379343712SPeter Avalos 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
64479343712SPeter Avalos };
64579343712SPeter Avalos #endif
64679343712SPeter Avalos 
64779343712SPeter Avalos /*
64879343712SPeter Avalos  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
64979343712SPeter Avalos  */
65079343712SPeter Avalos private void
from_ebcdic(const unsigned char * buf,size_t nbytes,unsigned char * out)65179343712SPeter Avalos from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
65279343712SPeter Avalos {
65379343712SPeter Avalos 	size_t i;
65479343712SPeter Avalos 
65579343712SPeter Avalos 	for (i = 0; i < nbytes; i++) {
65679343712SPeter Avalos 		out[i] = ebcdic_to_ascii[buf[i]];
65779343712SPeter Avalos 	}
65879343712SPeter Avalos }
659