xref: /freebsd-src/contrib/file/src/encoding.c (revision ae316d1d1cffd71ab7751f94e10118777a88e027)
1b6cee71dSXin LI /*
2b6cee71dSXin LI  * Copyright (c) Ian F. Darwin 1986-1995.
3b6cee71dSXin LI  * Software written by Ian F. Darwin and others;
4b6cee71dSXin LI  * maintained 1995-present by Christos Zoulas and others.
5b6cee71dSXin LI  *
6b6cee71dSXin LI  * Redistribution and use in source and binary forms, with or without
7b6cee71dSXin LI  * modification, are permitted provided that the following conditions
8b6cee71dSXin LI  * are met:
9b6cee71dSXin LI  * 1. Redistributions of source code must retain the above copyright
10b6cee71dSXin LI  *    notice immediately at the beginning of the file, without modification,
11b6cee71dSXin LI  *    this list of conditions, and the following disclaimer.
12b6cee71dSXin LI  * 2. Redistributions in binary form must reproduce the above copyright
13b6cee71dSXin LI  *    notice, this list of conditions and the following disclaimer in the
14b6cee71dSXin LI  *    documentation and/or other materials provided with the distribution.
15b6cee71dSXin LI  *
16b6cee71dSXin LI  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17b6cee71dSXin LI  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18b6cee71dSXin LI  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19b6cee71dSXin LI  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20b6cee71dSXin LI  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21b6cee71dSXin LI  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22b6cee71dSXin LI  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23b6cee71dSXin LI  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24b6cee71dSXin LI  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25b6cee71dSXin LI  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26b6cee71dSXin LI  * SUCH DAMAGE.
27b6cee71dSXin LI  */
28b6cee71dSXin LI /*
29b6cee71dSXin LI  * Encoding -- determine the character encoding of a text file.
30b6cee71dSXin LI  *
31b6cee71dSXin LI  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
32b6cee71dSXin LI  * international characters.
33b6cee71dSXin LI  */
34b6cee71dSXin LI 
35b6cee71dSXin LI #include "file.h"
36b6cee71dSXin LI 
37b6cee71dSXin LI #ifndef	lint
38*ae316d1dSXin LI FILE_RCSID("@(#)$File: encoding.c,v 1.43 2024/10/29 20:56:48 christos Exp $")
39b6cee71dSXin LI #endif	/* lint */
40b6cee71dSXin LI 
41b6cee71dSXin LI #include "magic.h"
42b6cee71dSXin LI #include <string.h>
43b6cee71dSXin LI #include <stdlib.h>
44b6cee71dSXin LI 
45b6cee71dSXin LI 
46898496eeSXin LI file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
47b6cee71dSXin LI     size_t *);
48898496eeSXin LI file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
4943a5ec4eSXin LI     size_t *);
50898496eeSXin LI file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
5143a5ec4eSXin LI     size_t *);
52898496eeSXin LI file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
5343a5ec4eSXin LI     size_t *);
54898496eeSXin LI file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
5543a5ec4eSXin LI     size_t *);
56898496eeSXin LI file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
5743a5ec4eSXin LI     size_t *);
58898496eeSXin LI file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
5943a5ec4eSXin LI     size_t *);
60898496eeSXin LI file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
61b6cee71dSXin LI 
62b6cee71dSXin LI #ifdef DEBUG_ENCODING
63b6cee71dSXin LI #define DPRINTF(a) printf a
64b6cee71dSXin LI #else
65b6cee71dSXin LI #define DPRINTF(a)
66b6cee71dSXin LI #endif
67b6cee71dSXin LI 
68b6cee71dSXin LI /*
69b6cee71dSXin LI  * Try to determine whether text is in some character code we can
70b6cee71dSXin LI  * identify.  Each of these tests, if it succeeds, will leave
7143a5ec4eSXin LI  * the text converted into one-file_unichar_t-per-character Unicode in
72b6cee71dSXin LI  * ubuf, and the number of characters converted in ulen.
73b6cee71dSXin LI  */
74898496eeSXin LI file_protected int
7543a5ec4eSXin LI file_encoding(struct magic_set *ms, const struct buffer *b,
7643a5ec4eSXin LI     file_unichar_t **ubuf, size_t *ulen, const char **code,
7743a5ec4eSXin LI     const char **code_mime, const char **type)
78b6cee71dSXin LI {
7948c779cdSXin LI 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
8058a0f0d0SEitan Adler 	size_t nbytes = b->flen;
81b6cee71dSXin LI 	size_t mlen;
82b6cee71dSXin LI 	int rv = 1, ucs_type;
8343a5ec4eSXin LI 	file_unichar_t *udefbuf;
8458a0f0d0SEitan Adler 	size_t udeflen;
8558a0f0d0SEitan Adler 
8658a0f0d0SEitan Adler 	if (ubuf == NULL)
8758a0f0d0SEitan Adler 		ubuf = &udefbuf;
8858a0f0d0SEitan Adler 	if (ulen == NULL)
8958a0f0d0SEitan Adler 		ulen = &udeflen;
90b6cee71dSXin LI 
91b6cee71dSXin LI 	*type = "text";
92b6cee71dSXin LI 	*ulen = 0;
93b6cee71dSXin LI 	*code = "unknown";
94b6cee71dSXin LI 	*code_mime = "binary";
95b6cee71dSXin LI 
9643a5ec4eSXin LI 	if (nbytes > ms->encoding_max)
9743a5ec4eSXin LI 		nbytes = ms->encoding_max;
9843a5ec4eSXin LI 
99b6cee71dSXin LI 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
10043a5ec4eSXin LI 	*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
10143a5ec4eSXin LI 	if (*ubuf == NULL) {
102b6cee71dSXin LI 		file_oomem(ms, mlen);
103b6cee71dSXin LI 		goto done;
104b6cee71dSXin LI 	}
105b6cee71dSXin LI 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
1065f0216bdSXin LI 		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
1075f0216bdSXin LI 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
10843a5ec4eSXin LI 			*code = "Unicode text, UTF-7";
1095f0216bdSXin LI 			*code_mime = "utf-7";
1105f0216bdSXin LI 		} else {
111b6cee71dSXin LI 			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
112b6cee71dSXin LI 			*code = "ASCII";
113b6cee71dSXin LI 			*code_mime = "us-ascii";
1145f0216bdSXin LI 		}
115b6cee71dSXin LI 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
116b6cee71dSXin LI 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
11743a5ec4eSXin LI 		*code = "Unicode text, UTF-8 (with BOM)";
118b6cee71dSXin LI 		*code_mime = "utf-8";
119b6cee71dSXin LI 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
120b6cee71dSXin LI 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
12143a5ec4eSXin LI 		*code = "Unicode text, UTF-8";
122b6cee71dSXin LI 		*code_mime = "utf-8";
12348c779cdSXin LI 	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
12448c779cdSXin LI 		if (ucs_type == 1) {
12543a5ec4eSXin LI 			*code = "Unicode text, UTF-32, little-endian";
12648c779cdSXin LI 			*code_mime = "utf-32le";
12748c779cdSXin LI 		} else {
12843a5ec4eSXin LI 			*code = "Unicode text, UTF-32, big-endian";
12948c779cdSXin LI 			*code_mime = "utf-32be";
13048c779cdSXin LI 		}
13148c779cdSXin LI 		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
132b6cee71dSXin LI 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
133b6cee71dSXin LI 		if (ucs_type == 1) {
13443a5ec4eSXin LI 			*code = "Unicode text, UTF-16, little-endian";
135b6cee71dSXin LI 			*code_mime = "utf-16le";
136b6cee71dSXin LI 		} else {
13743a5ec4eSXin LI 			*code = "Unicode text, UTF-16, big-endian";
138b6cee71dSXin LI 			*code_mime = "utf-16be";
139b6cee71dSXin LI 		}
140b6cee71dSXin LI 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
141b6cee71dSXin LI 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
142b6cee71dSXin LI 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
143b6cee71dSXin LI 		*code = "ISO-8859";
144b6cee71dSXin LI 		*code_mime = "iso-8859-1";
145b6cee71dSXin LI 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
146b6cee71dSXin LI 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
147b6cee71dSXin LI 		*code = "Non-ISO extended-ASCII";
148b6cee71dSXin LI 		*code_mime = "unknown-8bit";
149b6cee71dSXin LI 	} else {
150a4d6d3b8SXin LI 		unsigned char *nbuf;
151a4d6d3b8SXin LI 
152a4d6d3b8SXin LI 		mlen = (nbytes + 1) * sizeof(nbuf[0]);
153a4d6d3b8SXin LI 		if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) {
154a4d6d3b8SXin LI 			file_oomem(ms, mlen);
155a4d6d3b8SXin LI 			goto done;
156a4d6d3b8SXin LI 		}
157b6cee71dSXin LI 		from_ebcdic(buf, nbytes, nbuf);
158b6cee71dSXin LI 
159b6cee71dSXin LI 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
160b6cee71dSXin LI 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
161b6cee71dSXin LI 			*code = "EBCDIC";
162b6cee71dSXin LI 			*code_mime = "ebcdic";
163b6cee71dSXin LI 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
164b6cee71dSXin LI 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
165b6cee71dSXin LI 			    *ulen));
166b6cee71dSXin LI 			*code = "International EBCDIC";
167b6cee71dSXin LI 			*code_mime = "ebcdic";
168b6cee71dSXin LI 		} else { /* Doesn't look like text at all */
169b6cee71dSXin LI 			DPRINTF(("binary\n"));
170b6cee71dSXin LI 			rv = 0;
171b6cee71dSXin LI 			*type = "binary";
172b6cee71dSXin LI 		}
173a4d6d3b8SXin LI 		free(nbuf);
174b6cee71dSXin LI 	}
175b6cee71dSXin LI 
176b6cee71dSXin LI  done:
17758a0f0d0SEitan Adler 	if (ubuf == &udefbuf)
17858a0f0d0SEitan Adler 		free(udefbuf);
179b6cee71dSXin LI 
180b6cee71dSXin LI 	return rv;
181b6cee71dSXin LI }
182b6cee71dSXin LI 
183b6cee71dSXin LI /*
184b6cee71dSXin LI  * This table reflects a particular philosophy about what constitutes
185b6cee71dSXin LI  * "text," and there is room for disagreement about it.
186b6cee71dSXin LI  *
187b6cee71dSXin LI  * Version 3.31 of the file command considered a file to be ASCII if
188b6cee71dSXin LI  * each of its characters was approved by either the isascii() or
189b6cee71dSXin LI  * isalpha() function.  On most systems, this would mean that any
190b6cee71dSXin LI  * file consisting only of characters in the range 0x00 ... 0x7F
191b6cee71dSXin LI  * would be called ASCII text, but many systems might reasonably
192b6cee71dSXin LI  * consider some characters outside this range to be alphabetic,
193b6cee71dSXin LI  * so the file command would call such characters ASCII.  It might
194b6cee71dSXin LI  * have been more accurate to call this "considered textual on the
195b6cee71dSXin LI  * local system" than "ASCII."
196b6cee71dSXin LI  *
197b6cee71dSXin LI  * It considered a file to be "International language text" if each
198b6cee71dSXin LI  * of its characters was either an ASCII printing character (according
199b6cee71dSXin LI  * to the real ASCII standard, not the above test), a character in
200b6cee71dSXin LI  * the range 0x80 ... 0xFF, or one of the following control characters:
201b6cee71dSXin LI  * backspace, tab, line feed, vertical tab, form feed, carriage return,
202b6cee71dSXin LI  * escape.  No attempt was made to determine the language in which files
203b6cee71dSXin LI  * of this type were written.
204b6cee71dSXin LI  *
205b6cee71dSXin LI  *
206b6cee71dSXin LI  * The table below considers a file to be ASCII if all of its characters
207b6cee71dSXin LI  * are either ASCII printing characters (again, according to the X3.4
208b6cee71dSXin LI  * standard, not isascii()) or any of the following controls: bell,
209b6cee71dSXin LI  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
210b6cee71dSXin LI  *
211b6cee71dSXin LI  * I include bell because some programs (particularly shell scripts)
212b6cee71dSXin LI  * use it literally, even though it is rare in normal text.  I exclude
213b6cee71dSXin LI  * vertical tab because it never seems to be used in real text.  I also
214b6cee71dSXin LI  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
215b6cee71dSXin LI  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
216b6cee71dSXin LI  * character to.  It might be more appropriate to include it in the 8859
217b6cee71dSXin LI  * set instead of the ASCII set, but it's got to be included in *something*
218b6cee71dSXin LI  * we recognize or EBCDIC files aren't going to be considered textual.
219b6cee71dSXin LI  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
220b6cee71dSXin LI  * and Latin characters, so these should possibly be allowed.  But they
221b6cee71dSXin LI  * make a real mess on VT100-style displays if they're not paired properly,
222b6cee71dSXin LI  * so we are probably better off not calling them text.
223b6cee71dSXin LI  *
224b6cee71dSXin LI  * A file is considered to be ISO-8859 text if its characters are all
225b6cee71dSXin LI  * either ASCII, according to the above definition, or printing characters
226b6cee71dSXin LI  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
227b6cee71dSXin LI  *
228b6cee71dSXin LI  * Finally, a file is considered to be international text from some other
229b6cee71dSXin LI  * character code if its characters are all either ISO-8859 (according to
230b6cee71dSXin LI  * the above definition) or characters in the range 0x80 ... 0x9F, which
231b6cee71dSXin LI  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
232b6cee71dSXin LI  * consider to be printing characters.
233b6cee71dSXin LI  */
234b6cee71dSXin LI 
235b6cee71dSXin LI #define F 0   /* character never appears in text */
236b6cee71dSXin LI #define T 1   /* character appears in plain ASCII text */
237b6cee71dSXin LI #define I 2   /* character appears in ISO-8859 text */
238b6cee71dSXin LI #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
239b6cee71dSXin LI 
240898496eeSXin LI file_private char text_chars[256] = {
2415f0216bdSXin LI 	/*                  BEL BS HT LF VT FF CR    */
2425f0216bdSXin LI 	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
243b6cee71dSXin LI 	/*                              ESC          */
244b6cee71dSXin LI 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
245b6cee71dSXin LI 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
246b6cee71dSXin LI 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
247b6cee71dSXin LI 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
248b6cee71dSXin LI 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
249b6cee71dSXin LI 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
250b6cee71dSXin LI 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
251b6cee71dSXin LI 	/*            NEL                            */
252b6cee71dSXin LI 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
253b6cee71dSXin LI 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
254b6cee71dSXin LI 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
255b6cee71dSXin LI 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
256b6cee71dSXin LI 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
257b6cee71dSXin LI 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
258b6cee71dSXin LI 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
259b6cee71dSXin LI 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
260b6cee71dSXin LI };
261b6cee71dSXin LI 
26243a5ec4eSXin LI #define LOOKS(NAME, COND) \
263898496eeSXin LI file_private int \
26443a5ec4eSXin LI looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
26543a5ec4eSXin LI     size_t *ulen) \
26643a5ec4eSXin LI { \
26743a5ec4eSXin LI 	size_t i; \
26843a5ec4eSXin LI \
26943a5ec4eSXin LI 	*ulen = 0; \
27043a5ec4eSXin LI \
27143a5ec4eSXin LI 	for (i = 0; i < nbytes; i++) { \
27243a5ec4eSXin LI 		int t = text_chars[buf[i]]; \
27343a5ec4eSXin LI \
27443a5ec4eSXin LI 		if (COND) \
27543a5ec4eSXin LI 			return 0; \
27643a5ec4eSXin LI \
27743a5ec4eSXin LI 		ubuf[(*ulen)++] = buf[i]; \
27843a5ec4eSXin LI 	} \
27943a5ec4eSXin LI 	return 1; \
280b6cee71dSXin LI }
281b6cee71dSXin LI 
28243a5ec4eSXin LI LOOKS(ascii, t != T)
28343a5ec4eSXin LI LOOKS(latin1, t != T && t != I)
28443a5ec4eSXin LI LOOKS(extended, t != T && t != I && t != X)
285b6cee71dSXin LI 
286b6cee71dSXin LI /*
287b6cee71dSXin LI  * Decide whether some text looks like UTF-8. Returns:
288b6cee71dSXin LI  *
289b6cee71dSXin LI  *     -1: invalid UTF-8
290b6cee71dSXin LI  *      0: uses odd control characters, so doesn't look like text
291b6cee71dSXin LI  *      1: 7-bit text
292b6cee71dSXin LI  *      2: definitely UTF-8 text (valid high-bit set bytes)
293b6cee71dSXin LI  *
294b6cee71dSXin LI  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
295b6cee71dSXin LI  * ubuf must be big enough!
296b6cee71dSXin LI  */
29743a5ec4eSXin LI 
29843a5ec4eSXin LI // from: https://golang.org/src/unicode/utf8/utf8.go
29943a5ec4eSXin LI 
30043a5ec4eSXin LI #define	XX 0xF1 // invalid: size 1
30143a5ec4eSXin LI #define	AS 0xF0 // ASCII: size 1
30243a5ec4eSXin LI #define	S1 0x02 // accept 0, size 2
30343a5ec4eSXin LI #define	S2 0x13 // accept 1, size 3
30443a5ec4eSXin LI #define	S3 0x03 // accept 0, size 3
30543a5ec4eSXin LI #define	S4 0x23 // accept 2, size 3
30643a5ec4eSXin LI #define	S5 0x34 // accept 3, size 4
30743a5ec4eSXin LI #define	S6 0x04 // accept 0, size 4
30843a5ec4eSXin LI #define	S7 0x44 // accept 4, size 4
30943a5ec4eSXin LI 
31043a5ec4eSXin LI #define LOCB 0x80
31143a5ec4eSXin LI #define HICB 0xBF
31243a5ec4eSXin LI 
31343a5ec4eSXin LI // first is information about the first byte in a UTF-8 sequence.
31443a5ec4eSXin LI static const uint8_t first[] = {
31543a5ec4eSXin LI     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
31643a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
31743a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
31843a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
31943a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
32043a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
32143a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
32243a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
32343a5ec4eSXin LI     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
32443a5ec4eSXin LI     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
32543a5ec4eSXin LI     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
32643a5ec4eSXin LI     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
32743a5ec4eSXin LI     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
32843a5ec4eSXin LI     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
32943a5ec4eSXin LI     XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
33043a5ec4eSXin LI     S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
33143a5ec4eSXin LI     S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
33243a5ec4eSXin LI     S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
33343a5ec4eSXin LI };
33443a5ec4eSXin LI 
33543a5ec4eSXin LI // acceptRange gives the range of valid values for the second byte in a UTF-8
33643a5ec4eSXin LI // sequence.
337*ae316d1dSXin LI static struct accept_range {
33843a5ec4eSXin LI 	uint8_t lo; // lowest value for second byte.
33943a5ec4eSXin LI 	uint8_t hi; // highest value for second byte.
34043a5ec4eSXin LI } accept_ranges[16] = {
34143a5ec4eSXin LI // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
34243a5ec4eSXin LI 	{ LOCB, HICB },
34343a5ec4eSXin LI 	{ 0xA0, HICB },
34443a5ec4eSXin LI 	{ LOCB, 0x9F },
34543a5ec4eSXin LI 	{ 0x90, HICB },
34643a5ec4eSXin LI 	{ LOCB, 0x8F },
34743a5ec4eSXin LI };
34843a5ec4eSXin LI 
349898496eeSXin LI file_protected int
35043a5ec4eSXin LI file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
35143a5ec4eSXin LI     size_t *ulen)
352b6cee71dSXin LI {
353b6cee71dSXin LI 	size_t i;
354b6cee71dSXin LI 	int n;
35543a5ec4eSXin LI 	file_unichar_t c;
356b6cee71dSXin LI 	int gotone = 0, ctrl = 0;
357b6cee71dSXin LI 
358b6cee71dSXin LI 	if (ubuf)
359b6cee71dSXin LI 		*ulen = 0;
360b6cee71dSXin LI 
361b6cee71dSXin LI 	for (i = 0; i < nbytes; i++) {
362b6cee71dSXin LI 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
363b6cee71dSXin LI 			/*
364b6cee71dSXin LI 			 * Even if the whole file is valid UTF-8 sequences,
365b6cee71dSXin LI 			 * still reject it if it uses weird control characters.
366b6cee71dSXin LI 			 */
367b6cee71dSXin LI 
368b6cee71dSXin LI 			if (text_chars[buf[i]] != T)
369b6cee71dSXin LI 				ctrl = 1;
370b6cee71dSXin LI 
371b6cee71dSXin LI 			if (ubuf)
372b6cee71dSXin LI 				ubuf[(*ulen)++] = buf[i];
373b6cee71dSXin LI 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
374b6cee71dSXin LI 			return -1;
375b6cee71dSXin LI 		} else {			   /* 11xxxxxx begins UTF-8 */
376b6cee71dSXin LI 			int following;
37743a5ec4eSXin LI 			uint8_t x = first[buf[i]];
37843a5ec4eSXin LI 			const struct accept_range *ar =
37943a5ec4eSXin LI 			    &accept_ranges[(unsigned int)x >> 4];
38043a5ec4eSXin LI 			if (x == XX)
38143a5ec4eSXin LI 				return -1;
382b6cee71dSXin LI 
383b6cee71dSXin LI 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
384b6cee71dSXin LI 				c = buf[i] & 0x1f;
385b6cee71dSXin LI 				following = 1;
386b6cee71dSXin LI 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
387b6cee71dSXin LI 				c = buf[i] & 0x0f;
388b6cee71dSXin LI 				following = 2;
389b6cee71dSXin LI 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
390b6cee71dSXin LI 				c = buf[i] & 0x07;
391b6cee71dSXin LI 				following = 3;
392b6cee71dSXin LI 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
393b6cee71dSXin LI 				c = buf[i] & 0x03;
394b6cee71dSXin LI 				following = 4;
395b6cee71dSXin LI 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
396b6cee71dSXin LI 				c = buf[i] & 0x01;
397b6cee71dSXin LI 				following = 5;
398b6cee71dSXin LI 			} else
399b6cee71dSXin LI 				return -1;
400b6cee71dSXin LI 
401b6cee71dSXin LI 			for (n = 0; n < following; n++) {
402b6cee71dSXin LI 				i++;
403b6cee71dSXin LI 				if (i >= nbytes)
404b6cee71dSXin LI 					goto done;
405b6cee71dSXin LI 
40643a5ec4eSXin LI 				if (n == 0 &&
40743a5ec4eSXin LI 				     (buf[i] < ar->lo || buf[i] > ar->hi))
40843a5ec4eSXin LI 					return -1;
40943a5ec4eSXin LI 
410b6cee71dSXin LI 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
411b6cee71dSXin LI 					return -1;
412b6cee71dSXin LI 
413b6cee71dSXin LI 				c = (c << 6) + (buf[i] & 0x3f);
414b6cee71dSXin LI 			}
415b6cee71dSXin LI 
416b6cee71dSXin LI 			if (ubuf)
417b6cee71dSXin LI 				ubuf[(*ulen)++] = c;
418b6cee71dSXin LI 			gotone = 1;
419b6cee71dSXin LI 		}
420b6cee71dSXin LI 	}
421b6cee71dSXin LI done:
422b6cee71dSXin LI 	return ctrl ? 0 : (gotone ? 2 : 1);
423b6cee71dSXin LI }
424b6cee71dSXin LI 
425b6cee71dSXin LI /*
426b6cee71dSXin LI  * Decide whether some text looks like UTF-8 with BOM. If there is no
427b6cee71dSXin LI  * BOM, return -1; otherwise return the result of looks_utf8 on the
428b6cee71dSXin LI  * rest of the text.
429b6cee71dSXin LI  */
430898496eeSXin LI file_private int
43143a5ec4eSXin LI looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
43243a5ec4eSXin LI     file_unichar_t *ubuf, size_t *ulen)
433b6cee71dSXin LI {
434b6cee71dSXin LI 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
435b6cee71dSXin LI 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
436b6cee71dSXin LI 	else
437b6cee71dSXin LI 		return -1;
438b6cee71dSXin LI }
439b6cee71dSXin LI 
440898496eeSXin LI file_private int
44143a5ec4eSXin LI looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
44243a5ec4eSXin LI     size_t *ulen)
4435f0216bdSXin LI {
4445f0216bdSXin LI 	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
4455f0216bdSXin LI 		switch (buf[3]) {
4465f0216bdSXin LI 		case '8':
4475f0216bdSXin LI 		case '9':
4485f0216bdSXin LI 		case '+':
4495f0216bdSXin LI 		case '/':
4505f0216bdSXin LI 			if (ubuf)
4515f0216bdSXin LI 				*ulen = 0;
4525f0216bdSXin LI 			return 1;
4535f0216bdSXin LI 		default:
4545f0216bdSXin LI 			return -1;
4555f0216bdSXin LI 		}
4565f0216bdSXin LI 	else
4575f0216bdSXin LI 		return -1;
4585f0216bdSXin LI }
4595f0216bdSXin LI 
460a4d6d3b8SXin LI #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
461a4d6d3b8SXin LI #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
462a4d6d3b8SXin LI #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
463a4d6d3b8SXin LI 
464898496eeSXin LI file_private int
46543a5ec4eSXin LI looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
466b6cee71dSXin LI     size_t *ulen)
467b6cee71dSXin LI {
468b6cee71dSXin LI 	int bigend;
469a4d6d3b8SXin LI 	uint32_t hi;
470b6cee71dSXin LI 	size_t i;
471b6cee71dSXin LI 
472b6cee71dSXin LI 	if (nbytes < 2)
473b6cee71dSXin LI 		return 0;
474b6cee71dSXin LI 
47548c779cdSXin LI 	if (bf[0] == 0xff && bf[1] == 0xfe)
476b6cee71dSXin LI 		bigend = 0;
47748c779cdSXin LI 	else if (bf[0] == 0xfe && bf[1] == 0xff)
478b6cee71dSXin LI 		bigend = 1;
479b6cee71dSXin LI 	else
480b6cee71dSXin LI 		return 0;
481b6cee71dSXin LI 
482b6cee71dSXin LI 	*ulen = 0;
483a4d6d3b8SXin LI 	hi = 0;
484b6cee71dSXin LI 
485b6cee71dSXin LI 	for (i = 2; i + 1 < nbytes; i += 2) {
486a4d6d3b8SXin LI 		uint32_t uc;
487b6cee71dSXin LI 
488b6cee71dSXin LI 		if (bigend)
489a2dfb722SXin LI 			uc = CAST(uint32_t,
490a2dfb722SXin LI 			    bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));
491b6cee71dSXin LI 		else
492a2dfb722SXin LI 			uc = CAST(uint32_t,
493a2dfb722SXin LI 			    bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));
494b6cee71dSXin LI 
495a4d6d3b8SXin LI 		uc &= 0xffff;
496a4d6d3b8SXin LI 
497a4d6d3b8SXin LI 		switch (uc) {
498a4d6d3b8SXin LI 		case 0xfffe:
499a4d6d3b8SXin LI 		case 0xffff:
500b6cee71dSXin LI 			return 0;
501a4d6d3b8SXin LI 		default:
502a4d6d3b8SXin LI 			if (UCS16_NOCHAR(uc))
503a4d6d3b8SXin LI 				return 0;
504a4d6d3b8SXin LI 			break;
505a4d6d3b8SXin LI 		}
506a4d6d3b8SXin LI 		if (hi) {
507a4d6d3b8SXin LI 			if (!UCS16_LOSURR(uc))
508a4d6d3b8SXin LI 				return 0;
509a4d6d3b8SXin LI 			uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);
510a4d6d3b8SXin LI 			hi = 0;
511a4d6d3b8SXin LI 		}
512a4d6d3b8SXin LI 		if (uc < 128 && text_chars[CAST(size_t, uc)] != T)
513a4d6d3b8SXin LI 			return 0;
514a4d6d3b8SXin LI 		ubf[(*ulen)++] = uc;
515a4d6d3b8SXin LI 		if (UCS16_HISURR(uc))
516a4d6d3b8SXin LI 			hi = uc - 0xd800 + 1;
517a4d6d3b8SXin LI 		if (UCS16_LOSURR(uc))
518b6cee71dSXin LI 			return 0;
519b6cee71dSXin LI 	}
520b6cee71dSXin LI 
521b6cee71dSXin LI 	return 1 + bigend;
522b6cee71dSXin LI }
523b6cee71dSXin LI 
524898496eeSXin LI file_private int
52543a5ec4eSXin LI looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
52648c779cdSXin LI     size_t *ulen)
52748c779cdSXin LI {
52848c779cdSXin LI 	int bigend;
52948c779cdSXin LI 	size_t i;
53048c779cdSXin LI 
53148c779cdSXin LI 	if (nbytes < 4)
53248c779cdSXin LI 		return 0;
53348c779cdSXin LI 
53448c779cdSXin LI 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
53548c779cdSXin LI 		bigend = 0;
53648c779cdSXin LI 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
53748c779cdSXin LI 		bigend = 1;
53848c779cdSXin LI 	else
53948c779cdSXin LI 		return 0;
54048c779cdSXin LI 
54148c779cdSXin LI 	*ulen = 0;
54248c779cdSXin LI 
54348c779cdSXin LI 	for (i = 4; i + 3 < nbytes; i += 4) {
54448c779cdSXin LI 		/* XXX fix to properly handle chars > 65536 */
54548c779cdSXin LI 
54648c779cdSXin LI 		if (bigend)
54743a5ec4eSXin LI 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
54843a5ec4eSXin LI 			    | (CAST(file_unichar_t, bf[i + 2]) << 8)
54943a5ec4eSXin LI 			    | (CAST(file_unichar_t, bf[i + 1]) << 16)
55043a5ec4eSXin LI 			    | (CAST(file_unichar_t, bf[i]) << 24);
55148c779cdSXin LI 		else
55243a5ec4eSXin LI 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
55343a5ec4eSXin LI 			    | (CAST(file_unichar_t, bf[i + 1]) << 8)
55443a5ec4eSXin LI 			    | (CAST(file_unichar_t, bf[i + 2]) << 16)
55543a5ec4eSXin LI 			    | (CAST(file_unichar_t, bf[i + 3]) << 24);
55648c779cdSXin LI 
55748c779cdSXin LI 		if (ubf[*ulen - 1] == 0xfffe)
55848c779cdSXin LI 			return 0;
55948c779cdSXin LI 		if (ubf[*ulen - 1] < 128 &&
56048c779cdSXin LI 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
56148c779cdSXin LI 			return 0;
56248c779cdSXin LI 	}
56348c779cdSXin LI 
56448c779cdSXin LI 	return 1 + bigend;
56548c779cdSXin LI }
566b6cee71dSXin LI #undef F
567b6cee71dSXin LI #undef T
568b6cee71dSXin LI #undef I
569b6cee71dSXin LI #undef X
570b6cee71dSXin LI 
571b6cee71dSXin LI /*
572b6cee71dSXin LI  * This table maps each EBCDIC character to an (8-bit extended) ASCII
573b6cee71dSXin LI  * character, as specified in the rationale for the dd(1) command in
574b6cee71dSXin LI  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
575b6cee71dSXin LI  *
576b6cee71dSXin LI  * Unfortunately it does not seem to correspond exactly to any of the
577b6cee71dSXin LI  * five variants of EBCDIC documented in IBM's _Enterprise Systems
578b6cee71dSXin LI  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
579b6cee71dSXin LI  * Edition, July, 1999, pp. I-1 - I-4.
580b6cee71dSXin LI  *
581b6cee71dSXin LI  * Fortunately, though, all versions of EBCDIC, including this one, agree
582b6cee71dSXin LI  * on most of the printing characters that also appear in (7-bit) ASCII.
583b6cee71dSXin LI  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
584b6cee71dSXin LI  *
585b6cee71dSXin LI  * Fortunately too, there is general agreement that codes 0x00 through
586b6cee71dSXin LI  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
587b6cee71dSXin LI  * remainder printing characters.
588b6cee71dSXin LI  *
589b6cee71dSXin LI  * This is sufficient to allow us to identify EBCDIC text and to distinguish
590b6cee71dSXin LI  * between old-style and internationalized examples of text.
591b6cee71dSXin LI  */
592b6cee71dSXin LI 
593898496eeSXin LI file_private unsigned char ebcdic_to_ascii[] = {
594b6cee71dSXin LI   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
595b6cee71dSXin LI  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
596b6cee71dSXin LI 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
597b6cee71dSXin LI 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
598b6cee71dSXin LI ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
599b6cee71dSXin LI '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
600b6cee71dSXin LI '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
601b6cee71dSXin LI 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
602b6cee71dSXin LI 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
603b6cee71dSXin LI 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
604b6cee71dSXin LI 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
605b6cee71dSXin LI 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
606b6cee71dSXin LI '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
607b6cee71dSXin LI '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
608b6cee71dSXin LI '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
609b6cee71dSXin LI '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
610b6cee71dSXin LI };
611b6cee71dSXin LI 
612b6cee71dSXin LI #ifdef notdef
613b6cee71dSXin LI /*
614b6cee71dSXin LI  * The following EBCDIC-to-ASCII table may relate more closely to reality,
615b6cee71dSXin LI  * or at least to modern reality.  It comes from
616b6cee71dSXin LI  *
617b6cee71dSXin LI  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
618b6cee71dSXin LI  *
619b6cee71dSXin LI  * and maps the characters of EBCDIC code page 1047 (the code used for
620b6cee71dSXin LI  * Unix-derived software on IBM's 390 systems) to the corresponding
621b6cee71dSXin LI  * characters from ISO 8859-1.
622b6cee71dSXin LI  *
623b6cee71dSXin LI  * If this table is used instead of the above one, some of the special
624b6cee71dSXin LI  * cases for the NEL character can be taken out of the code.
625b6cee71dSXin LI  */
626b6cee71dSXin LI 
627898496eeSXin LI file_private unsigned char ebcdic_1047_to_8859[] = {
628b6cee71dSXin LI 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
629b6cee71dSXin LI 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
630b6cee71dSXin LI 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
631b6cee71dSXin LI 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
632b6cee71dSXin LI 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
633b6cee71dSXin LI 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
634b6cee71dSXin LI 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
635b6cee71dSXin LI 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
636b6cee71dSXin LI 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
637b6cee71dSXin LI 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
638b6cee71dSXin LI 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
639b6cee71dSXin LI 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
640b6cee71dSXin LI 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
641b6cee71dSXin LI 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
642b6cee71dSXin LI 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
643b6cee71dSXin LI 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
644b6cee71dSXin LI };
645b6cee71dSXin LI #endif
646b6cee71dSXin LI 
647b6cee71dSXin LI /*
648b6cee71dSXin LI  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
649b6cee71dSXin LI  */
650898496eeSXin LI file_private void
651b6cee71dSXin LI from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
652b6cee71dSXin LI {
653b6cee71dSXin LI 	size_t i;
654b6cee71dSXin LI 
655b6cee71dSXin LI 	for (i = 0; i < nbytes; i++) {
656b6cee71dSXin LI 		out[i] = ebcdic_to_ascii[buf[i]];
657b6cee71dSXin LI 	}
658b6cee71dSXin LI }
659