xref: /netbsd-src/external/bsd/file/dist/src/encoding.c (revision ddb176824c39fb0db5ceef3e9e40dcaa273aec38)
1*ddb17682Schristos /*	$NetBSD: encoding.c,v 1.12 2023/08/18 19:00:11 christos Exp $	*/
2fa9ee498Schristos 
31b108b8bSchristos /*
41b108b8bSchristos  * Copyright (c) Ian F. Darwin 1986-1995.
51b108b8bSchristos  * Software written by Ian F. Darwin and others;
61b108b8bSchristos  * maintained 1995-present by Christos Zoulas and others.
71b108b8bSchristos  *
81b108b8bSchristos  * Redistribution and use in source and binary forms, with or without
91b108b8bSchristos  * modification, are permitted provided that the following conditions
101b108b8bSchristos  * are met:
111b108b8bSchristos  * 1. Redistributions of source code must retain the above copyright
121b108b8bSchristos  *    notice immediately at the beginning of the file, without modification,
131b108b8bSchristos  *    this list of conditions, and the following disclaimer.
141b108b8bSchristos  * 2. Redistributions in binary form must reproduce the above copyright
151b108b8bSchristos  *    notice, this list of conditions and the following disclaimer in the
161b108b8bSchristos  *    documentation and/or other materials provided with the distribution.
171b108b8bSchristos  *
181b108b8bSchristos  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
191b108b8bSchristos  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
201b108b8bSchristos  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
211b108b8bSchristos  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
221b108b8bSchristos  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
231b108b8bSchristos  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
241b108b8bSchristos  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
251b108b8bSchristos  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
261b108b8bSchristos  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
271b108b8bSchristos  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
281b108b8bSchristos  * SUCH DAMAGE.
291b108b8bSchristos  */
301b108b8bSchristos /*
311b108b8bSchristos  * Encoding -- determine the character encoding of a text file.
321b108b8bSchristos  *
331b108b8bSchristos  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
341b108b8bSchristos  * international characters.
351b108b8bSchristos  */
361b108b8bSchristos 
371b108b8bSchristos #include "file.h"
381b108b8bSchristos 
391b108b8bSchristos #ifndef	lint
401b108b8bSchristos #if 0
41*ddb17682Schristos FILE_RCSID("@(#)$File: encoding.c,v 1.42 2022/12/26 17:31:14 christos Exp $")
421b108b8bSchristos #else
43*ddb17682Schristos __RCSID("$NetBSD: encoding.c,v 1.12 2023/08/18 19:00:11 christos Exp $");
441b108b8bSchristos #endif
451b108b8bSchristos #endif	/* lint */
461b108b8bSchristos 
471b108b8bSchristos #include "magic.h"
481b108b8bSchristos #include <string.h>
491b108b8bSchristos #include <stdlib.h>
501b108b8bSchristos 
511b108b8bSchristos 
52*ddb17682Schristos file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
531b108b8bSchristos     size_t *);
54*ddb17682Schristos file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
55d87b0039Schristos     size_t *);
56*ddb17682Schristos file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
57d87b0039Schristos     size_t *);
58*ddb17682Schristos file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
59d87b0039Schristos     size_t *);
60*ddb17682Schristos file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
61d87b0039Schristos     size_t *);
62*ddb17682Schristos file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
63d87b0039Schristos     size_t *);
64*ddb17682Schristos file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
65d87b0039Schristos     size_t *);
66*ddb17682Schristos file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
671b108b8bSchristos 
685ccaa8c0Schristos #ifdef DEBUG_ENCODING
695ccaa8c0Schristos #define DPRINTF(a) printf a
705ccaa8c0Schristos #else
715ccaa8c0Schristos #define DPRINTF(a)
725ccaa8c0Schristos #endif
735ccaa8c0Schristos 
741b108b8bSchristos /*
751b108b8bSchristos  * Try to determine whether text is in some character code we can
761b108b8bSchristos  * identify.  Each of these tests, if it succeeds, will leave
77d87b0039Schristos  * the text converted into one-file_unichar_t-per-character Unicode in
781b108b8bSchristos  * ubuf, and the number of characters converted in ulen.
791b108b8bSchristos  */
80*ddb17682Schristos file_protected int
file_encoding(struct magic_set * ms,const struct buffer * b,file_unichar_t ** ubuf,size_t * ulen,const char ** code,const char ** code_mime,const char ** type)81d87b0039Schristos file_encoding(struct magic_set *ms, const struct buffer *b,
82d87b0039Schristos     file_unichar_t **ubuf, size_t *ulen, const char **code,
83d87b0039Schristos     const char **code_mime, const char **type)
841b108b8bSchristos {
85c02f7f97Schristos 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
865efe63deSchristos 	size_t nbytes = b->flen;
871b108b8bSchristos 	size_t mlen;
881b108b8bSchristos 	int rv = 1, ucs_type;
89d87b0039Schristos 	file_unichar_t *udefbuf;
905efe63deSchristos 	size_t udeflen;
915efe63deSchristos 
925efe63deSchristos 	if (ubuf == NULL)
935efe63deSchristos 		ubuf = &udefbuf;
945efe63deSchristos 	if (ulen == NULL)
955efe63deSchristos 		ulen = &udeflen;
961b108b8bSchristos 
97f50962bcSchristos 	*type = "text";
988df916c0Schristos 	*ulen = 0;
998df916c0Schristos 	*code = "unknown";
1008df916c0Schristos 	*code_mime = "binary";
1018df916c0Schristos 
102d87b0039Schristos 	if (nbytes > ms->encoding_max)
103d87b0039Schristos 		nbytes = ms->encoding_max;
104d87b0039Schristos 
1058df916c0Schristos 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
106d87b0039Schristos 	*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
107d87b0039Schristos 	if (*ubuf == NULL) {
1081b108b8bSchristos 		file_oomem(ms, mlen);
1091b108b8bSchristos 		goto done;
1101b108b8bSchristos 	}
1111b108b8bSchristos 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
11274db5203Schristos 		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
11374db5203Schristos 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
114d87b0039Schristos 			*code = "Unicode text, UTF-7";
11574db5203Schristos 			*code_mime = "utf-7";
11674db5203Schristos 		} else {
1175ccaa8c0Schristos 			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
1181b108b8bSchristos 			*code = "ASCII";
1191b108b8bSchristos 			*code_mime = "us-ascii";
12074db5203Schristos 		}
1211b108b8bSchristos 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
1225ccaa8c0Schristos 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
123d87b0039Schristos 		*code = "Unicode text, UTF-8 (with BOM)";
1241b108b8bSchristos 		*code_mime = "utf-8";
1251b108b8bSchristos 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
1265ccaa8c0Schristos 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
127d87b0039Schristos 		*code = "Unicode text, UTF-8";
1281b108b8bSchristos 		*code_mime = "utf-8";
129d0c65b7bSchristos 	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
130d0c65b7bSchristos 		if (ucs_type == 1) {
131d87b0039Schristos 			*code = "Unicode text, UTF-32, little-endian";
132d0c65b7bSchristos 			*code_mime = "utf-32le";
133d0c65b7bSchristos 		} else {
134d87b0039Schristos 			*code = "Unicode text, UTF-32, big-endian";
135d0c65b7bSchristos 			*code_mime = "utf-32be";
136d0c65b7bSchristos 		}
137d0c65b7bSchristos 		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
1381b108b8bSchristos 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
1391b108b8bSchristos 		if (ucs_type == 1) {
140d87b0039Schristos 			*code = "Unicode text, UTF-16, little-endian";
1411b108b8bSchristos 			*code_mime = "utf-16le";
1421b108b8bSchristos 		} else {
143d87b0039Schristos 			*code = "Unicode text, UTF-16, big-endian";
1441b108b8bSchristos 			*code_mime = "utf-16be";
1451b108b8bSchristos 		}
1465ccaa8c0Schristos 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
1471b108b8bSchristos 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
1485ccaa8c0Schristos 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
1491b108b8bSchristos 		*code = "ISO-8859";
1501b108b8bSchristos 		*code_mime = "iso-8859-1";
1511b108b8bSchristos 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
1525ccaa8c0Schristos 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
1531b108b8bSchristos 		*code = "Non-ISO extended-ASCII";
1541b108b8bSchristos 		*code_mime = "unknown-8bit";
1551b108b8bSchristos 	} else {
1561d4cb158Schristos 		unsigned char *nbuf;
1571d4cb158Schristos 
1581d4cb158Schristos 		mlen = (nbytes + 1) * sizeof(nbuf[0]);
1591d4cb158Schristos 		if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) {
1601d4cb158Schristos 			file_oomem(ms, mlen);
1611d4cb158Schristos 			goto done;
1621d4cb158Schristos 		}
1631b108b8bSchristos 		from_ebcdic(buf, nbytes, nbuf);
1641b108b8bSchristos 
1651b108b8bSchristos 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
1665ccaa8c0Schristos 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
1671b108b8bSchristos 			*code = "EBCDIC";
1681b108b8bSchristos 			*code_mime = "ebcdic";
1691b108b8bSchristos 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
1705ccaa8c0Schristos 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
1715ccaa8c0Schristos 			    *ulen));
1721b108b8bSchristos 			*code = "International EBCDIC";
1731b108b8bSchristos 			*code_mime = "ebcdic";
1741b108b8bSchristos 		} else { /* Doesn't look like text at all */
1755ccaa8c0Schristos 			DPRINTF(("binary\n"));
1761b108b8bSchristos 			rv = 0;
1771b108b8bSchristos 			*type = "binary";
1781b108b8bSchristos 		}
1791d4cb158Schristos 		free(nbuf);
1801b108b8bSchristos 	}
1811b108b8bSchristos 
1821b108b8bSchristos  done:
1835efe63deSchristos 	if (ubuf == &udefbuf)
1845efe63deSchristos 		free(udefbuf);
1851b108b8bSchristos 
1861b108b8bSchristos 	return rv;
1871b108b8bSchristos }
1881b108b8bSchristos 
1891b108b8bSchristos /*
1901b108b8bSchristos  * This table reflects a particular philosophy about what constitutes
1911b108b8bSchristos  * "text," and there is room for disagreement about it.
1921b108b8bSchristos  *
1931b108b8bSchristos  * Version 3.31 of the file command considered a file to be ASCII if
1941b108b8bSchristos  * each of its characters was approved by either the isascii() or
1951b108b8bSchristos  * isalpha() function.  On most systems, this would mean that any
1961b108b8bSchristos  * file consisting only of characters in the range 0x00 ... 0x7F
1971b108b8bSchristos  * would be called ASCII text, but many systems might reasonably
1981b108b8bSchristos  * consider some characters outside this range to be alphabetic,
1991b108b8bSchristos  * so the file command would call such characters ASCII.  It might
2001b108b8bSchristos  * have been more accurate to call this "considered textual on the
2011b108b8bSchristos  * local system" than "ASCII."
2021b108b8bSchristos  *
2031b108b8bSchristos  * It considered a file to be "International language text" if each
2041b108b8bSchristos  * of its characters was either an ASCII printing character (according
2051b108b8bSchristos  * to the real ASCII standard, not the above test), a character in
2061b108b8bSchristos  * the range 0x80 ... 0xFF, or one of the following control characters:
2071b108b8bSchristos  * backspace, tab, line feed, vertical tab, form feed, carriage return,
2081b108b8bSchristos  * escape.  No attempt was made to determine the language in which files
2091b108b8bSchristos  * of this type were written.
2101b108b8bSchristos  *
2111b108b8bSchristos  *
2121b108b8bSchristos  * The table below considers a file to be ASCII if all of its characters
2131b108b8bSchristos  * are either ASCII printing characters (again, according to the X3.4
2141b108b8bSchristos  * standard, not isascii()) or any of the following controls: bell,
2151b108b8bSchristos  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
2161b108b8bSchristos  *
2171b108b8bSchristos  * I include bell because some programs (particularly shell scripts)
2181b108b8bSchristos  * use it literally, even though it is rare in normal text.  I exclude
2191b108b8bSchristos  * vertical tab because it never seems to be used in real text.  I also
2201b108b8bSchristos  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
2211b108b8bSchristos  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
2221b108b8bSchristos  * character to.  It might be more appropriate to include it in the 8859
2231b108b8bSchristos  * set instead of the ASCII set, but it's got to be included in *something*
2241b108b8bSchristos  * we recognize or EBCDIC files aren't going to be considered textual.
2251b108b8bSchristos  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
2261b108b8bSchristos  * and Latin characters, so these should possibly be allowed.  But they
2271b108b8bSchristos  * make a real mess on VT100-style displays if they're not paired properly,
2281b108b8bSchristos  * so we are probably better off not calling them text.
2291b108b8bSchristos  *
2301b108b8bSchristos  * A file is considered to be ISO-8859 text if its characters are all
2311b108b8bSchristos  * either ASCII, according to the above definition, or printing characters
2321b108b8bSchristos  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
2331b108b8bSchristos  *
2341b108b8bSchristos  * Finally, a file is considered to be international text from some other
2351b108b8bSchristos  * character code if its characters are all either ISO-8859 (according to
2361b108b8bSchristos  * the above definition) or characters in the range 0x80 ... 0x9F, which
2371b108b8bSchristos  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
2381b108b8bSchristos  * consider to be printing characters.
2391b108b8bSchristos  */
2401b108b8bSchristos 
2411b108b8bSchristos #define F 0   /* character never appears in text */
2421b108b8bSchristos #define T 1   /* character appears in plain ASCII text */
2431b108b8bSchristos #define I 2   /* character appears in ISO-8859 text */
2441b108b8bSchristos #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
2451b108b8bSchristos 
246*ddb17682Schristos file_private char text_chars[256] = {
24774db5203Schristos 	/*                  BEL BS HT LF VT FF CR    */
24874db5203Schristos 	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
2491b108b8bSchristos 	/*                              ESC          */
2501b108b8bSchristos 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
2511b108b8bSchristos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
2521b108b8bSchristos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
2531b108b8bSchristos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
2541b108b8bSchristos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
2551b108b8bSchristos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
2561b108b8bSchristos 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
2571b108b8bSchristos 	/*            NEL                            */
2581b108b8bSchristos 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
2591b108b8bSchristos 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
2601b108b8bSchristos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
2611b108b8bSchristos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
2621b108b8bSchristos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
2631b108b8bSchristos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
2641b108b8bSchristos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
2651b108b8bSchristos 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
2661b108b8bSchristos };
2671b108b8bSchristos 
268d87b0039Schristos #define LOOKS(NAME, COND) \
269*ddb17682Schristos file_private int \
270d87b0039Schristos looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
271d87b0039Schristos     size_t *ulen) \
272d87b0039Schristos { \
2731d4cb158Schristos 	size_t i; \
274d87b0039Schristos \
275d87b0039Schristos 	*ulen = 0; \
276d87b0039Schristos \
277d87b0039Schristos 	for (i = 0; i < nbytes; i++) { \
278d87b0039Schristos 		int t = text_chars[buf[i]]; \
279d87b0039Schristos \
280d87b0039Schristos 		if (COND) \
281d87b0039Schristos 			return 0; \
282d87b0039Schristos \
283d87b0039Schristos 		ubuf[(*ulen)++] = buf[i]; \
284d87b0039Schristos 	} \
285d87b0039Schristos 	return 1; \
2861b108b8bSchristos }
2871b108b8bSchristos 
288d87b0039Schristos LOOKS(ascii, t != T)
289d87b0039Schristos LOOKS(latin1, t != T && t != I)
290d87b0039Schristos LOOKS(extended, t != T && t != I && t != X)
2911b108b8bSchristos 
2921b108b8bSchristos /*
2931b108b8bSchristos  * Decide whether some text looks like UTF-8. Returns:
2941b108b8bSchristos  *
2951b108b8bSchristos  *     -1: invalid UTF-8
2961b108b8bSchristos  *      0: uses odd control characters, so doesn't look like text
2971b108b8bSchristos  *      1: 7-bit text
2981b108b8bSchristos  *      2: definitely UTF-8 text (valid high-bit set bytes)
2991b108b8bSchristos  *
3001b108b8bSchristos  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
3011b108b8bSchristos  * ubuf must be big enough!
3021b108b8bSchristos  */
303d87b0039Schristos 
304d87b0039Schristos // from: https://golang.org/src/unicode/utf8/utf8.go
305d87b0039Schristos 
306d87b0039Schristos #define	XX 0xF1 // invalid: size 1
307d87b0039Schristos #define	AS 0xF0 // ASCII: size 1
308d87b0039Schristos #define	S1 0x02 // accept 0, size 2
309d87b0039Schristos #define	S2 0x13 // accept 1, size 3
310d87b0039Schristos #define	S3 0x03 // accept 0, size 3
311d87b0039Schristos #define	S4 0x23 // accept 2, size 3
312d87b0039Schristos #define	S5 0x34 // accept 3, size 4
313d87b0039Schristos #define	S6 0x04 // accept 0, size 4
314d87b0039Schristos #define	S7 0x44 // accept 4, size 4
315d87b0039Schristos 
316d87b0039Schristos #define LOCB 0x80
317d87b0039Schristos #define HICB 0xBF
318d87b0039Schristos 
319d87b0039Schristos // first is information about the first byte in a UTF-8 sequence.
320d87b0039Schristos static const uint8_t first[] = {
321d87b0039Schristos     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
322d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
323d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
324d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
325d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
326d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
327d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
328d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
329d87b0039Schristos     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
330d87b0039Schristos     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
331d87b0039Schristos     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
332d87b0039Schristos     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
333d87b0039Schristos     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
334d87b0039Schristos     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
335d87b0039Schristos     XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
336d87b0039Schristos     S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
337d87b0039Schristos     S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
338d87b0039Schristos     S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
339d87b0039Schristos };
340d87b0039Schristos 
341d87b0039Schristos // acceptRange gives the range of valid values for the second byte in a UTF-8
342d87b0039Schristos // sequence.
343d87b0039Schristos struct accept_range {
344d87b0039Schristos 	uint8_t lo; // lowest value for second byte.
345d87b0039Schristos 	uint8_t hi; // highest value for second byte.
346d87b0039Schristos } accept_ranges[16] = {
347d87b0039Schristos // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
348d87b0039Schristos 	{ LOCB, HICB },
349d87b0039Schristos 	{ 0xA0, HICB },
350d87b0039Schristos 	{ LOCB, 0x9F },
351d87b0039Schristos 	{ 0x90, HICB },
352d87b0039Schristos 	{ LOCB, 0x8F },
353d87b0039Schristos };
354d87b0039Schristos 
355*ddb17682Schristos file_protected int
file_looks_utf8(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)356d87b0039Schristos file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
357d87b0039Schristos     size_t *ulen)
3581b108b8bSchristos {
3591b108b8bSchristos 	size_t i;
3601b108b8bSchristos 	int n;
361d87b0039Schristos 	file_unichar_t c;
3621b108b8bSchristos 	int gotone = 0, ctrl = 0;
3631b108b8bSchristos 
3641b108b8bSchristos 	if (ubuf)
3651b108b8bSchristos 		*ulen = 0;
3661b108b8bSchristos 
3671b108b8bSchristos 	for (i = 0; i < nbytes; i++) {
3681b108b8bSchristos 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
3691b108b8bSchristos 			/*
3701b108b8bSchristos 			 * Even if the whole file is valid UTF-8 sequences,
3711b108b8bSchristos 			 * still reject it if it uses weird control characters.
3721b108b8bSchristos 			 */
3731b108b8bSchristos 
3741b108b8bSchristos 			if (text_chars[buf[i]] != T)
3751b108b8bSchristos 				ctrl = 1;
3761b108b8bSchristos 
3771b108b8bSchristos 			if (ubuf)
3781b108b8bSchristos 				ubuf[(*ulen)++] = buf[i];
3791b108b8bSchristos 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
3801b108b8bSchristos 			return -1;
3811b108b8bSchristos 		} else {			   /* 11xxxxxx begins UTF-8 */
3821b108b8bSchristos 			int following;
383d87b0039Schristos 			uint8_t x = first[buf[i]];
384d87b0039Schristos 			const struct accept_range *ar =
385d87b0039Schristos 			    &accept_ranges[(unsigned int)x >> 4];
386d87b0039Schristos 			if (x == XX)
387d87b0039Schristos 				return -1;
3881b108b8bSchristos 
3891b108b8bSchristos 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
3901b108b8bSchristos 				c = buf[i] & 0x1f;
3911b108b8bSchristos 				following = 1;
3921b108b8bSchristos 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
3931b108b8bSchristos 				c = buf[i] & 0x0f;
3941b108b8bSchristos 				following = 2;
3951b108b8bSchristos 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
3961b108b8bSchristos 				c = buf[i] & 0x07;
3971b108b8bSchristos 				following = 3;
3981b108b8bSchristos 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
3991b108b8bSchristos 				c = buf[i] & 0x03;
4001b108b8bSchristos 				following = 4;
4011b108b8bSchristos 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
4021b108b8bSchristos 				c = buf[i] & 0x01;
4031b108b8bSchristos 				following = 5;
4041b108b8bSchristos 			} else
4051b108b8bSchristos 				return -1;
4061b108b8bSchristos 
4071b108b8bSchristos 			for (n = 0; n < following; n++) {
4081b108b8bSchristos 				i++;
4091b108b8bSchristos 				if (i >= nbytes)
4101b108b8bSchristos 					goto done;
4111b108b8bSchristos 
412d87b0039Schristos 				if (n == 0 &&
413d87b0039Schristos 				     (buf[i] < ar->lo || buf[i] > ar->hi))
414d87b0039Schristos 					return -1;
415d87b0039Schristos 
4161b108b8bSchristos 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
4171b108b8bSchristos 					return -1;
4181b108b8bSchristos 
4191b108b8bSchristos 				c = (c << 6) + (buf[i] & 0x3f);
4201b108b8bSchristos 			}
4211b108b8bSchristos 
4221b108b8bSchristos 			if (ubuf)
4231b108b8bSchristos 				ubuf[(*ulen)++] = c;
4241b108b8bSchristos 			gotone = 1;
4251b108b8bSchristos 		}
4261b108b8bSchristos 	}
4271b108b8bSchristos done:
4281b108b8bSchristos 	return ctrl ? 0 : (gotone ? 2 : 1);
4291b108b8bSchristos }
4301b108b8bSchristos 
4311b108b8bSchristos /*
4321b108b8bSchristos  * Decide whether some text looks like UTF-8 with BOM. If there is no
4331b108b8bSchristos  * BOM, return -1; otherwise return the result of looks_utf8 on the
4341b108b8bSchristos  * rest of the text.
4351b108b8bSchristos  */
436*ddb17682Schristos file_private int
looks_utf8_with_BOM(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)437d87b0039Schristos looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
438d87b0039Schristos     file_unichar_t *ubuf, size_t *ulen)
4391b108b8bSchristos {
4401b108b8bSchristos 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
4411b108b8bSchristos 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
4421b108b8bSchristos 	else
4431b108b8bSchristos 		return -1;
4441b108b8bSchristos }
4451b108b8bSchristos 
446*ddb17682Schristos file_private int
looks_utf7(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)447d87b0039Schristos looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
448d87b0039Schristos     size_t *ulen)
44974db5203Schristos {
45074db5203Schristos 	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
45174db5203Schristos 		switch (buf[3]) {
45274db5203Schristos 		case '8':
45374db5203Schristos 		case '9':
45474db5203Schristos 		case '+':
45574db5203Schristos 		case '/':
45674db5203Schristos 			if (ubuf)
45774db5203Schristos 				*ulen = 0;
45874db5203Schristos 			return 1;
45974db5203Schristos 		default:
46074db5203Schristos 			return -1;
46174db5203Schristos 		}
46274db5203Schristos 	else
46374db5203Schristos 		return -1;
46474db5203Schristos }
46574db5203Schristos 
4661d4cb158Schristos #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
4671d4cb158Schristos #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
4681d4cb158Schristos #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
4691d4cb158Schristos 
470*ddb17682Schristos file_private int
looks_ucs16(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)471d87b0039Schristos looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
4721b108b8bSchristos     size_t *ulen)
4731b108b8bSchristos {
4741b108b8bSchristos 	int bigend;
4751d4cb158Schristos 	uint32_t hi;
4761b108b8bSchristos 	size_t i;
4771b108b8bSchristos 
4781b108b8bSchristos 	if (nbytes < 2)
4791b108b8bSchristos 		return 0;
4801b108b8bSchristos 
481d0c65b7bSchristos 	if (bf[0] == 0xff && bf[1] == 0xfe)
4821b108b8bSchristos 		bigend = 0;
483d0c65b7bSchristos 	else if (bf[0] == 0xfe && bf[1] == 0xff)
4841b108b8bSchristos 		bigend = 1;
4851b108b8bSchristos 	else
4861b108b8bSchristos 		return 0;
4871b108b8bSchristos 
4881b108b8bSchristos 	*ulen = 0;
4891d4cb158Schristos 	hi = 0;
4901b108b8bSchristos 
4911b108b8bSchristos 	for (i = 2; i + 1 < nbytes; i += 2) {
4921d4cb158Schristos 		uint32_t uc;
4931b108b8bSchristos 
4941b108b8bSchristos 		if (bigend)
4951d4cb158Schristos 			uc = CAST(uint32_t,
4961d4cb158Schristos 			    bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));
4971b108b8bSchristos 		else
4981d4cb158Schristos 			uc = CAST(uint32_t,
4991d4cb158Schristos 			    bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));
5001b108b8bSchristos 
5011d4cb158Schristos 		uc &= 0xffff;
5021d4cb158Schristos 
5031d4cb158Schristos 		switch (uc) {
5041d4cb158Schristos 		case 0xfffe:
5051d4cb158Schristos 		case 0xffff:
5061b108b8bSchristos 			return 0;
5071d4cb158Schristos 		default:
5081d4cb158Schristos 			if (UCS16_NOCHAR(uc))
5091d4cb158Schristos 				return 0;
5101d4cb158Schristos 			break;
5111d4cb158Schristos 		}
5121d4cb158Schristos 		if (hi) {
5131d4cb158Schristos 			if (!UCS16_LOSURR(uc))
5141d4cb158Schristos 				return 0;
5151d4cb158Schristos 			uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);
5161d4cb158Schristos 			hi = 0;
5171d4cb158Schristos 		}
5181d4cb158Schristos 		if (uc < 128 && text_chars[CAST(size_t, uc)] != T)
5191d4cb158Schristos 			return 0;
5201d4cb158Schristos 		ubf[(*ulen)++] = uc;
5211d4cb158Schristos 		if (UCS16_HISURR(uc))
5221d4cb158Schristos 			hi = uc - 0xd800 + 1;
5231d4cb158Schristos 		if (UCS16_LOSURR(uc))
5241b108b8bSchristos 			return 0;
5251b108b8bSchristos 	}
5261b108b8bSchristos 
5271b108b8bSchristos 	return 1 + bigend;
5281b108b8bSchristos }
5291b108b8bSchristos 
530*ddb17682Schristos file_private int
looks_ucs32(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)531d87b0039Schristos looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
532d0c65b7bSchristos     size_t *ulen)
533d0c65b7bSchristos {
534d0c65b7bSchristos 	int bigend;
535d0c65b7bSchristos 	size_t i;
536d0c65b7bSchristos 
537d0c65b7bSchristos 	if (nbytes < 4)
538d0c65b7bSchristos 		return 0;
539d0c65b7bSchristos 
540d0c65b7bSchristos 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
541d0c65b7bSchristos 		bigend = 0;
542d0c65b7bSchristos 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
543d0c65b7bSchristos 		bigend = 1;
544d0c65b7bSchristos 	else
545d0c65b7bSchristos 		return 0;
546d0c65b7bSchristos 
547d0c65b7bSchristos 	*ulen = 0;
548d0c65b7bSchristos 
549d0c65b7bSchristos 	for (i = 4; i + 3 < nbytes; i += 4) {
550d0c65b7bSchristos 		/* XXX fix to properly handle chars > 65536 */
551d0c65b7bSchristos 
552d0c65b7bSchristos 		if (bigend)
553d87b0039Schristos 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
554d87b0039Schristos 			    | (CAST(file_unichar_t, bf[i + 2]) << 8)
555d87b0039Schristos 			    | (CAST(file_unichar_t, bf[i + 1]) << 16)
556d87b0039Schristos 			    | (CAST(file_unichar_t, bf[i]) << 24);
557d0c65b7bSchristos 		else
558d87b0039Schristos 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
559d87b0039Schristos 			    | (CAST(file_unichar_t, bf[i + 1]) << 8)
560d87b0039Schristos 			    | (CAST(file_unichar_t, bf[i + 2]) << 16)
561d87b0039Schristos 			    | (CAST(file_unichar_t, bf[i + 3]) << 24);
562d0c65b7bSchristos 
563d0c65b7bSchristos 		if (ubf[*ulen - 1] == 0xfffe)
564d0c65b7bSchristos 			return 0;
565d0c65b7bSchristos 		if (ubf[*ulen - 1] < 128 &&
566d0c65b7bSchristos 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
567d0c65b7bSchristos 			return 0;
568d0c65b7bSchristos 	}
569d0c65b7bSchristos 
570d0c65b7bSchristos 	return 1 + bigend;
571d0c65b7bSchristos }
5721b108b8bSchristos #undef F
5731b108b8bSchristos #undef T
5741b108b8bSchristos #undef I
5751b108b8bSchristos #undef X
5761b108b8bSchristos 
5771b108b8bSchristos /*
5781b108b8bSchristos  * This table maps each EBCDIC character to an (8-bit extended) ASCII
5791b108b8bSchristos  * character, as specified in the rationale for the dd(1) command in
5801b108b8bSchristos  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
5811b108b8bSchristos  *
5821b108b8bSchristos  * Unfortunately it does not seem to correspond exactly to any of the
5831b108b8bSchristos  * five variants of EBCDIC documented in IBM's _Enterprise Systems
5841b108b8bSchristos  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
5851b108b8bSchristos  * Edition, July, 1999, pp. I-1 - I-4.
5861b108b8bSchristos  *
5871b108b8bSchristos  * Fortunately, though, all versions of EBCDIC, including this one, agree
5881b108b8bSchristos  * on most of the printing characters that also appear in (7-bit) ASCII.
5891b108b8bSchristos  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
5901b108b8bSchristos  *
5911b108b8bSchristos  * Fortunately too, there is general agreement that codes 0x00 through
5921b108b8bSchristos  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
5931b108b8bSchristos  * remainder printing characters.
5941b108b8bSchristos  *
5951b108b8bSchristos  * This is sufficient to allow us to identify EBCDIC text and to distinguish
5961b108b8bSchristos  * between old-style and internationalized examples of text.
5971b108b8bSchristos  */
5981b108b8bSchristos 
599*ddb17682Schristos file_private unsigned char ebcdic_to_ascii[] = {
6001b108b8bSchristos   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
6011b108b8bSchristos  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
6021b108b8bSchristos 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
6031b108b8bSchristos 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
6041b108b8bSchristos ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
6051b108b8bSchristos '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
6061b108b8bSchristos '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
6071b108b8bSchristos 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
6081b108b8bSchristos 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
6091b108b8bSchristos 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
6101b108b8bSchristos 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
6111b108b8bSchristos 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
6121b108b8bSchristos '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
6131b108b8bSchristos '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
6141b108b8bSchristos '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
6151b108b8bSchristos '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
6161b108b8bSchristos };
6171b108b8bSchristos 
6181b108b8bSchristos #ifdef notdef
6191b108b8bSchristos /*
6201b108b8bSchristos  * The following EBCDIC-to-ASCII table may relate more closely to reality,
6211b108b8bSchristos  * or at least to modern reality.  It comes from
6221b108b8bSchristos  *
6231b108b8bSchristos  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
6241b108b8bSchristos  *
6251b108b8bSchristos  * and maps the characters of EBCDIC code page 1047 (the code used for
6261b108b8bSchristos  * Unix-derived software on IBM's 390 systems) to the corresponding
6271b108b8bSchristos  * characters from ISO 8859-1.
6281b108b8bSchristos  *
6291b108b8bSchristos  * If this table is used instead of the above one, some of the special
6301b108b8bSchristos  * cases for the NEL character can be taken out of the code.
6311b108b8bSchristos  */
6321b108b8bSchristos 
633*ddb17682Schristos file_private unsigned char ebcdic_1047_to_8859[] = {
6341b108b8bSchristos 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
6351b108b8bSchristos 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
6361b108b8bSchristos 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
6371b108b8bSchristos 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
6381b108b8bSchristos 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
6391b108b8bSchristos 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
6401b108b8bSchristos 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
6411b108b8bSchristos 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
6421b108b8bSchristos 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
6431b108b8bSchristos 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
6441b108b8bSchristos 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
6451b108b8bSchristos 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
6461b108b8bSchristos 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
6471b108b8bSchristos 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
6481b108b8bSchristos 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
6491b108b8bSchristos 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
6501b108b8bSchristos };
6511b108b8bSchristos #endif
6521b108b8bSchristos 
6531b108b8bSchristos /*
6541b108b8bSchristos  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
6551b108b8bSchristos  */
656*ddb17682Schristos file_private void
from_ebcdic(const unsigned char * buf,size_t nbytes,unsigned char * out)6571b108b8bSchristos from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
6581b108b8bSchristos {
6591b108b8bSchristos 	size_t i;
6601b108b8bSchristos 
6611b108b8bSchristos 	for (i = 0; i < nbytes; i++) {
6621b108b8bSchristos 		out[i] = ebcdic_to_ascii[buf[i]];
6631b108b8bSchristos 	}
6641b108b8bSchristos }
665