179343712SPeter Avalos /*
279343712SPeter Avalos * Copyright (c) Ian F. Darwin 1986-1995.
379343712SPeter Avalos * Software written by Ian F. Darwin and others;
479343712SPeter Avalos * maintained 1995-present by Christos Zoulas and others.
579343712SPeter Avalos *
679343712SPeter Avalos * Redistribution and use in source and binary forms, with or without
779343712SPeter Avalos * modification, are permitted provided that the following conditions
879343712SPeter Avalos * are met:
979343712SPeter Avalos * 1. Redistributions of source code must retain the above copyright
1079343712SPeter Avalos * notice immediately at the beginning of the file, without modification,
1179343712SPeter Avalos * this list of conditions, and the following disclaimer.
1279343712SPeter Avalos * 2. Redistributions in binary form must reproduce the above copyright
1379343712SPeter Avalos * notice, this list of conditions and the following disclaimer in the
1479343712SPeter Avalos * documentation and/or other materials provided with the distribution.
1579343712SPeter Avalos *
1679343712SPeter Avalos * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1779343712SPeter Avalos * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1879343712SPeter Avalos * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1979343712SPeter Avalos * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
2079343712SPeter Avalos * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2179343712SPeter Avalos * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2279343712SPeter Avalos * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2379343712SPeter Avalos * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2479343712SPeter Avalos * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2579343712SPeter Avalos * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2679343712SPeter Avalos * SUCH DAMAGE.
2779343712SPeter Avalos */
2879343712SPeter Avalos /*
2979343712SPeter Avalos * Encoding -- determine the character encoding of a text file.
3079343712SPeter Avalos *
3179343712SPeter Avalos * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
3279343712SPeter Avalos * international characters.
3379343712SPeter Avalos */
3479343712SPeter Avalos
3579343712SPeter Avalos #include "file.h"
3679343712SPeter Avalos
3779343712SPeter Avalos #ifndef lint
38*3b9cdfa3SAntonio Huete Jimenez FILE_RCSID("@(#)$File: encoding.c,v 1.39 2022/09/13 18:46:07 christos Exp $")
3979343712SPeter Avalos #endif /* lint */
4079343712SPeter Avalos
4179343712SPeter Avalos #include "magic.h"
4279343712SPeter Avalos #include <string.h>
4379343712SPeter Avalos #include <stdlib.h>
4479343712SPeter Avalos
4579343712SPeter Avalos
46970935fdSSascha Wildner private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
4779343712SPeter Avalos size_t *);
48970935fdSSascha Wildner private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
49970935fdSSascha Wildner size_t *);
50970935fdSSascha Wildner private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
51970935fdSSascha Wildner size_t *);
52970935fdSSascha Wildner private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
53970935fdSSascha Wildner size_t *);
54970935fdSSascha Wildner private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
55970935fdSSascha Wildner size_t *);
56970935fdSSascha Wildner private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
57970935fdSSascha Wildner size_t *);
58970935fdSSascha Wildner private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
59970935fdSSascha Wildner size_t *);
6079343712SPeter Avalos private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
6179343712SPeter Avalos
62f72f8299SJan Lentfer #ifdef DEBUG_ENCODING
63f72f8299SJan Lentfer #define DPRINTF(a) printf a
64f72f8299SJan Lentfer #else
65f72f8299SJan Lentfer #define DPRINTF(a)
66f72f8299SJan Lentfer #endif
67f72f8299SJan Lentfer
6879343712SPeter Avalos /*
6979343712SPeter Avalos * Try to determine whether text is in some character code we can
7079343712SPeter Avalos * identify. Each of these tests, if it succeeds, will leave
71970935fdSSascha Wildner * the text converted into one-file_unichar_t-per-character Unicode in
7279343712SPeter Avalos * ubuf, and the number of characters converted in ulen.
7379343712SPeter Avalos */
7479343712SPeter Avalos protected int
file_encoding(struct magic_set * ms,const struct buffer * b,file_unichar_t ** ubuf,size_t * ulen,const char ** code,const char ** code_mime,const char ** type)75970935fdSSascha Wildner file_encoding(struct magic_set *ms, const struct buffer *b,
76970935fdSSascha Wildner file_unichar_t **ubuf, size_t *ulen, const char **code,
77970935fdSSascha Wildner const char **code_mime, const char **type)
7879343712SPeter Avalos {
796fca56fbSSascha Wildner const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
806fca56fbSSascha Wildner size_t nbytes = b->flen;
8179343712SPeter Avalos size_t mlen;
8279343712SPeter Avalos int rv = 1, ucs_type;
83970935fdSSascha Wildner file_unichar_t *udefbuf;
846fca56fbSSascha Wildner size_t udeflen;
856fca56fbSSascha Wildner
866fca56fbSSascha Wildner if (ubuf == NULL)
876fca56fbSSascha Wildner ubuf = &udefbuf;
886fca56fbSSascha Wildner if (ulen == NULL)
896fca56fbSSascha Wildner ulen = &udeflen;
9079343712SPeter Avalos
9117b11469SPeter Avalos *type = "text";
92e8af9738SPeter Avalos *ulen = 0;
93e8af9738SPeter Avalos *code = "unknown";
94e8af9738SPeter Avalos *code_mime = "binary";
95e8af9738SPeter Avalos
96970935fdSSascha Wildner if (nbytes > ms->encoding_max)
97970935fdSSascha Wildner nbytes = ms->encoding_max;
98970935fdSSascha Wildner
99e8af9738SPeter Avalos mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
100970935fdSSascha Wildner *ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
101970935fdSSascha Wildner if (*ubuf == NULL) {
10279343712SPeter Avalos file_oomem(ms, mlen);
10379343712SPeter Avalos goto done;
10479343712SPeter Avalos }
10579343712SPeter Avalos if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
106c30bd091SSascha Wildner if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
107c30bd091SSascha Wildner DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
108970935fdSSascha Wildner *code = "Unicode text, UTF-7";
109c30bd091SSascha Wildner *code_mime = "utf-7";
110c30bd091SSascha Wildner } else {
111e4d4ce0cSPeter Avalos DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
11279343712SPeter Avalos *code = "ASCII";
11379343712SPeter Avalos *code_mime = "us-ascii";
114c30bd091SSascha Wildner }
11579343712SPeter Avalos } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
116e4d4ce0cSPeter Avalos DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
117970935fdSSascha Wildner *code = "Unicode text, UTF-8 (with BOM)";
11879343712SPeter Avalos *code_mime = "utf-8";
11979343712SPeter Avalos } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
120e4d4ce0cSPeter Avalos DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
121970935fdSSascha Wildner *code = "Unicode text, UTF-8";
12279343712SPeter Avalos *code_mime = "utf-8";
1236fca56fbSSascha Wildner } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
1246fca56fbSSascha Wildner if (ucs_type == 1) {
125970935fdSSascha Wildner *code = "Unicode text, UTF-32, little-endian";
1266fca56fbSSascha Wildner *code_mime = "utf-32le";
1276fca56fbSSascha Wildner } else {
128970935fdSSascha Wildner *code = "Unicode text, UTF-32, big-endian";
1296fca56fbSSascha Wildner *code_mime = "utf-32be";
1306fca56fbSSascha Wildner }
1316fca56fbSSascha Wildner DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
13279343712SPeter Avalos } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
13379343712SPeter Avalos if (ucs_type == 1) {
134970935fdSSascha Wildner *code = "Unicode text, UTF-16, little-endian";
13579343712SPeter Avalos *code_mime = "utf-16le";
13679343712SPeter Avalos } else {
137970935fdSSascha Wildner *code = "Unicode text, UTF-16, big-endian";
13879343712SPeter Avalos *code_mime = "utf-16be";
13979343712SPeter Avalos }
140e4d4ce0cSPeter Avalos DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
14179343712SPeter Avalos } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
142e4d4ce0cSPeter Avalos DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
14379343712SPeter Avalos *code = "ISO-8859";
14479343712SPeter Avalos *code_mime = "iso-8859-1";
14579343712SPeter Avalos } else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
146e4d4ce0cSPeter Avalos DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
14779343712SPeter Avalos *code = "Non-ISO extended-ASCII";
14879343712SPeter Avalos *code_mime = "unknown-8bit";
14979343712SPeter Avalos } else {
150*3b9cdfa3SAntonio Huete Jimenez unsigned char *nbuf;
151*3b9cdfa3SAntonio Huete Jimenez
152*3b9cdfa3SAntonio Huete Jimenez mlen = (nbytes + 1) * sizeof(nbuf[0]);
153*3b9cdfa3SAntonio Huete Jimenez if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) {
154*3b9cdfa3SAntonio Huete Jimenez file_oomem(ms, mlen);
155*3b9cdfa3SAntonio Huete Jimenez goto done;
156*3b9cdfa3SAntonio Huete Jimenez }
15779343712SPeter Avalos from_ebcdic(buf, nbytes, nbuf);
15879343712SPeter Avalos
15979343712SPeter Avalos if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
160e4d4ce0cSPeter Avalos DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
16179343712SPeter Avalos *code = "EBCDIC";
16279343712SPeter Avalos *code_mime = "ebcdic";
16379343712SPeter Avalos } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
164e4d4ce0cSPeter Avalos DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
165e4d4ce0cSPeter Avalos *ulen));
16679343712SPeter Avalos *code = "International EBCDIC";
16779343712SPeter Avalos *code_mime = "ebcdic";
16879343712SPeter Avalos } else { /* Doesn't look like text at all */
169f72f8299SJan Lentfer DPRINTF(("binary\n"));
17079343712SPeter Avalos rv = 0;
17179343712SPeter Avalos *type = "binary";
17279343712SPeter Avalos }
173*3b9cdfa3SAntonio Huete Jimenez free(nbuf);
17479343712SPeter Avalos }
17579343712SPeter Avalos
17679343712SPeter Avalos done:
1776fca56fbSSascha Wildner if (ubuf == &udefbuf)
1786fca56fbSSascha Wildner free(udefbuf);
17979343712SPeter Avalos
18079343712SPeter Avalos return rv;
18179343712SPeter Avalos }
18279343712SPeter Avalos
18379343712SPeter Avalos /*
18479343712SPeter Avalos * This table reflects a particular philosophy about what constitutes
18579343712SPeter Avalos * "text," and there is room for disagreement about it.
18679343712SPeter Avalos *
18779343712SPeter Avalos * Version 3.31 of the file command considered a file to be ASCII if
18879343712SPeter Avalos * each of its characters was approved by either the isascii() or
18979343712SPeter Avalos * isalpha() function. On most systems, this would mean that any
19079343712SPeter Avalos * file consisting only of characters in the range 0x00 ... 0x7F
19179343712SPeter Avalos * would be called ASCII text, but many systems might reasonably
19279343712SPeter Avalos * consider some characters outside this range to be alphabetic,
19379343712SPeter Avalos * so the file command would call such characters ASCII. It might
19479343712SPeter Avalos * have been more accurate to call this "considered textual on the
19579343712SPeter Avalos * local system" than "ASCII."
19679343712SPeter Avalos *
19779343712SPeter Avalos * It considered a file to be "International language text" if each
19879343712SPeter Avalos * of its characters was either an ASCII printing character (according
19979343712SPeter Avalos * to the real ASCII standard, not the above test), a character in
20079343712SPeter Avalos * the range 0x80 ... 0xFF, or one of the following control characters:
20179343712SPeter Avalos * backspace, tab, line feed, vertical tab, form feed, carriage return,
20279343712SPeter Avalos * escape. No attempt was made to determine the language in which files
20379343712SPeter Avalos * of this type were written.
20479343712SPeter Avalos *
20579343712SPeter Avalos *
20679343712SPeter Avalos * The table below considers a file to be ASCII if all of its characters
20779343712SPeter Avalos * are either ASCII printing characters (again, according to the X3.4
20879343712SPeter Avalos * standard, not isascii()) or any of the following controls: bell,
20979343712SPeter Avalos * backspace, tab, line feed, form feed, carriage return, esc, nextline.
21079343712SPeter Avalos *
21179343712SPeter Avalos * I include bell because some programs (particularly shell scripts)
21279343712SPeter Avalos * use it literally, even though it is rare in normal text. I exclude
21379343712SPeter Avalos * vertical tab because it never seems to be used in real text. I also
21479343712SPeter Avalos * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
21579343712SPeter Avalos * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
21679343712SPeter Avalos * character to. It might be more appropriate to include it in the 8859
21779343712SPeter Avalos * set instead of the ASCII set, but it's got to be included in *something*
21879343712SPeter Avalos * we recognize or EBCDIC files aren't going to be considered textual.
21979343712SPeter Avalos * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
22079343712SPeter Avalos * and Latin characters, so these should possibly be allowed. But they
22179343712SPeter Avalos * make a real mess on VT100-style displays if they're not paired properly,
22279343712SPeter Avalos * so we are probably better off not calling them text.
22379343712SPeter Avalos *
22479343712SPeter Avalos * A file is considered to be ISO-8859 text if its characters are all
22579343712SPeter Avalos * either ASCII, according to the above definition, or printing characters
22679343712SPeter Avalos * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
22779343712SPeter Avalos *
22879343712SPeter Avalos * Finally, a file is considered to be international text from some other
22979343712SPeter Avalos * character code if its characters are all either ISO-8859 (according to
23079343712SPeter Avalos * the above definition) or characters in the range 0x80 ... 0x9F, which
23179343712SPeter Avalos * ISO-8859 considers to be control characters but the IBM PC and Macintosh
23279343712SPeter Avalos * consider to be printing characters.
23379343712SPeter Avalos */
23479343712SPeter Avalos
23579343712SPeter Avalos #define F 0 /* character never appears in text */
23679343712SPeter Avalos #define T 1 /* character appears in plain ASCII text */
23779343712SPeter Avalos #define I 2 /* character appears in ISO-8859 text */
23879343712SPeter Avalos #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23979343712SPeter Avalos
24079343712SPeter Avalos private char text_chars[256] = {
241c30bd091SSascha Wildner /* BEL BS HT LF VT FF CR */
242c30bd091SSascha Wildner F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */
24379343712SPeter Avalos /* ESC */
24479343712SPeter Avalos F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
24579343712SPeter Avalos T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
24679343712SPeter Avalos T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
24779343712SPeter Avalos T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
24879343712SPeter Avalos T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
24979343712SPeter Avalos T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
25079343712SPeter Avalos T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
25179343712SPeter Avalos /* NEL */
25279343712SPeter Avalos X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
25379343712SPeter Avalos X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
25479343712SPeter Avalos I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
25579343712SPeter Avalos I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
25679343712SPeter Avalos I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
25779343712SPeter Avalos I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
25879343712SPeter Avalos I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
25979343712SPeter Avalos I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
26079343712SPeter Avalos };
26179343712SPeter Avalos
262970935fdSSascha Wildner #define LOOKS(NAME, COND) \
263970935fdSSascha Wildner private int \
264970935fdSSascha Wildner looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
265970935fdSSascha Wildner size_t *ulen) \
266970935fdSSascha Wildner { \
267614728caSSascha Wildner size_t i; \
268970935fdSSascha Wildner \
269970935fdSSascha Wildner *ulen = 0; \
270970935fdSSascha Wildner \
271970935fdSSascha Wildner for (i = 0; i < nbytes; i++) { \
272970935fdSSascha Wildner int t = text_chars[buf[i]]; \
273970935fdSSascha Wildner \
274970935fdSSascha Wildner if (COND) \
275970935fdSSascha Wildner return 0; \
276970935fdSSascha Wildner \
277970935fdSSascha Wildner ubuf[(*ulen)++] = buf[i]; \
278970935fdSSascha Wildner } \
279970935fdSSascha Wildner return 1; \
28079343712SPeter Avalos }
28179343712SPeter Avalos
282970935fdSSascha Wildner LOOKS(ascii, t != T)
283970935fdSSascha Wildner LOOKS(latin1, t != T && t != I)
284970935fdSSascha Wildner LOOKS(extended, t != T && t != I && t != X)
28579343712SPeter Avalos
28679343712SPeter Avalos /*
28779343712SPeter Avalos * Decide whether some text looks like UTF-8. Returns:
28879343712SPeter Avalos *
28979343712SPeter Avalos * -1: invalid UTF-8
29079343712SPeter Avalos * 0: uses odd control characters, so doesn't look like text
29179343712SPeter Avalos * 1: 7-bit text
29279343712SPeter Avalos * 2: definitely UTF-8 text (valid high-bit set bytes)
29379343712SPeter Avalos *
29479343712SPeter Avalos * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
29579343712SPeter Avalos * ubuf must be big enough!
29679343712SPeter Avalos */
297970935fdSSascha Wildner
298970935fdSSascha Wildner // from: https://golang.org/src/unicode/utf8/utf8.go
299970935fdSSascha Wildner
300970935fdSSascha Wildner #define XX 0xF1 // invalid: size 1
301970935fdSSascha Wildner #define AS 0xF0 // ASCII: size 1
302970935fdSSascha Wildner #define S1 0x02 // accept 0, size 2
303970935fdSSascha Wildner #define S2 0x13 // accept 1, size 3
304970935fdSSascha Wildner #define S3 0x03 // accept 0, size 3
305970935fdSSascha Wildner #define S4 0x23 // accept 2, size 3
306970935fdSSascha Wildner #define S5 0x34 // accept 3, size 4
307970935fdSSascha Wildner #define S6 0x04 // accept 0, size 4
308970935fdSSascha Wildner #define S7 0x44 // accept 4, size 4
309970935fdSSascha Wildner
310970935fdSSascha Wildner #define LOCB 0x80
311970935fdSSascha Wildner #define HICB 0xBF
312970935fdSSascha Wildner
313970935fdSSascha Wildner // first is information about the first byte in a UTF-8 sequence.
314970935fdSSascha Wildner static const uint8_t first[] = {
315970935fdSSascha Wildner // 1 2 3 4 5 6 7 8 9 A B C D E F
316970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
317970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
318970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
319970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
320970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
321970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
322970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
323970935fdSSascha Wildner AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
324970935fdSSascha Wildner // 1 2 3 4 5 6 7 8 9 A B C D E F
325970935fdSSascha Wildner XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
326970935fdSSascha Wildner XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
327970935fdSSascha Wildner XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
328970935fdSSascha Wildner XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
329970935fdSSascha Wildner XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
330970935fdSSascha Wildner S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
331970935fdSSascha Wildner S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
332970935fdSSascha Wildner S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
333970935fdSSascha Wildner };
334970935fdSSascha Wildner
335970935fdSSascha Wildner // acceptRange gives the range of valid values for the second byte in a UTF-8
336970935fdSSascha Wildner // sequence.
337970935fdSSascha Wildner struct accept_range {
338970935fdSSascha Wildner uint8_t lo; // lowest value for second byte.
339970935fdSSascha Wildner uint8_t hi; // highest value for second byte.
340970935fdSSascha Wildner } accept_ranges[16] = {
341970935fdSSascha Wildner // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
342970935fdSSascha Wildner { LOCB, HICB },
343970935fdSSascha Wildner { 0xA0, HICB },
344970935fdSSascha Wildner { LOCB, 0x9F },
345970935fdSSascha Wildner { 0x90, HICB },
346970935fdSSascha Wildner { LOCB, 0x8F },
347970935fdSSascha Wildner };
348970935fdSSascha Wildner
34979343712SPeter Avalos protected int
file_looks_utf8(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)350970935fdSSascha Wildner file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
351970935fdSSascha Wildner size_t *ulen)
35279343712SPeter Avalos {
35379343712SPeter Avalos size_t i;
35479343712SPeter Avalos int n;
355970935fdSSascha Wildner file_unichar_t c;
35679343712SPeter Avalos int gotone = 0, ctrl = 0;
35779343712SPeter Avalos
35879343712SPeter Avalos if (ubuf)
35979343712SPeter Avalos *ulen = 0;
36079343712SPeter Avalos
36179343712SPeter Avalos for (i = 0; i < nbytes; i++) {
36279343712SPeter Avalos if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
36379343712SPeter Avalos /*
36479343712SPeter Avalos * Even if the whole file is valid UTF-8 sequences,
36579343712SPeter Avalos * still reject it if it uses weird control characters.
36679343712SPeter Avalos */
36779343712SPeter Avalos
36879343712SPeter Avalos if (text_chars[buf[i]] != T)
36979343712SPeter Avalos ctrl = 1;
37079343712SPeter Avalos
37179343712SPeter Avalos if (ubuf)
37279343712SPeter Avalos ubuf[(*ulen)++] = buf[i];
37379343712SPeter Avalos } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
37479343712SPeter Avalos return -1;
37579343712SPeter Avalos } else { /* 11xxxxxx begins UTF-8 */
37679343712SPeter Avalos int following;
377970935fdSSascha Wildner uint8_t x = first[buf[i]];
378614728caSSascha Wildner const struct accept_range *ar =
379614728caSSascha Wildner &accept_ranges[(unsigned int)x >> 4];
380970935fdSSascha Wildner if (x == XX)
381970935fdSSascha Wildner return -1;
38279343712SPeter Avalos
38379343712SPeter Avalos if ((buf[i] & 0x20) == 0) { /* 110xxxxx */
38479343712SPeter Avalos c = buf[i] & 0x1f;
38579343712SPeter Avalos following = 1;
38679343712SPeter Avalos } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
38779343712SPeter Avalos c = buf[i] & 0x0f;
38879343712SPeter Avalos following = 2;
38979343712SPeter Avalos } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
39079343712SPeter Avalos c = buf[i] & 0x07;
39179343712SPeter Avalos following = 3;
39279343712SPeter Avalos } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
39379343712SPeter Avalos c = buf[i] & 0x03;
39479343712SPeter Avalos following = 4;
39579343712SPeter Avalos } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
39679343712SPeter Avalos c = buf[i] & 0x01;
39779343712SPeter Avalos following = 5;
39879343712SPeter Avalos } else
39979343712SPeter Avalos return -1;
40079343712SPeter Avalos
40179343712SPeter Avalos for (n = 0; n < following; n++) {
40279343712SPeter Avalos i++;
40379343712SPeter Avalos if (i >= nbytes)
40479343712SPeter Avalos goto done;
40579343712SPeter Avalos
406970935fdSSascha Wildner if (n == 0 &&
407970935fdSSascha Wildner (buf[i] < ar->lo || buf[i] > ar->hi))
408970935fdSSascha Wildner return -1;
409970935fdSSascha Wildner
41079343712SPeter Avalos if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
41179343712SPeter Avalos return -1;
41279343712SPeter Avalos
41379343712SPeter Avalos c = (c << 6) + (buf[i] & 0x3f);
41479343712SPeter Avalos }
41579343712SPeter Avalos
41679343712SPeter Avalos if (ubuf)
41779343712SPeter Avalos ubuf[(*ulen)++] = c;
41879343712SPeter Avalos gotone = 1;
41979343712SPeter Avalos }
42079343712SPeter Avalos }
42179343712SPeter Avalos done:
42279343712SPeter Avalos return ctrl ? 0 : (gotone ? 2 : 1);
42379343712SPeter Avalos }
42479343712SPeter Avalos
42579343712SPeter Avalos /*
42679343712SPeter Avalos * Decide whether some text looks like UTF-8 with BOM. If there is no
42779343712SPeter Avalos * BOM, return -1; otherwise return the result of looks_utf8 on the
42879343712SPeter Avalos * rest of the text.
42979343712SPeter Avalos */
43079343712SPeter Avalos private int
looks_utf8_with_BOM(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)431970935fdSSascha Wildner looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
432970935fdSSascha Wildner file_unichar_t *ubuf, size_t *ulen)
43379343712SPeter Avalos {
43479343712SPeter Avalos if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
43579343712SPeter Avalos return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
43679343712SPeter Avalos else
43779343712SPeter Avalos return -1;
43879343712SPeter Avalos }
43979343712SPeter Avalos
44079343712SPeter Avalos private int
looks_utf7(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)441970935fdSSascha Wildner looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
442970935fdSSascha Wildner size_t *ulen)
443c30bd091SSascha Wildner {
444c30bd091SSascha Wildner if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
445c30bd091SSascha Wildner switch (buf[3]) {
446c30bd091SSascha Wildner case '8':
447c30bd091SSascha Wildner case '9':
448c30bd091SSascha Wildner case '+':
449c30bd091SSascha Wildner case '/':
450c30bd091SSascha Wildner if (ubuf)
451c30bd091SSascha Wildner *ulen = 0;
452c30bd091SSascha Wildner return 1;
453c30bd091SSascha Wildner default:
454c30bd091SSascha Wildner return -1;
455c30bd091SSascha Wildner }
456c30bd091SSascha Wildner else
457c30bd091SSascha Wildner return -1;
458c30bd091SSascha Wildner }
459c30bd091SSascha Wildner
460*3b9cdfa3SAntonio Huete Jimenez #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
461*3b9cdfa3SAntonio Huete Jimenez #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
462*3b9cdfa3SAntonio Huete Jimenez #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
463*3b9cdfa3SAntonio Huete Jimenez
464c30bd091SSascha Wildner private int
looks_ucs16(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)465970935fdSSascha Wildner looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
46679343712SPeter Avalos size_t *ulen)
46779343712SPeter Avalos {
46879343712SPeter Avalos int bigend;
469*3b9cdfa3SAntonio Huete Jimenez uint32_t hi;
47079343712SPeter Avalos size_t i;
47179343712SPeter Avalos
47279343712SPeter Avalos if (nbytes < 2)
47379343712SPeter Avalos return 0;
47479343712SPeter Avalos
4756fca56fbSSascha Wildner if (bf[0] == 0xff && bf[1] == 0xfe)
47679343712SPeter Avalos bigend = 0;
4776fca56fbSSascha Wildner else if (bf[0] == 0xfe && bf[1] == 0xff)
47879343712SPeter Avalos bigend = 1;
47979343712SPeter Avalos else
48079343712SPeter Avalos return 0;
48179343712SPeter Avalos
48279343712SPeter Avalos *ulen = 0;
483*3b9cdfa3SAntonio Huete Jimenez hi = 0;
48479343712SPeter Avalos
48579343712SPeter Avalos for (i = 2; i + 1 < nbytes; i += 2) {
486*3b9cdfa3SAntonio Huete Jimenez uint32_t uc;
48779343712SPeter Avalos
48879343712SPeter Avalos if (bigend)
489*3b9cdfa3SAntonio Huete Jimenez uc = CAST(uint32_t,
490*3b9cdfa3SAntonio Huete Jimenez bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));
49179343712SPeter Avalos else
492*3b9cdfa3SAntonio Huete Jimenez uc = CAST(uint32_t,
493*3b9cdfa3SAntonio Huete Jimenez bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));
49479343712SPeter Avalos
495*3b9cdfa3SAntonio Huete Jimenez uc &= 0xffff;
496*3b9cdfa3SAntonio Huete Jimenez
497*3b9cdfa3SAntonio Huete Jimenez switch (uc) {
498*3b9cdfa3SAntonio Huete Jimenez case 0xfffe:
499*3b9cdfa3SAntonio Huete Jimenez case 0xffff:
50079343712SPeter Avalos return 0;
501*3b9cdfa3SAntonio Huete Jimenez default:
502*3b9cdfa3SAntonio Huete Jimenez if (UCS16_NOCHAR(uc))
503*3b9cdfa3SAntonio Huete Jimenez return 0;
504*3b9cdfa3SAntonio Huete Jimenez break;
505*3b9cdfa3SAntonio Huete Jimenez }
506*3b9cdfa3SAntonio Huete Jimenez if (hi) {
507*3b9cdfa3SAntonio Huete Jimenez if (!UCS16_LOSURR(uc))
508*3b9cdfa3SAntonio Huete Jimenez return 0;
509*3b9cdfa3SAntonio Huete Jimenez uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);
510*3b9cdfa3SAntonio Huete Jimenez hi = 0;
511*3b9cdfa3SAntonio Huete Jimenez }
512*3b9cdfa3SAntonio Huete Jimenez if (uc < 128 && text_chars[CAST(size_t, uc)] != T)
513*3b9cdfa3SAntonio Huete Jimenez return 0;
514*3b9cdfa3SAntonio Huete Jimenez ubf[(*ulen)++] = uc;
515*3b9cdfa3SAntonio Huete Jimenez if (UCS16_HISURR(uc))
516*3b9cdfa3SAntonio Huete Jimenez hi = uc - 0xd800 + 1;
517*3b9cdfa3SAntonio Huete Jimenez if (UCS16_LOSURR(uc))
51879343712SPeter Avalos return 0;
51979343712SPeter Avalos }
52079343712SPeter Avalos
52179343712SPeter Avalos return 1 + bigend;
52279343712SPeter Avalos }
52379343712SPeter Avalos
5246fca56fbSSascha Wildner private int
looks_ucs32(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)525970935fdSSascha Wildner looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
5266fca56fbSSascha Wildner size_t *ulen)
5276fca56fbSSascha Wildner {
5286fca56fbSSascha Wildner int bigend;
5296fca56fbSSascha Wildner size_t i;
5306fca56fbSSascha Wildner
5316fca56fbSSascha Wildner if (nbytes < 4)
5326fca56fbSSascha Wildner return 0;
5336fca56fbSSascha Wildner
5346fca56fbSSascha Wildner if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
5356fca56fbSSascha Wildner bigend = 0;
5366fca56fbSSascha Wildner else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
5376fca56fbSSascha Wildner bigend = 1;
5386fca56fbSSascha Wildner else
5396fca56fbSSascha Wildner return 0;
5406fca56fbSSascha Wildner
5416fca56fbSSascha Wildner *ulen = 0;
5426fca56fbSSascha Wildner
5436fca56fbSSascha Wildner for (i = 4; i + 3 < nbytes; i += 4) {
5446fca56fbSSascha Wildner /* XXX fix to properly handle chars > 65536 */
5456fca56fbSSascha Wildner
5466fca56fbSSascha Wildner if (bigend)
547970935fdSSascha Wildner ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
548970935fdSSascha Wildner | (CAST(file_unichar_t, bf[i + 2]) << 8)
549970935fdSSascha Wildner | (CAST(file_unichar_t, bf[i + 1]) << 16)
550970935fdSSascha Wildner | (CAST(file_unichar_t, bf[i]) << 24);
5516fca56fbSSascha Wildner else
552970935fdSSascha Wildner ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
553970935fdSSascha Wildner | (CAST(file_unichar_t, bf[i + 1]) << 8)
554970935fdSSascha Wildner | (CAST(file_unichar_t, bf[i + 2]) << 16)
555970935fdSSascha Wildner | (CAST(file_unichar_t, bf[i + 3]) << 24);
5566fca56fbSSascha Wildner
5576fca56fbSSascha Wildner if (ubf[*ulen - 1] == 0xfffe)
5586fca56fbSSascha Wildner return 0;
5596fca56fbSSascha Wildner if (ubf[*ulen - 1] < 128 &&
5606fca56fbSSascha Wildner text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
5616fca56fbSSascha Wildner return 0;
5626fca56fbSSascha Wildner }
5636fca56fbSSascha Wildner
5646fca56fbSSascha Wildner return 1 + bigend;
5656fca56fbSSascha Wildner }
56679343712SPeter Avalos #undef F
56779343712SPeter Avalos #undef T
56879343712SPeter Avalos #undef I
56979343712SPeter Avalos #undef X
57079343712SPeter Avalos
57179343712SPeter Avalos /*
57279343712SPeter Avalos * This table maps each EBCDIC character to an (8-bit extended) ASCII
57379343712SPeter Avalos * character, as specified in the rationale for the dd(1) command in
57479343712SPeter Avalos * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
57579343712SPeter Avalos *
57679343712SPeter Avalos * Unfortunately it does not seem to correspond exactly to any of the
57779343712SPeter Avalos * five variants of EBCDIC documented in IBM's _Enterprise Systems
57879343712SPeter Avalos * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
57979343712SPeter Avalos * Edition, July, 1999, pp. I-1 - I-4.
58079343712SPeter Avalos *
58179343712SPeter Avalos * Fortunately, though, all versions of EBCDIC, including this one, agree
58279343712SPeter Avalos * on most of the printing characters that also appear in (7-bit) ASCII.
58379343712SPeter Avalos * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
58479343712SPeter Avalos *
58579343712SPeter Avalos * Fortunately too, there is general agreement that codes 0x00 through
58679343712SPeter Avalos * 0x3F represent control characters, 0x41 a nonbreaking space, and the
58779343712SPeter Avalos * remainder printing characters.
58879343712SPeter Avalos *
58979343712SPeter Avalos * This is sufficient to allow us to identify EBCDIC text and to distinguish
59079343712SPeter Avalos * between old-style and internationalized examples of text.
59179343712SPeter Avalos */
59279343712SPeter Avalos
59379343712SPeter Avalos private unsigned char ebcdic_to_ascii[] = {
59479343712SPeter Avalos 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
59579343712SPeter Avalos 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
59679343712SPeter Avalos 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
59779343712SPeter Avalos 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
59879343712SPeter Avalos ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
59979343712SPeter Avalos '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
60079343712SPeter Avalos '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
60179343712SPeter Avalos 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
60279343712SPeter Avalos 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
60379343712SPeter Avalos 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
60479343712SPeter Avalos 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
60579343712SPeter Avalos 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
60679343712SPeter Avalos '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
60779343712SPeter Avalos '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
60879343712SPeter Avalos '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
60979343712SPeter Avalos '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
61079343712SPeter Avalos };
61179343712SPeter Avalos
61279343712SPeter Avalos #ifdef notdef
61379343712SPeter Avalos /*
61479343712SPeter Avalos * The following EBCDIC-to-ASCII table may relate more closely to reality,
61579343712SPeter Avalos * or at least to modern reality. It comes from
61679343712SPeter Avalos *
61779343712SPeter Avalos * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
61879343712SPeter Avalos *
61979343712SPeter Avalos * and maps the characters of EBCDIC code page 1047 (the code used for
62079343712SPeter Avalos * Unix-derived software on IBM's 390 systems) to the corresponding
62179343712SPeter Avalos * characters from ISO 8859-1.
62279343712SPeter Avalos *
62379343712SPeter Avalos * If this table is used instead of the above one, some of the special
62479343712SPeter Avalos * cases for the NEL character can be taken out of the code.
62579343712SPeter Avalos */
62679343712SPeter Avalos
62779343712SPeter Avalos private unsigned char ebcdic_1047_to_8859[] = {
62879343712SPeter Avalos 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
62979343712SPeter Avalos 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
63079343712SPeter Avalos 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
63179343712SPeter Avalos 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
63279343712SPeter Avalos 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
63379343712SPeter Avalos 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
63479343712SPeter Avalos 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
63579343712SPeter Avalos 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
63679343712SPeter Avalos 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
63779343712SPeter Avalos 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
63879343712SPeter Avalos 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
63979343712SPeter Avalos 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
64079343712SPeter Avalos 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
64179343712SPeter Avalos 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
64279343712SPeter Avalos 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
64379343712SPeter Avalos 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
64479343712SPeter Avalos };
64579343712SPeter Avalos #endif
64679343712SPeter Avalos
64779343712SPeter Avalos /*
64879343712SPeter Avalos * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
64979343712SPeter Avalos */
65079343712SPeter Avalos private void
from_ebcdic(const unsigned char * buf,size_t nbytes,unsigned char * out)65179343712SPeter Avalos from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
65279343712SPeter Avalos {
65379343712SPeter Avalos size_t i;
65479343712SPeter Avalos
65579343712SPeter Avalos for (i = 0; i < nbytes; i++) {
65679343712SPeter Avalos out[i] = ebcdic_to_ascii[buf[i]];
65779343712SPeter Avalos }
65879343712SPeter Avalos }
659