1b6cee71dSXin LI /* 2b6cee71dSXin LI * Copyright (c) Ian F. Darwin 1986-1995. 3b6cee71dSXin LI * Software written by Ian F. Darwin and others; 4b6cee71dSXin LI * maintained 1995-present by Christos Zoulas and others. 5b6cee71dSXin LI * 6b6cee71dSXin LI * Redistribution and use in source and binary forms, with or without 7b6cee71dSXin LI * modification, are permitted provided that the following conditions 8b6cee71dSXin LI * are met: 9b6cee71dSXin LI * 1. Redistributions of source code must retain the above copyright 10b6cee71dSXin LI * notice immediately at the beginning of the file, without modification, 11b6cee71dSXin LI * this list of conditions, and the following disclaimer. 12b6cee71dSXin LI * 2. Redistributions in binary form must reproduce the above copyright 13b6cee71dSXin LI * notice, this list of conditions and the following disclaimer in the 14b6cee71dSXin LI * documentation and/or other materials provided with the distribution. 15b6cee71dSXin LI * 16b6cee71dSXin LI * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17b6cee71dSXin LI * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18b6cee71dSXin LI * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19b6cee71dSXin LI * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 20b6cee71dSXin LI * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21b6cee71dSXin LI * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22b6cee71dSXin LI * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23b6cee71dSXin LI * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24b6cee71dSXin LI * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25b6cee71dSXin LI * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26b6cee71dSXin LI * SUCH DAMAGE. 27b6cee71dSXin LI */ 28b6cee71dSXin LI /* 29b6cee71dSXin LI * Encoding -- determine the character encoding of a text file. 30b6cee71dSXin LI * 31b6cee71dSXin LI * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 32b6cee71dSXin LI * international characters. 33b6cee71dSXin LI */ 34b6cee71dSXin LI 35b6cee71dSXin LI #include "file.h" 36b6cee71dSXin LI 37b6cee71dSXin LI #ifndef lint 38*ae316d1dSXin LI FILE_RCSID("@(#)$File: encoding.c,v 1.43 2024/10/29 20:56:48 christos Exp $") 39b6cee71dSXin LI #endif /* lint */ 40b6cee71dSXin LI 41b6cee71dSXin LI #include "magic.h" 42b6cee71dSXin LI #include <string.h> 43b6cee71dSXin LI #include <stdlib.h> 44b6cee71dSXin LI 45b6cee71dSXin LI 46898496eeSXin LI file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *, 47b6cee71dSXin LI size_t *); 48898496eeSXin LI file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *, 4943a5ec4eSXin LI size_t *); 50898496eeSXin LI file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *, 5143a5ec4eSXin LI size_t *); 52898496eeSXin LI file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *, 5343a5ec4eSXin LI size_t *); 54898496eeSXin LI file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *, 5543a5ec4eSXin LI size_t *); 56898496eeSXin LI file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *, 5743a5ec4eSXin LI size_t *); 58898496eeSXin LI file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *, 5943a5ec4eSXin LI size_t *); 60898496eeSXin LI file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 61b6cee71dSXin LI 62b6cee71dSXin LI #ifdef DEBUG_ENCODING 63b6cee71dSXin LI #define DPRINTF(a) printf a 64b6cee71dSXin LI #else 65b6cee71dSXin LI #define DPRINTF(a) 66b6cee71dSXin LI #endif 67b6cee71dSXin LI 68b6cee71dSXin LI /* 69b6cee71dSXin LI * Try to determine whether text is in some character code we can 70b6cee71dSXin LI * identify. Each of these tests, if it succeeds, will leave 7143a5ec4eSXin LI * the text converted into one-file_unichar_t-per-character Unicode in 72b6cee71dSXin LI * ubuf, and the number of characters converted in ulen. 73b6cee71dSXin LI */ 74898496eeSXin LI file_protected int 7543a5ec4eSXin LI file_encoding(struct magic_set *ms, const struct buffer *b, 7643a5ec4eSXin LI file_unichar_t **ubuf, size_t *ulen, const char **code, 7743a5ec4eSXin LI const char **code_mime, const char **type) 78b6cee71dSXin LI { 7948c779cdSXin LI const unsigned char *buf = CAST(const unsigned char *, b->fbuf); 8058a0f0d0SEitan Adler size_t nbytes = b->flen; 81b6cee71dSXin LI size_t mlen; 82b6cee71dSXin LI int rv = 1, ucs_type; 8343a5ec4eSXin LI file_unichar_t *udefbuf; 8458a0f0d0SEitan Adler size_t udeflen; 8558a0f0d0SEitan Adler 8658a0f0d0SEitan Adler if (ubuf == NULL) 8758a0f0d0SEitan Adler ubuf = &udefbuf; 8858a0f0d0SEitan Adler if (ulen == NULL) 8958a0f0d0SEitan Adler ulen = &udeflen; 90b6cee71dSXin LI 91b6cee71dSXin LI *type = "text"; 92b6cee71dSXin LI *ulen = 0; 93b6cee71dSXin LI *code = "unknown"; 94b6cee71dSXin LI *code_mime = "binary"; 95b6cee71dSXin LI 9643a5ec4eSXin LI if (nbytes > ms->encoding_max) 9743a5ec4eSXin LI nbytes = ms->encoding_max; 9843a5ec4eSXin LI 99b6cee71dSXin LI mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 10043a5ec4eSXin LI *ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen)); 10143a5ec4eSXin LI if (*ubuf == NULL) { 102b6cee71dSXin LI file_oomem(ms, mlen); 103b6cee71dSXin LI goto done; 104b6cee71dSXin LI } 105b6cee71dSXin LI if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 1065f0216bdSXin LI if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) { 1075f0216bdSXin LI DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); 10843a5ec4eSXin LI *code = "Unicode text, UTF-7"; 1095f0216bdSXin LI *code_mime = "utf-7"; 1105f0216bdSXin LI } else { 111b6cee71dSXin LI DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 112b6cee71dSXin LI *code = "ASCII"; 113b6cee71dSXin LI *code_mime = "us-ascii"; 1145f0216bdSXin LI } 115b6cee71dSXin LI } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 116b6cee71dSXin LI DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 11743a5ec4eSXin LI *code = "Unicode text, UTF-8 (with BOM)"; 118b6cee71dSXin LI *code_mime = "utf-8"; 119b6cee71dSXin LI } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 120b6cee71dSXin LI DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 12143a5ec4eSXin LI *code = "Unicode text, UTF-8"; 122b6cee71dSXin LI *code_mime = "utf-8"; 12348c779cdSXin LI } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { 12448c779cdSXin LI if (ucs_type == 1) { 12543a5ec4eSXin LI *code = "Unicode text, UTF-32, little-endian"; 12648c779cdSXin LI *code_mime = "utf-32le"; 12748c779cdSXin LI } else { 12843a5ec4eSXin LI *code = "Unicode text, UTF-32, big-endian"; 12948c779cdSXin LI *code_mime = "utf-32be"; 13048c779cdSXin LI } 13148c779cdSXin LI DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); 132b6cee71dSXin LI } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 133b6cee71dSXin LI if (ucs_type == 1) { 13443a5ec4eSXin LI *code = "Unicode text, UTF-16, little-endian"; 135b6cee71dSXin LI *code_mime = "utf-16le"; 136b6cee71dSXin LI } else { 13743a5ec4eSXin LI *code = "Unicode text, UTF-16, big-endian"; 138b6cee71dSXin LI *code_mime = "utf-16be"; 139b6cee71dSXin LI } 140b6cee71dSXin LI DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 141b6cee71dSXin LI } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 142b6cee71dSXin LI DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 143b6cee71dSXin LI *code = "ISO-8859"; 144b6cee71dSXin LI *code_mime = "iso-8859-1"; 145b6cee71dSXin LI } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 146b6cee71dSXin LI DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 147b6cee71dSXin LI *code = "Non-ISO extended-ASCII"; 148b6cee71dSXin LI *code_mime = "unknown-8bit"; 149b6cee71dSXin LI } else { 150a4d6d3b8SXin LI unsigned char *nbuf; 151a4d6d3b8SXin LI 152a4d6d3b8SXin LI mlen = (nbytes + 1) * sizeof(nbuf[0]); 153a4d6d3b8SXin LI if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) { 154a4d6d3b8SXin LI file_oomem(ms, mlen); 155a4d6d3b8SXin LI goto done; 156a4d6d3b8SXin LI } 157b6cee71dSXin LI from_ebcdic(buf, nbytes, nbuf); 158b6cee71dSXin LI 159b6cee71dSXin LI if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 160b6cee71dSXin LI DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 161b6cee71dSXin LI *code = "EBCDIC"; 162b6cee71dSXin LI *code_mime = "ebcdic"; 163b6cee71dSXin LI } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 164b6cee71dSXin LI DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 165b6cee71dSXin LI *ulen)); 166b6cee71dSXin LI *code = "International EBCDIC"; 167b6cee71dSXin LI *code_mime = "ebcdic"; 168b6cee71dSXin LI } else { /* Doesn't look like text at all */ 169b6cee71dSXin LI DPRINTF(("binary\n")); 170b6cee71dSXin LI rv = 0; 171b6cee71dSXin LI *type = "binary"; 172b6cee71dSXin LI } 173a4d6d3b8SXin LI free(nbuf); 174b6cee71dSXin LI } 175b6cee71dSXin LI 176b6cee71dSXin LI done: 17758a0f0d0SEitan Adler if (ubuf == &udefbuf) 17858a0f0d0SEitan Adler free(udefbuf); 179b6cee71dSXin LI 180b6cee71dSXin LI return rv; 181b6cee71dSXin LI } 182b6cee71dSXin LI 183b6cee71dSXin LI /* 184b6cee71dSXin LI * This table reflects a particular philosophy about what constitutes 185b6cee71dSXin LI * "text," and there is room for disagreement about it. 186b6cee71dSXin LI * 187b6cee71dSXin LI * Version 3.31 of the file command considered a file to be ASCII if 188b6cee71dSXin LI * each of its characters was approved by either the isascii() or 189b6cee71dSXin LI * isalpha() function. On most systems, this would mean that any 190b6cee71dSXin LI * file consisting only of characters in the range 0x00 ... 0x7F 191b6cee71dSXin LI * would be called ASCII text, but many systems might reasonably 192b6cee71dSXin LI * consider some characters outside this range to be alphabetic, 193b6cee71dSXin LI * so the file command would call such characters ASCII. It might 194b6cee71dSXin LI * have been more accurate to call this "considered textual on the 195b6cee71dSXin LI * local system" than "ASCII." 196b6cee71dSXin LI * 197b6cee71dSXin LI * It considered a file to be "International language text" if each 198b6cee71dSXin LI * of its characters was either an ASCII printing character (according 199b6cee71dSXin LI * to the real ASCII standard, not the above test), a character in 200b6cee71dSXin LI * the range 0x80 ... 0xFF, or one of the following control characters: 201b6cee71dSXin LI * backspace, tab, line feed, vertical tab, form feed, carriage return, 202b6cee71dSXin LI * escape. No attempt was made to determine the language in which files 203b6cee71dSXin LI * of this type were written. 204b6cee71dSXin LI * 205b6cee71dSXin LI * 206b6cee71dSXin LI * The table below considers a file to be ASCII if all of its characters 207b6cee71dSXin LI * are either ASCII printing characters (again, according to the X3.4 208b6cee71dSXin LI * standard, not isascii()) or any of the following controls: bell, 209b6cee71dSXin LI * backspace, tab, line feed, form feed, carriage return, esc, nextline. 210b6cee71dSXin LI * 211b6cee71dSXin LI * I include bell because some programs (particularly shell scripts) 212b6cee71dSXin LI * use it literally, even though it is rare in normal text. I exclude 213b6cee71dSXin LI * vertical tab because it never seems to be used in real text. I also 214b6cee71dSXin LI * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 215b6cee71dSXin LI * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 216b6cee71dSXin LI * character to. It might be more appropriate to include it in the 8859 217b6cee71dSXin LI * set instead of the ASCII set, but it's got to be included in *something* 218b6cee71dSXin LI * we recognize or EBCDIC files aren't going to be considered textual. 219b6cee71dSXin LI * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 220b6cee71dSXin LI * and Latin characters, so these should possibly be allowed. But they 221b6cee71dSXin LI * make a real mess on VT100-style displays if they're not paired properly, 222b6cee71dSXin LI * so we are probably better off not calling them text. 223b6cee71dSXin LI * 224b6cee71dSXin LI * A file is considered to be ISO-8859 text if its characters are all 225b6cee71dSXin LI * either ASCII, according to the above definition, or printing characters 226b6cee71dSXin LI * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 227b6cee71dSXin LI * 228b6cee71dSXin LI * Finally, a file is considered to be international text from some other 229b6cee71dSXin LI * character code if its characters are all either ISO-8859 (according to 230b6cee71dSXin LI * the above definition) or characters in the range 0x80 ... 0x9F, which 231b6cee71dSXin LI * ISO-8859 considers to be control characters but the IBM PC and Macintosh 232b6cee71dSXin LI * consider to be printing characters. 233b6cee71dSXin LI */ 234b6cee71dSXin LI 235b6cee71dSXin LI #define F 0 /* character never appears in text */ 236b6cee71dSXin LI #define T 1 /* character appears in plain ASCII text */ 237b6cee71dSXin LI #define I 2 /* character appears in ISO-8859 text */ 238b6cee71dSXin LI #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 239b6cee71dSXin LI 240898496eeSXin LI file_private char text_chars[256] = { 2415f0216bdSXin LI /* BEL BS HT LF VT FF CR */ 2425f0216bdSXin LI F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */ 243b6cee71dSXin LI /* ESC */ 244b6cee71dSXin LI F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 245b6cee71dSXin LI T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 246b6cee71dSXin LI T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 247b6cee71dSXin LI T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 248b6cee71dSXin LI T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 249b6cee71dSXin LI T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 250b6cee71dSXin LI T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 251b6cee71dSXin LI /* NEL */ 252b6cee71dSXin LI X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 253b6cee71dSXin LI X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 254b6cee71dSXin LI I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 255b6cee71dSXin LI I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 256b6cee71dSXin LI I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 257b6cee71dSXin LI I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 258b6cee71dSXin LI I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 259b6cee71dSXin LI I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 260b6cee71dSXin LI }; 261b6cee71dSXin LI 26243a5ec4eSXin LI #define LOOKS(NAME, COND) \ 263898496eeSXin LI file_private int \ 26443a5ec4eSXin LI looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \ 26543a5ec4eSXin LI size_t *ulen) \ 26643a5ec4eSXin LI { \ 26743a5ec4eSXin LI size_t i; \ 26843a5ec4eSXin LI \ 26943a5ec4eSXin LI *ulen = 0; \ 27043a5ec4eSXin LI \ 27143a5ec4eSXin LI for (i = 0; i < nbytes; i++) { \ 27243a5ec4eSXin LI int t = text_chars[buf[i]]; \ 27343a5ec4eSXin LI \ 27443a5ec4eSXin LI if (COND) \ 27543a5ec4eSXin LI return 0; \ 27643a5ec4eSXin LI \ 27743a5ec4eSXin LI ubuf[(*ulen)++] = buf[i]; \ 27843a5ec4eSXin LI } \ 27943a5ec4eSXin LI return 1; \ 280b6cee71dSXin LI } 281b6cee71dSXin LI 28243a5ec4eSXin LI LOOKS(ascii, t != T) 28343a5ec4eSXin LI LOOKS(latin1, t != T && t != I) 28443a5ec4eSXin LI LOOKS(extended, t != T && t != I && t != X) 285b6cee71dSXin LI 286b6cee71dSXin LI /* 287b6cee71dSXin LI * Decide whether some text looks like UTF-8. Returns: 288b6cee71dSXin LI * 289b6cee71dSXin LI * -1: invalid UTF-8 290b6cee71dSXin LI * 0: uses odd control characters, so doesn't look like text 291b6cee71dSXin LI * 1: 7-bit text 292b6cee71dSXin LI * 2: definitely UTF-8 text (valid high-bit set bytes) 293b6cee71dSXin LI * 294b6cee71dSXin LI * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 295b6cee71dSXin LI * ubuf must be big enough! 296b6cee71dSXin LI */ 29743a5ec4eSXin LI 29843a5ec4eSXin LI // from: https://golang.org/src/unicode/utf8/utf8.go 29943a5ec4eSXin LI 30043a5ec4eSXin LI #define XX 0xF1 // invalid: size 1 30143a5ec4eSXin LI #define AS 0xF0 // ASCII: size 1 30243a5ec4eSXin LI #define S1 0x02 // accept 0, size 2 30343a5ec4eSXin LI #define S2 0x13 // accept 1, size 3 30443a5ec4eSXin LI #define S3 0x03 // accept 0, size 3 30543a5ec4eSXin LI #define S4 0x23 // accept 2, size 3 30643a5ec4eSXin LI #define S5 0x34 // accept 3, size 4 30743a5ec4eSXin LI #define S6 0x04 // accept 0, size 4 30843a5ec4eSXin LI #define S7 0x44 // accept 4, size 4 30943a5ec4eSXin LI 31043a5ec4eSXin LI #define LOCB 0x80 31143a5ec4eSXin LI #define HICB 0xBF 31243a5ec4eSXin LI 31343a5ec4eSXin LI // first is information about the first byte in a UTF-8 sequence. 31443a5ec4eSXin LI static const uint8_t first[] = { 31543a5ec4eSXin LI // 1 2 3 4 5 6 7 8 9 A B C D E F 31643a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F 31743a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F 31843a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F 31943a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F 32043a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F 32143a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F 32243a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F 32343a5ec4eSXin LI AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F 32443a5ec4eSXin LI // 1 2 3 4 5 6 7 8 9 A B C D E F 32543a5ec4eSXin LI XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F 32643a5ec4eSXin LI XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F 32743a5ec4eSXin LI XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF 32843a5ec4eSXin LI XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF 32943a5ec4eSXin LI XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF 33043a5ec4eSXin LI S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF 33143a5ec4eSXin LI S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF 33243a5ec4eSXin LI S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF 33343a5ec4eSXin LI }; 33443a5ec4eSXin LI 33543a5ec4eSXin LI // acceptRange gives the range of valid values for the second byte in a UTF-8 33643a5ec4eSXin LI // sequence. 337*ae316d1dSXin LI static struct accept_range { 33843a5ec4eSXin LI uint8_t lo; // lowest value for second byte. 33943a5ec4eSXin LI uint8_t hi; // highest value for second byte. 34043a5ec4eSXin LI } accept_ranges[16] = { 34143a5ec4eSXin LI // acceptRanges has size 16 to avoid bounds checks in the code that uses it. 34243a5ec4eSXin LI { LOCB, HICB }, 34343a5ec4eSXin LI { 0xA0, HICB }, 34443a5ec4eSXin LI { LOCB, 0x9F }, 34543a5ec4eSXin LI { 0x90, HICB }, 34643a5ec4eSXin LI { LOCB, 0x8F }, 34743a5ec4eSXin LI }; 34843a5ec4eSXin LI 349898496eeSXin LI file_protected int 35043a5ec4eSXin LI file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, 35143a5ec4eSXin LI size_t *ulen) 352b6cee71dSXin LI { 353b6cee71dSXin LI size_t i; 354b6cee71dSXin LI int n; 35543a5ec4eSXin LI file_unichar_t c; 356b6cee71dSXin LI int gotone = 0, ctrl = 0; 357b6cee71dSXin LI 358b6cee71dSXin LI if (ubuf) 359b6cee71dSXin LI *ulen = 0; 360b6cee71dSXin LI 361b6cee71dSXin LI for (i = 0; i < nbytes; i++) { 362b6cee71dSXin LI if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 363b6cee71dSXin LI /* 364b6cee71dSXin LI * Even if the whole file is valid UTF-8 sequences, 365b6cee71dSXin LI * still reject it if it uses weird control characters. 366b6cee71dSXin LI */ 367b6cee71dSXin LI 368b6cee71dSXin LI if (text_chars[buf[i]] != T) 369b6cee71dSXin LI ctrl = 1; 370b6cee71dSXin LI 371b6cee71dSXin LI if (ubuf) 372b6cee71dSXin LI ubuf[(*ulen)++] = buf[i]; 373b6cee71dSXin LI } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 374b6cee71dSXin LI return -1; 375b6cee71dSXin LI } else { /* 11xxxxxx begins UTF-8 */ 376b6cee71dSXin LI int following; 37743a5ec4eSXin LI uint8_t x = first[buf[i]]; 37843a5ec4eSXin LI const struct accept_range *ar = 37943a5ec4eSXin LI &accept_ranges[(unsigned int)x >> 4]; 38043a5ec4eSXin LI if (x == XX) 38143a5ec4eSXin LI return -1; 382b6cee71dSXin LI 383b6cee71dSXin LI if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 384b6cee71dSXin LI c = buf[i] & 0x1f; 385b6cee71dSXin LI following = 1; 386b6cee71dSXin LI } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 387b6cee71dSXin LI c = buf[i] & 0x0f; 388b6cee71dSXin LI following = 2; 389b6cee71dSXin LI } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 390b6cee71dSXin LI c = buf[i] & 0x07; 391b6cee71dSXin LI following = 3; 392b6cee71dSXin LI } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 393b6cee71dSXin LI c = buf[i] & 0x03; 394b6cee71dSXin LI following = 4; 395b6cee71dSXin LI } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 396b6cee71dSXin LI c = buf[i] & 0x01; 397b6cee71dSXin LI following = 5; 398b6cee71dSXin LI } else 399b6cee71dSXin LI return -1; 400b6cee71dSXin LI 401b6cee71dSXin LI for (n = 0; n < following; n++) { 402b6cee71dSXin LI i++; 403b6cee71dSXin LI if (i >= nbytes) 404b6cee71dSXin LI goto done; 405b6cee71dSXin LI 40643a5ec4eSXin LI if (n == 0 && 40743a5ec4eSXin LI (buf[i] < ar->lo || buf[i] > ar->hi)) 40843a5ec4eSXin LI return -1; 40943a5ec4eSXin LI 410b6cee71dSXin LI if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 411b6cee71dSXin LI return -1; 412b6cee71dSXin LI 413b6cee71dSXin LI c = (c << 6) + (buf[i] & 0x3f); 414b6cee71dSXin LI } 415b6cee71dSXin LI 416b6cee71dSXin LI if (ubuf) 417b6cee71dSXin LI ubuf[(*ulen)++] = c; 418b6cee71dSXin LI gotone = 1; 419b6cee71dSXin LI } 420b6cee71dSXin LI } 421b6cee71dSXin LI done: 422b6cee71dSXin LI return ctrl ? 0 : (gotone ? 2 : 1); 423b6cee71dSXin LI } 424b6cee71dSXin LI 425b6cee71dSXin LI /* 426b6cee71dSXin LI * Decide whether some text looks like UTF-8 with BOM. If there is no 427b6cee71dSXin LI * BOM, return -1; otherwise return the result of looks_utf8 on the 428b6cee71dSXin LI * rest of the text. 429b6cee71dSXin LI */ 430898496eeSXin LI file_private int 43143a5ec4eSXin LI looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, 43243a5ec4eSXin LI file_unichar_t *ubuf, size_t *ulen) 433b6cee71dSXin LI { 434b6cee71dSXin LI if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 435b6cee71dSXin LI return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 436b6cee71dSXin LI else 437b6cee71dSXin LI return -1; 438b6cee71dSXin LI } 439b6cee71dSXin LI 440898496eeSXin LI file_private int 44143a5ec4eSXin LI looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, 44243a5ec4eSXin LI size_t *ulen) 4435f0216bdSXin LI { 4445f0216bdSXin LI if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') 4455f0216bdSXin LI switch (buf[3]) { 4465f0216bdSXin LI case '8': 4475f0216bdSXin LI case '9': 4485f0216bdSXin LI case '+': 4495f0216bdSXin LI case '/': 4505f0216bdSXin LI if (ubuf) 4515f0216bdSXin LI *ulen = 0; 4525f0216bdSXin LI return 1; 4535f0216bdSXin LI default: 4545f0216bdSXin LI return -1; 4555f0216bdSXin LI } 4565f0216bdSXin LI else 4575f0216bdSXin LI return -1; 4585f0216bdSXin LI } 4595f0216bdSXin LI 460a4d6d3b8SXin LI #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef) 461a4d6d3b8SXin LI #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff) 462a4d6d3b8SXin LI #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff) 463a4d6d3b8SXin LI 464898496eeSXin LI file_private int 46543a5ec4eSXin LI looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, 466b6cee71dSXin LI size_t *ulen) 467b6cee71dSXin LI { 468b6cee71dSXin LI int bigend; 469a4d6d3b8SXin LI uint32_t hi; 470b6cee71dSXin LI size_t i; 471b6cee71dSXin LI 472b6cee71dSXin LI if (nbytes < 2) 473b6cee71dSXin LI return 0; 474b6cee71dSXin LI 47548c779cdSXin LI if (bf[0] == 0xff && bf[1] == 0xfe) 476b6cee71dSXin LI bigend = 0; 47748c779cdSXin LI else if (bf[0] == 0xfe && bf[1] == 0xff) 478b6cee71dSXin LI bigend = 1; 479b6cee71dSXin LI else 480b6cee71dSXin LI return 0; 481b6cee71dSXin LI 482b6cee71dSXin LI *ulen = 0; 483a4d6d3b8SXin LI hi = 0; 484b6cee71dSXin LI 485b6cee71dSXin LI for (i = 2; i + 1 < nbytes; i += 2) { 486a4d6d3b8SXin LI uint32_t uc; 487b6cee71dSXin LI 488b6cee71dSXin LI if (bigend) 489a2dfb722SXin LI uc = CAST(uint32_t, 490a2dfb722SXin LI bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8)); 491b6cee71dSXin LI else 492a2dfb722SXin LI uc = CAST(uint32_t, 493a2dfb722SXin LI bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8)); 494b6cee71dSXin LI 495a4d6d3b8SXin LI uc &= 0xffff; 496a4d6d3b8SXin LI 497a4d6d3b8SXin LI switch (uc) { 498a4d6d3b8SXin LI case 0xfffe: 499a4d6d3b8SXin LI case 0xffff: 500b6cee71dSXin LI return 0; 501a4d6d3b8SXin LI default: 502a4d6d3b8SXin LI if (UCS16_NOCHAR(uc)) 503a4d6d3b8SXin LI return 0; 504a4d6d3b8SXin LI break; 505a4d6d3b8SXin LI } 506a4d6d3b8SXin LI if (hi) { 507a4d6d3b8SXin LI if (!UCS16_LOSURR(uc)) 508a4d6d3b8SXin LI return 0; 509a4d6d3b8SXin LI uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00); 510a4d6d3b8SXin LI hi = 0; 511a4d6d3b8SXin LI } 512a4d6d3b8SXin LI if (uc < 128 && text_chars[CAST(size_t, uc)] != T) 513a4d6d3b8SXin LI return 0; 514a4d6d3b8SXin LI ubf[(*ulen)++] = uc; 515a4d6d3b8SXin LI if (UCS16_HISURR(uc)) 516a4d6d3b8SXin LI hi = uc - 0xd800 + 1; 517a4d6d3b8SXin LI if (UCS16_LOSURR(uc)) 518b6cee71dSXin LI return 0; 519b6cee71dSXin LI } 520b6cee71dSXin LI 521b6cee71dSXin LI return 1 + bigend; 522b6cee71dSXin LI } 523b6cee71dSXin LI 524898496eeSXin LI file_private int 52543a5ec4eSXin LI looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, 52648c779cdSXin LI size_t *ulen) 52748c779cdSXin LI { 52848c779cdSXin LI int bigend; 52948c779cdSXin LI size_t i; 53048c779cdSXin LI 53148c779cdSXin LI if (nbytes < 4) 53248c779cdSXin LI return 0; 53348c779cdSXin LI 53448c779cdSXin LI if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) 53548c779cdSXin LI bigend = 0; 53648c779cdSXin LI else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) 53748c779cdSXin LI bigend = 1; 53848c779cdSXin LI else 53948c779cdSXin LI return 0; 54048c779cdSXin LI 54148c779cdSXin LI *ulen = 0; 54248c779cdSXin LI 54348c779cdSXin LI for (i = 4; i + 3 < nbytes; i += 4) { 54448c779cdSXin LI /* XXX fix to properly handle chars > 65536 */ 54548c779cdSXin LI 54648c779cdSXin LI if (bigend) 54743a5ec4eSXin LI ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3]) 54843a5ec4eSXin LI | (CAST(file_unichar_t, bf[i + 2]) << 8) 54943a5ec4eSXin LI | (CAST(file_unichar_t, bf[i + 1]) << 16) 55043a5ec4eSXin LI | (CAST(file_unichar_t, bf[i]) << 24); 55148c779cdSXin LI else 55243a5ec4eSXin LI ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0]) 55343a5ec4eSXin LI | (CAST(file_unichar_t, bf[i + 1]) << 8) 55443a5ec4eSXin LI | (CAST(file_unichar_t, bf[i + 2]) << 16) 55543a5ec4eSXin LI | (CAST(file_unichar_t, bf[i + 3]) << 24); 55648c779cdSXin LI 55748c779cdSXin LI if (ubf[*ulen - 1] == 0xfffe) 55848c779cdSXin LI return 0; 55948c779cdSXin LI if (ubf[*ulen - 1] < 128 && 56048c779cdSXin LI text_chars[CAST(size_t, ubf[*ulen - 1])] != T) 56148c779cdSXin LI return 0; 56248c779cdSXin LI } 56348c779cdSXin LI 56448c779cdSXin LI return 1 + bigend; 56548c779cdSXin LI } 566b6cee71dSXin LI #undef F 567b6cee71dSXin LI #undef T 568b6cee71dSXin LI #undef I 569b6cee71dSXin LI #undef X 570b6cee71dSXin LI 571b6cee71dSXin LI /* 572b6cee71dSXin LI * This table maps each EBCDIC character to an (8-bit extended) ASCII 573b6cee71dSXin LI * character, as specified in the rationale for the dd(1) command in 574b6cee71dSXin LI * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 575b6cee71dSXin LI * 576b6cee71dSXin LI * Unfortunately it does not seem to correspond exactly to any of the 577b6cee71dSXin LI * five variants of EBCDIC documented in IBM's _Enterprise Systems 578b6cee71dSXin LI * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 579b6cee71dSXin LI * Edition, July, 1999, pp. I-1 - I-4. 580b6cee71dSXin LI * 581b6cee71dSXin LI * Fortunately, though, all versions of EBCDIC, including this one, agree 582b6cee71dSXin LI * on most of the printing characters that also appear in (7-bit) ASCII. 583b6cee71dSXin LI * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 584b6cee71dSXin LI * 585b6cee71dSXin LI * Fortunately too, there is general agreement that codes 0x00 through 586b6cee71dSXin LI * 0x3F represent control characters, 0x41 a nonbreaking space, and the 587b6cee71dSXin LI * remainder printing characters. 588b6cee71dSXin LI * 589b6cee71dSXin LI * This is sufficient to allow us to identify EBCDIC text and to distinguish 590b6cee71dSXin LI * between old-style and internationalized examples of text. 591b6cee71dSXin LI */ 592b6cee71dSXin LI 593898496eeSXin LI file_private unsigned char ebcdic_to_ascii[] = { 594b6cee71dSXin LI 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 595b6cee71dSXin LI 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 596b6cee71dSXin LI 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 597b6cee71dSXin LI 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 598b6cee71dSXin LI ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 599b6cee71dSXin LI '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 600b6cee71dSXin LI '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 601b6cee71dSXin LI 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 602b6cee71dSXin LI 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 603b6cee71dSXin LI 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 604b6cee71dSXin LI 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 605b6cee71dSXin LI 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 606b6cee71dSXin LI '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 607b6cee71dSXin LI '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 608b6cee71dSXin LI '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 609b6cee71dSXin LI '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 610b6cee71dSXin LI }; 611b6cee71dSXin LI 612b6cee71dSXin LI #ifdef notdef 613b6cee71dSXin LI /* 614b6cee71dSXin LI * The following EBCDIC-to-ASCII table may relate more closely to reality, 615b6cee71dSXin LI * or at least to modern reality. It comes from 616b6cee71dSXin LI * 617b6cee71dSXin LI * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 618b6cee71dSXin LI * 619b6cee71dSXin LI * and maps the characters of EBCDIC code page 1047 (the code used for 620b6cee71dSXin LI * Unix-derived software on IBM's 390 systems) to the corresponding 621b6cee71dSXin LI * characters from ISO 8859-1. 622b6cee71dSXin LI * 623b6cee71dSXin LI * If this table is used instead of the above one, some of the special 624b6cee71dSXin LI * cases for the NEL character can be taken out of the code. 625b6cee71dSXin LI */ 626b6cee71dSXin LI 627898496eeSXin LI file_private unsigned char ebcdic_1047_to_8859[] = { 628b6cee71dSXin LI 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 629b6cee71dSXin LI 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 630b6cee71dSXin LI 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 631b6cee71dSXin LI 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 632b6cee71dSXin LI 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 633b6cee71dSXin LI 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 634b6cee71dSXin LI 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 635b6cee71dSXin LI 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 636b6cee71dSXin LI 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 637b6cee71dSXin LI 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 638b6cee71dSXin LI 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 639b6cee71dSXin LI 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 640b6cee71dSXin LI 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 641b6cee71dSXin LI 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 642b6cee71dSXin LI 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 643b6cee71dSXin LI 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 644b6cee71dSXin LI }; 645b6cee71dSXin LI #endif 646b6cee71dSXin LI 647b6cee71dSXin LI /* 648b6cee71dSXin LI * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 649b6cee71dSXin LI */ 650898496eeSXin LI file_private void 651b6cee71dSXin LI from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 652b6cee71dSXin LI { 653b6cee71dSXin LI size_t i; 654b6cee71dSXin LI 655b6cee71dSXin LI for (i = 0; i < nbytes; i++) { 656b6cee71dSXin LI out[i] = ebcdic_to_ascii[buf[i]]; 657b6cee71dSXin LI } 658b6cee71dSXin LI } 659