1 /* $NetBSD: encoding.c,v 1.10 2021/04/09 19:11:42 christos Exp $ */ 2 3 /* 4 * Copyright (c) Ian F. Darwin 1986-1995. 5 * Software written by Ian F. Darwin and others; 6 * maintained 1995-present by Christos Zoulas and others. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice immediately at the beginning of the file, without modification, 13 * this list of conditions, and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 /* 31 * Encoding -- determine the character encoding of a text file. 32 * 33 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 34 * international characters. 35 */ 36 37 #include "file.h" 38 39 #ifndef lint 40 #if 0 41 FILE_RCSID("@(#)$File: encoding.c,v 1.27 2021/02/05 21:33:49 christos Exp $") 42 #else 43 __RCSID("$NetBSD: encoding.c,v 1.10 2021/04/09 19:11:42 christos Exp $"); 44 #endif 45 #endif /* lint */ 46 47 #include "magic.h" 48 #include <string.h> 49 #include <stdlib.h> 50 51 52 private int looks_ascii(const unsigned char *, size_t, file_unichar_t *, 53 size_t *); 54 private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *, 55 size_t *); 56 private int looks_utf7(const unsigned char *, size_t, file_unichar_t *, 57 size_t *); 58 private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *, 59 size_t *); 60 private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *, 61 size_t *); 62 private int looks_latin1(const unsigned char *, size_t, file_unichar_t *, 63 size_t *); 64 private int looks_extended(const unsigned char *, size_t, file_unichar_t *, 65 size_t *); 66 private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 67 68 #ifdef DEBUG_ENCODING 69 #define DPRINTF(a) printf a 70 #else 71 #define DPRINTF(a) 72 #endif 73 74 /* 75 * Try to determine whether text is in some character code we can 76 * identify. Each of these tests, if it succeeds, will leave 77 * the text converted into one-file_unichar_t-per-character Unicode in 78 * ubuf, and the number of characters converted in ulen. 79 */ 80 protected int 81 file_encoding(struct magic_set *ms, const struct buffer *b, 82 file_unichar_t **ubuf, size_t *ulen, const char **code, 83 const char **code_mime, const char **type) 84 { 85 const unsigned char *buf = CAST(const unsigned char *, b->fbuf); 86 size_t nbytes = b->flen; 87 size_t mlen; 88 int rv = 1, ucs_type; 89 unsigned char *nbuf = NULL; 90 file_unichar_t *udefbuf; 91 size_t udeflen; 92 93 if (ubuf == NULL) 94 ubuf = &udefbuf; 95 if (ulen == NULL) 96 ulen = &udeflen; 97 98 *type = "text"; 99 *ulen = 0; 100 *code = "unknown"; 101 *code_mime = "binary"; 102 103 if (nbytes > ms->encoding_max) 104 nbytes = ms->encoding_max; 105 106 mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 107 *ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen)); 108 if (*ubuf == NULL) { 109 file_oomem(ms, mlen); 110 goto done; 111 } 112 mlen = (nbytes + 1) * sizeof(nbuf[0]); 113 if ((nbuf = CAST(unsigned char *, 114 calloc(CAST(size_t, 1), mlen))) == NULL) { 115 file_oomem(ms, mlen); 116 goto done; 117 } 118 119 if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 120 if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) { 121 DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); 122 *code = "Unicode text, UTF-7"; 123 *code_mime = "utf-7"; 124 } else { 125 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 126 *code = "ASCII"; 127 *code_mime = "us-ascii"; 128 } 129 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 130 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 131 *code = "Unicode text, UTF-8 (with BOM)"; 132 *code_mime = "utf-8"; 133 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 134 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 135 *code = "Unicode text, UTF-8"; 136 *code_mime = "utf-8"; 137 } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { 138 if (ucs_type == 1) { 139 *code = "Unicode text, UTF-32, little-endian"; 140 *code_mime = "utf-32le"; 141 } else { 142 *code = "Unicode text, UTF-32, big-endian"; 143 *code_mime = "utf-32be"; 144 } 145 DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); 146 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 147 if (ucs_type == 1) { 148 *code = "Unicode text, UTF-16, little-endian"; 149 *code_mime = "utf-16le"; 150 } else { 151 *code = "Unicode text, UTF-16, big-endian"; 152 *code_mime = "utf-16be"; 153 } 154 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 155 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 156 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 157 *code = "ISO-8859"; 158 *code_mime = "iso-8859-1"; 159 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 160 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 161 *code = "Non-ISO extended-ASCII"; 162 *code_mime = "unknown-8bit"; 163 } else { 164 from_ebcdic(buf, nbytes, nbuf); 165 166 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 167 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 168 *code = "EBCDIC"; 169 *code_mime = "ebcdic"; 170 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 171 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 172 *ulen)); 173 *code = "International EBCDIC"; 174 *code_mime = "ebcdic"; 175 } else { /* Doesn't look like text at all */ 176 DPRINTF(("binary\n")); 177 rv = 0; 178 *type = "binary"; 179 } 180 } 181 182 done: 183 free(nbuf); 184 if (ubuf == &udefbuf) 185 free(udefbuf); 186 187 return rv; 188 } 189 190 /* 191 * This table reflects a particular philosophy about what constitutes 192 * "text," and there is room for disagreement about it. 193 * 194 * Version 3.31 of the file command considered a file to be ASCII if 195 * each of its characters was approved by either the isascii() or 196 * isalpha() function. On most systems, this would mean that any 197 * file consisting only of characters in the range 0x00 ... 0x7F 198 * would be called ASCII text, but many systems might reasonably 199 * consider some characters outside this range to be alphabetic, 200 * so the file command would call such characters ASCII. It might 201 * have been more accurate to call this "considered textual on the 202 * local system" than "ASCII." 203 * 204 * It considered a file to be "International language text" if each 205 * of its characters was either an ASCII printing character (according 206 * to the real ASCII standard, not the above test), a character in 207 * the range 0x80 ... 0xFF, or one of the following control characters: 208 * backspace, tab, line feed, vertical tab, form feed, carriage return, 209 * escape. No attempt was made to determine the language in which files 210 * of this type were written. 211 * 212 * 213 * The table below considers a file to be ASCII if all of its characters 214 * are either ASCII printing characters (again, according to the X3.4 215 * standard, not isascii()) or any of the following controls: bell, 216 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 217 * 218 * I include bell because some programs (particularly shell scripts) 219 * use it literally, even though it is rare in normal text. I exclude 220 * vertical tab because it never seems to be used in real text. I also 221 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 222 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 223 * character to. It might be more appropriate to include it in the 8859 224 * set instead of the ASCII set, but it's got to be included in *something* 225 * we recognize or EBCDIC files aren't going to be considered textual. 226 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 227 * and Latin characters, so these should possibly be allowed. But they 228 * make a real mess on VT100-style displays if they're not paired properly, 229 * so we are probably better off not calling them text. 230 * 231 * A file is considered to be ISO-8859 text if its characters are all 232 * either ASCII, according to the above definition, or printing characters 233 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 234 * 235 * Finally, a file is considered to be international text from some other 236 * character code if its characters are all either ISO-8859 (according to 237 * the above definition) or characters in the range 0x80 ... 0x9F, which 238 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 239 * consider to be printing characters. 240 */ 241 242 #define F 0 /* character never appears in text */ 243 #define T 1 /* character appears in plain ASCII text */ 244 #define I 2 /* character appears in ISO-8859 text */ 245 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 246 247 private char text_chars[256] = { 248 /* BEL BS HT LF VT FF CR */ 249 F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */ 250 /* ESC */ 251 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 252 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 253 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 254 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 255 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 256 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 257 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 258 /* NEL */ 259 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 260 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 261 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 262 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 263 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 264 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 265 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 266 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 267 }; 268 269 #define LOOKS(NAME, COND) \ 270 private int \ 271 looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \ 272 size_t *ulen) \ 273 { \ 274 size_t i, u; \ 275 unsigned char dist[256]; \ 276 memset(dist, 0, sizeof(dist)); \ 277 \ 278 *ulen = 0; \ 279 \ 280 for (i = 0; i < nbytes; i++) { \ 281 int t = text_chars[buf[i]]; \ 282 \ 283 if (COND) \ 284 return 0; \ 285 \ 286 ubuf[(*ulen)++] = buf[i]; \ 287 dist[buf[i]]++; \ 288 } \ 289 u = 0; \ 290 for (i = 0; i < __arraycount(dist); i++) { \ 291 if (dist[i]) \ 292 u++; \ 293 } \ 294 if (u < 3) \ 295 return 0; \ 296 \ 297 return 1; \ 298 } 299 300 LOOKS(ascii, t != T) 301 LOOKS(latin1, t != T && t != I) 302 LOOKS(extended, t != T && t != I && t != X) 303 304 /* 305 * Decide whether some text looks like UTF-8. Returns: 306 * 307 * -1: invalid UTF-8 308 * 0: uses odd control characters, so doesn't look like text 309 * 1: 7-bit text 310 * 2: definitely UTF-8 text (valid high-bit set bytes) 311 * 312 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 313 * ubuf must be big enough! 314 */ 315 316 // from: https://golang.org/src/unicode/utf8/utf8.go 317 318 #define XX 0xF1 // invalid: size 1 319 #define AS 0xF0 // ASCII: size 1 320 #define S1 0x02 // accept 0, size 2 321 #define S2 0x13 // accept 1, size 3 322 #define S3 0x03 // accept 0, size 3 323 #define S4 0x23 // accept 2, size 3 324 #define S5 0x34 // accept 3, size 4 325 #define S6 0x04 // accept 0, size 4 326 #define S7 0x44 // accept 4, size 4 327 328 #define LOCB 0x80 329 #define HICB 0xBF 330 331 // first is information about the first byte in a UTF-8 sequence. 332 static const uint8_t first[] = { 333 // 1 2 3 4 5 6 7 8 9 A B C D E F 334 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F 335 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F 336 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F 337 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F 338 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F 339 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F 340 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F 341 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F 342 // 1 2 3 4 5 6 7 8 9 A B C D E F 343 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F 344 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F 345 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF 346 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF 347 XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF 348 S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF 349 S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF 350 S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF 351 }; 352 353 // acceptRange gives the range of valid values for the second byte in a UTF-8 354 // sequence. 355 struct accept_range { 356 uint8_t lo; // lowest value for second byte. 357 uint8_t hi; // highest value for second byte. 358 } accept_ranges[16] = { 359 // acceptRanges has size 16 to avoid bounds checks in the code that uses it. 360 { LOCB, HICB }, 361 { 0xA0, HICB }, 362 { LOCB, 0x9F }, 363 { 0x90, HICB }, 364 { LOCB, 0x8F }, 365 }; 366 367 protected int 368 file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, 369 size_t *ulen) 370 { 371 size_t i; 372 int n; 373 file_unichar_t c; 374 int gotone = 0, ctrl = 0; 375 376 if (ubuf) 377 *ulen = 0; 378 379 for (i = 0; i < nbytes; i++) { 380 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 381 /* 382 * Even if the whole file is valid UTF-8 sequences, 383 * still reject it if it uses weird control characters. 384 */ 385 386 if (text_chars[buf[i]] != T) 387 ctrl = 1; 388 389 if (ubuf) 390 ubuf[(*ulen)++] = buf[i]; 391 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 392 return -1; 393 } else { /* 11xxxxxx begins UTF-8 */ 394 int following; 395 uint8_t x = first[buf[i]]; 396 const struct accept_range *ar = 397 &accept_ranges[(unsigned int)x >> 4]; 398 if (x == XX) 399 return -1; 400 401 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 402 c = buf[i] & 0x1f; 403 following = 1; 404 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 405 c = buf[i] & 0x0f; 406 following = 2; 407 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 408 c = buf[i] & 0x07; 409 following = 3; 410 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 411 c = buf[i] & 0x03; 412 following = 4; 413 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 414 c = buf[i] & 0x01; 415 following = 5; 416 } else 417 return -1; 418 419 for (n = 0; n < following; n++) { 420 i++; 421 if (i >= nbytes) 422 goto done; 423 424 if (n == 0 && 425 (buf[i] < ar->lo || buf[i] > ar->hi)) 426 return -1; 427 428 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 429 return -1; 430 431 c = (c << 6) + (buf[i] & 0x3f); 432 } 433 434 if (ubuf) 435 ubuf[(*ulen)++] = c; 436 gotone = 1; 437 } 438 } 439 done: 440 return ctrl ? 0 : (gotone ? 2 : 1); 441 } 442 443 /* 444 * Decide whether some text looks like UTF-8 with BOM. If there is no 445 * BOM, return -1; otherwise return the result of looks_utf8 on the 446 * rest of the text. 447 */ 448 private int 449 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, 450 file_unichar_t *ubuf, size_t *ulen) 451 { 452 if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 453 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 454 else 455 return -1; 456 } 457 458 private int 459 looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, 460 size_t *ulen) 461 { 462 if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') 463 switch (buf[3]) { 464 case '8': 465 case '9': 466 case '+': 467 case '/': 468 if (ubuf) 469 *ulen = 0; 470 return 1; 471 default: 472 return -1; 473 } 474 else 475 return -1; 476 } 477 478 private int 479 looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, 480 size_t *ulen) 481 { 482 int bigend; 483 size_t i; 484 485 if (nbytes < 2) 486 return 0; 487 488 if (bf[0] == 0xff && bf[1] == 0xfe) 489 bigend = 0; 490 else if (bf[0] == 0xfe && bf[1] == 0xff) 491 bigend = 1; 492 else 493 return 0; 494 495 *ulen = 0; 496 497 for (i = 2; i + 1 < nbytes; i += 2) { 498 /* XXX fix to properly handle chars > 65536 */ 499 500 if (bigend) 501 ubf[(*ulen)++] = bf[i + 1] 502 | (CAST(file_unichar_t, bf[i]) << 8); 503 else 504 ubf[(*ulen)++] = bf[i] 505 | (CAST(file_unichar_t, bf[i + 1]) << 8); 506 507 if (ubf[*ulen - 1] == 0xfffe) 508 return 0; 509 if (ubf[*ulen - 1] < 128 && 510 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) 511 return 0; 512 } 513 514 return 1 + bigend; 515 } 516 517 private int 518 looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, 519 size_t *ulen) 520 { 521 int bigend; 522 size_t i; 523 524 if (nbytes < 4) 525 return 0; 526 527 if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) 528 bigend = 0; 529 else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) 530 bigend = 1; 531 else 532 return 0; 533 534 *ulen = 0; 535 536 for (i = 4; i + 3 < nbytes; i += 4) { 537 /* XXX fix to properly handle chars > 65536 */ 538 539 if (bigend) 540 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3]) 541 | (CAST(file_unichar_t, bf[i + 2]) << 8) 542 | (CAST(file_unichar_t, bf[i + 1]) << 16) 543 | (CAST(file_unichar_t, bf[i]) << 24); 544 else 545 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0]) 546 | (CAST(file_unichar_t, bf[i + 1]) << 8) 547 | (CAST(file_unichar_t, bf[i + 2]) << 16) 548 | (CAST(file_unichar_t, bf[i + 3]) << 24); 549 550 if (ubf[*ulen - 1] == 0xfffe) 551 return 0; 552 if (ubf[*ulen - 1] < 128 && 553 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) 554 return 0; 555 } 556 557 return 1 + bigend; 558 } 559 #undef F 560 #undef T 561 #undef I 562 #undef X 563 564 /* 565 * This table maps each EBCDIC character to an (8-bit extended) ASCII 566 * character, as specified in the rationale for the dd(1) command in 567 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 568 * 569 * Unfortunately it does not seem to correspond exactly to any of the 570 * five variants of EBCDIC documented in IBM's _Enterprise Systems 571 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 572 * Edition, July, 1999, pp. I-1 - I-4. 573 * 574 * Fortunately, though, all versions of EBCDIC, including this one, agree 575 * on most of the printing characters that also appear in (7-bit) ASCII. 576 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 577 * 578 * Fortunately too, there is general agreement that codes 0x00 through 579 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 580 * remainder printing characters. 581 * 582 * This is sufficient to allow us to identify EBCDIC text and to distinguish 583 * between old-style and internationalized examples of text. 584 */ 585 586 private unsigned char ebcdic_to_ascii[] = { 587 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 588 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 589 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 590 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 591 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 592 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 593 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 594 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 595 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 596 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 597 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 598 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 599 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 600 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 601 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 602 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 603 }; 604 605 #ifdef notdef 606 /* 607 * The following EBCDIC-to-ASCII table may relate more closely to reality, 608 * or at least to modern reality. It comes from 609 * 610 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 611 * 612 * and maps the characters of EBCDIC code page 1047 (the code used for 613 * Unix-derived software on IBM's 390 systems) to the corresponding 614 * characters from ISO 8859-1. 615 * 616 * If this table is used instead of the above one, some of the special 617 * cases for the NEL character can be taken out of the code. 618 */ 619 620 private unsigned char ebcdic_1047_to_8859[] = { 621 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 622 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 623 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 624 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 625 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 626 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 627 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 628 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 629 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 630 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 631 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 632 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 633 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 634 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 635 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 636 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 637 }; 638 #endif 639 640 /* 641 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 642 */ 643 private void 644 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 645 { 646 size_t i; 647 648 for (i = 0; i < nbytes; i++) { 649 out[i] = ebcdic_to_ascii[buf[i]]; 650 } 651 } 652