1 /* $NetBSD: encoding.c,v 1.12 2023/08/18 19:00:11 christos Exp $ */ 2 3 /* 4 * Copyright (c) Ian F. Darwin 1986-1995. 5 * Software written by Ian F. Darwin and others; 6 * maintained 1995-present by Christos Zoulas and others. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice immediately at the beginning of the file, without modification, 13 * this list of conditions, and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 /* 31 * Encoding -- determine the character encoding of a text file. 32 * 33 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 34 * international characters. 35 */ 36 37 #include "file.h" 38 39 #ifndef lint 40 #if 0 41 FILE_RCSID("@(#)$File: encoding.c,v 1.42 2022/12/26 17:31:14 christos Exp $") 42 #else 43 __RCSID("$NetBSD: encoding.c,v 1.12 2023/08/18 19:00:11 christos Exp $"); 44 #endif 45 #endif /* lint */ 46 47 #include "magic.h" 48 #include <string.h> 49 #include <stdlib.h> 50 51 52 file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *, 53 size_t *); 54 file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *, 55 size_t *); 56 file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *, 57 size_t *); 58 file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *, 59 size_t *); 60 file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *, 61 size_t *); 62 file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *, 63 size_t *); 64 file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *, 65 size_t *); 66 file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 67 68 #ifdef DEBUG_ENCODING 69 #define DPRINTF(a) printf a 70 #else 71 #define DPRINTF(a) 72 #endif 73 74 /* 75 * Try to determine whether text is in some character code we can 76 * identify. Each of these tests, if it succeeds, will leave 77 * the text converted into one-file_unichar_t-per-character Unicode in 78 * ubuf, and the number of characters converted in ulen. 79 */ 80 file_protected int 81 file_encoding(struct magic_set *ms, const struct buffer *b, 82 file_unichar_t **ubuf, size_t *ulen, const char **code, 83 const char **code_mime, const char **type) 84 { 85 const unsigned char *buf = CAST(const unsigned char *, b->fbuf); 86 size_t nbytes = b->flen; 87 size_t mlen; 88 int rv = 1, ucs_type; 89 file_unichar_t *udefbuf; 90 size_t udeflen; 91 92 if (ubuf == NULL) 93 ubuf = &udefbuf; 94 if (ulen == NULL) 95 ulen = &udeflen; 96 97 *type = "text"; 98 *ulen = 0; 99 *code = "unknown"; 100 *code_mime = "binary"; 101 102 if (nbytes > ms->encoding_max) 103 nbytes = ms->encoding_max; 104 105 mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 106 *ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen)); 107 if (*ubuf == NULL) { 108 file_oomem(ms, mlen); 109 goto done; 110 } 111 if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 112 if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) { 113 DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); 114 *code = "Unicode text, UTF-7"; 115 *code_mime = "utf-7"; 116 } else { 117 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 118 *code = "ASCII"; 119 *code_mime = "us-ascii"; 120 } 121 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 122 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 123 *code = "Unicode text, UTF-8 (with BOM)"; 124 *code_mime = "utf-8"; 125 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 126 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 127 *code = "Unicode text, UTF-8"; 128 *code_mime = "utf-8"; 129 } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { 130 if (ucs_type == 1) { 131 *code = "Unicode text, UTF-32, little-endian"; 132 *code_mime = "utf-32le"; 133 } else { 134 *code = "Unicode text, UTF-32, big-endian"; 135 *code_mime = "utf-32be"; 136 } 137 DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); 138 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 139 if (ucs_type == 1) { 140 *code = "Unicode text, UTF-16, little-endian"; 141 *code_mime = "utf-16le"; 142 } else { 143 *code = "Unicode text, UTF-16, big-endian"; 144 *code_mime = "utf-16be"; 145 } 146 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 147 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 148 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 149 *code = "ISO-8859"; 150 *code_mime = "iso-8859-1"; 151 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 152 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 153 *code = "Non-ISO extended-ASCII"; 154 *code_mime = "unknown-8bit"; 155 } else { 156 unsigned char *nbuf; 157 158 mlen = (nbytes + 1) * sizeof(nbuf[0]); 159 if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) { 160 file_oomem(ms, mlen); 161 goto done; 162 } 163 from_ebcdic(buf, nbytes, nbuf); 164 165 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 166 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 167 *code = "EBCDIC"; 168 *code_mime = "ebcdic"; 169 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 170 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 171 *ulen)); 172 *code = "International EBCDIC"; 173 *code_mime = "ebcdic"; 174 } else { /* Doesn't look like text at all */ 175 DPRINTF(("binary\n")); 176 rv = 0; 177 *type = "binary"; 178 } 179 free(nbuf); 180 } 181 182 done: 183 if (ubuf == &udefbuf) 184 free(udefbuf); 185 186 return rv; 187 } 188 189 /* 190 * This table reflects a particular philosophy about what constitutes 191 * "text," and there is room for disagreement about it. 192 * 193 * Version 3.31 of the file command considered a file to be ASCII if 194 * each of its characters was approved by either the isascii() or 195 * isalpha() function. On most systems, this would mean that any 196 * file consisting only of characters in the range 0x00 ... 0x7F 197 * would be called ASCII text, but many systems might reasonably 198 * consider some characters outside this range to be alphabetic, 199 * so the file command would call such characters ASCII. It might 200 * have been more accurate to call this "considered textual on the 201 * local system" than "ASCII." 202 * 203 * It considered a file to be "International language text" if each 204 * of its characters was either an ASCII printing character (according 205 * to the real ASCII standard, not the above test), a character in 206 * the range 0x80 ... 0xFF, or one of the following control characters: 207 * backspace, tab, line feed, vertical tab, form feed, carriage return, 208 * escape. No attempt was made to determine the language in which files 209 * of this type were written. 210 * 211 * 212 * The table below considers a file to be ASCII if all of its characters 213 * are either ASCII printing characters (again, according to the X3.4 214 * standard, not isascii()) or any of the following controls: bell, 215 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 216 * 217 * I include bell because some programs (particularly shell scripts) 218 * use it literally, even though it is rare in normal text. I exclude 219 * vertical tab because it never seems to be used in real text. I also 220 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 221 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 222 * character to. It might be more appropriate to include it in the 8859 223 * set instead of the ASCII set, but it's got to be included in *something* 224 * we recognize or EBCDIC files aren't going to be considered textual. 225 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 226 * and Latin characters, so these should possibly be allowed. But they 227 * make a real mess on VT100-style displays if they're not paired properly, 228 * so we are probably better off not calling them text. 229 * 230 * A file is considered to be ISO-8859 text if its characters are all 231 * either ASCII, according to the above definition, or printing characters 232 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 233 * 234 * Finally, a file is considered to be international text from some other 235 * character code if its characters are all either ISO-8859 (according to 236 * the above definition) or characters in the range 0x80 ... 0x9F, which 237 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 238 * consider to be printing characters. 239 */ 240 241 #define F 0 /* character never appears in text */ 242 #define T 1 /* character appears in plain ASCII text */ 243 #define I 2 /* character appears in ISO-8859 text */ 244 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 245 246 file_private char text_chars[256] = { 247 /* BEL BS HT LF VT FF CR */ 248 F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */ 249 /* ESC */ 250 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 251 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 252 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 253 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 254 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 255 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 256 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 257 /* NEL */ 258 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 259 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 260 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 261 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 262 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 263 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 264 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 265 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 266 }; 267 268 #define LOOKS(NAME, COND) \ 269 file_private int \ 270 looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \ 271 size_t *ulen) \ 272 { \ 273 size_t i; \ 274 \ 275 *ulen = 0; \ 276 \ 277 for (i = 0; i < nbytes; i++) { \ 278 int t = text_chars[buf[i]]; \ 279 \ 280 if (COND) \ 281 return 0; \ 282 \ 283 ubuf[(*ulen)++] = buf[i]; \ 284 } \ 285 return 1; \ 286 } 287 288 LOOKS(ascii, t != T) 289 LOOKS(latin1, t != T && t != I) 290 LOOKS(extended, t != T && t != I && t != X) 291 292 /* 293 * Decide whether some text looks like UTF-8. Returns: 294 * 295 * -1: invalid UTF-8 296 * 0: uses odd control characters, so doesn't look like text 297 * 1: 7-bit text 298 * 2: definitely UTF-8 text (valid high-bit set bytes) 299 * 300 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 301 * ubuf must be big enough! 302 */ 303 304 // from: https://golang.org/src/unicode/utf8/utf8.go 305 306 #define XX 0xF1 // invalid: size 1 307 #define AS 0xF0 // ASCII: size 1 308 #define S1 0x02 // accept 0, size 2 309 #define S2 0x13 // accept 1, size 3 310 #define S3 0x03 // accept 0, size 3 311 #define S4 0x23 // accept 2, size 3 312 #define S5 0x34 // accept 3, size 4 313 #define S6 0x04 // accept 0, size 4 314 #define S7 0x44 // accept 4, size 4 315 316 #define LOCB 0x80 317 #define HICB 0xBF 318 319 // first is information about the first byte in a UTF-8 sequence. 320 static const uint8_t first[] = { 321 // 1 2 3 4 5 6 7 8 9 A B C D E F 322 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F 323 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F 324 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F 325 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F 326 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F 327 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F 328 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F 329 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F 330 // 1 2 3 4 5 6 7 8 9 A B C D E F 331 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F 332 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F 333 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF 334 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF 335 XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF 336 S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF 337 S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF 338 S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF 339 }; 340 341 // acceptRange gives the range of valid values for the second byte in a UTF-8 342 // sequence. 343 struct accept_range { 344 uint8_t lo; // lowest value for second byte. 345 uint8_t hi; // highest value for second byte. 346 } accept_ranges[16] = { 347 // acceptRanges has size 16 to avoid bounds checks in the code that uses it. 348 { LOCB, HICB }, 349 { 0xA0, HICB }, 350 { LOCB, 0x9F }, 351 { 0x90, HICB }, 352 { LOCB, 0x8F }, 353 }; 354 355 file_protected int 356 file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, 357 size_t *ulen) 358 { 359 size_t i; 360 int n; 361 file_unichar_t c; 362 int gotone = 0, ctrl = 0; 363 364 if (ubuf) 365 *ulen = 0; 366 367 for (i = 0; i < nbytes; i++) { 368 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 369 /* 370 * Even if the whole file is valid UTF-8 sequences, 371 * still reject it if it uses weird control characters. 372 */ 373 374 if (text_chars[buf[i]] != T) 375 ctrl = 1; 376 377 if (ubuf) 378 ubuf[(*ulen)++] = buf[i]; 379 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 380 return -1; 381 } else { /* 11xxxxxx begins UTF-8 */ 382 int following; 383 uint8_t x = first[buf[i]]; 384 const struct accept_range *ar = 385 &accept_ranges[(unsigned int)x >> 4]; 386 if (x == XX) 387 return -1; 388 389 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 390 c = buf[i] & 0x1f; 391 following = 1; 392 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 393 c = buf[i] & 0x0f; 394 following = 2; 395 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 396 c = buf[i] & 0x07; 397 following = 3; 398 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 399 c = buf[i] & 0x03; 400 following = 4; 401 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 402 c = buf[i] & 0x01; 403 following = 5; 404 } else 405 return -1; 406 407 for (n = 0; n < following; n++) { 408 i++; 409 if (i >= nbytes) 410 goto done; 411 412 if (n == 0 && 413 (buf[i] < ar->lo || buf[i] > ar->hi)) 414 return -1; 415 416 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 417 return -1; 418 419 c = (c << 6) + (buf[i] & 0x3f); 420 } 421 422 if (ubuf) 423 ubuf[(*ulen)++] = c; 424 gotone = 1; 425 } 426 } 427 done: 428 return ctrl ? 0 : (gotone ? 2 : 1); 429 } 430 431 /* 432 * Decide whether some text looks like UTF-8 with BOM. If there is no 433 * BOM, return -1; otherwise return the result of looks_utf8 on the 434 * rest of the text. 435 */ 436 file_private int 437 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, 438 file_unichar_t *ubuf, size_t *ulen) 439 { 440 if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 441 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 442 else 443 return -1; 444 } 445 446 file_private int 447 looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, 448 size_t *ulen) 449 { 450 if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') 451 switch (buf[3]) { 452 case '8': 453 case '9': 454 case '+': 455 case '/': 456 if (ubuf) 457 *ulen = 0; 458 return 1; 459 default: 460 return -1; 461 } 462 else 463 return -1; 464 } 465 466 #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef) 467 #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff) 468 #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff) 469 470 file_private int 471 looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, 472 size_t *ulen) 473 { 474 int bigend; 475 uint32_t hi; 476 size_t i; 477 478 if (nbytes < 2) 479 return 0; 480 481 if (bf[0] == 0xff && bf[1] == 0xfe) 482 bigend = 0; 483 else if (bf[0] == 0xfe && bf[1] == 0xff) 484 bigend = 1; 485 else 486 return 0; 487 488 *ulen = 0; 489 hi = 0; 490 491 for (i = 2; i + 1 < nbytes; i += 2) { 492 uint32_t uc; 493 494 if (bigend) 495 uc = CAST(uint32_t, 496 bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8)); 497 else 498 uc = CAST(uint32_t, 499 bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8)); 500 501 uc &= 0xffff; 502 503 switch (uc) { 504 case 0xfffe: 505 case 0xffff: 506 return 0; 507 default: 508 if (UCS16_NOCHAR(uc)) 509 return 0; 510 break; 511 } 512 if (hi) { 513 if (!UCS16_LOSURR(uc)) 514 return 0; 515 uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00); 516 hi = 0; 517 } 518 if (uc < 128 && text_chars[CAST(size_t, uc)] != T) 519 return 0; 520 ubf[(*ulen)++] = uc; 521 if (UCS16_HISURR(uc)) 522 hi = uc - 0xd800 + 1; 523 if (UCS16_LOSURR(uc)) 524 return 0; 525 } 526 527 return 1 + bigend; 528 } 529 530 file_private int 531 looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, 532 size_t *ulen) 533 { 534 int bigend; 535 size_t i; 536 537 if (nbytes < 4) 538 return 0; 539 540 if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) 541 bigend = 0; 542 else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) 543 bigend = 1; 544 else 545 return 0; 546 547 *ulen = 0; 548 549 for (i = 4; i + 3 < nbytes; i += 4) { 550 /* XXX fix to properly handle chars > 65536 */ 551 552 if (bigend) 553 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3]) 554 | (CAST(file_unichar_t, bf[i + 2]) << 8) 555 | (CAST(file_unichar_t, bf[i + 1]) << 16) 556 | (CAST(file_unichar_t, bf[i]) << 24); 557 else 558 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0]) 559 | (CAST(file_unichar_t, bf[i + 1]) << 8) 560 | (CAST(file_unichar_t, bf[i + 2]) << 16) 561 | (CAST(file_unichar_t, bf[i + 3]) << 24); 562 563 if (ubf[*ulen - 1] == 0xfffe) 564 return 0; 565 if (ubf[*ulen - 1] < 128 && 566 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) 567 return 0; 568 } 569 570 return 1 + bigend; 571 } 572 #undef F 573 #undef T 574 #undef I 575 #undef X 576 577 /* 578 * This table maps each EBCDIC character to an (8-bit extended) ASCII 579 * character, as specified in the rationale for the dd(1) command in 580 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 581 * 582 * Unfortunately it does not seem to correspond exactly to any of the 583 * five variants of EBCDIC documented in IBM's _Enterprise Systems 584 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 585 * Edition, July, 1999, pp. I-1 - I-4. 586 * 587 * Fortunately, though, all versions of EBCDIC, including this one, agree 588 * on most of the printing characters that also appear in (7-bit) ASCII. 589 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 590 * 591 * Fortunately too, there is general agreement that codes 0x00 through 592 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 593 * remainder printing characters. 594 * 595 * This is sufficient to allow us to identify EBCDIC text and to distinguish 596 * between old-style and internationalized examples of text. 597 */ 598 599 file_private unsigned char ebcdic_to_ascii[] = { 600 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 601 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 602 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 603 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 604 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 605 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 606 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 607 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 608 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 609 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 610 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 611 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 612 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 613 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 614 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 615 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 616 }; 617 618 #ifdef notdef 619 /* 620 * The following EBCDIC-to-ASCII table may relate more closely to reality, 621 * or at least to modern reality. It comes from 622 * 623 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 624 * 625 * and maps the characters of EBCDIC code page 1047 (the code used for 626 * Unix-derived software on IBM's 390 systems) to the corresponding 627 * characters from ISO 8859-1. 628 * 629 * If this table is used instead of the above one, some of the special 630 * cases for the NEL character can be taken out of the code. 631 */ 632 633 file_private unsigned char ebcdic_1047_to_8859[] = { 634 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 635 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 636 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 637 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 638 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 639 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 640 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 641 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 642 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 643 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 644 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 645 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 646 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 647 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 648 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 649 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 650 }; 651 #endif 652 653 /* 654 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 655 */ 656 file_private void 657 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 658 { 659 size_t i; 660 661 for (i = 0; i < nbytes; i++) { 662 out[i] = ebcdic_to_ascii[buf[i]]; 663 } 664 } 665