1 /* $NetBSD: encoding.c,v 1.8 2019/05/22 17:26:05 christos Exp $ */ 2 3 /* 4 * Copyright (c) Ian F. Darwin 1986-1995. 5 * Software written by Ian F. Darwin and others; 6 * maintained 1995-present by Christos Zoulas and others. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice immediately at the beginning of the file, without modification, 13 * this list of conditions, and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 /* 31 * Encoding -- determine the character encoding of a text file. 32 * 33 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 34 * international characters. 35 */ 36 37 #include "file.h" 38 39 #ifndef lint 40 #if 0 41 FILE_RCSID("@(#)$File: encoding.c,v 1.20 2019/04/15 16:48:41 christos Exp $") 42 #else 43 __RCSID("$NetBSD: encoding.c,v 1.8 2019/05/22 17:26:05 christos Exp $"); 44 #endif 45 #endif /* lint */ 46 47 #include "magic.h" 48 #include <string.h> 49 #include <memory.h> 50 #include <stdlib.h> 51 52 53 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 54 private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, 55 size_t *); 56 private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *); 57 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); 58 private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *); 59 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 60 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 61 private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 62 63 #ifdef DEBUG_ENCODING 64 #define DPRINTF(a) printf a 65 #else 66 #define DPRINTF(a) 67 #endif 68 69 /* 70 * Try to determine whether text is in some character code we can 71 * identify. Each of these tests, if it succeeds, will leave 72 * the text converted into one-unichar-per-character Unicode in 73 * ubuf, and the number of characters converted in ulen. 74 */ 75 protected int 76 file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf, 77 size_t *ulen, const char **code, const char **code_mime, const char **type) 78 { 79 const unsigned char *buf = CAST(const unsigned char *, b->fbuf); 80 size_t nbytes = b->flen; 81 size_t mlen; 82 int rv = 1, ucs_type; 83 unsigned char *nbuf = NULL; 84 unichar *udefbuf; 85 size_t udeflen; 86 87 if (ubuf == NULL) 88 ubuf = &udefbuf; 89 if (ulen == NULL) 90 ulen = &udeflen; 91 92 *type = "text"; 93 *ulen = 0; 94 *code = "unknown"; 95 *code_mime = "binary"; 96 97 mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 98 if ((*ubuf = CAST(unichar *, calloc(CAST(size_t, 1), mlen))) == NULL) { 99 file_oomem(ms, mlen); 100 goto done; 101 } 102 mlen = (nbytes + 1) * sizeof(nbuf[0]); 103 if ((nbuf = CAST(unsigned char *, 104 calloc(CAST(size_t, 1), mlen))) == NULL) { 105 file_oomem(ms, mlen); 106 goto done; 107 } 108 109 if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 110 if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) { 111 DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); 112 *code = "UTF-7 Unicode"; 113 *code_mime = "utf-7"; 114 } else { 115 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 116 *code = "ASCII"; 117 *code_mime = "us-ascii"; 118 } 119 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 120 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 121 *code = "UTF-8 Unicode (with BOM)"; 122 *code_mime = "utf-8"; 123 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 124 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 125 *code = "UTF-8 Unicode"; 126 *code_mime = "utf-8"; 127 } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { 128 if (ucs_type == 1) { 129 *code = "Little-endian UTF-32 Unicode"; 130 *code_mime = "utf-32le"; 131 } else { 132 *code = "Big-endian UTF-32 Unicode"; 133 *code_mime = "utf-32be"; 134 } 135 DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); 136 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 137 if (ucs_type == 1) { 138 *code = "Little-endian UTF-16 Unicode"; 139 *code_mime = "utf-16le"; 140 } else { 141 *code = "Big-endian UTF-16 Unicode"; 142 *code_mime = "utf-16be"; 143 } 144 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 145 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 146 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 147 *code = "ISO-8859"; 148 *code_mime = "iso-8859-1"; 149 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 150 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 151 *code = "Non-ISO extended-ASCII"; 152 *code_mime = "unknown-8bit"; 153 } else { 154 from_ebcdic(buf, nbytes, nbuf); 155 156 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 157 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 158 *code = "EBCDIC"; 159 *code_mime = "ebcdic"; 160 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 161 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 162 *ulen)); 163 *code = "International EBCDIC"; 164 *code_mime = "ebcdic"; 165 } else { /* Doesn't look like text at all */ 166 DPRINTF(("binary\n")); 167 rv = 0; 168 *type = "binary"; 169 } 170 } 171 172 done: 173 free(nbuf); 174 if (ubuf == &udefbuf) 175 free(udefbuf); 176 177 return rv; 178 } 179 180 /* 181 * This table reflects a particular philosophy about what constitutes 182 * "text," and there is room for disagreement about it. 183 * 184 * Version 3.31 of the file command considered a file to be ASCII if 185 * each of its characters was approved by either the isascii() or 186 * isalpha() function. On most systems, this would mean that any 187 * file consisting only of characters in the range 0x00 ... 0x7F 188 * would be called ASCII text, but many systems might reasonably 189 * consider some characters outside this range to be alphabetic, 190 * so the file command would call such characters ASCII. It might 191 * have been more accurate to call this "considered textual on the 192 * local system" than "ASCII." 193 * 194 * It considered a file to be "International language text" if each 195 * of its characters was either an ASCII printing character (according 196 * to the real ASCII standard, not the above test), a character in 197 * the range 0x80 ... 0xFF, or one of the following control characters: 198 * backspace, tab, line feed, vertical tab, form feed, carriage return, 199 * escape. No attempt was made to determine the language in which files 200 * of this type were written. 201 * 202 * 203 * The table below considers a file to be ASCII if all of its characters 204 * are either ASCII printing characters (again, according to the X3.4 205 * standard, not isascii()) or any of the following controls: bell, 206 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 207 * 208 * I include bell because some programs (particularly shell scripts) 209 * use it literally, even though it is rare in normal text. I exclude 210 * vertical tab because it never seems to be used in real text. I also 211 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 212 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 213 * character to. It might be more appropriate to include it in the 8859 214 * set instead of the ASCII set, but it's got to be included in *something* 215 * we recognize or EBCDIC files aren't going to be considered textual. 216 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 217 * and Latin characters, so these should possibly be allowed. But they 218 * make a real mess on VT100-style displays if they're not paired properly, 219 * so we are probably better off not calling them text. 220 * 221 * A file is considered to be ISO-8859 text if its characters are all 222 * either ASCII, according to the above definition, or printing characters 223 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 224 * 225 * Finally, a file is considered to be international text from some other 226 * character code if its characters are all either ISO-8859 (according to 227 * the above definition) or characters in the range 0x80 ... 0x9F, which 228 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 229 * consider to be printing characters. 230 */ 231 232 #define F 0 /* character never appears in text */ 233 #define T 1 /* character appears in plain ASCII text */ 234 #define I 2 /* character appears in ISO-8859 text */ 235 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 236 237 private char text_chars[256] = { 238 /* BEL BS HT LF VT FF CR */ 239 F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */ 240 /* ESC */ 241 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 242 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 243 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 244 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 245 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 246 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 247 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 248 /* NEL */ 249 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 250 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 251 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 252 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 253 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 254 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 255 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 256 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 257 }; 258 259 private int 260 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, 261 size_t *ulen) 262 { 263 size_t i; 264 265 *ulen = 0; 266 267 for (i = 0; i < nbytes; i++) { 268 int t = text_chars[buf[i]]; 269 270 if (t != T) 271 return 0; 272 273 ubuf[(*ulen)++] = buf[i]; 274 } 275 276 return 1; 277 } 278 279 private int 280 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 281 { 282 size_t i; 283 284 *ulen = 0; 285 286 for (i = 0; i < nbytes; i++) { 287 int t = text_chars[buf[i]]; 288 289 if (t != T && t != I) 290 return 0; 291 292 ubuf[(*ulen)++] = buf[i]; 293 } 294 295 return 1; 296 } 297 298 private int 299 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, 300 size_t *ulen) 301 { 302 size_t i; 303 304 *ulen = 0; 305 306 for (i = 0; i < nbytes; i++) { 307 int t = text_chars[buf[i]]; 308 309 if (t != T && t != I && t != X) 310 return 0; 311 312 ubuf[(*ulen)++] = buf[i]; 313 } 314 315 return 1; 316 } 317 318 /* 319 * Decide whether some text looks like UTF-8. Returns: 320 * 321 * -1: invalid UTF-8 322 * 0: uses odd control characters, so doesn't look like text 323 * 1: 7-bit text 324 * 2: definitely UTF-8 text (valid high-bit set bytes) 325 * 326 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 327 * ubuf must be big enough! 328 */ 329 protected int 330 file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 331 { 332 size_t i; 333 int n; 334 unichar c; 335 int gotone = 0, ctrl = 0; 336 337 if (ubuf) 338 *ulen = 0; 339 340 for (i = 0; i < nbytes; i++) { 341 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 342 /* 343 * Even if the whole file is valid UTF-8 sequences, 344 * still reject it if it uses weird control characters. 345 */ 346 347 if (text_chars[buf[i]] != T) 348 ctrl = 1; 349 350 if (ubuf) 351 ubuf[(*ulen)++] = buf[i]; 352 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 353 return -1; 354 } else { /* 11xxxxxx begins UTF-8 */ 355 int following; 356 357 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 358 c = buf[i] & 0x1f; 359 following = 1; 360 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 361 c = buf[i] & 0x0f; 362 following = 2; 363 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 364 c = buf[i] & 0x07; 365 following = 3; 366 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 367 c = buf[i] & 0x03; 368 following = 4; 369 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 370 c = buf[i] & 0x01; 371 following = 5; 372 } else 373 return -1; 374 375 for (n = 0; n < following; n++) { 376 i++; 377 if (i >= nbytes) 378 goto done; 379 380 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 381 return -1; 382 383 c = (c << 6) + (buf[i] & 0x3f); 384 } 385 386 if (ubuf) 387 ubuf[(*ulen)++] = c; 388 gotone = 1; 389 } 390 } 391 done: 392 return ctrl ? 0 : (gotone ? 2 : 1); 393 } 394 395 /* 396 * Decide whether some text looks like UTF-8 with BOM. If there is no 397 * BOM, return -1; otherwise return the result of looks_utf8 on the 398 * rest of the text. 399 */ 400 private int 401 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, 402 size_t *ulen) 403 { 404 if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 405 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 406 else 407 return -1; 408 } 409 410 private int 411 looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 412 { 413 if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') 414 switch (buf[3]) { 415 case '8': 416 case '9': 417 case '+': 418 case '/': 419 if (ubuf) 420 *ulen = 0; 421 return 1; 422 default: 423 return -1; 424 } 425 else 426 return -1; 427 } 428 429 private int 430 looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf, 431 size_t *ulen) 432 { 433 int bigend; 434 size_t i; 435 436 if (nbytes < 2) 437 return 0; 438 439 if (bf[0] == 0xff && bf[1] == 0xfe) 440 bigend = 0; 441 else if (bf[0] == 0xfe && bf[1] == 0xff) 442 bigend = 1; 443 else 444 return 0; 445 446 *ulen = 0; 447 448 for (i = 2; i + 1 < nbytes; i += 2) { 449 /* XXX fix to properly handle chars > 65536 */ 450 451 if (bigend) 452 ubf[(*ulen)++] = bf[i + 1] 453 | (CAST(unichar, bf[i]) << 8); 454 else 455 ubf[(*ulen)++] = bf[i] 456 | (CAST(unichar, bf[i + 1]) << 8); 457 458 if (ubf[*ulen - 1] == 0xfffe) 459 return 0; 460 if (ubf[*ulen - 1] < 128 && 461 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) 462 return 0; 463 } 464 465 return 1 + bigend; 466 } 467 468 private int 469 looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf, 470 size_t *ulen) 471 { 472 int bigend; 473 size_t i; 474 475 if (nbytes < 4) 476 return 0; 477 478 if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) 479 bigend = 0; 480 else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) 481 bigend = 1; 482 else 483 return 0; 484 485 *ulen = 0; 486 487 for (i = 4; i + 3 < nbytes; i += 4) { 488 /* XXX fix to properly handle chars > 65536 */ 489 490 if (bigend) 491 ubf[(*ulen)++] = CAST(unichar, bf[i + 3]) 492 | (CAST(unichar, bf[i + 2]) << 8) 493 | (CAST(unichar, bf[i + 1]) << 16) 494 | (CAST(unichar, bf[i]) << 24); 495 else 496 ubf[(*ulen)++] = CAST(unichar, bf[i + 0]) 497 | (CAST(unichar, bf[i + 1]) << 8) 498 | (CAST(unichar, bf[i + 2]) << 16) 499 | (CAST(unichar, bf[i + 3]) << 24); 500 501 if (ubf[*ulen - 1] == 0xfffe) 502 return 0; 503 if (ubf[*ulen - 1] < 128 && 504 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) 505 return 0; 506 } 507 508 return 1 + bigend; 509 } 510 #undef F 511 #undef T 512 #undef I 513 #undef X 514 515 /* 516 * This table maps each EBCDIC character to an (8-bit extended) ASCII 517 * character, as specified in the rationale for the dd(1) command in 518 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 519 * 520 * Unfortunately it does not seem to correspond exactly to any of the 521 * five variants of EBCDIC documented in IBM's _Enterprise Systems 522 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 523 * Edition, July, 1999, pp. I-1 - I-4. 524 * 525 * Fortunately, though, all versions of EBCDIC, including this one, agree 526 * on most of the printing characters that also appear in (7-bit) ASCII. 527 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 528 * 529 * Fortunately too, there is general agreement that codes 0x00 through 530 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 531 * remainder printing characters. 532 * 533 * This is sufficient to allow us to identify EBCDIC text and to distinguish 534 * between old-style and internationalized examples of text. 535 */ 536 537 private unsigned char ebcdic_to_ascii[] = { 538 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 539 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 540 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 541 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 542 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 543 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 544 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 545 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 546 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 547 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 548 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 549 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 550 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 551 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 552 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 553 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 554 }; 555 556 #ifdef notdef 557 /* 558 * The following EBCDIC-to-ASCII table may relate more closely to reality, 559 * or at least to modern reality. It comes from 560 * 561 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 562 * 563 * and maps the characters of EBCDIC code page 1047 (the code used for 564 * Unix-derived software on IBM's 390 systems) to the corresponding 565 * characters from ISO 8859-1. 566 * 567 * If this table is used instead of the above one, some of the special 568 * cases for the NEL character can be taken out of the code. 569 */ 570 571 private unsigned char ebcdic_1047_to_8859[] = { 572 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 573 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 574 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 575 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 576 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 577 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 578 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 579 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 580 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 581 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 582 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 583 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 584 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 585 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 586 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 587 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 588 }; 589 #endif 590 591 /* 592 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 593 */ 594 private void 595 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 596 { 597 size_t i; 598 599 for (i = 0; i < nbytes; i++) { 600 out[i] = ebcdic_to_ascii[buf[i]]; 601 } 602 } 603