1 /* $NetBSD: encoding.c,v 1.1.1.2 2011/05/12 20:46:52 christos Exp $ */ 2 3 /* 4 * Copyright (c) Ian F. Darwin 1986-1995. 5 * Software written by Ian F. Darwin and others; 6 * maintained 1995-present by Christos Zoulas and others. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice immediately at the beginning of the file, without modification, 13 * this list of conditions, and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 /* 31 * Encoding -- determine the character encoding of a text file. 32 * 33 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 34 * international characters. 35 */ 36 37 #include "file.h" 38 39 #ifndef lint 40 #if 0 41 FILE_RCSID("@(#)$File: encoding.c,v 1.5 2010/07/21 16:47:17 christos Exp $") 42 #else 43 __RCSID("$NetBSD: encoding.c,v 1.1.1.2 2011/05/12 20:46:52 christos Exp $"); 44 #endif 45 #endif /* lint */ 46 47 #include "magic.h" 48 #include <string.h> 49 #include <memory.h> 50 #include <stdlib.h> 51 52 53 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 54 private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, 55 size_t *); 56 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); 57 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 58 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 59 private void from_ebcdic(const unsigned char *, size_t, unsigned char *); 60 61 #ifdef DEBUG_ENCODING 62 #define DPRINTF(a) printf a 63 #else 64 #define DPRINTF(a) 65 #endif 66 67 /* 68 * Try to determine whether text is in some character code we can 69 * identify. Each of these tests, if it succeeds, will leave 70 * the text converted into one-unichar-per-character Unicode in 71 * ubuf, and the number of characters converted in ulen. 72 */ 73 protected int 74 file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) 75 { 76 size_t mlen; 77 int rv = 1, ucs_type; 78 unsigned char *nbuf = NULL; 79 80 mlen = (nbytes + 1) * sizeof(nbuf[0]); 81 if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) { 82 file_oomem(ms, mlen); 83 goto done; 84 } 85 mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 86 if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) { 87 file_oomem(ms, mlen); 88 goto done; 89 } 90 91 *type = "text"; 92 if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 93 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 94 *code = "ASCII"; 95 *code_mime = "us-ascii"; 96 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 97 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 98 *code = "UTF-8 Unicode (with BOM)"; 99 *code_mime = "utf-8"; 100 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 101 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 102 *code = "UTF-8 Unicode (with BOM)"; 103 *code = "UTF-8 Unicode"; 104 *code_mime = "utf-8"; 105 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 106 if (ucs_type == 1) { 107 *code = "Little-endian UTF-16 Unicode"; 108 *code_mime = "utf-16le"; 109 } else { 110 *code = "Big-endian UTF-16 Unicode"; 111 *code_mime = "utf-16be"; 112 } 113 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 114 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 115 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 116 *code = "ISO-8859"; 117 *code_mime = "iso-8859-1"; 118 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 119 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 120 *code = "Non-ISO extended-ASCII"; 121 *code_mime = "unknown-8bit"; 122 } else { 123 from_ebcdic(buf, nbytes, nbuf); 124 125 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 126 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 127 *code = "EBCDIC"; 128 *code_mime = "ebcdic"; 129 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 130 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 131 *ulen)); 132 *code = "International EBCDIC"; 133 *code_mime = "ebcdic"; 134 } else { /* Doesn't look like text at all */ 135 DPRINTF(("binary\n")); 136 rv = 0; 137 *type = "binary"; 138 } 139 } 140 141 done: 142 if (nbuf) 143 free(nbuf); 144 145 return rv; 146 } 147 148 /* 149 * This table reflects a particular philosophy about what constitutes 150 * "text," and there is room for disagreement about it. 151 * 152 * Version 3.31 of the file command considered a file to be ASCII if 153 * each of its characters was approved by either the isascii() or 154 * isalpha() function. On most systems, this would mean that any 155 * file consisting only of characters in the range 0x00 ... 0x7F 156 * would be called ASCII text, but many systems might reasonably 157 * consider some characters outside this range to be alphabetic, 158 * so the file command would call such characters ASCII. It might 159 * have been more accurate to call this "considered textual on the 160 * local system" than "ASCII." 161 * 162 * It considered a file to be "International language text" if each 163 * of its characters was either an ASCII printing character (according 164 * to the real ASCII standard, not the above test), a character in 165 * the range 0x80 ... 0xFF, or one of the following control characters: 166 * backspace, tab, line feed, vertical tab, form feed, carriage return, 167 * escape. No attempt was made to determine the language in which files 168 * of this type were written. 169 * 170 * 171 * The table below considers a file to be ASCII if all of its characters 172 * are either ASCII printing characters (again, according to the X3.4 173 * standard, not isascii()) or any of the following controls: bell, 174 * backspace, tab, line feed, form feed, carriage return, esc, nextline. 175 * 176 * I include bell because some programs (particularly shell scripts) 177 * use it literally, even though it is rare in normal text. I exclude 178 * vertical tab because it never seems to be used in real text. I also 179 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 180 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 181 * character to. It might be more appropriate to include it in the 8859 182 * set instead of the ASCII set, but it's got to be included in *something* 183 * we recognize or EBCDIC files aren't going to be considered textual. 184 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 185 * and Latin characters, so these should possibly be allowed. But they 186 * make a real mess on VT100-style displays if they're not paired properly, 187 * so we are probably better off not calling them text. 188 * 189 * A file is considered to be ISO-8859 text if its characters are all 190 * either ASCII, according to the above definition, or printing characters 191 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 192 * 193 * Finally, a file is considered to be international text from some other 194 * character code if its characters are all either ISO-8859 (according to 195 * the above definition) or characters in the range 0x80 ... 0x9F, which 196 * ISO-8859 considers to be control characters but the IBM PC and Macintosh 197 * consider to be printing characters. 198 */ 199 200 #define F 0 /* character never appears in text */ 201 #define T 1 /* character appears in plain ASCII text */ 202 #define I 2 /* character appears in ISO-8859 text */ 203 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 204 205 private char text_chars[256] = { 206 /* BEL BS HT LF FF CR */ 207 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 208 /* ESC */ 209 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 210 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 211 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 212 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 213 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 214 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 215 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 216 /* NEL */ 217 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 218 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 219 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 220 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 221 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 222 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 223 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 224 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 225 }; 226 227 private int 228 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, 229 size_t *ulen) 230 { 231 size_t i; 232 233 *ulen = 0; 234 235 for (i = 0; i < nbytes; i++) { 236 int t = text_chars[buf[i]]; 237 238 if (t != T) 239 return 0; 240 241 ubuf[(*ulen)++] = buf[i]; 242 } 243 244 return 1; 245 } 246 247 private int 248 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 249 { 250 size_t i; 251 252 *ulen = 0; 253 254 for (i = 0; i < nbytes; i++) { 255 int t = text_chars[buf[i]]; 256 257 if (t != T && t != I) 258 return 0; 259 260 ubuf[(*ulen)++] = buf[i]; 261 } 262 263 return 1; 264 } 265 266 private int 267 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, 268 size_t *ulen) 269 { 270 size_t i; 271 272 *ulen = 0; 273 274 for (i = 0; i < nbytes; i++) { 275 int t = text_chars[buf[i]]; 276 277 if (t != T && t != I && t != X) 278 return 0; 279 280 ubuf[(*ulen)++] = buf[i]; 281 } 282 283 return 1; 284 } 285 286 /* 287 * Decide whether some text looks like UTF-8. Returns: 288 * 289 * -1: invalid UTF-8 290 * 0: uses odd control characters, so doesn't look like text 291 * 1: 7-bit text 292 * 2: definitely UTF-8 text (valid high-bit set bytes) 293 * 294 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 295 * ubuf must be big enough! 296 */ 297 protected int 298 file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 299 { 300 size_t i; 301 int n; 302 unichar c; 303 int gotone = 0, ctrl = 0; 304 305 if (ubuf) 306 *ulen = 0; 307 308 for (i = 0; i < nbytes; i++) { 309 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 310 /* 311 * Even if the whole file is valid UTF-8 sequences, 312 * still reject it if it uses weird control characters. 313 */ 314 315 if (text_chars[buf[i]] != T) 316 ctrl = 1; 317 318 if (ubuf) 319 ubuf[(*ulen)++] = buf[i]; 320 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 321 return -1; 322 } else { /* 11xxxxxx begins UTF-8 */ 323 int following; 324 325 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 326 c = buf[i] & 0x1f; 327 following = 1; 328 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 329 c = buf[i] & 0x0f; 330 following = 2; 331 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 332 c = buf[i] & 0x07; 333 following = 3; 334 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 335 c = buf[i] & 0x03; 336 following = 4; 337 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 338 c = buf[i] & 0x01; 339 following = 5; 340 } else 341 return -1; 342 343 for (n = 0; n < following; n++) { 344 i++; 345 if (i >= nbytes) 346 goto done; 347 348 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 349 return -1; 350 351 c = (c << 6) + (buf[i] & 0x3f); 352 } 353 354 if (ubuf) 355 ubuf[(*ulen)++] = c; 356 gotone = 1; 357 } 358 } 359 done: 360 return ctrl ? 0 : (gotone ? 2 : 1); 361 } 362 363 /* 364 * Decide whether some text looks like UTF-8 with BOM. If there is no 365 * BOM, return -1; otherwise return the result of looks_utf8 on the 366 * rest of the text. 367 */ 368 private int 369 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, 370 size_t *ulen) 371 { 372 if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 373 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 374 else 375 return -1; 376 } 377 378 private int 379 looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, 380 size_t *ulen) 381 { 382 int bigend; 383 size_t i; 384 385 if (nbytes < 2) 386 return 0; 387 388 if (buf[0] == 0xff && buf[1] == 0xfe) 389 bigend = 0; 390 else if (buf[0] == 0xfe && buf[1] == 0xff) 391 bigend = 1; 392 else 393 return 0; 394 395 *ulen = 0; 396 397 for (i = 2; i + 1 < nbytes; i += 2) { 398 /* XXX fix to properly handle chars > 65536 */ 399 400 if (bigend) 401 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 402 else 403 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 404 405 if (ubuf[*ulen - 1] == 0xfffe) 406 return 0; 407 if (ubuf[*ulen - 1] < 128 && 408 text_chars[(size_t)ubuf[*ulen - 1]] != T) 409 return 0; 410 } 411 412 return 1 + bigend; 413 } 414 415 #undef F 416 #undef T 417 #undef I 418 #undef X 419 420 /* 421 * This table maps each EBCDIC character to an (8-bit extended) ASCII 422 * character, as specified in the rationale for the dd(1) command in 423 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 424 * 425 * Unfortunately it does not seem to correspond exactly to any of the 426 * five variants of EBCDIC documented in IBM's _Enterprise Systems 427 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 428 * Edition, July, 1999, pp. I-1 - I-4. 429 * 430 * Fortunately, though, all versions of EBCDIC, including this one, agree 431 * on most of the printing characters that also appear in (7-bit) ASCII. 432 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 433 * 434 * Fortunately too, there is general agreement that codes 0x00 through 435 * 0x3F represent control characters, 0x41 a nonbreaking space, and the 436 * remainder printing characters. 437 * 438 * This is sufficient to allow us to identify EBCDIC text and to distinguish 439 * between old-style and internationalized examples of text. 440 */ 441 442 private unsigned char ebcdic_to_ascii[] = { 443 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 444 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 445 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 446 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 447 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 448 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 449 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 450 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 451 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 452 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 453 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 454 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 455 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 456 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 457 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 458 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 459 }; 460 461 #ifdef notdef 462 /* 463 * The following EBCDIC-to-ASCII table may relate more closely to reality, 464 * or at least to modern reality. It comes from 465 * 466 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 467 * 468 * and maps the characters of EBCDIC code page 1047 (the code used for 469 * Unix-derived software on IBM's 390 systems) to the corresponding 470 * characters from ISO 8859-1. 471 * 472 * If this table is used instead of the above one, some of the special 473 * cases for the NEL character can be taken out of the code. 474 */ 475 476 private unsigned char ebcdic_1047_to_8859[] = { 477 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 478 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 479 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 480 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 481 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 482 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 483 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 484 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 485 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 486 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 487 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 488 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 489 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 490 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 491 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 492 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 493 }; 494 #endif 495 496 /* 497 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 498 */ 499 private void 500 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 501 { 502 size_t i; 503 504 for (i = 0; i < nbytes; i++) { 505 out[i] = ebcdic_to_ascii[buf[i]]; 506 } 507 } 508