Lines Matching +full:no +full:- +full:bf
4 * Copyright (c) Ian F. Darwin 1986-1995.
6 * maintained 1995-present by Christos Zoulas and others.
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
31 * Encoding -- determine the character encoding of a text file.
33 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
77 * the text converted into one-file_unichar_t-per-character Unicode in
85 const unsigned char *buf = CAST(const unsigned char *, b->fbuf); in file_encoding()
86 size_t nbytes = b->flen; in file_encoding()
102 if (nbytes > ms->encoding_max) in file_encoding()
103 nbytes = ms->encoding_max; in file_encoding()
113 DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); in file_encoding()
114 *code = "Unicode text, UTF-7"; in file_encoding()
115 *code_mime = "utf-7"; in file_encoding()
119 *code_mime = "us-ascii"; in file_encoding()
123 *code = "Unicode text, UTF-8 (with BOM)"; in file_encoding()
124 *code_mime = "utf-8"; in file_encoding()
127 *code = "Unicode text, UTF-8"; in file_encoding()
128 *code_mime = "utf-8"; in file_encoding()
131 *code = "Unicode text, UTF-32, little-endian"; in file_encoding()
132 *code_mime = "utf-32le"; in file_encoding()
134 *code = "Unicode text, UTF-32, big-endian"; in file_encoding()
135 *code_mime = "utf-32be"; in file_encoding()
140 *code = "Unicode text, UTF-16, little-endian"; in file_encoding()
141 *code_mime = "utf-16le"; in file_encoding()
143 *code = "Unicode text, UTF-16, big-endian"; in file_encoding()
144 *code_mime = "utf-16be"; in file_encoding()
149 *code = "ISO-8859"; in file_encoding()
150 *code_mime = "iso-8859-1"; in file_encoding()
153 *code = "Non-ISO extended-ASCII"; in file_encoding()
154 *code_mime = "unknown-8bit"; in file_encoding()
208 * escape. No attempt was made to determine the language in which files
220 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
221 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
227 * make a real mess on VT100-style displays if they're not paired properly,
230 * A file is considered to be ISO-8859 text if its characters are all
232 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
235 * character code if its characters are all either ISO-8859 (according to
237 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
243 #define I 2 /* character appears in ISO-8859 text */
244 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
293 * Decide whether some text looks like UTF-8. Returns:
295 * -1: invalid UTF-8
297 * 1: 7-bit text
298 * 2: definitely UTF-8 text (valid high-bit set bytes)
300 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
319 // first is information about the first byte in a UTF-8 sequence.
322 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
323 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
324 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
325 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
326 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
327 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
328 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
329 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
331 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
332 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
333 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
334 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
335 XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
336 S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
337 S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
338 S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
341 // acceptRange gives the range of valid values for the second byte in a UTF-8
370 * Even if the whole file is valid UTF-8 sequences, in file_looks_utf8()
380 return -1; in file_looks_utf8()
381 } else { /* 11xxxxxx begins UTF-8 */ in file_looks_utf8()
387 return -1; in file_looks_utf8()
405 return -1; in file_looks_utf8()
413 (buf[i] < ar->lo || buf[i] > ar->hi)) in file_looks_utf8()
414 return -1; in file_looks_utf8()
417 return -1; in file_looks_utf8()
432 * Decide whether some text looks like UTF-8 with BOM. If there is no
433 * BOM, return -1; otherwise return the result of looks_utf8 on the
441 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); in looks_utf8_with_BOM()
443 return -1; in looks_utf8_with_BOM()
460 return -1; in looks_utf7()
463 return -1; in looks_utf7()
471 looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, in looks_ucs16() argument
481 if (bf[0] == 0xff && bf[1] == 0xfe) in looks_ucs16()
483 else if (bf[0] == 0xfe && bf[1] == 0xff) in looks_ucs16()
496 bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8)); in looks_ucs16()
499 bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8)); in looks_ucs16()
515 uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00); in looks_ucs16()
522 hi = uc - 0xd800 + 1; in looks_ucs16()
531 looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf, in looks_ucs32() argument
540 if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) in looks_ucs32()
542 else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) in looks_ucs32()
553 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3]) in looks_ucs32()
554 | (CAST(file_unichar_t, bf[i + 2]) << 8) in looks_ucs32()
555 | (CAST(file_unichar_t, bf[i + 1]) << 16) in looks_ucs32()
556 | (CAST(file_unichar_t, bf[i]) << 24); in looks_ucs32()
558 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0]) in looks_ucs32()
559 | (CAST(file_unichar_t, bf[i + 1]) << 8) in looks_ucs32()
560 | (CAST(file_unichar_t, bf[i + 2]) << 16) in looks_ucs32()
561 | (CAST(file_unichar_t, bf[i + 3]) << 24); in looks_ucs32()
563 if (ubf[*ulen - 1] == 0xfffe) in looks_ucs32()
565 if (ubf[*ulen - 1] < 128 && in looks_ucs32()
566 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) in looks_ucs32()
578 * This table maps each EBCDIC character to an (8-bit extended) ASCII
584 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
585 * Edition, July, 1999, pp. I-1 - I-4.
588 * on most of the printing characters that also appear in (7-bit) ASCII.
596 * between old-style and internationalized examples of text.
606 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
620 * The following EBCDIC-to-ASCII table may relate more closely to reality,
626 * Unix-derived software on IBM's 390 systems) to the corresponding
627 * characters from ISO 8859-1.
654 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.