encoding.c - OpenGrok cross reference for /netbsd-src/external/bsd/file/dist/src/encoding.c

Lines Matching +full:no +full:- +full:bf
4  * Copyright (c) Ian F. Darwin 1986-1995.
6  * maintained 1995-present by Christos Zoulas and others.
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
31  * Encoding -- determine the character encoding of a text file.
33  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
77  * the text converted into one-file_unichar_t-per-character Unicode in
85 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);  in file_encoding()
86 	size_t nbytes = b->flen;  in file_encoding()
102 	if (nbytes > ms->encoding_max)  in file_encoding()
103 		nbytes = ms->encoding_max;  in file_encoding()
113 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));  in file_encoding()
114 			*code = "Unicode text, UTF-7";  in file_encoding()
115 			*code_mime = "utf-7";  in file_encoding()
119 			*code_mime = "us-ascii";  in file_encoding()
123 		*code = "Unicode text, UTF-8 (with BOM)";  in file_encoding()
124 		*code_mime = "utf-8";  in file_encoding()
127 		*code = "Unicode text, UTF-8";  in file_encoding()
128 		*code_mime = "utf-8";  in file_encoding()
131 			*code = "Unicode text, UTF-32, little-endian";  in file_encoding()
132 			*code_mime = "utf-32le";  in file_encoding()
134 			*code = "Unicode text, UTF-32, big-endian";  in file_encoding()
135 			*code_mime = "utf-32be";  in file_encoding()
140 			*code = "Unicode text, UTF-16, little-endian";  in file_encoding()
141 			*code_mime = "utf-16le";  in file_encoding()
143 			*code = "Unicode text, UTF-16, big-endian";  in file_encoding()
144 			*code_mime = "utf-16be";  in file_encoding()
149 		*code = "ISO-8859";  in file_encoding()
150 		*code_mime = "iso-8859-1";  in file_encoding()
153 		*code = "Non-ISO extended-ASCII";  in file_encoding()
154 		*code_mime = "unknown-8bit";  in file_encoding()
208  * escape.  No attempt was made to determine the language in which files
220  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
221  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
227  * make a real mess on VT100-style displays if they're not paired properly,
230  * A file is considered to be ISO-8859 text if its characters are all
232  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
235  * character code if its characters are all either ISO-8859 (according to
237  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
243 #define I 2   /* character appears in ISO-8859 text */
244 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
293  * Decide whether some text looks like UTF-8. Returns:
295  *     -1: invalid UTF-8
297  *      1: 7-bit text
298  *      2: definitely UTF-8 text (valid high-bit set bytes)
300  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
319 // first is information about the first byte in a UTF-8 sequence.
322     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
323     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
324     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
325     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
326     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
327     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
328     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
329     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
331     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
332     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
333     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
334     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
335     XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
336     S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
337     S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
338     S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
341 // acceptRange gives the range of valid values for the second byte in a UTF-8
370 			 * Even if the whole file is valid UTF-8 sequences,  in file_looks_utf8()
380 			return -1;  in file_looks_utf8()
381 		} else {			   /* 11xxxxxx begins UTF-8 */  in file_looks_utf8()
387 				return -1;  in file_looks_utf8()
405 				return -1;  in file_looks_utf8()
413 				     (buf[i] < ar->lo || buf[i] > ar->hi))  in file_looks_utf8()
414 					return -1;  in file_looks_utf8()
417 					return -1;  in file_looks_utf8()
432  * Decide whether some text looks like UTF-8 with BOM. If there is no
433  * BOM, return -1; otherwise return the result of looks_utf8 on the
441 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);  in looks_utf8_with_BOM()
443 		return -1;  in looks_utf8_with_BOM()
460 			return -1;  in looks_utf7()
463 		return -1;  in looks_utf7()
471 looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,  in looks_ucs16()  argument
481 	if (bf[0] == 0xff && bf[1] == 0xfe)  in looks_ucs16()
483 	else if (bf[0] == 0xfe && bf[1] == 0xff)  in looks_ucs16()
496 			    bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));  in looks_ucs16()
499 			    bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));  in looks_ucs16()
515 			uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);  in looks_ucs16()
522 			hi = uc - 0xd800 + 1;  in looks_ucs16()
531 looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,  in looks_ucs32()  argument
540 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)  in looks_ucs32()
542 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)  in looks_ucs32()
553 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])  in looks_ucs32()
554 			    | (CAST(file_unichar_t, bf[i + 2]) << 8)  in looks_ucs32()
555 			    | (CAST(file_unichar_t, bf[i + 1]) << 16)  in looks_ucs32()
556 			    | (CAST(file_unichar_t, bf[i]) << 24);  in looks_ucs32()
558 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])  in looks_ucs32()
559 			    | (CAST(file_unichar_t, bf[i + 1]) << 8)   in looks_ucs32()
560 			    | (CAST(file_unichar_t, bf[i + 2]) << 16)  in looks_ucs32()
561 			    | (CAST(file_unichar_t, bf[i + 3]) << 24);  in looks_ucs32()
563 		if (ubf[*ulen - 1] == 0xfffe)  in looks_ucs32()
565 		if (ubf[*ulen - 1] < 128 &&  in looks_ucs32()
566 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)  in looks_ucs32()
578  * This table maps each EBCDIC character to an (8-bit extended) ASCII
584  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
585  * Edition, July, 1999, pp. I-1 - I-4.
588  * on most of the printing characters that also appear in (7-bit) ASCII.
596  * between old-style and internationalized examples of text.
606 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
620  * The following EBCDIC-to-ASCII table may relate more closely to reality,
626  * Unix-derived software on IBM's 390 systems) to the corresponding
627  * characters from ISO 8859-1.
654  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.