xref: /netbsd-src/external/bsd/file/dist/src/encoding.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*	$NetBSD: encoding.c,v 1.10 2021/04/09 19:11:42 christos Exp $	*/
2 
3 /*
4  * Copyright (c) Ian F. Darwin 1986-1995.
5  * Software written by Ian F. Darwin and others;
6  * maintained 1995-present by Christos Zoulas and others.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice immediately at the beginning of the file, without modification,
13  *    this list of conditions, and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 /*
31  * Encoding -- determine the character encoding of a text file.
32  *
33  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
34  * international characters.
35  */
36 
37 #include "file.h"
38 
39 #ifndef	lint
40 #if 0
41 FILE_RCSID("@(#)$File: encoding.c,v 1.27 2021/02/05 21:33:49 christos Exp $")
42 #else
43 __RCSID("$NetBSD: encoding.c,v 1.10 2021/04/09 19:11:42 christos Exp $");
44 #endif
45 #endif	/* lint */
46 
47 #include "magic.h"
48 #include <string.h>
49 #include <stdlib.h>
50 
51 
52 private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
53     size_t *);
54 private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
55     size_t *);
56 private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
57     size_t *);
58 private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
59     size_t *);
60 private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
61     size_t *);
62 private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
63     size_t *);
64 private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
65     size_t *);
66 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
67 
68 #ifdef DEBUG_ENCODING
69 #define DPRINTF(a) printf a
70 #else
71 #define DPRINTF(a)
72 #endif
73 
74 /*
75  * Try to determine whether text is in some character code we can
76  * identify.  Each of these tests, if it succeeds, will leave
77  * the text converted into one-file_unichar_t-per-character Unicode in
78  * ubuf, and the number of characters converted in ulen.
79  */
80 protected int
81 file_encoding(struct magic_set *ms, const struct buffer *b,
82     file_unichar_t **ubuf, size_t *ulen, const char **code,
83     const char **code_mime, const char **type)
84 {
85 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
86 	size_t nbytes = b->flen;
87 	size_t mlen;
88 	int rv = 1, ucs_type;
89 	unsigned char *nbuf = NULL;
90 	file_unichar_t *udefbuf;
91 	size_t udeflen;
92 
93 	if (ubuf == NULL)
94 		ubuf = &udefbuf;
95 	if (ulen == NULL)
96 		ulen = &udeflen;
97 
98 	*type = "text";
99 	*ulen = 0;
100 	*code = "unknown";
101 	*code_mime = "binary";
102 
103 	if (nbytes > ms->encoding_max)
104 		nbytes = ms->encoding_max;
105 
106 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
107 	*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
108 	if (*ubuf == NULL) {
109 		file_oomem(ms, mlen);
110 		goto done;
111 	}
112 	mlen = (nbytes + 1) * sizeof(nbuf[0]);
113 	if ((nbuf = CAST(unsigned char *,
114 	    calloc(CAST(size_t, 1), mlen))) == NULL) {
115 		file_oomem(ms, mlen);
116 		goto done;
117 	}
118 
119 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
120 		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
121 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
122 			*code = "Unicode text, UTF-7";
123 			*code_mime = "utf-7";
124 		} else {
125 			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
126 			*code = "ASCII";
127 			*code_mime = "us-ascii";
128 		}
129 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
130 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
131 		*code = "Unicode text, UTF-8 (with BOM)";
132 		*code_mime = "utf-8";
133 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
134 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
135 		*code = "Unicode text, UTF-8";
136 		*code_mime = "utf-8";
137 	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
138 		if (ucs_type == 1) {
139 			*code = "Unicode text, UTF-32, little-endian";
140 			*code_mime = "utf-32le";
141 		} else {
142 			*code = "Unicode text, UTF-32, big-endian";
143 			*code_mime = "utf-32be";
144 		}
145 		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
146 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
147 		if (ucs_type == 1) {
148 			*code = "Unicode text, UTF-16, little-endian";
149 			*code_mime = "utf-16le";
150 		} else {
151 			*code = "Unicode text, UTF-16, big-endian";
152 			*code_mime = "utf-16be";
153 		}
154 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
155 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
156 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
157 		*code = "ISO-8859";
158 		*code_mime = "iso-8859-1";
159 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
160 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
161 		*code = "Non-ISO extended-ASCII";
162 		*code_mime = "unknown-8bit";
163 	} else {
164 		from_ebcdic(buf, nbytes, nbuf);
165 
166 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
167 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
168 			*code = "EBCDIC";
169 			*code_mime = "ebcdic";
170 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
171 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
172 			    *ulen));
173 			*code = "International EBCDIC";
174 			*code_mime = "ebcdic";
175 		} else { /* Doesn't look like text at all */
176 			DPRINTF(("binary\n"));
177 			rv = 0;
178 			*type = "binary";
179 		}
180 	}
181 
182  done:
183 	free(nbuf);
184 	if (ubuf == &udefbuf)
185 		free(udefbuf);
186 
187 	return rv;
188 }
189 
190 /*
191  * This table reflects a particular philosophy about what constitutes
192  * "text," and there is room for disagreement about it.
193  *
194  * Version 3.31 of the file command considered a file to be ASCII if
195  * each of its characters was approved by either the isascii() or
196  * isalpha() function.  On most systems, this would mean that any
197  * file consisting only of characters in the range 0x00 ... 0x7F
198  * would be called ASCII text, but many systems might reasonably
199  * consider some characters outside this range to be alphabetic,
200  * so the file command would call such characters ASCII.  It might
201  * have been more accurate to call this "considered textual on the
202  * local system" than "ASCII."
203  *
204  * It considered a file to be "International language text" if each
205  * of its characters was either an ASCII printing character (according
206  * to the real ASCII standard, not the above test), a character in
207  * the range 0x80 ... 0xFF, or one of the following control characters:
208  * backspace, tab, line feed, vertical tab, form feed, carriage return,
209  * escape.  No attempt was made to determine the language in which files
210  * of this type were written.
211  *
212  *
213  * The table below considers a file to be ASCII if all of its characters
214  * are either ASCII printing characters (again, according to the X3.4
215  * standard, not isascii()) or any of the following controls: bell,
216  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
217  *
218  * I include bell because some programs (particularly shell scripts)
219  * use it literally, even though it is rare in normal text.  I exclude
220  * vertical tab because it never seems to be used in real text.  I also
221  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
222  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
223  * character to.  It might be more appropriate to include it in the 8859
224  * set instead of the ASCII set, but it's got to be included in *something*
225  * we recognize or EBCDIC files aren't going to be considered textual.
226  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
227  * and Latin characters, so these should possibly be allowed.  But they
228  * make a real mess on VT100-style displays if they're not paired properly,
229  * so we are probably better off not calling them text.
230  *
231  * A file is considered to be ISO-8859 text if its characters are all
232  * either ASCII, according to the above definition, or printing characters
233  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
234  *
235  * Finally, a file is considered to be international text from some other
236  * character code if its characters are all either ISO-8859 (according to
237  * the above definition) or characters in the range 0x80 ... 0x9F, which
238  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
239  * consider to be printing characters.
240  */
241 
242 #define F 0   /* character never appears in text */
243 #define T 1   /* character appears in plain ASCII text */
244 #define I 2   /* character appears in ISO-8859 text */
245 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
246 
247 private char text_chars[256] = {
248 	/*                  BEL BS HT LF VT FF CR    */
249 	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
250 	/*                              ESC          */
251 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
252 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
253 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
254 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
255 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
256 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
257 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
258 	/*            NEL                            */
259 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
260 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
261 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
262 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
263 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
264 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
265 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
266 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
267 };
268 
269 #define LOOKS(NAME, COND) \
270 private int \
271 looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
272     size_t *ulen) \
273 { \
274 	size_t i, u; \
275 	unsigned char dist[256]; \
276 	memset(dist, 0, sizeof(dist)); \
277 \
278 	*ulen = 0; \
279 \
280 	for (i = 0; i < nbytes; i++) { \
281 		int t = text_chars[buf[i]]; \
282 \
283 		if (COND) \
284 			return 0; \
285 \
286 		ubuf[(*ulen)++] = buf[i]; \
287 		dist[buf[i]]++; \
288 	} \
289 	u = 0; \
290 	for (i = 0; i < __arraycount(dist); i++) { \
291 		if (dist[i]) \
292 			u++; \
293 	} \
294 	if (u < 3) \
295 		return 0; \
296 \
297 	return 1; \
298 }
299 
300 LOOKS(ascii, t != T)
301 LOOKS(latin1, t != T && t != I)
302 LOOKS(extended, t != T && t != I && t != X)
303 
304 /*
305  * Decide whether some text looks like UTF-8. Returns:
306  *
307  *     -1: invalid UTF-8
308  *      0: uses odd control characters, so doesn't look like text
309  *      1: 7-bit text
310  *      2: definitely UTF-8 text (valid high-bit set bytes)
311  *
312  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
313  * ubuf must be big enough!
314  */
315 
316 // from: https://golang.org/src/unicode/utf8/utf8.go
317 
318 #define	XX 0xF1 // invalid: size 1
319 #define	AS 0xF0 // ASCII: size 1
320 #define	S1 0x02 // accept 0, size 2
321 #define	S2 0x13 // accept 1, size 3
322 #define	S3 0x03 // accept 0, size 3
323 #define	S4 0x23 // accept 2, size 3
324 #define	S5 0x34 // accept 3, size 4
325 #define	S6 0x04 // accept 0, size 4
326 #define	S7 0x44 // accept 4, size 4
327 
328 #define LOCB 0x80
329 #define HICB 0xBF
330 
331 // first is information about the first byte in a UTF-8 sequence.
332 static const uint8_t first[] = {
333     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
334     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
335     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
336     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
337     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
338     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
339     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
340     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
341     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
342     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
343     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
344     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
345     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
346     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
347     XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
348     S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
349     S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
350     S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
351 };
352 
353 // acceptRange gives the range of valid values for the second byte in a UTF-8
354 // sequence.
355 struct accept_range {
356 	uint8_t lo; // lowest value for second byte.
357 	uint8_t hi; // highest value for second byte.
358 } accept_ranges[16] = {
359 // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
360 	{ LOCB, HICB },
361 	{ 0xA0, HICB },
362 	{ LOCB, 0x9F },
363 	{ 0x90, HICB },
364 	{ LOCB, 0x8F },
365 };
366 
367 protected int
368 file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
369     size_t *ulen)
370 {
371 	size_t i;
372 	int n;
373 	file_unichar_t c;
374 	int gotone = 0, ctrl = 0;
375 
376 	if (ubuf)
377 		*ulen = 0;
378 
379 	for (i = 0; i < nbytes; i++) {
380 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
381 			/*
382 			 * Even if the whole file is valid UTF-8 sequences,
383 			 * still reject it if it uses weird control characters.
384 			 */
385 
386 			if (text_chars[buf[i]] != T)
387 				ctrl = 1;
388 
389 			if (ubuf)
390 				ubuf[(*ulen)++] = buf[i];
391 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
392 			return -1;
393 		} else {			   /* 11xxxxxx begins UTF-8 */
394 			int following;
395 			uint8_t x = first[buf[i]];
396 			const struct accept_range *ar =
397 			    &accept_ranges[(unsigned int)x >> 4];
398 			if (x == XX)
399 				return -1;
400 
401 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
402 				c = buf[i] & 0x1f;
403 				following = 1;
404 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
405 				c = buf[i] & 0x0f;
406 				following = 2;
407 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
408 				c = buf[i] & 0x07;
409 				following = 3;
410 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
411 				c = buf[i] & 0x03;
412 				following = 4;
413 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
414 				c = buf[i] & 0x01;
415 				following = 5;
416 			} else
417 				return -1;
418 
419 			for (n = 0; n < following; n++) {
420 				i++;
421 				if (i >= nbytes)
422 					goto done;
423 
424 				if (n == 0 &&
425 				     (buf[i] < ar->lo || buf[i] > ar->hi))
426 					return -1;
427 
428 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
429 					return -1;
430 
431 				c = (c << 6) + (buf[i] & 0x3f);
432 			}
433 
434 			if (ubuf)
435 				ubuf[(*ulen)++] = c;
436 			gotone = 1;
437 		}
438 	}
439 done:
440 	return ctrl ? 0 : (gotone ? 2 : 1);
441 }
442 
443 /*
444  * Decide whether some text looks like UTF-8 with BOM. If there is no
445  * BOM, return -1; otherwise return the result of looks_utf8 on the
446  * rest of the text.
447  */
448 private int
449 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
450     file_unichar_t *ubuf, size_t *ulen)
451 {
452 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
453 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
454 	else
455 		return -1;
456 }
457 
458 private int
459 looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
460     size_t *ulen)
461 {
462 	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
463 		switch (buf[3]) {
464 		case '8':
465 		case '9':
466 		case '+':
467 		case '/':
468 			if (ubuf)
469 				*ulen = 0;
470 			return 1;
471 		default:
472 			return -1;
473 		}
474 	else
475 		return -1;
476 }
477 
478 private int
479 looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
480     size_t *ulen)
481 {
482 	int bigend;
483 	size_t i;
484 
485 	if (nbytes < 2)
486 		return 0;
487 
488 	if (bf[0] == 0xff && bf[1] == 0xfe)
489 		bigend = 0;
490 	else if (bf[0] == 0xfe && bf[1] == 0xff)
491 		bigend = 1;
492 	else
493 		return 0;
494 
495 	*ulen = 0;
496 
497 	for (i = 2; i + 1 < nbytes; i += 2) {
498 		/* XXX fix to properly handle chars > 65536 */
499 
500 		if (bigend)
501 			ubf[(*ulen)++] = bf[i + 1]
502 			    | (CAST(file_unichar_t, bf[i]) << 8);
503 		else
504 			ubf[(*ulen)++] = bf[i]
505 			    | (CAST(file_unichar_t, bf[i + 1]) << 8);
506 
507 		if (ubf[*ulen - 1] == 0xfffe)
508 			return 0;
509 		if (ubf[*ulen - 1] < 128 &&
510 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
511 			return 0;
512 	}
513 
514 	return 1 + bigend;
515 }
516 
517 private int
518 looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
519     size_t *ulen)
520 {
521 	int bigend;
522 	size_t i;
523 
524 	if (nbytes < 4)
525 		return 0;
526 
527 	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
528 		bigend = 0;
529 	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
530 		bigend = 1;
531 	else
532 		return 0;
533 
534 	*ulen = 0;
535 
536 	for (i = 4; i + 3 < nbytes; i += 4) {
537 		/* XXX fix to properly handle chars > 65536 */
538 
539 		if (bigend)
540 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
541 			    | (CAST(file_unichar_t, bf[i + 2]) << 8)
542 			    | (CAST(file_unichar_t, bf[i + 1]) << 16)
543 			    | (CAST(file_unichar_t, bf[i]) << 24);
544 		else
545 			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
546 			    | (CAST(file_unichar_t, bf[i + 1]) << 8)
547 			    | (CAST(file_unichar_t, bf[i + 2]) << 16)
548 			    | (CAST(file_unichar_t, bf[i + 3]) << 24);
549 
550 		if (ubf[*ulen - 1] == 0xfffe)
551 			return 0;
552 		if (ubf[*ulen - 1] < 128 &&
553 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
554 			return 0;
555 	}
556 
557 	return 1 + bigend;
558 }
559 #undef F
560 #undef T
561 #undef I
562 #undef X
563 
564 /*
565  * This table maps each EBCDIC character to an (8-bit extended) ASCII
566  * character, as specified in the rationale for the dd(1) command in
567  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
568  *
569  * Unfortunately it does not seem to correspond exactly to any of the
570  * five variants of EBCDIC documented in IBM's _Enterprise Systems
571  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
572  * Edition, July, 1999, pp. I-1 - I-4.
573  *
574  * Fortunately, though, all versions of EBCDIC, including this one, agree
575  * on most of the printing characters that also appear in (7-bit) ASCII.
576  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
577  *
578  * Fortunately too, there is general agreement that codes 0x00 through
579  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
580  * remainder printing characters.
581  *
582  * This is sufficient to allow us to identify EBCDIC text and to distinguish
583  * between old-style and internationalized examples of text.
584  */
585 
586 private unsigned char ebcdic_to_ascii[] = {
587   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
588  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
589 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
590 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
591 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
592 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
593 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
594 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
595 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
596 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
597 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
598 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
599 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
600 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
601 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
602 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
603 };
604 
605 #ifdef notdef
606 /*
607  * The following EBCDIC-to-ASCII table may relate more closely to reality,
608  * or at least to modern reality.  It comes from
609  *
610  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
611  *
612  * and maps the characters of EBCDIC code page 1047 (the code used for
613  * Unix-derived software on IBM's 390 systems) to the corresponding
614  * characters from ISO 8859-1.
615  *
616  * If this table is used instead of the above one, some of the special
617  * cases for the NEL character can be taken out of the code.
618  */
619 
620 private unsigned char ebcdic_1047_to_8859[] = {
621 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
622 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
623 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
624 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
625 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
626 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
627 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
628 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
629 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
630 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
631 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
632 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
633 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
634 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
635 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
636 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
637 };
638 #endif
639 
640 /*
641  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
642  */
643 private void
644 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
645 {
646 	size_t i;
647 
648 	for (i = 0; i < nbytes; i++) {
649 		out[i] = ebcdic_to_ascii[buf[i]];
650 	}
651 }
652