1 /* $NetBSD: encoding.c,v 1.12 2023/08/18 19:00:11 christos Exp $ */
2
3 /*
4 * Copyright (c) Ian F. Darwin 1986-1995.
5 * Software written by Ian F. Darwin and others;
6 * maintained 1995-present by Christos Zoulas and others.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice immediately at the beginning of the file, without modification,
13 * this list of conditions, and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30 /*
31 * Encoding -- determine the character encoding of a text file.
32 *
33 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
34 * international characters.
35 */
36
37 #include "file.h"
38
39 #ifndef lint
40 #if 0
41 FILE_RCSID("@(#)$File: encoding.c,v 1.42 2022/12/26 17:31:14 christos Exp $")
42 #else
43 __RCSID("$NetBSD: encoding.c,v 1.12 2023/08/18 19:00:11 christos Exp $");
44 #endif
45 #endif /* lint */
46
47 #include "magic.h"
48 #include <string.h>
49 #include <stdlib.h>
50
51
52 file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
53 size_t *);
54 file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
55 size_t *);
56 file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
57 size_t *);
58 file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
59 size_t *);
60 file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
61 size_t *);
62 file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
63 size_t *);
64 file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
65 size_t *);
66 file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
67
68 #ifdef DEBUG_ENCODING
69 #define DPRINTF(a) printf a
70 #else
71 #define DPRINTF(a)
72 #endif
73
74 /*
75 * Try to determine whether text is in some character code we can
76 * identify. Each of these tests, if it succeeds, will leave
77 * the text converted into one-file_unichar_t-per-character Unicode in
78 * ubuf, and the number of characters converted in ulen.
79 */
80 file_protected int
file_encoding(struct magic_set * ms,const struct buffer * b,file_unichar_t ** ubuf,size_t * ulen,const char ** code,const char ** code_mime,const char ** type)81 file_encoding(struct magic_set *ms, const struct buffer *b,
82 file_unichar_t **ubuf, size_t *ulen, const char **code,
83 const char **code_mime, const char **type)
84 {
85 const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
86 size_t nbytes = b->flen;
87 size_t mlen;
88 int rv = 1, ucs_type;
89 file_unichar_t *udefbuf;
90 size_t udeflen;
91
92 if (ubuf == NULL)
93 ubuf = &udefbuf;
94 if (ulen == NULL)
95 ulen = &udeflen;
96
97 *type = "text";
98 *ulen = 0;
99 *code = "unknown";
100 *code_mime = "binary";
101
102 if (nbytes > ms->encoding_max)
103 nbytes = ms->encoding_max;
104
105 mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
106 *ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
107 if (*ubuf == NULL) {
108 file_oomem(ms, mlen);
109 goto done;
110 }
111 if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
112 if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
113 DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
114 *code = "Unicode text, UTF-7";
115 *code_mime = "utf-7";
116 } else {
117 DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
118 *code = "ASCII";
119 *code_mime = "us-ascii";
120 }
121 } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
122 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
123 *code = "Unicode text, UTF-8 (with BOM)";
124 *code_mime = "utf-8";
125 } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
126 DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
127 *code = "Unicode text, UTF-8";
128 *code_mime = "utf-8";
129 } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
130 if (ucs_type == 1) {
131 *code = "Unicode text, UTF-32, little-endian";
132 *code_mime = "utf-32le";
133 } else {
134 *code = "Unicode text, UTF-32, big-endian";
135 *code_mime = "utf-32be";
136 }
137 DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
138 } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
139 if (ucs_type == 1) {
140 *code = "Unicode text, UTF-16, little-endian";
141 *code_mime = "utf-16le";
142 } else {
143 *code = "Unicode text, UTF-16, big-endian";
144 *code_mime = "utf-16be";
145 }
146 DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
147 } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
148 DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
149 *code = "ISO-8859";
150 *code_mime = "iso-8859-1";
151 } else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
152 DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
153 *code = "Non-ISO extended-ASCII";
154 *code_mime = "unknown-8bit";
155 } else {
156 unsigned char *nbuf;
157
158 mlen = (nbytes + 1) * sizeof(nbuf[0]);
159 if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) {
160 file_oomem(ms, mlen);
161 goto done;
162 }
163 from_ebcdic(buf, nbytes, nbuf);
164
165 if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
166 DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
167 *code = "EBCDIC";
168 *code_mime = "ebcdic";
169 } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
170 DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
171 *ulen));
172 *code = "International EBCDIC";
173 *code_mime = "ebcdic";
174 } else { /* Doesn't look like text at all */
175 DPRINTF(("binary\n"));
176 rv = 0;
177 *type = "binary";
178 }
179 free(nbuf);
180 }
181
182 done:
183 if (ubuf == &udefbuf)
184 free(udefbuf);
185
186 return rv;
187 }
188
189 /*
190 * This table reflects a particular philosophy about what constitutes
191 * "text," and there is room for disagreement about it.
192 *
193 * Version 3.31 of the file command considered a file to be ASCII if
194 * each of its characters was approved by either the isascii() or
195 * isalpha() function. On most systems, this would mean that any
196 * file consisting only of characters in the range 0x00 ... 0x7F
197 * would be called ASCII text, but many systems might reasonably
198 * consider some characters outside this range to be alphabetic,
199 * so the file command would call such characters ASCII. It might
200 * have been more accurate to call this "considered textual on the
201 * local system" than "ASCII."
202 *
203 * It considered a file to be "International language text" if each
204 * of its characters was either an ASCII printing character (according
205 * to the real ASCII standard, not the above test), a character in
206 * the range 0x80 ... 0xFF, or one of the following control characters:
207 * backspace, tab, line feed, vertical tab, form feed, carriage return,
208 * escape. No attempt was made to determine the language in which files
209 * of this type were written.
210 *
211 *
212 * The table below considers a file to be ASCII if all of its characters
213 * are either ASCII printing characters (again, according to the X3.4
214 * standard, not isascii()) or any of the following controls: bell,
215 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
216 *
217 * I include bell because some programs (particularly shell scripts)
218 * use it literally, even though it is rare in normal text. I exclude
219 * vertical tab because it never seems to be used in real text. I also
220 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
221 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
222 * character to. It might be more appropriate to include it in the 8859
223 * set instead of the ASCII set, but it's got to be included in *something*
224 * we recognize or EBCDIC files aren't going to be considered textual.
225 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
226 * and Latin characters, so these should possibly be allowed. But they
227 * make a real mess on VT100-style displays if they're not paired properly,
228 * so we are probably better off not calling them text.
229 *
230 * A file is considered to be ISO-8859 text if its characters are all
231 * either ASCII, according to the above definition, or printing characters
232 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
233 *
234 * Finally, a file is considered to be international text from some other
235 * character code if its characters are all either ISO-8859 (according to
236 * the above definition) or characters in the range 0x80 ... 0x9F, which
237 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
238 * consider to be printing characters.
239 */
240
241 #define F 0 /* character never appears in text */
242 #define T 1 /* character appears in plain ASCII text */
243 #define I 2 /* character appears in ISO-8859 text */
244 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
245
246 file_private char text_chars[256] = {
247 /* BEL BS HT LF VT FF CR */
248 F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */
249 /* ESC */
250 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
251 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
252 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
253 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
254 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
255 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
256 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
257 /* NEL */
258 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
259 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
260 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
261 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
262 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
263 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
264 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
265 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
266 };
267
268 #define LOOKS(NAME, COND) \
269 file_private int \
270 looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
271 size_t *ulen) \
272 { \
273 size_t i; \
274 \
275 *ulen = 0; \
276 \
277 for (i = 0; i < nbytes; i++) { \
278 int t = text_chars[buf[i]]; \
279 \
280 if (COND) \
281 return 0; \
282 \
283 ubuf[(*ulen)++] = buf[i]; \
284 } \
285 return 1; \
286 }
287
288 LOOKS(ascii, t != T)
289 LOOKS(latin1, t != T && t != I)
290 LOOKS(extended, t != T && t != I && t != X)
291
292 /*
293 * Decide whether some text looks like UTF-8. Returns:
294 *
295 * -1: invalid UTF-8
296 * 0: uses odd control characters, so doesn't look like text
297 * 1: 7-bit text
298 * 2: definitely UTF-8 text (valid high-bit set bytes)
299 *
300 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
301 * ubuf must be big enough!
302 */
303
304 // from: https://golang.org/src/unicode/utf8/utf8.go
305
306 #define XX 0xF1 // invalid: size 1
307 #define AS 0xF0 // ASCII: size 1
308 #define S1 0x02 // accept 0, size 2
309 #define S2 0x13 // accept 1, size 3
310 #define S3 0x03 // accept 0, size 3
311 #define S4 0x23 // accept 2, size 3
312 #define S5 0x34 // accept 3, size 4
313 #define S6 0x04 // accept 0, size 4
314 #define S7 0x44 // accept 4, size 4
315
316 #define LOCB 0x80
317 #define HICB 0xBF
318
319 // first is information about the first byte in a UTF-8 sequence.
320 static const uint8_t first[] = {
321 // 1 2 3 4 5 6 7 8 9 A B C D E F
322 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
323 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
324 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
325 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
326 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
327 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
328 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
329 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
330 // 1 2 3 4 5 6 7 8 9 A B C D E F
331 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
332 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
333 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
334 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
335 XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
336 S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
337 S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
338 S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
339 };
340
341 // acceptRange gives the range of valid values for the second byte in a UTF-8
342 // sequence.
343 struct accept_range {
344 uint8_t lo; // lowest value for second byte.
345 uint8_t hi; // highest value for second byte.
346 } accept_ranges[16] = {
347 // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
348 { LOCB, HICB },
349 { 0xA0, HICB },
350 { LOCB, 0x9F },
351 { 0x90, HICB },
352 { LOCB, 0x8F },
353 };
354
355 file_protected int
file_looks_utf8(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)356 file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
357 size_t *ulen)
358 {
359 size_t i;
360 int n;
361 file_unichar_t c;
362 int gotone = 0, ctrl = 0;
363
364 if (ubuf)
365 *ulen = 0;
366
367 for (i = 0; i < nbytes; i++) {
368 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
369 /*
370 * Even if the whole file is valid UTF-8 sequences,
371 * still reject it if it uses weird control characters.
372 */
373
374 if (text_chars[buf[i]] != T)
375 ctrl = 1;
376
377 if (ubuf)
378 ubuf[(*ulen)++] = buf[i];
379 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
380 return -1;
381 } else { /* 11xxxxxx begins UTF-8 */
382 int following;
383 uint8_t x = first[buf[i]];
384 const struct accept_range *ar =
385 &accept_ranges[(unsigned int)x >> 4];
386 if (x == XX)
387 return -1;
388
389 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */
390 c = buf[i] & 0x1f;
391 following = 1;
392 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
393 c = buf[i] & 0x0f;
394 following = 2;
395 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
396 c = buf[i] & 0x07;
397 following = 3;
398 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
399 c = buf[i] & 0x03;
400 following = 4;
401 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
402 c = buf[i] & 0x01;
403 following = 5;
404 } else
405 return -1;
406
407 for (n = 0; n < following; n++) {
408 i++;
409 if (i >= nbytes)
410 goto done;
411
412 if (n == 0 &&
413 (buf[i] < ar->lo || buf[i] > ar->hi))
414 return -1;
415
416 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
417 return -1;
418
419 c = (c << 6) + (buf[i] & 0x3f);
420 }
421
422 if (ubuf)
423 ubuf[(*ulen)++] = c;
424 gotone = 1;
425 }
426 }
427 done:
428 return ctrl ? 0 : (gotone ? 2 : 1);
429 }
430
431 /*
432 * Decide whether some text looks like UTF-8 with BOM. If there is no
433 * BOM, return -1; otherwise return the result of looks_utf8 on the
434 * rest of the text.
435 */
436 file_private int
looks_utf8_with_BOM(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)437 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
438 file_unichar_t *ubuf, size_t *ulen)
439 {
440 if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
441 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
442 else
443 return -1;
444 }
445
446 file_private int
looks_utf7(const unsigned char * buf,size_t nbytes,file_unichar_t * ubuf,size_t * ulen)447 looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
448 size_t *ulen)
449 {
450 if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
451 switch (buf[3]) {
452 case '8':
453 case '9':
454 case '+':
455 case '/':
456 if (ubuf)
457 *ulen = 0;
458 return 1;
459 default:
460 return -1;
461 }
462 else
463 return -1;
464 }
465
466 #define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
467 #define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
468 #define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
469
470 file_private int
looks_ucs16(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)471 looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
472 size_t *ulen)
473 {
474 int bigend;
475 uint32_t hi;
476 size_t i;
477
478 if (nbytes < 2)
479 return 0;
480
481 if (bf[0] == 0xff && bf[1] == 0xfe)
482 bigend = 0;
483 else if (bf[0] == 0xfe && bf[1] == 0xff)
484 bigend = 1;
485 else
486 return 0;
487
488 *ulen = 0;
489 hi = 0;
490
491 for (i = 2; i + 1 < nbytes; i += 2) {
492 uint32_t uc;
493
494 if (bigend)
495 uc = CAST(uint32_t,
496 bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));
497 else
498 uc = CAST(uint32_t,
499 bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));
500
501 uc &= 0xffff;
502
503 switch (uc) {
504 case 0xfffe:
505 case 0xffff:
506 return 0;
507 default:
508 if (UCS16_NOCHAR(uc))
509 return 0;
510 break;
511 }
512 if (hi) {
513 if (!UCS16_LOSURR(uc))
514 return 0;
515 uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);
516 hi = 0;
517 }
518 if (uc < 128 && text_chars[CAST(size_t, uc)] != T)
519 return 0;
520 ubf[(*ulen)++] = uc;
521 if (UCS16_HISURR(uc))
522 hi = uc - 0xd800 + 1;
523 if (UCS16_LOSURR(uc))
524 return 0;
525 }
526
527 return 1 + bigend;
528 }
529
530 file_private int
looks_ucs32(const unsigned char * bf,size_t nbytes,file_unichar_t * ubf,size_t * ulen)531 looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
532 size_t *ulen)
533 {
534 int bigend;
535 size_t i;
536
537 if (nbytes < 4)
538 return 0;
539
540 if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
541 bigend = 0;
542 else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
543 bigend = 1;
544 else
545 return 0;
546
547 *ulen = 0;
548
549 for (i = 4; i + 3 < nbytes; i += 4) {
550 /* XXX fix to properly handle chars > 65536 */
551
552 if (bigend)
553 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
554 | (CAST(file_unichar_t, bf[i + 2]) << 8)
555 | (CAST(file_unichar_t, bf[i + 1]) << 16)
556 | (CAST(file_unichar_t, bf[i]) << 24);
557 else
558 ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
559 | (CAST(file_unichar_t, bf[i + 1]) << 8)
560 | (CAST(file_unichar_t, bf[i + 2]) << 16)
561 | (CAST(file_unichar_t, bf[i + 3]) << 24);
562
563 if (ubf[*ulen - 1] == 0xfffe)
564 return 0;
565 if (ubf[*ulen - 1] < 128 &&
566 text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
567 return 0;
568 }
569
570 return 1 + bigend;
571 }
572 #undef F
573 #undef T
574 #undef I
575 #undef X
576
577 /*
578 * This table maps each EBCDIC character to an (8-bit extended) ASCII
579 * character, as specified in the rationale for the dd(1) command in
580 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
581 *
582 * Unfortunately it does not seem to correspond exactly to any of the
583 * five variants of EBCDIC documented in IBM's _Enterprise Systems
584 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
585 * Edition, July, 1999, pp. I-1 - I-4.
586 *
587 * Fortunately, though, all versions of EBCDIC, including this one, agree
588 * on most of the printing characters that also appear in (7-bit) ASCII.
589 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
590 *
591 * Fortunately too, there is general agreement that codes 0x00 through
592 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
593 * remainder printing characters.
594 *
595 * This is sufficient to allow us to identify EBCDIC text and to distinguish
596 * between old-style and internationalized examples of text.
597 */
598
599 file_private unsigned char ebcdic_to_ascii[] = {
600 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
601 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
602 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
603 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
604 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
605 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
606 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
607 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
608 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
609 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
610 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
611 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
612 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
613 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
614 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
615 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
616 };
617
618 #ifdef notdef
619 /*
620 * The following EBCDIC-to-ASCII table may relate more closely to reality,
621 * or at least to modern reality. It comes from
622 *
623 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
624 *
625 * and maps the characters of EBCDIC code page 1047 (the code used for
626 * Unix-derived software on IBM's 390 systems) to the corresponding
627 * characters from ISO 8859-1.
628 *
629 * If this table is used instead of the above one, some of the special
630 * cases for the NEL character can be taken out of the code.
631 */
632
633 file_private unsigned char ebcdic_1047_to_8859[] = {
634 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
635 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
636 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
637 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
638 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
639 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
640 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
641 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
642 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
643 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
644 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
645 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
646 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
647 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
648 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
649 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
650 };
651 #endif
652
653 /*
654 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
655 */
656 file_private void
from_ebcdic(const unsigned char * buf,size_t nbytes,unsigned char * out)657 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
658 {
659 size_t i;
660
661 for (i = 0; i < nbytes; i++) {
662 out[i] = ebcdic_to_ascii[buf[i]];
663 }
664 }
665