1 #ifdef PLAN9 2 #include <u.h> 3 #include <libc.h> 4 #include <bio.h> 5 #else 6 #include <sys/types.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <unistd.h> 11 #include <errno.h> 12 #include "plan9.h" 13 #endif 14 #include "hdr.h" 15 16 /* 17 the our_* routines are implementations for the corresponding library 18 routines. for a while, i tried to actually name them wctomb etc 19 but stopped that after i found a system which made wchar_t an 20 unsigned char. 21 */ 22 23 #ifdef PLAN9 24 long getrune(Biobuf *); 25 long getisorune(Biobuf *); 26 #else 27 long getrune(FILE *); 28 long getisorune(FILE *); 29 #endif 30 int our_wctomb(char *s, unsigned long wc); 31 int our_mbtowc(unsigned long *p, char *s, unsigned n); 32 int runetoisoutf(char *str, Rune *rune); 33 int fullisorune(char *str, int n); 34 int isochartorune(Rune *rune, char *str); 35 36 void 37 utf_in(int fd, long *notused, struct convert *out) 38 { 39 #ifndef PLAN9 40 FILE *fp; 41 #else /* PLAN9 */ 42 Biobuf b; 43 #endif /* PLAN9 */ 44 Rune *r; 45 long l; 46 47 USED(notused); 48 #ifndef PLAN9 49 if((fp = fdopen(fd, "r")) == NULL){ 50 EPR "%s: input setup error: %s\n", argv0, strerror(errno)); 51 #else /* PLAN9 */ 52 if(Binit(&b, fd, OREAD) < 0){ 53 EPR "%s: input setup error: %r\n", argv0); 54 #endif /* PLAN9 */ 55 EXIT(1, "input error"); 56 } 57 r = runes; 58 for(;;) 59 #ifndef PLAN9 60 switch(l = getrune(fp)) 61 #else /* PLAN9 */ 62 switch(l = getrune(&b)) 63 #endif /* PLAN9 */ 64 { 65 case -1: 66 goto done; 67 case -2: 68 if(squawk) 69 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput); 70 if(clean) 71 continue; 72 nerrors++; 73 l = Runeerror; 74 default: 75 *r++ = l; 76 if(r >= &runes[N]){ 77 OUT(out, runes, r-runes); 78 r = runes; 79 } 80 } 81 done: 82 if(r > runes) 83 OUT(out, runes, r-runes); 84 } 85 86 void 87 utf_out(Rune *base, int n, long *notused) 88 { 89 char *p; 90 Rune *r; 91 92 USED(notused); 93 nrunes += n; 94 for(r = base, p = obuf; n-- > 0; r++){ 95 p += our_wctomb(p, *r); 96 } 97 noutput += p-obuf; 98 write(1, obuf, p-obuf); 99 } 100 101 void 102 isoutf_in(int fd, long *notused, struct convert *out) 103 { 104 #ifndef PLAN9 105 FILE *fp; 106 #else /* PLAN9 */ 107 Biobuf b; 108 #endif /* PLAN9 */ 109 Rune *r; 110 long l; 111 112 USED(notused); 113 #ifndef PLAN9 114 if((fp = fdopen(fd, "r")) == 0){ 115 EPR "%s: input setup error: %s\n", argv0, strerror(errno)); 116 #else /* PLAN9 */ 117 if(Binit(&b, fd, OREAD) < 0){ 118 EPR "%s: input setup error: %r\n", argv0); 119 #endif /* PLAN9 */ 120 EXIT(1, "input error"); 121 } 122 r = runes; 123 for(;;) 124 #ifndef PLAN9 125 switch(l = getisorune(fp)) 126 #else /* PLAN9 */ 127 switch(l = getisorune(&b)) 128 #endif /* PLAN9 */ 129 { 130 case -1: 131 goto done; 132 case -2: 133 if(squawk) 134 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput); 135 if(clean) 136 continue; 137 nerrors++; 138 l = Runeerror; 139 default: 140 *r++ = l; 141 if(r >= &runes[N]){ 142 OUT(out, runes, r-runes); 143 r = runes; 144 } 145 } 146 done: 147 if(r > runes) 148 OUT(out, runes, r-runes); 149 } 150 151 void 152 isoutf_out(Rune *base, int n, long *notused) 153 { 154 char *p; 155 Rune *r; 156 157 USED(notused); 158 nrunes += n; 159 for(r = base, p = obuf; n-- > 0; r++) 160 p += runetoisoutf(p, r); 161 noutput += p-obuf; 162 write(1, obuf, p-obuf); 163 } 164 165 long 166 #ifndef PLAN9 167 getrune(FILE *fp) 168 #else /* PLAN9 */ 169 getrune(Biobuf *bp) 170 #endif /* PLAN9 */ 171 { 172 int c, i; 173 char str[UTFmax]; /* MB_LEN_MAX really */ 174 unsigned long l; 175 int n; 176 177 for(i = 0;;){ 178 #ifndef PLAN9 179 c = getc(fp); 180 #else /* PLAN9 */ 181 c = Bgetc(bp); 182 #endif /* PLAN9 */ 183 if(c < 0) 184 return(c); 185 ninput++; 186 str[i++] = c; 187 n = our_mbtowc(&l, str, i); 188 if(n == -1) 189 return(-2); 190 if(n > 0) 191 return(l); 192 } 193 } 194 195 long 196 #ifndef PLAN9 197 getisorune(FILE *fp) 198 #else /* PLAN9 */ 199 getisorune(Biobuf *bp) 200 #endif /* PLAN9 */ 201 { 202 int c, i; 203 Rune rune; 204 char str[UTFmax]; /* MB_LEN_MAX really */ 205 206 for(i = 0;;){ 207 #ifndef PLAN9 208 c = getc(fp); 209 #else /* PLAN9 */ 210 c = Bgetc(bp); 211 #endif /* PLAN9 */ 212 if(c < 0) 213 return(c); 214 ninput++; 215 str[i++] = c; 216 if(fullisorune(str, i)) 217 break; 218 } 219 isochartorune(&rune, str); 220 if(rune == Runeerror) 221 return -2; 222 return(rune); 223 } 224 225 enum 226 { 227 Char1 = Runeself, Rune1 = Runeself, 228 Char21 = 0xA1, Rune21 = 0x0100, 229 Char22 = 0xF6, Rune22 = 0x4016, 230 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */ 231 Esc = 0xBE, Bad = Runeerror 232 }; 233 234 static uchar U[256]; 235 static uchar T[256]; 236 237 static 238 void 239 mktable(void) 240 { 241 int i, u; 242 243 for(i=0; i<256; i++) { 244 u = i + (0x5E - 0xA0); 245 if(i < 0xA0) 246 u = i + (0xDF - 0x7F); 247 if(i < 0x7F) 248 u = i + (0x00 - 0x21); 249 if(i < 0x21) 250 u = i + (0xBE - 0x00); 251 U[i] = u; 252 T[u] = i; 253 } 254 } 255 256 int 257 isochartorune(Rune *rune, char *str) 258 { 259 int c, c1, c2; 260 long l; 261 262 if(U[0] == 0) 263 mktable(); 264 265 /* 266 * one character sequence 267 * 00000-0009F => 00-9F 268 */ 269 c = *(uchar*)str; 270 if(c < Char1) { 271 *rune = c; 272 return 1; 273 } 274 275 /* 276 * two character sequence 277 * 000A0-000FF => A0; A0-FF 278 */ 279 c1 = *(uchar*)(str+1); 280 if(c < Char21) { 281 if(c1 >= Rune1 && c1 < Rune21) { 282 *rune = c1; 283 return 2; 284 } 285 goto bad; 286 } 287 288 /* 289 * two character sequence 290 * 00100-04015 => A1-F5; 21-7E/A0-FF 291 */ 292 c1 = U[c1]; 293 if(c1 >= Esc) 294 goto bad; 295 if(c < Char22) { 296 *rune = (c-Char21)*Esc + c1 + Rune21; 297 return 2; 298 } 299 300 /* 301 * three character sequence 302 * 04016-38E2D => A6-FB; 21-7E/A0-FF 303 */ 304 c2 = U[*(uchar*)(str+2)]; 305 if(c2 >= Esc) 306 goto bad; 307 if(c < Char3) { 308 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22; 309 if(l >= Rune3) 310 goto bad; 311 *rune = l; 312 return 3; 313 } 314 315 /* 316 * bad decoding 317 */ 318 bad: 319 *rune = Bad; 320 return 1; 321 } 322 323 int 324 runetoisoutf(char *str, Rune *rune) 325 { 326 long c; 327 328 if(T[0] == 0) 329 mktable(); 330 331 /* 332 * one character sequence 333 * 00000-0009F => 00-9F 334 */ 335 c = *rune; 336 if(c < Rune1) { 337 str[0] = c; 338 return 1; 339 } 340 341 /* 342 * two character sequence 343 * 000A0-000FF => A0; A0-FF 344 */ 345 if(c < Rune21) { 346 str[0] = Char1; 347 str[1] = c; 348 return 2; 349 } 350 351 /* 352 * two character sequence 353 * 00100-04015 => A1-F5; 21-7E/A0-FF 354 */ 355 if(c < Rune22) { 356 c -= Rune21; 357 str[0] = c/Esc + Char21; 358 str[1] = T[c%Esc]; 359 return 2; 360 } 361 362 /* 363 * three character sequence 364 * 04016-38E2D => A6-FB; 21-7E/A0-FF 365 */ 366 c -= Rune22; 367 str[0] = c/(Esc*Esc) + Char22; 368 str[1] = T[c/Esc%Esc]; 369 str[2] = T[c%Esc]; 370 return 3; 371 } 372 373 int 374 fullisorune(char *str, int n) 375 { 376 int c; 377 378 if(n > 0) { 379 c = *(uchar*)str; 380 if(c < Char1) 381 return 1; 382 if(n > 1) 383 if(c < Char22 || n > 2) 384 return 1; 385 } 386 return 0; 387 } 388 389 #ifdef PLAN9 390 int errno; 391 #endif 392 393 enum 394 { 395 T1 = 0x00, 396 Tx = 0x80, 397 T2 = 0xC0, 398 T3 = 0xE0, 399 T4 = 0xF0, 400 T5 = 0xF8, 401 T6 = 0xFC, 402 403 Bit1 = 7, 404 Bitx = 6, 405 Bit2 = 5, 406 Bit3 = 4, 407 Bit4 = 3, 408 Bit5 = 2, 409 Bit6 = 2, 410 411 Mask1 = (1<<Bit1)-1, 412 Maskx = (1<<Bitx)-1, 413 Mask2 = (1<<Bit2)-1, 414 Mask3 = (1<<Bit3)-1, 415 Mask4 = (1<<Bit4)-1, 416 Mask5 = (1<<Bit5)-1, 417 Mask6 = (1<<Bit6)-1, 418 419 Wchar1 = (1UL<<Bit1)-1, 420 Wchar2 = (1UL<<(Bit2+Bitx))-1, 421 Wchar3 = (1UL<<(Bit3+2*Bitx))-1, 422 Wchar4 = (1UL<<(Bit4+3*Bitx))-1, 423 Wchar5 = (1UL<<(Bit5+4*Bitx))-1 424 425 #ifndef EILSEQ 426 , /* we hate ansi c's comma rules */ 427 EILSEQ = 123 428 #endif /* PLAN9 */ 429 }; 430 431 int 432 our_wctomb(char *s, unsigned long wc) 433 { 434 if(s == 0) 435 return 0; /* no shift states */ 436 if(wc & ~Wchar2) { 437 if(wc & ~Wchar4) { 438 if(wc & ~Wchar5) { 439 /* 6 bytes */ 440 s[0] = T6 | ((wc >> 5*Bitx) & Mask6); 441 s[1] = Tx | ((wc >> 4*Bitx) & Maskx); 442 s[2] = Tx | ((wc >> 3*Bitx) & Maskx); 443 s[3] = Tx | ((wc >> 2*Bitx) & Maskx); 444 s[4] = Tx | ((wc >> 1*Bitx) & Maskx); 445 s[5] = Tx | (wc & Maskx); 446 return 6; 447 } 448 /* 5 bytes */ 449 s[0] = T5 | (wc >> 4*Bitx); 450 s[1] = Tx | ((wc >> 3*Bitx) & Maskx); 451 s[2] = Tx | ((wc >> 2*Bitx) & Maskx); 452 s[3] = Tx | ((wc >> 1*Bitx) & Maskx); 453 s[4] = Tx | (wc & Maskx); 454 return 5; 455 } 456 if(wc & ~Wchar3) { 457 /* 4 bytes */ 458 s[0] = T4 | (wc >> 3*Bitx); 459 s[1] = Tx | ((wc >> 2*Bitx) & Maskx); 460 s[2] = Tx | ((wc >> 1*Bitx) & Maskx); 461 s[3] = Tx | (wc & Maskx); 462 return 4; 463 } 464 /* 3 bytes */ 465 s[0] = T3 | (wc >> 2*Bitx); 466 s[1] = Tx | ((wc >> 1*Bitx) & Maskx); 467 s[2] = Tx | (wc & Maskx); 468 return 3; 469 } 470 if(wc & ~Wchar1) { 471 /* 2 bytes */ 472 s[0] = T2 | (wc >> 1*Bitx); 473 s[1] = Tx | (wc & Maskx); 474 return 2; 475 } 476 /* 1 byte */ 477 s[0] = T1 | wc; 478 return 1; 479 } 480 481 int 482 our_mbtowc(unsigned long *p, char *s, unsigned n) 483 { 484 uchar *us; 485 int c0, c1, c2, c3, c4, c5; 486 unsigned long wc; 487 488 if(s == 0) 489 return 0; /* no shift states */ 490 491 if(n < 1) 492 goto badlen; 493 us = (uchar*)s; 494 c0 = us[0]; 495 if(c0 >= T3) { 496 if(n < 3) 497 goto badlen; 498 c1 = us[1] ^ Tx; 499 c2 = us[2] ^ Tx; 500 if((c1|c2) & T2) 501 goto bad; 502 if(c0 >= T5) { 503 if(n < 5) 504 goto badlen; 505 c3 = us[3] ^ Tx; 506 c4 = us[4] ^ Tx; 507 if((c3|c4) & T2) 508 goto bad; 509 if(c0 >= T6) { 510 /* 6 bytes */ 511 if(n < 6) 512 goto badlen; 513 c5 = us[5] ^ Tx; 514 if(c5 & T2) 515 goto bad; 516 wc = ((((((((((c0 & Mask6) << Bitx) | 517 c1) << Bitx) | c2) << Bitx) | 518 c3) << Bitx) | c4) << Bitx) | c5; 519 if(wc <= Wchar5) 520 goto bad; 521 *p = wc; 522 return 6; 523 } 524 /* 5 bytes */ 525 wc = ((((((((c0 & Mask5) << Bitx) | 526 c1) << Bitx) | c2) << Bitx) | 527 c3) << Bitx) | c4; 528 if(wc <= Wchar4) 529 goto bad; 530 *p = wc; 531 return 5; 532 } 533 if(c0 >= T4) { 534 /* 4 bytes */ 535 if(n < 4) 536 goto badlen; 537 c3 = us[3] ^ Tx; 538 if(c3 & T2) 539 goto bad; 540 wc = ((((((c0 & Mask4) << Bitx) | 541 c1) << Bitx) | c2) << Bitx) | 542 c3; 543 if(wc <= Wchar3) 544 goto bad; 545 *p = wc; 546 return 4; 547 } 548 /* 3 bytes */ 549 wc = ((((c0 & Mask3) << Bitx) | 550 c1) << Bitx) | c2; 551 if(wc <= Wchar2) 552 goto bad; 553 *p = wc; 554 return 3; 555 } 556 if(c0 >= T2) { 557 /* 2 bytes */ 558 if(n < 2) 559 goto badlen; 560 c1 = us[1] ^ Tx; 561 if(c1 & T2) 562 goto bad; 563 wc = ((c0 & Mask2) << Bitx) | 564 c1; 565 if(wc <= Wchar1) 566 goto bad; 567 *p = wc; 568 return 2; 569 } 570 /* 1 byte */ 571 if(c0 >= Tx) 572 goto bad; 573 *p = c0; 574 return 1; 575 576 bad: 577 errno = EILSEQ; 578 return -1; 579 badlen: 580 return -2; 581 } 582