1 #ifdef PLAN9 2 #include <u.h> 3 #include <libc.h> 4 #include <bio.h> 5 #else 6 #include <sys/types.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 #include <unistd.h> 11 #include <errno.h> 12 #include "plan9.h" 13 #endif 14 #include "hdr.h" 15 16 /* 17 the our_* routines are implementations for the corresponding library 18 routines. for a while, i tried to actually name them wctomb etc 19 but stopped that after i found a system which made wchar_t an 20 unsigned char. 21 */ 22 23 int our_wctomb(char *s, unsigned long wc); 24 int our_mbtowc(unsigned long *p, char *s, unsigned n); 25 int runetoisoutf(char *str, Rune *rune); 26 int fullisorune(char *str, int n); 27 int isochartorune(Rune *rune, char *str); 28 29 void 30 utf_in(int fd, long *notused, struct convert *out) 31 { 32 char buf[N]; 33 int i, j, c, n, tot; 34 ulong l; 35 36 USED(notused); 37 tot = 0; 38 while((n = read(fd, buf+tot, N-tot)) >= 0){ 39 tot += n; 40 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){ 41 c = our_mbtowc(&l, buf+i, tot-i); 42 if(c == -1){ 43 if(squawk) 44 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); 45 if(clean){ 46 i++; 47 continue; 48 } 49 nerrors++; 50 l = Runeerror; 51 c = 1; 52 } 53 runes[j++] = l; 54 i += c; 55 } 56 OUT(out, runes, j); 57 tot -= i; 58 ninput += i; 59 if(tot) 60 memmove(buf, buf+i, tot); 61 if(n == 0) 62 break; 63 } 64 OUT(out, runes, 0); 65 } 66 67 void 68 utf_out(Rune *base, int n, long *notused) 69 { 70 char *p; 71 Rune *r; 72 73 USED(notused); 74 nrunes += n; 75 for(r = base, p = obuf; n-- > 0; r++){ 76 p += our_wctomb(p, *r); 77 } 78 noutput += p-obuf; 79 write(1, obuf, p-obuf); 80 } 81 82 void 83 isoutf_in(int fd, long *notused, struct convert *out) 84 { 85 char buf[N]; 86 int i, j, c, n, tot; 87 88 USED(notused); 89 tot = 0; 90 while((n = read(fd, buf+tot, N-tot)) >= 0){ 91 tot += n; 92 for(i=j=0; i<tot; ){ 93 if(!fullisorune(buf+i, tot-i)) 94 break; 95 c = isochartorune(&runes[j], buf+i); 96 if(runes[j] == Runeerror && c == 1){ 97 if(squawk) 98 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); 99 if(clean){ 100 i++; 101 continue; 102 } 103 nerrors++; 104 } 105 j++; 106 i += c; 107 } 108 OUT(out, runes, j); 109 tot -= i; 110 ninput += i; 111 if(tot) 112 memmove(buf, buf+i, tot); 113 if(n == 0) 114 break; 115 } 116 OUT(out, runes, 0); 117 } 118 119 void 120 isoutf_out(Rune *base, int n, long *notused) 121 { 122 char *p; 123 Rune *r; 124 125 USED(notused); 126 nrunes += n; 127 for(r = base, p = obuf; n-- > 0; r++) 128 p += runetoisoutf(p, r); 129 noutput += p-obuf; 130 write(1, obuf, p-obuf); 131 } 132 133 134 enum 135 { 136 Char1 = Runeself, Rune1 = Runeself, 137 Char21 = 0xA1, Rune21 = 0x0100, 138 Char22 = 0xF6, Rune22 = 0x4016, 139 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */ 140 Esc = 0xBE, Bad = Runeerror 141 }; 142 143 static uchar U[256]; 144 static uchar T[256]; 145 146 static 147 void 148 mktable(void) 149 { 150 int i, u; 151 152 for(i=0; i<256; i++) { 153 u = i + (0x5E - 0xA0); 154 if(i < 0xA0) 155 u = i + (0xDF - 0x7F); 156 if(i < 0x7F) 157 u = i + (0x00 - 0x21); 158 if(i < 0x21) 159 u = i + (0xBE - 0x00); 160 U[i] = u; 161 T[u] = i; 162 } 163 } 164 165 int 166 isochartorune(Rune *rune, char *str) 167 { 168 int c, c1, c2; 169 long l; 170 171 if(U[0] == 0) 172 mktable(); 173 174 /* 175 * one character sequence 176 * 00000-0009F => 00-9F 177 */ 178 c = *(uchar*)str; 179 if(c < Char1) { 180 *rune = c; 181 return 1; 182 } 183 184 /* 185 * two character sequence 186 * 000A0-000FF => A0; A0-FF 187 */ 188 c1 = *(uchar*)(str+1); 189 if(c < Char21) { 190 if(c1 >= Rune1 && c1 < Rune21) { 191 *rune = c1; 192 return 2; 193 } 194 goto bad; 195 } 196 197 /* 198 * two character sequence 199 * 00100-04015 => A1-F5; 21-7E/A0-FF 200 */ 201 c1 = U[c1]; 202 if(c1 >= Esc) 203 goto bad; 204 if(c < Char22) { 205 *rune = (c-Char21)*Esc + c1 + Rune21; 206 return 2; 207 } 208 209 /* 210 * three character sequence 211 * 04016-38E2D => A6-FB; 21-7E/A0-FF 212 */ 213 c2 = U[*(uchar*)(str+2)]; 214 if(c2 >= Esc) 215 goto bad; 216 if(c < Char3) { 217 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22; 218 if(l >= Rune3) 219 goto bad; 220 *rune = l; 221 return 3; 222 } 223 224 /* 225 * bad decoding 226 */ 227 bad: 228 *rune = Bad; 229 return 1; 230 } 231 232 int 233 runetoisoutf(char *str, Rune *rune) 234 { 235 long c; 236 237 if(T[0] == 0) 238 mktable(); 239 240 /* 241 * one character sequence 242 * 00000-0009F => 00-9F 243 */ 244 c = *rune; 245 if(c < Rune1) { 246 str[0] = c; 247 return 1; 248 } 249 250 /* 251 * two character sequence 252 * 000A0-000FF => A0; A0-FF 253 */ 254 if(c < Rune21) { 255 str[0] = Char1; 256 str[1] = c; 257 return 2; 258 } 259 260 /* 261 * two character sequence 262 * 00100-04015 => A1-F5; 21-7E/A0-FF 263 */ 264 if(c < Rune22) { 265 c -= Rune21; 266 str[0] = c/Esc + Char21; 267 str[1] = T[c%Esc]; 268 return 2; 269 } 270 271 /* 272 * three character sequence 273 * 04016-38E2D => A6-FB; 21-7E/A0-FF 274 */ 275 c -= Rune22; 276 str[0] = c/(Esc*Esc) + Char22; 277 str[1] = T[c/Esc%Esc]; 278 str[2] = T[c%Esc]; 279 return 3; 280 } 281 282 int 283 fullisorune(char *str, int n) 284 { 285 int c; 286 287 if(n > 0) { 288 c = *(uchar*)str; 289 if(c < Char1) 290 return 1; 291 if(n > 1) 292 if(c < Char22 || n > 2) 293 return 1; 294 } 295 return 0; 296 } 297 298 #ifdef PLAN9 299 int errno; 300 #endif 301 302 enum 303 { 304 T1 = 0x00, 305 Tx = 0x80, 306 T2 = 0xC0, 307 T3 = 0xE0, 308 T4 = 0xF0, 309 T5 = 0xF8, 310 T6 = 0xFC, 311 312 Bit1 = 7, 313 Bitx = 6, 314 Bit2 = 5, 315 Bit3 = 4, 316 Bit4 = 3, 317 Bit5 = 2, 318 Bit6 = 2, 319 320 Mask1 = (1<<Bit1)-1, 321 Maskx = (1<<Bitx)-1, 322 Mask2 = (1<<Bit2)-1, 323 Mask3 = (1<<Bit3)-1, 324 Mask4 = (1<<Bit4)-1, 325 Mask5 = (1<<Bit5)-1, 326 Mask6 = (1<<Bit6)-1, 327 328 Wchar1 = (1UL<<Bit1)-1, 329 Wchar2 = (1UL<<(Bit2+Bitx))-1, 330 Wchar3 = (1UL<<(Bit3+2*Bitx))-1, 331 Wchar4 = (1UL<<(Bit4+3*Bitx))-1, 332 Wchar5 = (1UL<<(Bit5+4*Bitx))-1, 333 334 #ifndef EILSEQ 335 EILSEQ = 123, 336 #endif /* EILSEQ */ 337 }; 338 339 int 340 our_wctomb(char *s, unsigned long wc) 341 { 342 if(s == 0) 343 return 0; /* no shift states */ 344 if(wc & ~Wchar2) { 345 if(wc & ~Wchar4) { 346 if(wc & ~Wchar5) { 347 /* 6 bytes */ 348 s[0] = T6 | ((wc >> 5*Bitx) & Mask6); 349 s[1] = Tx | ((wc >> 4*Bitx) & Maskx); 350 s[2] = Tx | ((wc >> 3*Bitx) & Maskx); 351 s[3] = Tx | ((wc >> 2*Bitx) & Maskx); 352 s[4] = Tx | ((wc >> 1*Bitx) & Maskx); 353 s[5] = Tx | (wc & Maskx); 354 return 6; 355 } 356 /* 5 bytes */ 357 s[0] = T5 | (wc >> 4*Bitx); 358 s[1] = Tx | ((wc >> 3*Bitx) & Maskx); 359 s[2] = Tx | ((wc >> 2*Bitx) & Maskx); 360 s[3] = Tx | ((wc >> 1*Bitx) & Maskx); 361 s[4] = Tx | (wc & Maskx); 362 return 5; 363 } 364 if(wc & ~Wchar3) { 365 /* 4 bytes */ 366 s[0] = T4 | (wc >> 3*Bitx); 367 s[1] = Tx | ((wc >> 2*Bitx) & Maskx); 368 s[2] = Tx | ((wc >> 1*Bitx) & Maskx); 369 s[3] = Tx | (wc & Maskx); 370 return 4; 371 } 372 /* 3 bytes */ 373 s[0] = T3 | (wc >> 2*Bitx); 374 s[1] = Tx | ((wc >> 1*Bitx) & Maskx); 375 s[2] = Tx | (wc & Maskx); 376 return 3; 377 } 378 if(wc & ~Wchar1) { 379 /* 2 bytes */ 380 s[0] = T2 | (wc >> 1*Bitx); 381 s[1] = Tx | (wc & Maskx); 382 return 2; 383 } 384 /* 1 byte */ 385 s[0] = T1 | wc; 386 return 1; 387 } 388 389 int 390 our_mbtowc(unsigned long *p, char *s, unsigned n) 391 { 392 uchar *us; 393 int c0, c1, c2, c3, c4, c5; 394 unsigned long wc; 395 396 if(s == 0) 397 return 0; /* no shift states */ 398 399 if(n < 1) 400 goto bad; 401 us = (uchar*)s; 402 c0 = us[0]; 403 if(c0 >= T3) { 404 if(n < 3) 405 goto bad; 406 c1 = us[1] ^ Tx; 407 c2 = us[2] ^ Tx; 408 if((c1|c2) & T2) 409 goto bad; 410 if(c0 >= T5) { 411 if(n < 5) 412 goto bad; 413 c3 = us[3] ^ Tx; 414 c4 = us[4] ^ Tx; 415 if((c3|c4) & T2) 416 goto bad; 417 if(c0 >= T6) { 418 /* 6 bytes */ 419 if(n < 6) 420 goto bad; 421 c5 = us[5] ^ Tx; 422 if(c5 & T2) 423 goto bad; 424 wc = ((((((((((c0 & Mask6) << Bitx) | 425 c1) << Bitx) | c2) << Bitx) | 426 c3) << Bitx) | c4) << Bitx) | c5; 427 if(wc <= Wchar5) 428 goto bad; 429 *p = wc; 430 return 6; 431 } 432 /* 5 bytes */ 433 wc = ((((((((c0 & Mask5) << Bitx) | 434 c1) << Bitx) | c2) << Bitx) | 435 c3) << Bitx) | c4; 436 if(wc <= Wchar4) 437 goto bad; 438 *p = wc; 439 return 5; 440 } 441 if(c0 >= T4) { 442 /* 4 bytes */ 443 if(n < 4) 444 goto bad; 445 c3 = us[3] ^ Tx; 446 if(c3 & T2) 447 goto bad; 448 wc = ((((((c0 & Mask4) << Bitx) | 449 c1) << Bitx) | c2) << Bitx) | 450 c3; 451 if(wc <= Wchar3) 452 goto bad; 453 *p = wc; 454 return 4; 455 } 456 /* 3 bytes */ 457 wc = ((((c0 & Mask3) << Bitx) | 458 c1) << Bitx) | c2; 459 if(wc <= Wchar2) 460 goto bad; 461 *p = wc; 462 return 3; 463 } 464 if(c0 >= T2) { 465 /* 2 bytes */ 466 if(n < 2) 467 goto bad; 468 c1 = us[1] ^ Tx; 469 if(c1 & T2) 470 goto bad; 471 wc = ((c0 & Mask2) << Bitx) | 472 c1; 473 if(wc <= Wchar1) 474 goto bad; 475 *p = wc; 476 return 2; 477 } 478 /* 1 byte */ 479 if(c0 >= Tx) 480 goto bad; 481 *p = c0; 482 return 1; 483 484 bad: 485 errno = EILSEQ; 486 return -1; 487 } 488