1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #else
6 #include <sys/types.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <unistd.h>
11 #include <errno.h>
12 #include "plan9.h"
13 #endif
14 #include "hdr.h"
15
16 /*
17 the our_* routines are implementations for the corresponding library
18 routines. for a while, i tried to actually name them wctomb etc
19 but stopped that after i found a system which made wchar_t an
20 unsigned char.
21 */
22
23 int our_wctomb(char *s, unsigned long wc);
24 int our_mbtowc(unsigned long *p, char *s, unsigned n);
25 int runetoisoutf(char *str, Rune *rune);
26 int fullisorune(char *str, int n);
27 int isochartorune(Rune *rune, char *str);
28
29 void
utf_in(int fd,long * notused,struct convert * out)30 utf_in(int fd, long *notused, struct convert *out)
31 {
32 char buf[N];
33 int i, j, c, n, tot;
34 ulong l;
35
36 USED(notused);
37 tot = 0;
38 while((n = read(fd, buf+tot, N-tot)) >= 0){
39 tot += n;
40 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
41 c = our_mbtowc(&l, buf+i, tot-i);
42 if(c == -1){
43 if(squawk)
44 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
45 if(clean){
46 i++;
47 continue;
48 }
49 nerrors++;
50 l = Runeerror;
51 c = 1;
52 }
53 runes[j++] = l;
54 i += c;
55 }
56 OUT(out, runes, j);
57 tot -= i;
58 ninput += i;
59 if(tot)
60 memmove(buf, buf+i, tot);
61 if(n == 0)
62 break;
63 }
64 OUT(out, runes, 0);
65 }
66
67 void
utf_out(Rune * base,int n,long * notused)68 utf_out(Rune *base, int n, long *notused)
69 {
70 char *p;
71 Rune *r;
72
73 USED(notused);
74 nrunes += n;
75 for(r = base, p = obuf; n-- > 0; r++){
76 p += our_wctomb(p, *r);
77 }
78 noutput += p-obuf;
79 write(1, obuf, p-obuf);
80 }
81
82 void
isoutf_in(int fd,long * notused,struct convert * out)83 isoutf_in(int fd, long *notused, struct convert *out)
84 {
85 char buf[N];
86 int i, j, c, n, tot;
87
88 USED(notused);
89 tot = 0;
90 while((n = read(fd, buf+tot, N-tot)) >= 0){
91 tot += n;
92 for(i=j=0; i<tot; ){
93 if(!fullisorune(buf+i, tot-i))
94 break;
95 c = isochartorune(&runes[j], buf+i);
96 if(runes[j] == Runeerror && c == 1){
97 if(squawk)
98 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
99 if(clean){
100 i++;
101 continue;
102 }
103 nerrors++;
104 }
105 j++;
106 i += c;
107 }
108 OUT(out, runes, j);
109 tot -= i;
110 ninput += i;
111 if(tot)
112 memmove(buf, buf+i, tot);
113 if(n == 0)
114 break;
115 }
116 OUT(out, runes, 0);
117 }
118
119 void
isoutf_out(Rune * base,int n,long * notused)120 isoutf_out(Rune *base, int n, long *notused)
121 {
122 char *p;
123 Rune *r;
124
125 USED(notused);
126 nrunes += n;
127 for(r = base, p = obuf; n-- > 0; r++)
128 p += runetoisoutf(p, r);
129 noutput += p-obuf;
130 write(1, obuf, p-obuf);
131 }
132
133
134 enum
135 {
136 Char1 = Runeself, Rune1 = Runeself,
137 Char21 = 0xA1, Rune21 = 0x0100,
138 Char22 = 0xF6, Rune22 = 0x4016,
139 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
140 Esc = 0xBE, Bad = Runeerror
141 };
142
143 static uchar U[256];
144 static uchar T[256];
145
146 static
147 void
mktable(void)148 mktable(void)
149 {
150 int i, u;
151
152 for(i=0; i<256; i++) {
153 u = i + (0x5E - 0xA0);
154 if(i < 0xA0)
155 u = i + (0xDF - 0x7F);
156 if(i < 0x7F)
157 u = i + (0x00 - 0x21);
158 if(i < 0x21)
159 u = i + (0xBE - 0x00);
160 U[i] = u;
161 T[u] = i;
162 }
163 }
164
165 int
isochartorune(Rune * rune,char * str)166 isochartorune(Rune *rune, char *str)
167 {
168 int c, c1, c2;
169 long l;
170
171 if(U[0] == 0)
172 mktable();
173
174 /*
175 * one character sequence
176 * 00000-0009F => 00-9F
177 */
178 c = *(uchar*)str;
179 if(c < Char1) {
180 *rune = c;
181 return 1;
182 }
183
184 /*
185 * two character sequence
186 * 000A0-000FF => A0; A0-FF
187 */
188 c1 = *(uchar*)(str+1);
189 if(c < Char21) {
190 if(c1 >= Rune1 && c1 < Rune21) {
191 *rune = c1;
192 return 2;
193 }
194 goto bad;
195 }
196
197 /*
198 * two character sequence
199 * 00100-04015 => A1-F5; 21-7E/A0-FF
200 */
201 c1 = U[c1];
202 if(c1 >= Esc)
203 goto bad;
204 if(c < Char22) {
205 *rune = (c-Char21)*Esc + c1 + Rune21;
206 return 2;
207 }
208
209 /*
210 * three character sequence
211 * 04016-38E2D => A6-FB; 21-7E/A0-FF
212 */
213 c2 = U[*(uchar*)(str+2)];
214 if(c2 >= Esc)
215 goto bad;
216 if(c < Char3) {
217 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
218 if(l >= Rune3)
219 goto bad;
220 *rune = l;
221 return 3;
222 }
223
224 /*
225 * bad decoding
226 */
227 bad:
228 *rune = Bad;
229 return 1;
230 }
231
232 int
runetoisoutf(char * str,Rune * rune)233 runetoisoutf(char *str, Rune *rune)
234 {
235 long c;
236
237 if(T[0] == 0)
238 mktable();
239
240 /*
241 * one character sequence
242 * 00000-0009F => 00-9F
243 */
244 c = *rune;
245 if(c < Rune1) {
246 str[0] = c;
247 return 1;
248 }
249
250 /*
251 * two character sequence
252 * 000A0-000FF => A0; A0-FF
253 */
254 if(c < Rune21) {
255 str[0] = Char1;
256 str[1] = c;
257 return 2;
258 }
259
260 /*
261 * two character sequence
262 * 00100-04015 => A1-F5; 21-7E/A0-FF
263 */
264 if(c < Rune22) {
265 c -= Rune21;
266 str[0] = c/Esc + Char21;
267 str[1] = T[c%Esc];
268 return 2;
269 }
270
271 /*
272 * three character sequence
273 * 04016-38E2D => A6-FB; 21-7E/A0-FF
274 */
275 c -= Rune22;
276 str[0] = c/(Esc*Esc) + Char22;
277 str[1] = T[c/Esc%Esc];
278 str[2] = T[c%Esc];
279 return 3;
280 }
281
282 int
fullisorune(char * str,int n)283 fullisorune(char *str, int n)
284 {
285 int c;
286
287 if(n > 0) {
288 c = *(uchar*)str;
289 if(c < Char1)
290 return 1;
291 if(n > 1)
292 if(c < Char22 || n > 2)
293 return 1;
294 }
295 return 0;
296 }
297
298 #ifdef PLAN9
299 int errno;
300 #endif
301
302 enum
303 {
304 T1 = 0x00,
305 Tx = 0x80,
306 T2 = 0xC0,
307 T3 = 0xE0,
308 T4 = 0xF0,
309 T5 = 0xF8,
310 T6 = 0xFC,
311
312 Bit1 = 7,
313 Bitx = 6,
314 Bit2 = 5,
315 Bit3 = 4,
316 Bit4 = 3,
317 Bit5 = 2,
318 Bit6 = 2,
319
320 Mask1 = (1<<Bit1)-1,
321 Maskx = (1<<Bitx)-1,
322 Mask2 = (1<<Bit2)-1,
323 Mask3 = (1<<Bit3)-1,
324 Mask4 = (1<<Bit4)-1,
325 Mask5 = (1<<Bit5)-1,
326 Mask6 = (1<<Bit6)-1,
327
328 Wchar1 = (1UL<<Bit1)-1,
329 Wchar2 = (1UL<<(Bit2+Bitx))-1,
330 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
331 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
332 Wchar5 = (1UL<<(Bit5+4*Bitx))-1,
333
334 #ifndef EILSEQ
335 EILSEQ = 123,
336 #endif /* EILSEQ */
337 };
338
339 int
our_wctomb(char * s,unsigned long wc)340 our_wctomb(char *s, unsigned long wc)
341 {
342 if(s == 0)
343 return 0; /* no shift states */
344 if(wc & ~Wchar2) {
345 if(wc & ~Wchar4) {
346 if(wc & ~Wchar5) {
347 /* 6 bytes */
348 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
349 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
350 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
351 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
352 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
353 s[5] = Tx | (wc & Maskx);
354 return 6;
355 }
356 /* 5 bytes */
357 s[0] = T5 | (wc >> 4*Bitx);
358 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
359 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
360 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
361 s[4] = Tx | (wc & Maskx);
362 return 5;
363 }
364 if(wc & ~Wchar3) {
365 /* 4 bytes */
366 s[0] = T4 | (wc >> 3*Bitx);
367 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
368 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
369 s[3] = Tx | (wc & Maskx);
370 return 4;
371 }
372 /* 3 bytes */
373 s[0] = T3 | (wc >> 2*Bitx);
374 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
375 s[2] = Tx | (wc & Maskx);
376 return 3;
377 }
378 if(wc & ~Wchar1) {
379 /* 2 bytes */
380 s[0] = T2 | (wc >> 1*Bitx);
381 s[1] = Tx | (wc & Maskx);
382 return 2;
383 }
384 /* 1 byte */
385 s[0] = T1 | wc;
386 return 1;
387 }
388
389 int
our_mbtowc(unsigned long * p,char * s,unsigned n)390 our_mbtowc(unsigned long *p, char *s, unsigned n)
391 {
392 uchar *us;
393 int c0, c1, c2, c3, c4, c5;
394 unsigned long wc;
395
396 if(s == 0)
397 return 0; /* no shift states */
398
399 if(n < 1)
400 goto bad;
401 us = (uchar*)s;
402 c0 = us[0];
403 if(c0 >= T3) {
404 if(n < 3)
405 goto bad;
406 c1 = us[1] ^ Tx;
407 c2 = us[2] ^ Tx;
408 if((c1|c2) & T2)
409 goto bad;
410 if(c0 >= T5) {
411 if(n < 5)
412 goto bad;
413 c3 = us[3] ^ Tx;
414 c4 = us[4] ^ Tx;
415 if((c3|c4) & T2)
416 goto bad;
417 if(c0 >= T6) {
418 /* 6 bytes */
419 if(n < 6)
420 goto bad;
421 c5 = us[5] ^ Tx;
422 if(c5 & T2)
423 goto bad;
424 wc = ((((((((((c0 & Mask6) << Bitx) |
425 c1) << Bitx) | c2) << Bitx) |
426 c3) << Bitx) | c4) << Bitx) | c5;
427 if(wc <= Wchar5)
428 goto bad;
429 *p = wc;
430 return 6;
431 }
432 /* 5 bytes */
433 wc = ((((((((c0 & Mask5) << Bitx) |
434 c1) << Bitx) | c2) << Bitx) |
435 c3) << Bitx) | c4;
436 if(wc <= Wchar4)
437 goto bad;
438 *p = wc;
439 return 5;
440 }
441 if(c0 >= T4) {
442 /* 4 bytes */
443 if(n < 4)
444 goto bad;
445 c3 = us[3] ^ Tx;
446 if(c3 & T2)
447 goto bad;
448 wc = ((((((c0 & Mask4) << Bitx) |
449 c1) << Bitx) | c2) << Bitx) |
450 c3;
451 if(wc <= Wchar3)
452 goto bad;
453 *p = wc;
454 return 4;
455 }
456 /* 3 bytes */
457 wc = ((((c0 & Mask3) << Bitx) |
458 c1) << Bitx) | c2;
459 if(wc <= Wchar2)
460 goto bad;
461 *p = wc;
462 return 3;
463 }
464 if(c0 >= T2) {
465 /* 2 bytes */
466 if(n < 2)
467 goto bad;
468 c1 = us[1] ^ Tx;
469 if(c1 & T2)
470 goto bad;
471 wc = ((c0 & Mask2) << Bitx) |
472 c1;
473 if(wc <= Wchar1)
474 goto bad;
475 *p = wc;
476 return 2;
477 }
478 /* 1 byte */
479 if(c0 >= Tx)
480 goto bad;
481 *p = c0;
482 return 1;
483
484 bad:
485 errno = EILSEQ;
486 return -1;
487 }
488