xref: /plan9/sys/src/cmd/tcs/conv_jis.c (revision ec46fab06dcae3e636b775c4eaa679036316e1d8)
1 #ifdef	PLAN9
2 #include	<u.h>
3 #include	<libc.h>
4 #include	<bio.h>
5 #else
6 #include	<stdio.h>
7 #include	<unistd.h>
8 #include	"plan9.h"
9 #endif
10 #include	"hdr.h"
11 #include	"conv.h"
12 #include	"kuten208.h"
13 #include	"jis.h"
14 
15 /*
16 	a state machine for interpreting all sorts of encodings
17 */
18 static void
alljis(int c,Rune ** r,long input_loc)19 alljis(int c, Rune **r, long input_loc)
20 {
21 	static enum { state0, state1, state2, state3, state4 } state = state0;
22 	static int set8 = 0;
23 	static int japan646 = 0;
24 	static int lastc;
25 	int n;
26 	long l;
27 
28 again:
29 	switch(state)
30 	{
31 	case state0:	/* idle state */
32 		if(c == ESC){ state = state1; return; }
33 		if(c < 0) return;
34 		if(!set8 && (c < 128)){
35 			if(japan646){
36 				switch(c)
37 				{
38 				case '\\':	emit(0xA5); return;	/* yen */
39 				case '~':	emit(0xAF); return;	/* spacing macron */
40 				default:	emit(c); return;
41 				}
42 			} else {
43 				emit(c);
44 				return;
45 			}
46 		}
47 		if(c < 0x21){	/* guard against bogus characters in JIS mode */
48 			if(squawk)
49 				EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
50 			emit(c);
51 			return;
52 		}
53 		lastc = c; state = state4; return;
54 
55 	case state1:	/* seen an escape */
56 		if(c == '$'){ state = state2; return; }
57 		if(c == '('){ state = state3; return; }
58 		emit(ESC); state = state0; goto again;
59 
60 	case state2:	/* may be shifting into JIS */
61 		if((c == '@') || (c == 'B')){
62 			set8 = 1; state = state0; return;
63 		}
64 		emit(ESC); emit('$'); state = state0; goto again;
65 
66 	case state3:	/* may be shifting out of JIS */
67 		if((c == 'J') || (c == 'H') || (c == 'B')){
68 			japan646 = (c == 'J');
69 			set8 = 0; state = state0; return;
70 		}
71 		emit(ESC); emit('('); state = state0; goto again;
72 
73 	case state4:	/* two part char */
74 		if(c < 0){
75 			if(squawk)
76 				EPR "%s: unexpected EOF in %s\n", argv0, file);
77 			c = 0x21 | (lastc&0x80);
78 		}
79 		if(CANS2J(lastc, c)){	/* ms dos sjis */
80 			int hi = lastc, lo = c;
81 			S2J(hi, lo);			/* convert to 208 */
82 			n = hi*100 + lo - 3232;		/* convert to kuten208 */
83 		} else
84 			n = (lastc&0x7F)*100 + (c&0x7f) - 3232;	/* kuten208 */
85 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
86 			nerrors++;
87 			if(squawk)
88 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
89 			if(!clean)
90 				emit(BADMAP);
91 		} else {
92 			if(l < 0){
93 				l = -l;
94 				if(squawk)
95 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
96 			}
97 			emit(l);
98 		}
99 		state = state0;
100 	}
101 }
102 
103 /*
104 	a state machine for interpreting ms-kanji == shift-jis.
105 */
106 static void
ms(int c,Rune ** r,long input_loc)107 ms(int c, Rune **r, long input_loc)
108 {
109 	static enum { state0, state1, state2, state3, state4 } state = state0;
110 	static int set8 = 0;
111 	static int japan646 = 0;
112 	static int lastc;
113 	int n;
114 	long l;
115 
116 again:
117 	switch(state)
118 	{
119 	case state0:	/* idle state */
120 		if(c == ESC){ state = state1; return; }
121 		if(c < 0) return;
122 		if(!set8 && (c < 128)){
123 			if(japan646){
124 				switch(c)
125 				{
126 				case '\\':	emit(0xA5); return;	/* yen */
127 				case '~':	emit(0xAF); return;	/* spacing macron */
128 				default:	emit(c); return;
129 				}
130 			} else {
131 				emit(c);
132 				return;
133 			}
134 		}
135 		lastc = c; state = state4; return;
136 
137 	case state1:	/* seen an escape */
138 		if(c == '$'){ state = state2; return; }
139 		if(c == '('){ state = state3; return; }
140 		emit(ESC); state = state0; goto again;
141 
142 	case state2:	/* may be shifting into JIS */
143 		if((c == '@') || (c == 'B')){
144 			set8 = 1; state = state0; return;
145 		}
146 		emit(ESC); emit('$'); state = state0; goto again;
147 
148 	case state3:	/* may be shifting out of JIS */
149 		if((c == 'J') || (c == 'H') || (c == 'B')){
150 			japan646 = (c == 'J');
151 			set8 = 0; state = state0; return;
152 		}
153 		emit(ESC); emit('('); state = state0; goto again;
154 
155 	case state4:	/* two part char */
156 		if(c < 0){
157 			if(squawk)
158 				EPR "%s: unexpected EOF in %s\n", argv0, file);
159 			c = 0x21 | (lastc&0x80);
160 		}
161 		if(CANS2J(lastc, c)){	/* ms dos sjis */
162 			int hi = lastc, lo = c;
163 			S2J(hi, lo);			/* convert to 208 */
164 			n = hi*100 + lo - 3232;		/* convert to kuten208 */
165 		} else {
166 			nerrors++;
167 			if(squawk)
168 				EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
169 			if(!clean)
170 				emit(BADMAP);
171 			state = state0;
172 			goto again;
173 		}
174 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
175 			nerrors++;
176 			if(squawk)
177 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
178 			if(!clean)
179 				emit(BADMAP);
180 		} else {
181 			if(l < 0){
182 				l = -l;
183 				if(squawk)
184 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
185 			}
186 			emit(l);
187 		}
188 		state = state0;
189 	}
190 }
191 
192 /*
193 	a state machine for interpreting ujis == EUC
194 */
195 static void
ujis(int c,Rune ** r,long input_loc)196 ujis(int c, Rune **r, long input_loc)
197 {
198 	static enum { state0, state1 } state = state0;
199 	static int lastc;
200 	int n;
201 	long l;
202 
203 	switch(state)
204 	{
205 	case state0:	/* idle state */
206 		if(c < 0) return;
207 		if(c < 128){
208 			emit(c);
209 			return;
210 		}
211 		if(c == 0x8e){	/* codeset 2 */
212 			nerrors++;
213 			if(squawk)
214 				EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
215 			if(!clean)
216 				emit(BADMAP);
217 			return;
218 		}
219 		if(c == 0x8f){	/* codeset 3 */
220 			nerrors++;
221 			if(squawk)
222 				EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
223 			if(!clean)
224 				emit(BADMAP);
225 			return;
226 		}
227 		lastc = c;
228 		state = state1;
229 		return;
230 
231 	case state1:	/* two part char */
232 		if(c < 0){
233 			if(squawk)
234 				EPR "%s: unexpected EOF in %s\n", argv0, file);
235 			c = 0xA1;
236 		}
237 		n = (lastc&0x7F)*100 + (c&0x7F) - 3232;	/* kuten208 */
238 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
239 			nerrors++;
240 			if(squawk)
241 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
242 			if(!clean)
243 				emit(BADMAP);
244 		} else {
245 			if(l < 0){
246 				l = -l;
247 				if(squawk)
248 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
249 			}
250 			emit(l);
251 		}
252 		state = state0;
253 	}
254 }
255 
256 /*
257 	a state machine for interpreting jis-kanji == 2022-JP
258 */
259 static void
jis(int c,Rune ** r,long input_loc)260 jis(int c, Rune **r, long input_loc)
261 {
262 	static enum { state0, state1, state2, state3, state4 } state = state0;
263 	static int set8 = 0;
264 	static int japan646 = 0;
265 	static int lastc;
266 	int n;
267 	long l;
268 
269 again:
270 	switch(state)
271 	{
272 	case state0:	/* idle state */
273 		if(c == ESC){ state = state1; return; }
274 		if(c < 0) return;
275 		if(!set8 && (c < 128)){
276 			if(japan646){
277 				switch(c)
278 				{
279 				case '\\':	emit(0xA5); return;	/* yen */
280 				case '~':	emit(0xAF); return;	/* spacing macron */
281 				default:	emit(c); return;
282 				}
283 			} else {
284 				emit(c);
285 				return;
286 			}
287 		}
288 		lastc = c; state = state4; return;
289 
290 	case state1:	/* seen an escape */
291 		if(c == '$'){ state = state2; return; }
292 		if(c == '('){ state = state3; return; }
293 		emit(ESC); state = state0; goto again;
294 
295 	case state2:	/* may be shifting into JIS */
296 		if((c == '@') || (c == 'B')){
297 			set8 = 1; state = state0; return;
298 		}
299 		emit(ESC); emit('$'); state = state0; goto again;
300 
301 	case state3:	/* may be shifting out of JIS */
302 		if((c == 'J') || (c == 'H') || (c == 'B')){
303 			japan646 = (c == 'J');
304 			set8 = 0; state = state0; return;
305 		}
306 		emit(ESC); emit('('); state = state0; goto again;
307 
308 	case state4:	/* two part char */
309 		if(c < 0){
310 			if(squawk)
311 				EPR "%s: unexpected EOF in %s\n", argv0, file);
312 			c = 0x21 | (lastc&0x80);
313 		}
314 		if((lastc&0x80) != (c&0x80)){	/* guard against latin1 in jis */
315 			emit(lastc);
316 			state = state0;
317 			goto again;
318 		}
319 		n = (lastc&0x7F)*100 + (c&0x7f) - 3232;	/* kuten208 */
320 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
321 			nerrors++;
322 			if(squawk)
323 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
324 			if(!clean)
325 				emit(BADMAP);
326 		} else {
327 			if(l < 0){
328 				l = -l;
329 				if(squawk)
330 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
331 			}
332 			emit(l);
333 		}
334 		state = state0;
335 	}
336 }
337 
338 static void
do_in(int fd,void (* procfn)(int,Rune **,long),struct convert * out)339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
340 {
341 	Rune ob[N];
342 	Rune *r, *re;
343 	uchar ibuf[N];
344 	int n, i;
345 	long nin;
346 
347 	r = ob;
348 	re = ob+N-3;
349 	nin = 0;
350 	while((n = read(fd, ibuf, sizeof ibuf)) > 0){
351 		for(i = 0; i < n; i++){
352 			(*procfn)(ibuf[i], &r, nin++);
353 			if(r >= re){
354 				OUT(out, ob, r-ob);
355 				r = ob;
356 			}
357 		}
358 		if(r > ob){
359 			OUT(out, ob, r-ob);
360 			r = ob;
361 		}
362 	}
363 	(*procfn)(-1, &r, nin);
364 	if(r > ob)
365 		OUT(out, ob, r-ob);
366 	OUT(out, ob, 0);
367 }
368 
369 void
jis_in(int fd,long * notused,struct convert * out)370 jis_in(int fd, long *notused, struct convert *out)
371 {
372 	USED(notused);
373 	do_in(fd, alljis, out);
374 }
375 
376 void
ujis_in(int fd,long * notused,struct convert * out)377 ujis_in(int fd, long *notused, struct convert *out)
378 {
379 	USED(notused);
380 	do_in(fd, ujis, out);
381 }
382 
383 void
msjis_in(int fd,long * notused,struct convert * out)384 msjis_in(int fd, long *notused, struct convert *out)
385 {
386 	USED(notused);
387 	do_in(fd, ms, out);
388 }
389 
390 void
jisjis_in(int fd,long * notused,struct convert * out)391 jisjis_in(int fd, long *notused, struct convert *out)
392 {
393 	USED(notused);
394 	do_in(fd, jis, out);
395 }
396 
397 static int first = 1;
398 
399 static void
tab_init(void)400 tab_init(void)
401 {
402 	int i;
403 	long l;
404 
405 	first = 0;
406 	for(i = 0; i < NRUNE; i++)
407 		tab[i] = -1;
408 	for(i = 0; i < KUTEN208MAX; i++)
409 		if((l = tabkuten208[i]) != -1){
410 			if(l < 0)
411 				tab[-l] = i;
412 			else
413 				tab[l] = i;
414 		}
415 }
416 
417 
418 /*	jis-kanji, or ISO 2022-JP	*/
419 void
jisjis_out(Rune * base,int n,long * notused)420 jisjis_out(Rune *base, int n, long *notused)
421 {
422 	char *p;
423 	int i;
424 	Rune r;
425 	static enum { ascii, japan646, jp2022 } state = ascii;
426 
427 	USED(notused);
428 	if(first)
429 		tab_init();
430 	nrunes += n;
431 	p = obuf;
432 	for(i = 0; i < n; i++){
433 		r = base[i];
434 		if(r < 128){
435 			if(state == jp2022){
436 				*p++ = ESC; *p++ = '('; *p++ = 'B';
437 				state = ascii;
438 			}
439 			*p++ = r;
440 		} else {
441 			if(tab[r] != -1){
442 				if(state != jp2022){
443 					*p++ = ESC; *p++ = '$'; *p++ = 'B';
444 					state = jp2022;
445 				}
446 				*p++ = tab[r]/100 + ' ';
447 				*p++ = tab[r]%100 + ' ';
448 				continue;
449 			}
450 			if(squawk)
451 				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
452 			nerrors++;
453 			if(clean)
454 				continue;
455 			*p++ = BYTEBADMAP;
456 		}
457 	}
458 	noutput += p-obuf;
459 	if(p > obuf)
460 		write(1, obuf, p-obuf);
461 }
462 
463 /*	ms-kanji, or Shift-JIS	*/
464 void
msjis_out(Rune * base,int n,long * notused)465 msjis_out(Rune *base, int n, long *notused)
466 {
467 	char *p;
468 	int i, hi, lo;
469 	Rune r;
470 
471 	USED(notused);
472 	if(first)
473 		tab_init();
474 	nrunes += n;
475 	p = obuf;
476 	for(i = 0; i < n; i++){
477 		r = base[i];
478 		if(r < 128)
479 			*p++ = r;
480 		else {
481 			if(tab[r] != -1){
482 				hi = tab[r]/100 + ' ';
483 				lo = tab[r]%100 + ' ';
484 				J2S(hi, lo);
485 				*p++ = hi;
486 				*p++ = lo;
487 				continue;
488 			}
489 			if(squawk)
490 				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
491 			nerrors++;
492 			if(clean)
493 				continue;
494 			*p++ = BYTEBADMAP;
495 		}
496 	}
497 	noutput += p-obuf;
498 	if(p > obuf)
499 		write(1, obuf, p-obuf);
500 }
501 
502 /*	ujis, or EUC	*/
503 void
ujis_out(Rune * base,int n,long * notused)504 ujis_out(Rune *base, int n, long *notused)
505 {
506 	char *p;
507 	int i;
508 	Rune r;
509 
510 	USED(notused);
511 	if(first)
512 		tab_init();
513 	nrunes += n;
514 	p = obuf;
515 	for(i = 0; i < n; i++){
516 		r = base[i];
517 		if(r < 128)
518 			*p++ = r;
519 		else {
520 			if(tab[r] != -1){
521 				*p++ = 0x80 | (tab[r]/100 + ' ');
522 				*p++ = 0x80 | (tab[r]%100 + ' ');
523 				continue;
524 			}
525 			if(squawk)
526 				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
527 			nerrors++;
528 			if(clean)
529 				continue;
530 			*p++ = BYTEBADMAP;
531 		}
532 	}
533 	noutput += p-obuf;
534 	if(p > obuf)
535 		write(1, obuf, p-obuf);
536 }
537