11 a state machine for interpreting all sorts of encodings
14 alljis(int c, Rune **r, long input_loc)
16 static enum { state0, state1, state2, state3, state4 } state = state0;
18 static int japan646 = 0;
26 case state0: /* idle state */
27 if(c == ESC){ state = state1; return; }
29 if(!set8 && (c < 128)){
33 case '\\': emit(0xA5); return; /* yen */
34 case '~': emit(0xAF); return; /* spacing macron */
35 default: emit(c); return;
42 if(c < 0x21){ /* guard against bogus characters in JIS mode */
44 warn("non-JIS character %02x in %s near byte %ld", c, file, input_loc);
48 lastc = c; state = state4; return;
50 case state1: /* seen an escape */
51 if(c == '$'){ state = state2; return; }
52 if(c == '('){ state = state3; return; }
53 emit(ESC); state = state0; goto again;
55 case state2: /* may be shifting into JIS */
56 if((c == '@') || (c == 'B')){
57 set8 = 1; state = state0; return;
59 emit(ESC); emit('$'); state = state0; goto again;
61 case state3: /* may be shifting out of JIS */
62 if((c == 'J') || (c == 'H') || (c == 'B')){
63 japan646 = (c == 'J');
64 set8 = 0; state = state0; return;
66 emit(ESC); emit('('); state = state0; goto again;
68 case state4: /* two part char */
71 warn("unexpected EOF in %s", file);
72 c = 0x21 | (lastc&0x80);
74 if(CANS2J(lastc, c)){ /* ms dos sjis */
75 int hi = lastc, lo = c;
76 S2J(hi, lo); /* convert to 208 */
77 n = hi*100 + lo - 3232; /* convert to kuten208 */
79 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
80 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
83 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
90 warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
99 a state machine for interpreting ms-kanji == shift-jis.
102 ms(int c, Rune **r, long input_loc)
104 static enum { state0, state1, state2, state3, state4 } state = state0;
106 static int japan646 = 0;
114 case state0: /* idle state */
115 if(c == ESC){ state = state1; return; }
117 if(!set8 && (c < 128)){
121 case '\\': emit(0xA5); return; /* yen */
122 case '~': emit(0xAF); return; /* spacing macron */
123 default: emit(c); return;
130 if(!set8 && c >= 161 && c <= 223){
134 lastc = c; state = state4; return;
136 case state1: /* seen an escape */
137 if(c == '$'){ state = state2; return; }
138 if(c == '('){ state = state3; return; }
139 emit(ESC); state = state0; goto again;
141 case state2: /* may be shifting into JIS */
142 if((c == '@') || (c == 'B')){
143 set8 = 1; state = state0; return;
145 emit(ESC); emit('$'); state = state0; goto again;
147 case state3: /* may be shifting out of JIS */
148 if((c == 'J') || (c == 'H') || (c == 'B')){
149 japan646 = (c == 'J');
150 set8 = 0; state = state0; return;
152 emit(ESC); emit('('); state = state0; goto again;
154 case state4: /* two part char */
157 warn("unexpected EOF in %s", file);
158 c = 0x21 | (lastc&0x80);
160 if(CANS2J(lastc, c)){ /* ms dos sjis */
161 int hi = lastc, lo = c;
162 S2J(hi, lo); /* convert to 208 */
163 n = hi*100 + lo - 3232; /* convert to kuten208 */
167 warn("illegal byte pair (0x%x,0x%x) near byte %ld in %s", lastc, c, input_loc, file);
173 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
176 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
183 warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
192 a state machine for interpreting ujis == EUC
195 ujis(int c, Rune **r, long input_loc)
197 static enum { state0, state1, state2, state3 } state = state0;
204 case state0: /* idle state */
210 if(c == 0x8e){ /* codeset 2 */
213 warn("unknown codeset 2 near byte %ld in %s", input_loc, file);
218 if(c == 0x8f) /* codeset 3 */
226 case state1: /* two part char */
229 warn("unexpected EOF in %s", file);
232 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
233 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
236 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
243 warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
250 case state2: /* three part char, part #2 */
253 warn("unexpected EOF in %s\n", argv0, file);
256 if(c < 0xa1 || c > 0xfe){
258 warn("invalid byte 0x%x in codeset 3\n", argv0, c);
266 case state3: /* three part char, part #3 */
269 warn("unexpected EOF in %s\n", argv0, file);
272 if(c < 0xa1 || c > 0xfe){
274 warn("invalid byte 0x%x in codeset 3\n", argv0, c);
279 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten212 */
280 if((n >= KUTEN212MAX) || ((l = tabkuten212[n]) == -1)){
283 warn("unknown kuten212 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
290 warn("ambiguous kuten212 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
302 a state machine for interpreting jis-kanji == 2022-JP
305 jis(int c, Rune **r, long input_loc)
307 static enum { state0, state1, state2, state3, state4 } state = state0;
309 static int japan646 = 0;
317 case state0: /* idle state */
318 if(c == ESC){ state = state1; return; }
320 if(!set8 && (c < 128)){
324 case '\\': emit(0xA5); return; /* yen */
325 case '~': emit(0xAF); return; /* spacing macron */
326 default: emit(c); return;
333 lastc = c; state = state4; return;
335 case state1: /* seen an escape */
336 if(c == '$'){ state = state2; return; }
337 if(c == '('){ state = state3; return; }
338 emit(ESC); state = state0; goto again;
340 case state2: /* may be shifting into JIS */
341 if((c == '@') || (c == 'B')){
342 set8 = 1; state = state0; return;
344 emit(ESC); emit('$'); state = state0; goto again;
346 case state3: /* may be shifting out of JIS */
347 if((c == 'J') || (c == 'H') || (c == 'B')){
348 japan646 = (c == 'J');
349 set8 = 0; state = state0; return;
351 emit(ESC); emit('('); state = state0; goto again;
353 case state4: /* two part char */
356 warn("unexpected EOF in %s", file);
357 c = 0x21 | (lastc&0x80);
359 if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
364 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
365 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
368 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
375 warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
384 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
395 while((n = read(fd, ibuf, sizeof ibuf)) > 0){
396 for(i = 0; i < n; i++){
397 (*procfn)(ibuf[i], &r, nin++);
408 (*procfn)(-1, &r, nin);
415 jis_in(int fd, long *, struct convert *out)
417 do_in(fd, alljis, out);
421 ujis_in(int fd, long *, struct convert *out)
423 do_in(fd, ujis, out);
427 msjis_in(int fd, long *, struct convert *out)
433 jisjis_in(int fd, long *, struct convert *out)
438 static int first = 1;
447 for(i = 0; i < NRUNE; i++)
449 for(i = 0; i < KUTEN208MAX; i++)
450 if((l = tabkuten208[i]) != -1){
459 /* jis-kanji, or ISO 2022-JP */
461 jisjis_out(Rune *base, int n, long *)
466 static enum { ascii, japan646, jp2022 } state = ascii;
472 for(i = 0; i < n; i++){
476 *p++ = ESC; *p++ = '('; *p++ = 'B';
481 if(r < NRUNE && tab[r] != -1){
483 *p++ = ESC; *p++ = '$'; *p++ = 'B';
486 *p++ = tab[r]/100 + ' ';
487 *p++ = tab[r]%100 + ' ';
491 warn("rune 0x%x not in output cs", r);
500 write(1, obuf, p-obuf);
503 /* ms-kanji, or Shift-JIS */
505 msjis_out(Rune *base, int n, long *)
515 for(i = 0; i < n; i++){
520 if(r < NRUNE && tab[r] != -1){
521 hi = tab[r]/100 + ' ';
522 lo = tab[r]%100 + ' ';
529 warn("rune 0x%x not in output cs", r);
538 write(1, obuf, p-obuf);
543 ujis_out(Rune *base, int n, long *)
553 for(i = 0; i < n; i++){
558 if(r < NRUNE && tab[r] != -1){
559 *p++ = 0x80 | (tab[r]/100 + ' ');
560 *p++ = 0x80 | (tab[r]%100 + ' ');
564 warn("rune 0x%x not in output cs", r);
573 write(1, obuf, p-obuf);