]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/tcs/conv_jis.c
ip/ipconfig: use ewrite() to enable routing command for sendra
[plan9front.git] / sys / src / cmd / tcs / conv_jis.c
1 #include        <u.h>
2 #include        <libc.h>
3 #include        <bio.h>
4 #include        "hdr.h"
5 #include        "conv.h"
6 #include        "kuten208.h"
7 #include        "kuten212.h"
8 #include        "jis.h"
9
10 /*
11         a state machine for interpreting all sorts of encodings
12 */
13 static void
14 alljis(int c, Rune **r, long input_loc)
15 {
16         static enum { state0, state1, state2, state3, state4 } state = state0;
17         static int set8 = 0;
18         static int japan646 = 0;
19         static int lastc;
20         int n;
21         long l;
22
23 again:
24         switch(state)
25         {
26         case state0:    /* idle state */
27                 if(c == ESC){ state = state1; return; }
28                 if(c < 0) return;
29                 if(!set8 && (c < 128)){
30                         if(japan646){
31                                 switch(c)
32                                 {
33                                 case '\\':      emit(0xA5); return;     /* yen */
34                                 case '~':       emit(0xAF); return;     /* spacing macron */
35                                 default:        emit(c); return;
36                                 }
37                         } else {
38                                 emit(c);
39                                 return;
40                         }
41                 }
42                 if(c < 0x21){   /* guard against bogus characters in JIS mode */
43                         if(squawk)
44                                 warn("non-JIS character %02x in %s near byte %ld", c, file, input_loc);
45                         emit(c);
46                         return;
47                 }
48                 lastc = c; state = state4; return;
49
50         case state1:    /* seen an escape */
51                 if(c == '$'){ state = state2; return; }
52                 if(c == '('){ state = state3; return; }
53                 emit(ESC); state = state0; goto again;
54
55         case state2:    /* may be shifting into JIS */
56                 if((c == '@') || (c == 'B')){
57                         set8 = 1; state = state0; return;
58                 }
59                 emit(ESC); emit('$'); state = state0; goto again;
60
61         case state3:    /* may be shifting out of JIS */
62                 if((c == 'J') || (c == 'H') || (c == 'B')){
63                         japan646 = (c == 'J');
64                         set8 = 0; state = state0; return;
65                 }
66                 emit(ESC); emit('('); state = state0; goto again;
67
68         case state4:    /* two part char */
69                 if(c < 0){
70                         if(squawk)
71                                 warn("unexpected EOF in %s", file);
72                         c = 0x21 | (lastc&0x80);
73                 }
74                 if(CANS2J(lastc, c)){   /* ms dos sjis */
75                         int hi = lastc, lo = c;
76                         S2J(hi, lo);                    /* convert to 208 */
77                         n = hi*100 + lo - 3232;         /* convert to kuten208 */
78                 } else
79                         n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
80                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
81                         nerrors++;
82                         if(squawk)
83                                 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
84                         if(!clean)
85                                 emit(BADMAP);
86                 } else {
87                         if(l < 0){
88                                 l = -l;
89                                 if(squawk)
90                                         warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
91                         }
92                         emit(l);
93                 }
94                 state = state0;
95         }
96 }
97
98 /*
99         a state machine for interpreting ms-kanji == shift-jis.
100 */
101 static void
102 ms(int c, Rune **r, long input_loc)
103 {
104         static enum { state0, state1, state2, state3, state4 } state = state0;
105         static int set8 = 0;
106         static int japan646 = 0;
107         static int lastc;
108         int n;
109         long l;
110
111 again:
112         switch(state)
113         {
114         case state0:    /* idle state */
115                 if(c == ESC){ state = state1; return; }
116                 if(c < 0) return;
117                 if(!set8 && (c < 128)){
118                         if(japan646){
119                                 switch(c)
120                                 {
121                                 case '\\':      emit(0xA5); return;     /* yen */
122                                 case '~':       emit(0xAF); return;     /* spacing macron */
123                                 default:        emit(c); return;
124                                 }
125                         } else {
126                                 emit(c);
127                                 return;
128                         }
129                 }
130                 if(!set8 && c >= 161 && c <= 223){
131                         emit(0xFEC0 + c);
132                         return;
133                 }
134                 lastc = c; state = state4; return;
135
136         case state1:    /* seen an escape */
137                 if(c == '$'){ state = state2; return; }
138                 if(c == '('){ state = state3; return; }
139                 emit(ESC); state = state0; goto again;
140
141         case state2:    /* may be shifting into JIS */
142                 if((c == '@') || (c == 'B')){
143                         set8 = 1; state = state0; return;
144                 }
145                 emit(ESC); emit('$'); state = state0; goto again;
146
147         case state3:    /* may be shifting out of JIS */
148                 if((c == 'J') || (c == 'H') || (c == 'B')){
149                         japan646 = (c == 'J');
150                         set8 = 0; state = state0; return;
151                 }
152                 emit(ESC); emit('('); state = state0; goto again;
153
154         case state4:    /* two part char */
155                 if(c < 0){
156                         if(squawk)
157                                 warn("unexpected EOF in %s", file);
158                         c = 0x21 | (lastc&0x80);
159                 }
160                 if(CANS2J(lastc, c)){   /* ms dos sjis */
161                         int hi = lastc, lo = c;
162                         S2J(hi, lo);                    /* convert to 208 */
163                         n = hi*100 + lo - 3232;         /* convert to kuten208 */
164                 } else {
165                         nerrors++;
166                         if(squawk)
167                                 warn("illegal byte pair (0x%x,0x%x) near byte %ld in %s", lastc, c, input_loc, file);
168                         if(!clean)
169                                 emit(BADMAP);
170                         state = state0;
171                         goto again;
172                 }
173                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
174                         nerrors++;
175                         if(squawk)
176                                 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
177                         if(!clean)
178                                 emit(BADMAP);
179                 } else {
180                         if(l < 0){
181                                 l = -l;
182                                 if(squawk)
183                                         warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
184                         }
185                         emit(l);
186                 }
187                 state = state0;
188         }
189 }
190
191 /*
192         a state machine for interpreting ujis == EUC
193 */
194 static void
195 ujis(int c, Rune **r, long input_loc)
196 {
197         static enum { state0, state1, state2, state3 } state = state0;
198         static int lastc;
199         int n;
200         long l;
201
202         switch(state)
203         {
204         case state0:    /* idle state */
205                 if(c < 0) return;
206                 if(c < 128){
207                         emit(c);
208                         return;
209                 }
210                 if(c == 0x8e){  /* codeset 2 */
211                         nerrors++;
212                         if(squawk)
213                                 warn("unknown codeset 2 near byte %ld in %s", input_loc, file);
214                         if(!clean)
215                                 emit(BADMAP);
216                         return;
217                 }
218                 if(c == 0x8f)   /* codeset 3 */
219                         state = state2;
220                 else{
221                         lastc = c;
222                         state = state1;
223                 }
224                 return;
225
226         case state1:    /* two part char */
227                 if(c < 0){
228                         if(squawk)
229                                 warn("unexpected EOF in %s", file);
230                         c = 0xA1;
231                 }
232                 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
233                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
234                         nerrors++;
235                         if(squawk)
236                                 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
237                         if(!clean)
238                                 emit(BADMAP);
239                 } else {
240                         if(l < 0){
241                                 l = -l;
242                                 if(squawk)
243                                         warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
244                         }
245                         emit(l);
246                 }
247                 state = state0;
248                 return;
249         
250         case state2:    /* three part char, part #2 */
251                 if(c < 0){
252                         if(squawk)
253                                 warn("unexpected EOF in %s\n", argv0, file);
254                         c = 0xA1;
255                 }
256                 if(c < 0xa1 || c > 0xfe){
257                         if(squawk)
258                                 warn("invalid byte 0x%x in codeset 3\n", argv0, c);
259                         state = state0;
260                 }else{
261                         lastc = c;
262                         state = state3;
263                 }
264                 return;
265
266         case state3:    /* three part char, part #3 */
267                 if(c < 0){
268                         if(squawk)
269                                 warn("unexpected EOF in %s\n", argv0, file);
270                         c = 0xA1;
271                 }
272                 if(c < 0xa1 || c > 0xfe){
273                         if(squawk)
274                                 warn("invalid byte 0x%x in codeset 3\n", argv0, c);
275                         state = state0;
276                         return;
277                 }
278                 
279                 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten212 */
280                 if((n >= KUTEN212MAX) || ((l = tabkuten212[n]) == -1)){
281                         nerrors++;
282                         if(squawk)
283                                 warn("unknown kuten212 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
284                         if(!clean)
285                                 emit(BADMAP);
286                 } else {
287                         if(l < 0){
288                                 l = -l;
289                                 if(squawk)
290                                         warn("ambiguous kuten212 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
291                         }
292                         emit(l);
293                 }
294                 state = state0;
295                 return;
296                 
297                 
298         }
299 }
300
301 /*
302         a state machine for interpreting jis-kanji == 2022-JP
303 */
304 static void
305 jis(int c, Rune **r, long input_loc)
306 {
307         static enum { state0, state1, state2, state3, state4 } state = state0;
308         static int set8 = 0;
309         static int japan646 = 0;
310         static int lastc;
311         int n;
312         long l;
313
314 again:
315         switch(state)
316         {
317         case state0:    /* idle state */
318                 if(c == ESC){ state = state1; return; }
319                 if(c < 0) return;
320                 if(!set8 && (c < 128)){
321                         if(japan646){
322                                 switch(c)
323                                 {
324                                 case '\\':      emit(0xA5); return;     /* yen */
325                                 case '~':       emit(0xAF); return;     /* spacing macron */
326                                 default:        emit(c); return;
327                                 }
328                         } else {
329                                 emit(c);
330                                 return;
331                         }
332                 }
333                 lastc = c; state = state4; return;
334
335         case state1:    /* seen an escape */
336                 if(c == '$'){ state = state2; return; }
337                 if(c == '('){ state = state3; return; }
338                 emit(ESC); state = state0; goto again;
339
340         case state2:    /* may be shifting into JIS */
341                 if((c == '@') || (c == 'B')){
342                         set8 = 1; state = state0; return;
343                 }
344                 emit(ESC); emit('$'); state = state0; goto again;
345
346         case state3:    /* may be shifting out of JIS */
347                 if((c == 'J') || (c == 'H') || (c == 'B')){
348                         japan646 = (c == 'J');
349                         set8 = 0; state = state0; return;
350                 }
351                 emit(ESC); emit('('); state = state0; goto again;
352
353         case state4:    /* two part char */
354                 if(c < 0){
355                         if(squawk)
356                                 warn("unexpected EOF in %s", file);
357                         c = 0x21 | (lastc&0x80);
358                 }
359                 if((lastc&0x80) != (c&0x80)){   /* guard against latin1 in jis */
360                         emit(lastc);
361                         state = state0;
362                         goto again;
363                 }
364                 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
365                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
366                         nerrors++;
367                         if(squawk)
368                                 warn("unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s", n, lastc, c, input_loc, file);
369                         if(!clean)
370                                 emit(BADMAP);
371                 } else {
372                         if(l < 0){
373                                 l = -l;
374                                 if(squawk)
375                                         warn("ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s", n, l, input_loc, file);
376                         }
377                         emit(l);
378                 }
379                 state = state0;
380         }
381 }
382
383 static void
384 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
385 {
386         Rune ob[N];
387         Rune *r, *re;
388         uchar ibuf[N];
389         int n, i;
390         long nin;
391
392         r = ob;
393         re = ob+N-3;
394         nin = 0;
395         while((n = read(fd, ibuf, sizeof ibuf)) > 0){
396                 for(i = 0; i < n; i++){
397                         (*procfn)(ibuf[i], &r, nin++);
398                         if(r >= re){
399                                 OUT(out, ob, r-ob);
400                                 r = ob;
401                         }
402                 }
403                 if(r > ob){
404                         OUT(out, ob, r-ob);
405                         r = ob;
406                 }
407         }
408         (*procfn)(-1, &r, nin);
409         if(r > ob)
410                 OUT(out, ob, r-ob);
411         OUT(out, ob, 0);
412 }
413
414 void
415 jis_in(int fd, long *, struct convert *out)
416 {
417         do_in(fd, alljis, out);
418 }
419
420 void
421 ujis_in(int fd, long *, struct convert *out)
422 {
423         do_in(fd, ujis, out);
424 }
425
426 void
427 msjis_in(int fd, long *, struct convert *out)
428 {
429         do_in(fd, ms, out);
430 }
431
432 void
433 jisjis_in(int fd, long *, struct convert *out)
434 {
435         do_in(fd, jis, out);
436 }
437
438 static int first = 1;
439
440 static void
441 tab_init(void)
442 {
443         int i;
444         long l;
445
446         first = 0;
447         for(i = 0; i < NRUNE; i++)
448                 tab[i] = -1;
449         for(i = 0; i < KUTEN208MAX; i++)
450                 if((l = tabkuten208[i]) != -1){
451                         if(l < 0)
452                                 tab[-l] = i;
453                         else
454                                 tab[l] = i;
455                 }
456 }
457
458
459 /*      jis-kanji, or ISO 2022-JP       */
460 void
461 jisjis_out(Rune *base, int n, long *)
462 {
463         char *p;
464         int i;
465         Rune r;
466         static enum { ascii, japan646, jp2022 } state = ascii;
467
468         if(first)
469                 tab_init();
470         nrunes += n;
471         p = obuf;
472         for(i = 0; i < n; i++){
473                 r = base[i];
474                 if(r < 128){
475                         if(state == jp2022){
476                                 *p++ = ESC; *p++ = '('; *p++ = 'B';
477                                 state = ascii;
478                         }
479                         *p++ = r;
480                 } else {
481                         if(r < NRUNE && tab[r] != -1){
482                                 if(state != jp2022){
483                                         *p++ = ESC; *p++ = '$'; *p++ = 'B';
484                                         state = jp2022;
485                                 }
486                                 *p++ = tab[r]/100 + ' ';
487                                 *p++ = tab[r]%100 + ' ';
488                                 continue;
489                         }
490                         if(squawk)
491                                 warn("rune 0x%x not in output cs", r);
492                         nerrors++;
493                         if(clean)
494                                 continue;
495                         *p++ = BYTEBADMAP;
496                 }
497         }
498         noutput += p-obuf;
499         if(p > obuf)
500                 write(1, obuf, p-obuf);
501 }
502
503 /*      ms-kanji, or Shift-JIS  */
504 void
505 msjis_out(Rune *base, int n, long *)
506 {
507         char *p;
508         int i, hi, lo;
509         Rune r;
510
511         if(first)
512                 tab_init();
513         nrunes += n;
514         p = obuf;
515         for(i = 0; i < n; i++){
516                 r = base[i];
517                 if(r < 128)
518                         *p++ = r;
519                 else {
520                         if(r < NRUNE && tab[r] != -1){
521                                 hi = tab[r]/100 + ' ';
522                                 lo = tab[r]%100 + ' ';
523                                 J2S(hi, lo);
524                                 *p++ = hi;
525                                 *p++ = lo;
526                                 continue;
527                         }
528                         if(squawk)
529                                 warn("rune 0x%x not in output cs", r);
530                         nerrors++;
531                         if(clean)
532                                 continue;
533                         *p++ = BYTEBADMAP;
534                 }
535         }
536         noutput += p-obuf;
537         if(p > obuf)
538                 write(1, obuf, p-obuf);
539 }
540
541 /*      ujis, or EUC    */
542 void
543 ujis_out(Rune *base, int n, long *)
544 {
545         char *p;
546         int i;
547         Rune r;
548
549         if(first)
550                 tab_init();
551         nrunes += n;
552         p = obuf;
553         for(i = 0; i < n; i++){
554                 r = base[i];
555                 if(r < 128)
556                         *p++ = r;
557                 else {
558                         if(r < NRUNE && tab[r] != -1){
559                                 *p++ = 0x80 | (tab[r]/100 + ' ');
560                                 *p++ = 0x80 | (tab[r]%100 + ' ');
561                                 continue;
562                         }
563                         if(squawk)
564                                 warn("rune 0x%x not in output cs", r);
565                         nerrors++;
566                         if(clean)
567                                 continue;
568                         *p++ = BYTEBADMAP;
569                 }
570         }
571         noutput += p-obuf;
572         if(p > obuf)
573                 write(1, obuf, p-obuf);
574 }