]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/awk/lex.c
rune(2): add Runeerror reencoding considerations in BUGS section (thanks aiju)
[plan9front.git] / sys / src / cmd / awk / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "y.tab.h"
31
32 extern YYSTYPE  yylval;
33 extern int      infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         char    *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 Keyword keywords[] ={   /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "atan2",      FATAN,          BLTIN },
51         { "break",      BREAK,          BREAK },
52         { "close",      CLOSE,          CLOSE },
53         { "continue",   CONTINUE,       CONTINUE },
54         { "cos",        FCOS,           BLTIN },
55         { "delete",     DELETE,         DELETE },
56         { "do",         DO,             DO },
57         { "else",       ELSE,           ELSE },
58         { "exit",       EXIT,           EXIT },
59         { "exp",        FEXP,           BLTIN },
60         { "fflush",     FFLUSH,         BLTIN },
61         { "for",        FOR,            FOR },
62         { "func",       FUNC,           FUNC },
63         { "function",   FUNC,           FUNC },
64         { "getline",    GETLINE,        GETLINE },
65         { "gsub",       GSUB,           GSUB },
66         { "if",         IF,             IF },
67         { "in",         IN,             IN },
68         { "index",      INDEX,          INDEX },
69         { "int",        FINT,           BLTIN },
70         { "length",     FLENGTH,        BLTIN },
71         { "log",        FLOG,           BLTIN },
72         { "match",      MATCHFCN,       MATCHFCN },
73         { "next",       NEXT,           NEXT },
74         { "nextfile",   NEXTFILE,       NEXTFILE },
75         { "print",      PRINT,          PRINT },
76         { "printf",     PRINTF,         PRINTF },
77         { "rand",       FRAND,          BLTIN },
78         { "return",     RETURN,         RETURN },
79         { "sin",        FSIN,           BLTIN },
80         { "split",      SPLIT,          SPLIT },
81         { "sprintf",    SPRINTF,        SPRINTF },
82         { "sqrt",       FSQRT,          BLTIN },
83         { "srand",      FSRAND,         BLTIN },
84         { "sub",        SUB,            SUB },
85         { "substr",     SUBSTR,         SUBSTR },
86         { "system",     FSYSTEM,        BLTIN },
87         { "tolower",    FTOLOWER,       BLTIN },
88         { "toupper",    FTOUPPER,       BLTIN },
89         { "utf",        FUTF,           BLTIN },
90         { "while",      WHILE,          WHILE },
91 };
92
93 #define DEBUG
94 #ifdef  DEBUG
95 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
96 #else
97 #define RET(x)  return(x)
98 #endif
99
100 int peek(void)
101 {
102         int c = input();
103         unput(c);
104         return c;
105 }
106
107 int gettok(char **pbuf, int *psz)       /* get next input token */
108 {
109         int c;
110         char *buf = *pbuf;
111         int sz = *psz;
112         char *bp = buf;
113
114         c = input();
115         if (c == 0)
116                 return 0;
117         buf[0] = c;
118         buf[1] = 0;
119         if (!isalnum(c) && c != '.' && c != '_')
120                 return c;
121
122         *bp++ = c;
123         if (isalpha(c) || c == '_') {   /* it's a varname */
124                 for ( ; (c = input()) != 0; ) {
125                         if (bp-buf >= sz)
126                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
127                                         FATAL( "out of space for name %.10s...", buf );
128                         if (isalnum(c) || c == '_')
129                                 *bp++ = c;
130                         else {
131                                 *bp = 0;
132                                 unput(c);
133                                 break;
134                         }
135                 }
136         } else {        /* it's a number */
137                 char *rem;
138                 /* read input until can't be a number */
139                 for ( ; (c = input()) != 0; ) {
140                         if (bp-buf >= sz)
141                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
142                                         FATAL( "out of space for number %.10s...", buf );
143                         if (isdigit(c) || c == 'e' || c == 'E' 
144                           || c == '.' || c == '+' || c == '-')
145                                 *bp++ = c;
146                         else {
147                                 unput(c);
148                                 break;
149                         }
150                 }
151                 *bp = 0;
152                 strtod(buf, &rem);      /* parse the number */
153                 unputstr(rem);          /* put rest back for later */
154                 rem[0] = 0;
155         }
156         *pbuf = buf;
157         *psz = sz;
158         return buf[0];
159 }
160
161 int     word(char *);
162 int     string(void);
163 int     regexpr(void);
164 int     sc      = 0;    /* 1 => return a } right now */
165 int     reg     = 0;    /* 1 => return a REGEXPR now */
166
167 int yylex(void)
168 {
169         int c;
170         static char *buf = 0;
171         static int bufsize = 500;
172
173         if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
174                 FATAL( "out of space in yylex" );
175         if (sc) {
176                 sc = 0;
177                 RET('}');
178         }
179         if (reg) {
180                 reg = 0;
181                 return regexpr();
182         }
183         for (;;) {
184                 c = gettok(&buf, &bufsize);
185                 if (c == 0)
186                         return 0;
187                 if (isalpha(c) || c == '_')
188                         return word(buf);
189                 if (isdigit(c) || c == '.') {
190                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
191                         /* should this also have STR set? */
192                         RET(NUMBER);
193                 }
194         
195                 yylval.i = c;
196                 switch (c) {
197                 case '\n':      /* {EOL} */
198                         RET(NL);
199                 case '\r':      /* assume \n is coming */
200                 case ' ':       /* {WS}+ */
201                 case '\t':
202                         break;
203                 case '#':       /* #.* strip comments */
204                         while ((c = input()) != '\n' && c != 0)
205                                 ;
206                         unput(c);
207                         break;
208                 case ';':
209                         RET(';');
210                 case '\\':
211                         if (peek() == '\n') {
212                                 input();
213                         } else if (peek() == '\r') {
214                                 input(); input();       /* \n */
215                                 lineno++;
216                         } else {
217                                 RET(c);
218                         }
219                         break;
220                 case '&':
221                         if (peek() == '&') {
222                                 input(); RET(AND);
223                         } else 
224                                 RET('&');
225                 case '|':
226                         if (peek() == '|') {
227                                 input(); RET(BOR);
228                         } else
229                                 RET('|');
230                 case '!':
231                         if (peek() == '=') {
232                                 input(); yylval.i = NE; RET(NE);
233                         } else if (peek() == '~') {
234                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
235                         } else
236                                 RET(NOT);
237                 case '~':
238                         yylval.i = MATCH;
239                         RET(MATCHOP);
240                 case '<':
241                         if (peek() == '=') {
242                                 input(); yylval.i = LE; RET(LE);
243                         } else {
244                                 yylval.i = LT; RET(LT);
245                         }
246                 case '=':
247                         if (peek() == '=') {
248                                 input(); yylval.i = EQ; RET(EQ);
249                         } else {
250                                 yylval.i = ASSIGN; RET(ASGNOP);
251                         }
252                 case '>':
253                         if (peek() == '=') {
254                                 input(); yylval.i = GE; RET(GE);
255                         } else if (peek() == '>') {
256                                 input(); yylval.i = APPEND; RET(APPEND);
257                         } else {
258                                 yylval.i = GT; RET(GT);
259                         }
260                 case '+':
261                         if (peek() == '+') {
262                                 input(); yylval.i = INCR; RET(INCR);
263                         } else if (peek() == '=') {
264                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
265                         } else
266                                 RET('+');
267                 case '-':
268                         if (peek() == '-') {
269                                 input(); yylval.i = DECR; RET(DECR);
270                         } else if (peek() == '=') {
271                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
272                         } else
273                                 RET('-');
274                 case '*':
275                         if (peek() == '=') {    /* *= */
276                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
277                         } else if (peek() == '*') {     /* ** or **= */
278                                 input();        /* eat 2nd * */
279                                 if (peek() == '=') {
280                                         input(); yylval.i = POWEQ; RET(ASGNOP);
281                                 } else {
282                                         RET(POWER);
283                                 }
284                         } else
285                                 RET('*');
286                 case '/':
287                         RET('/');
288                 case '%':
289                         if (peek() == '=') {
290                                 input(); yylval.i = MODEQ; RET(ASGNOP);
291                         } else
292                                 RET('%');
293                 case '^':
294                         if (peek() == '=') {
295                                 input(); yylval.i = POWEQ; RET(ASGNOP);
296                         } else
297                                 RET(POWER);
298         
299                 case '$':
300                         /* BUG: awkward, if not wrong */
301                         c = gettok(&buf, &bufsize);
302                         if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
303                                 unputstr(buf);
304                                 RET(INDIRECT);
305                         } else if (isalpha(c)) {
306                                 if (strcmp(buf, "NF") == 0) {   /* very special */
307                                         unputstr("(NF)");
308                                         RET(INDIRECT);
309                                 }
310                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
311                                 RET(IVAR);
312                         } else {
313                                 unputstr(buf);
314                                 RET(INDIRECT);
315                         }
316         
317                 case '}':
318                         if (--bracecnt < 0)
319                                 SYNTAX( "extra }" );
320                         sc = 1;
321                         RET(';');
322                 case ']':
323                         if (--brackcnt < 0)
324                                 SYNTAX( "extra ]" );
325                         RET(']');
326                 case ')':
327                         if (--parencnt < 0)
328                                 SYNTAX( "extra )" );
329                         RET(')');
330                 case '{':
331                         bracecnt++;
332                         RET('{');
333                 case '[':
334                         brackcnt++;
335                         RET('[');
336                 case '(':
337                         parencnt++;
338                         RET('(');
339         
340                 case '"':
341                         return string();        /* BUG: should be like tran.c ? */
342         
343                 default:
344                         RET(c);
345                 }
346         }
347 }
348
349 int string(void)
350 {
351         int c, n;
352         char *s, *bp;
353         static char *buf = 0;
354         static int bufsz = 500;
355
356         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
357                 FATAL("out of space for strings");
358         for (bp = buf; (c = input()) != '"'; ) {
359                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
360                         FATAL("out of space for string %.10s...", buf);
361                 switch (c) {
362                 case '\n':
363                 case '\r':
364                 case 0:
365                         SYNTAX( "non-terminated string %.10s...", buf );
366                         lineno++;
367                         break;
368                 case '\\':
369                         c = input();
370                         switch (c) {
371                         case '"': *bp++ = '"'; break;
372                         case 'n': *bp++ = '\n'; break;  
373                         case 't': *bp++ = '\t'; break;
374                         case 'f': *bp++ = '\f'; break;
375                         case 'r': *bp++ = '\r'; break;
376                         case 'b': *bp++ = '\b'; break;
377                         case 'v': *bp++ = '\v'; break;
378                         case 'a': *bp++ = '\007'; break;
379                         case '\\': *bp++ = '\\'; break;
380
381                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
382                         case '3': case '4': case '5': case '6': case '7':
383                                 n = c - '0';
384                                 if ((c = peek()) >= '0' && c < '8') {
385                                         n = 8 * n + input() - '0';
386                                         if ((c = peek()) >= '0' && c < '8')
387                                                 n = 8 * n + input() - '0';
388                                 }
389                                 *bp++ = n;
390                                 break;
391
392                         case 'x':       /* hex  \x0-9a-fA-F + */
393                             {   char xbuf[100], *px;
394                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
395                                         if (isdigit(c)
396                                          || (c >= 'a' && c <= 'f')
397                                          || (c >= 'A' && c <= 'F'))
398                                                 *px++ = c;
399                                         else
400                                                 break;
401                                 }
402                                 *px = 0;
403                                 unput(c);
404                                 sscanf(xbuf, "%x", &n);
405                                 *bp++ = n;
406                                 break;
407                             }
408
409                         default: 
410                                 *bp++ = c;
411                                 break;
412                         }
413                         break;
414                 default:
415                         *bp++ = c;
416                         break;
417                 }
418         }
419         *bp = 0; 
420         s = tostring(buf);
421         *bp++ = ' '; *bp++ = 0;
422         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
423         RET(STRING);
424 }
425
426
427 int binsearch(char *w, Keyword *kp, int n)
428 {
429         int cond, low, mid, high;
430
431         low = 0;
432         high = n - 1;
433         while (low <= high) {
434                 mid = (low + high) / 2;
435                 if ((cond = strcmp(w, kp[mid].word)) < 0)
436                         high = mid - 1;
437                 else if (cond > 0)
438                         low = mid + 1;
439                 else
440                         return mid;
441         }
442         return -1;
443 }
444
445 int word(char *w) 
446 {
447         Keyword *kp;
448         int c, n;
449
450         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
451         kp = keywords + n;
452         if (n != -1) {  /* found in table */
453                 yylval.i = kp->sub;
454                 switch (kp->type) {     /* special handling */
455                 case FSYSTEM:
456                         if (safe)
457                                 SYNTAX( "system is unsafe" );
458                         RET(kp->type);
459                 case FUNC:
460                         if (infunc)
461                                 SYNTAX( "illegal nested function" );
462                         RET(kp->type);
463                 case RETURN:
464                         if (!infunc)
465                                 SYNTAX( "return not in function" );
466                         RET(kp->type);
467                 case VARNF:
468                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
469                         RET(VARNF);
470                 default:
471                         RET(kp->type);
472                 }
473         }
474         c = peek();     /* look for '(' */
475         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
476                 yylval.i = n;
477                 RET(ARG);
478         } else {
479                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
480                 if (c == '(') {
481                         RET(CALL);
482                 } else {
483                         RET(VAR);
484                 }
485         }
486 }
487
488 void startreg(void)     /* next call to yyles will return a regular expression */
489 {
490         reg = 1;
491 }
492
493 int regexpr(void)
494 {
495         int c;
496         static char *buf = 0;
497         static int bufsz = 500;
498         char *bp;
499
500         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
501                 FATAL("out of space for rex expr");
502         bp = buf;
503         for ( ; (c = input()) != '/' && c != 0; ) {
504                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
505                         FATAL("out of space for reg expr %.10s...", buf);
506                 if (c == '\n') {
507                         SYNTAX( "newline in regular expression %.10s...", buf ); 
508                         unput('\n');
509                         break;
510                 } else if (c == '\\') {
511                         *bp++ = '\\'; 
512                         *bp++ = input();
513                 } else {
514                         *bp++ = c;
515                 }
516         }
517         *bp = 0;
518         yylval.s = tostring(buf);
519         unput('/');
520         RET(REGEXPR);
521 }
522
523 /* low-level lexical stuff, sort of inherited from lex */
524
525 char    ebuf[300];
526 char    *ep = ebuf;
527 char    yysbuf[100];    /* pushback buffer */
528 char    *yysptr = yysbuf;
529 FILE    *yyin = 0;
530
531 int input(void) /* get next lexical input character */
532 {
533         int c;
534         extern char *lexprog;
535
536         if (yysptr > yysbuf)
537                 c = *--yysptr;
538         else if (lexprog != NULL) {     /* awk '...' */
539                 if ((c = *lexprog) != 0)
540                         lexprog++;
541         } else                          /* awk -f ... */
542                 c = pgetc();
543         if (c == '\n')
544                 lineno++;
545         else if (c == EOF)
546                 c = 0;
547         if (ep >= ebuf + sizeof ebuf)
548                 ep = ebuf;
549         return *ep++ = c;
550 }
551
552 void unput(int c)       /* put lexical character back on input */
553 {
554         if (c == '\n')
555                 lineno--;
556         if (yysptr >= yysbuf + sizeof(yysbuf))
557                 FATAL("pushed back too much: %.20s...", yysbuf);
558         *yysptr++ = c;
559         if (--ep < ebuf)
560                 ep = ebuf + sizeof(ebuf) - 1;
561 }
562
563 void unputstr(char *s)  /* put a string back on input */
564 {
565         int i;
566
567         for (i = strlen(s)-1; i >= 0; i--)
568                 unput(s[i]);
569 }