]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/awk/lex.c
merge
[plan9front.git] / sys / src / cmd / awk / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <u.h>
26 #include <libc.h>
27 #include <ctype.h>
28 #include <bio.h>
29 #include "awk.h"
30 #include "y.tab.h"
31
32 extern YYSTYPE  yylval;
33 extern int      infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         char    *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 Keyword keywords[] ={   /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "atan2",      FATAN,          BLTIN },
51         { "break",      BREAK,          BREAK },
52         { "close",      CLOSE,          CLOSE },
53         { "continue",   CONTINUE,       CONTINUE },
54         { "cos",        FCOS,           BLTIN },
55         { "delete",     DELETE,         DELETE },
56         { "do",         DO,             DO },
57         { "else",       ELSE,           ELSE },
58         { "exit",       EXIT,           EXIT },
59         { "exp",        FEXP,           BLTIN },
60         { "fflush",     FFLUSH,         BLTIN },
61         { "for",        FOR,            FOR },
62         { "func",       FUNC,           FUNC },
63         { "function",   FUNC,           FUNC },
64         { "getline",    GETLINE,        GETLINE },
65         { "gsub",       GSUB,           GSUB },
66         { "if",         IF,             IF },
67         { "in",         IN,             IN },
68         { "index",      INDEX,          INDEX },
69         { "int",        FINT,           BLTIN },
70         { "length",     FLENGTH,        BLTIN },
71         { "log",        FLOG,           BLTIN },
72         { "match",      MATCHFCN,       MATCHFCN },
73         { "next",       NEXT,           NEXT },
74         { "nextfile",   NEXTFILE,       NEXTFILE },
75         { "print",      PRINT,          PRINT },
76         { "printf",     PRINTF,         PRINTF },
77         { "rand",       FRAND,          BLTIN },
78         { "return",     RETURN,         RETURN },
79         { "sin",        FSIN,           BLTIN },
80         { "split",      SPLIT,          SPLIT },
81         { "sprintf",    SPRINTF,        SPRINTF },
82         { "sqrt",       FSQRT,          BLTIN },
83         { "srand",      FSRAND,         BLTIN },
84         { "sub",        SUB,            SUB },
85         { "substr",     SUBSTR,         SUBSTR },
86         { "system",     FSYSTEM,        BLTIN },
87         { "tolower",    FTOLOWER,       BLTIN },
88         { "toupper",    FTOUPPER,       BLTIN },
89         { "utf",        FUTF,           BLTIN },
90         { "while",      WHILE,          WHILE },
91 };
92
93 #ifdef  DEBUG
94 #define RET(x)  { if(dbg)print("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define RET(x)  return(x)
97 #endif
98
99 int peek(void)
100 {
101         int c = input();
102         unput(c);
103         return c;
104 }
105
106 int gettok(char **pbuf, int *psz)       /* get next input token */
107 {
108         int c;
109         char *buf = *pbuf;
110         int sz = *psz;
111         char *bp = buf;
112
113         c = input();
114         if (c == 0)
115                 return 0;
116         buf[0] = c;
117         buf[1] = 0;
118         if (!isalnum(c) && c != '.' && c != '_')
119                 return c;
120
121         *bp++ = c;
122         if (isalpha(c) || c == '_') {   /* it's a varname */
123                 for ( ; (c = input()) != 0; ) {
124                         if (bp-buf >= sz)
125                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126                                         FATAL( "out of space for name %.10s...", buf );
127                         if (isalnum(c) || c == '_')
128                                 *bp++ = c;
129                         else {
130                                 *bp = 0;
131                                 unput(c);
132                                 break;
133                         }
134                 }
135         } else {        /* it's a number */
136                 char *rem;
137                 /* read input until can't be a number */
138                 for ( ; (c = input()) != 0; ) {
139                         if (bp-buf >= sz)
140                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
141                                         FATAL( "out of space for number %.10s...", buf );
142                         if (isdigit(c) || c == 'e' || c == 'E' 
143                           || c == '.' || c == '+' || c == '-')
144                                 *bp++ = c;
145                         else {
146                                 unput(c);
147                                 break;
148                         }
149                 }
150                 *bp = 0;
151                 strtod(buf, &rem);      /* parse the number */
152                 unputstr(rem);          /* put rest back for later */
153                 rem[0] = 0;
154         }
155         *pbuf = buf;
156         *psz = sz;
157         return buf[0];
158 }
159
160 int     word(char *);
161 int     string(void);
162 int     regexpr(void);
163 int     sc      = 0;    /* 1 => return a } right now */
164 int     reg     = 0;    /* 1 => return a REGEXPR now */
165
166 int yylex(void)
167 {
168         int c;
169         static char *buf = 0;
170         static int bufsize = 500;
171
172         if (buf == 0 && (buf = (char *) malloc(bufsize)) == nil)
173                 FATAL( "out of space in yylex" );
174         if (sc) {
175                 sc = 0;
176                 RET('}');
177         }
178         if (reg) {
179                 reg = 0;
180                 return regexpr();
181         }
182         for (;;) {
183                 c = gettok(&buf, &bufsize);
184                 if (c == 0)
185                         return 0;
186                 if (isalpha(c) || c == '_')
187                         return word(buf);
188                 if (isdigit(c) || c == '.') {
189                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
190                         /* should this also have STR set? */
191                         RET(NUMBER);
192                 }
193         
194                 yylval.i = c;
195                 switch (c) {
196                 case '\n':      /* {EOL} */
197                         RET(NL);
198                 case '\r':      /* assume \n is coming */
199                 case ' ':       /* {WS}+ */
200                 case '\t':
201                         break;
202                 case '#':       /* #.* strip comments */
203                         while ((c = input()) != '\n' && c != 0)
204                                 ;
205                         unput(c);
206                         break;
207                 case ';':
208                         RET(';');
209                 case '\\':
210                         if (peek() == '\n') {
211                                 input();
212                         } else if (peek() == '\r') {
213                                 input(); input();       /* \n */
214                                 lineno++;
215                         } else {
216                                 RET(c);
217                         }
218                         break;
219                 case '&':
220                         if (peek() == '&') {
221                                 input(); RET(AND);
222                         } else 
223                                 RET('&');
224                 case '|':
225                         if (peek() == '|') {
226                                 input(); RET(BOR);
227                         } else
228                                 RET('|');
229                 case '!':
230                         if (peek() == '=') {
231                                 input(); yylval.i = NE; RET(NE);
232                         } else if (peek() == '~') {
233                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
234                         } else
235                                 RET(NOT);
236                 case '~':
237                         yylval.i = MATCH;
238                         RET(MATCHOP);
239                 case '<':
240                         if (peek() == '=') {
241                                 input(); yylval.i = LE; RET(LE);
242                         } else {
243                                 yylval.i = LT; RET(LT);
244                         }
245                 case '=':
246                         if (peek() == '=') {
247                                 input(); yylval.i = EQ; RET(EQ);
248                         } else {
249                                 yylval.i = ASSIGN; RET(ASGNOP);
250                         }
251                 case '>':
252                         if (peek() == '=') {
253                                 input(); yylval.i = GE; RET(GE);
254                         } else if (peek() == '>') {
255                                 input(); yylval.i = APPEND; RET(APPEND);
256                         } else {
257                                 yylval.i = GT; RET(GT);
258                         }
259                 case '+':
260                         if (peek() == '+') {
261                                 input(); yylval.i = INCR; RET(INCR);
262                         } else if (peek() == '=') {
263                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
264                         } else
265                                 RET('+');
266                 case '-':
267                         if (peek() == '-') {
268                                 input(); yylval.i = DECR; RET(DECR);
269                         } else if (peek() == '=') {
270                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
271                         } else
272                                 RET('-');
273                 case '*':
274                         if (peek() == '=') {    /* *= */
275                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
276                         } else if (peek() == '*') {     /* ** or **= */
277                                 input();        /* eat 2nd * */
278                                 if (peek() == '=') {
279                                         input(); yylval.i = POWEQ; RET(ASGNOP);
280                                 } else {
281                                         RET(POWER);
282                                 }
283                         } else
284                                 RET('*');
285                 case '/':
286                         RET('/');
287                 case '%':
288                         if (peek() == '=') {
289                                 input(); yylval.i = MODEQ; RET(ASGNOP);
290                         } else
291                                 RET('%');
292                 case '^':
293                         if (peek() == '=') {
294                                 input(); yylval.i = POWEQ; RET(ASGNOP);
295                         } else
296                                 RET(POWER);
297         
298                 case '$':
299                         /* BUG: awkward, if not wrong */
300                         c = gettok(&buf, &bufsize);
301                         if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
302                                 unputstr(buf);
303                                 RET(INDIRECT);
304                         } else if (isalpha(c)) {
305                                 if (strcmp(buf, "NF") == 0) {   /* very special */
306                                         unputstr("(NF)");
307                                         RET(INDIRECT);
308                                 }
309                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
310                                 RET(IVAR);
311                         } else {
312                                 unputstr(buf);
313                                 RET(INDIRECT);
314                         }
315         
316                 case '}':
317                         if (--bracecnt < 0)
318                                 SYNTAX( "extra }" );
319                         sc = 1;
320                         RET(';');
321                 case ']':
322                         if (--brackcnt < 0)
323                                 SYNTAX( "extra ]" );
324                         RET(']');
325                 case ')':
326                         if (--parencnt < 0)
327                                 SYNTAX( "extra )" );
328                         RET(')');
329                 case '{':
330                         bracecnt++;
331                         RET('{');
332                 case '[':
333                         brackcnt++;
334                         RET('[');
335                 case '(':
336                         parencnt++;
337                         RET('(');
338         
339                 case '"':
340                         return string();        /* BUG: should be like tran.c ? */
341         
342                 default:
343                         RET(c);
344                 }
345         }
346 }
347
348 int string(void)
349 {
350         int c, n;
351         char *s, *bp;
352         static char *buf = 0;
353         static int bufsz = 500;
354
355         if (buf == 0 && (buf = (char *) malloc(bufsz)) == nil)
356                 FATAL("out of space for strings");
357         for (bp = buf; (c = input()) != '"'; ) {
358                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
359                         FATAL("out of space for string %.10s...", buf);
360                 switch (c) {
361                 case '\n':
362                 case '\r':
363                 case 0:
364                         SYNTAX( "non-terminated string %.10s...", buf );
365                         lineno++;
366                         break;
367                 case '\\':
368                         c = input();
369                         switch (c) {
370                         case '"': *bp++ = '"'; break;
371                         case 'n': *bp++ = '\n'; break;  
372                         case 't': *bp++ = '\t'; break;
373                         case 'f': *bp++ = '\f'; break;
374                         case 'r': *bp++ = '\r'; break;
375                         case 'b': *bp++ = '\b'; break;
376                         case 'v': *bp++ = '\v'; break;
377                         case 'a': *bp++ = '\007'; break;
378                         case '\\': *bp++ = '\\'; break;
379
380                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
381                         case '3': case '4': case '5': case '6': case '7':
382                                 n = c - '0';
383                                 if ((c = peek()) >= '0' && c < '8') {
384                                         n = 8 * n + input() - '0';
385                                         if ((c = peek()) >= '0' && c < '8')
386                                                 n = 8 * n + input() - '0';
387                                 }
388                                 *bp++ = n;
389                                 break;
390
391                         case 'x':       /* hex  \x0-9a-fA-F + */
392                             {   char xbuf[100], *px;
393                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
394                                         if (isdigit(c)
395                                          || (c >= 'a' && c <= 'f')
396                                          || (c >= 'A' && c <= 'F'))
397                                                 *px++ = c;
398                                         else
399                                                 break;
400                                 }
401                                 *px = 0;
402                                 unput(c);
403                                 n = strtol(xbuf, nil, 16);
404                                 *bp++ = n;
405                                 break;
406                             }
407
408                         default: 
409                                 *bp++ = c;
410                                 break;
411                         }
412                         break;
413                 default:
414                         *bp++ = c;
415                         break;
416                 }
417         }
418         *bp = 0; 
419         s = tostring(buf);
420         *bp++ = ' '; *bp++ = 0;
421         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
422         RET(STRING);
423 }
424
425
426 int binsearch(char *w, Keyword *kp, int n)
427 {
428         int cond, low, mid, high;
429
430         low = 0;
431         high = n - 1;
432         while (low <= high) {
433                 mid = (low + high) / 2;
434                 if ((cond = strcmp(w, kp[mid].word)) < 0)
435                         high = mid - 1;
436                 else if (cond > 0)
437                         low = mid + 1;
438                 else
439                         return mid;
440         }
441         return -1;
442 }
443
444 int word(char *w) 
445 {
446         Keyword *kp;
447         int c, n;
448
449         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
450         kp = keywords + n;
451         if (n != -1) {  /* found in table */
452                 yylval.i = kp->sub;
453                 switch (kp->type) {     /* special handling */
454                 case FSYSTEM:
455                         if (safe)
456                                 SYNTAX( "system is unsafe" );
457                         RET(kp->type);
458                 case FUNC:
459                         if (infunc)
460                                 SYNTAX( "illegal nested function" );
461                         RET(kp->type);
462                 case RETURN:
463                         if (!infunc)
464                                 SYNTAX( "return not in function" );
465                         RET(kp->type);
466                 case VARNF:
467                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
468                         RET(VARNF);
469                 default:
470                         RET(kp->type);
471                 }
472         }
473         c = peek();     /* look for '(' */
474         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
475                 yylval.i = n;
476                 RET(ARG);
477         } else {
478                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
479                 if (c == '(') {
480                         RET(CALL);
481                 } else {
482                         RET(VAR);
483                 }
484         }
485 }
486
487 void startreg(void)     /* next call to yyles will return a regular expression */
488 {
489         reg = 1;
490 }
491
492 int regexpr(void)
493 {
494         int c;
495         static char *buf = 0;
496         static int bufsz = 500;
497         char *bp;
498
499         if (buf == 0 && (buf = (char *) malloc(bufsz)) == nil)
500                 FATAL("out of space for rex expr");
501         bp = buf;
502         for ( ; (c = input()) != '/' && c != 0; ) {
503                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
504                         FATAL("out of space for reg expr %.10s...", buf);
505                 if (c == '\n') {
506                         SYNTAX( "newline in regular expression %.10s...", buf ); 
507                         unput('\n');
508                         break;
509                 } else if (c == '\\') {
510                         *bp++ = '\\'; 
511                         *bp++ = input();
512                 } else {
513                         *bp++ = c;
514                 }
515         }
516         *bp = 0;
517         yylval.s = tostring(buf);
518         unput('/');
519         RET(REGEXPR);
520 }
521
522 /* low-level lexical stuff, sort of inherited from lex */
523
524 char    ebuf[300];
525 char    *ep = ebuf;
526 char    yysbuf[100];    /* pushback buffer */
527 char    *yysptr = yysbuf;
528 Biobuf  *yyin;
529
530 int input(void) /* get next lexical input character */
531 {
532         int c;
533         extern char *lexprog;
534
535         if (yysptr > yysbuf)
536                 c = *--yysptr;
537         else if (lexprog != nil) {      /* awk '...' */
538                 if ((c = *lexprog) != 0)
539                         lexprog++;
540         } else                          /* awk -f ... */
541                 c = pgetc();
542         if (c == '\n')
543                 lineno++;
544         else if (c == Beof)
545                 c = 0;
546         if (ep >= ebuf + sizeof ebuf)
547                 ep = ebuf;
548         return *ep++ = c;
549 }
550
551 void unput(int c)       /* put lexical character back on input */
552 {
553         if (c == '\n')
554                 lineno--;
555         if (yysptr >= yysbuf + sizeof(yysbuf))
556                 FATAL("pushed back too much: %.20s...", yysbuf);
557         *yysptr++ = c;
558         if (--ep < ebuf)
559                 ep = ebuf + sizeof(ebuf) - 1;
560 }
561
562 void unputstr(char *s)  /* put a string back on input */
563 {
564         int i;
565
566         for (i = strlen(s)-1; i >= 0; i--)
567                 unput(s[i]);
568 }