]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/awk/lex.c
merge
[plan9front.git] / sys / src / cmd / awk / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <u.h>
26 #include <libc.h>
27 #include <ctype.h>
28 #include <bio.h>
29 #include "awk.h"
30 #include "y.tab.h"
31
32 extern YYSTYPE  yylval;
33 extern int      infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         char    *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 Keyword keywords[] ={   /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "atan2",      FATAN,          BLTIN },
51         { "break",      BREAK,          BREAK },
52         { "close",      CLOSE,          CLOSE },
53         { "continue",   CONTINUE,       CONTINUE },
54         { "cos",        FCOS,           BLTIN },
55         { "delete",     DELETE,         DELETE },
56         { "do",         DO,             DO },
57         { "else",       ELSE,           ELSE },
58         { "exit",       EXIT,           EXIT },
59         { "exp",        FEXP,           BLTIN },
60         { "fflush",     FFLUSH,         BLTIN },
61         { "for",        FOR,            FOR },
62         { "func",       FUNC,           FUNC },
63         { "function",   FUNC,           FUNC },
64         { "getline",    GETLINE,        GETLINE },
65         { "gsub",       GSUB,           GSUB },
66         { "if",         IF,             IF },
67         { "in",         IN,             IN },
68         { "index",      INDEX,          INDEX },
69         { "int",        FINT,           BLTIN },
70         { "length",     FLENGTH,        BLTIN },
71         { "log",        FLOG,           BLTIN },
72         { "match",      MATCHFCN,       MATCHFCN },
73         { "next",       NEXT,           NEXT },
74         { "nextfile",   NEXTFILE,       NEXTFILE },
75         { "print",      PRINT,          PRINT },
76         { "printf",     PRINTF,         PRINTF },
77         { "rand",       FRAND,          BLTIN },
78         { "return",     RETURN,         RETURN },
79         { "sin",        FSIN,           BLTIN },
80         { "split",      SPLIT,          SPLIT },
81         { "sprintf",    SPRINTF,        SPRINTF },
82         { "sqrt",       FSQRT,          BLTIN },
83         { "srand",      FSRAND,         BLTIN },
84         { "sub",        SUB,            SUB },
85         { "substr",     SUBSTR,         SUBSTR },
86         { "system",     FSYSTEM,        BLTIN },
87         { "tolower",    FTOLOWER,       BLTIN },
88         { "toupper",    FTOUPPER,       BLTIN },
89         { "utf",        FUTF,           BLTIN },
90         { "while",      WHILE,          WHILE },
91 };
92
93 #ifdef  DEBUG
94 #define RET(x)  { if(dbg)print("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define RET(x)  return(x)
97 #endif
98
99 int peek(void)
100 {
101         int c = input();
102         unput(c);
103         return c;
104 }
105
106 int gettok(char **pbuf, int *psz)       /* get next input token */
107 {
108         int c;
109         char *buf = *pbuf;
110         int sz = *psz;
111         char *bp = buf;
112
113         c = input();
114         if (c == 0)
115                 return 0;
116         buf[0] = c;
117         buf[1] = 0;
118         if (!isalnum(c) && c != '.' && c != '_')
119                 return c;
120
121         *bp++ = c;
122         if (isalpha(c) || c == '_') {   /* it's a varname */
123                 for ( ; (c = input()) != 0; ) {
124                         if (bp-buf >= sz)
125                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126                                         FATAL( "out of space for name %.10s...", buf );
127                         if (isalnum(c) || c == '_')
128                                 *bp++ = c;
129                         else {
130                                 *bp = 0;
131                                 unput(c);
132                                 break;
133                         }
134                 }
135         } else {        /* it's a number */
136                 char *rem;
137                 /* read input until can't be a number */
138                 for ( ; (c = input()) != 0; ) {
139                         if (bp-buf >= sz)
140                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
141                                         FATAL( "out of space for number %.10s...", buf );
142                         if (isdigit(c) || c == 'e' || c == 'E' 
143                           || c == '.' || c == '+' || c == '-')
144                                 *bp++ = c;
145                         else {
146                                 unput(c);
147                                 break;
148                         }
149                 }
150                 *bp = 0;
151                 strtod(buf, &rem);      /* parse the number */
152                 unputstr(rem);          /* put rest back for later */
153                 rem[0] = 0;
154         }
155         *pbuf = buf;
156         *psz = sz;
157         return buf[0];
158 }
159
160 int     word(char *);
161 int     string(void);
162 int     regexpr(void);
163 int     sc      = 0;    /* 1 => return a } right now */
164 int     reg     = 0;    /* 1 => return a REGEXPR now */
165
166 int yylex(void)
167 {
168         int c;
169         static char *buf = 0;
170         static int bufsize = 500;
171
172         if (buf == 0 && (buf = (char *) malloc(bufsize)) == nil)
173                 FATAL( "out of space in yylex" );
174         if (sc) {
175                 sc = 0;
176                 RET('}');
177         }
178         if (reg) {
179                 reg = 0;
180                 return regexpr();
181         }
182         for (;;) {
183                 c = gettok(&buf, &bufsize);
184                 if (c == 0)
185                         return 0;
186                 if (isalpha(c) || c == '_')
187                         return word(buf);
188                 if (isdigit(c) || c == '.') {
189                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
190                         /* should this also have STR set? */
191                         RET(NUMBER);
192                 }
193         
194                 yylval.i = c;
195                 switch (c) {
196                 case '\n':      /* {EOL} */
197                         RET(NL);
198                 case '\r':      /* assume \n is coming */
199                 case ' ':       /* {WS}+ */
200                 case '\t':
201                         break;
202                 case '#':       /* #.* strip comments */
203                         while ((c = input()) != '\n' && c != 0)
204                                 ;
205                         unput(c);
206                         break;
207                 case ';':
208                         RET(';');
209                 case '\\':
210                         if (peek() == '\n') {
211                                 input();
212                         } else if (peek() == '\r') {
213                                 input(); input();       /* \n */
214                                 lineno++;
215                         } else {
216                                 RET(c);
217                         }
218                         break;
219                 case '&':
220                         if (peek() == '&') {
221                                 input(); RET(AND);
222                         } else 
223                                 RET('&');
224                 case '|':
225                         if (peek() == '|') {
226                                 input(); RET(BOR);
227                         } else
228                                 RET('|');
229                 case '!':
230                         if (peek() == '=') {
231                                 input(); yylval.i = NE; RET(NE);
232                         } else if (peek() == '~') {
233                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
234                         } else
235                                 RET(NOT);
236                 case '~':
237                         yylval.i = MATCH;
238                         RET(MATCHOP);
239                 case '<':
240                         if (peek() == '=') {
241                                 input(); yylval.i = LE; RET(LE);
242                         } else {
243                                 yylval.i = LT; RET(LT);
244                         }
245                 case '=':
246                         if (peek() == '=') {
247                                 input(); yylval.i = EQ; RET(EQ);
248                         } else {
249                                 yylval.i = ASSIGN; RET(ASGNOP);
250                         }
251                 case '>':
252                         if (peek() == '=') {
253                                 input(); yylval.i = GE; RET(GE);
254                         } else if (peek() == '>') {
255                                 input(); yylval.i = APPEND; RET(APPEND);
256                         } else {
257                                 yylval.i = GT; RET(GT);
258                         }
259                 case '+':
260                         if (peek() == '+') {
261                                 input(); yylval.i = INCR; RET(INCR);
262                         } else if (peek() == '=') {
263                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
264                         } else
265                                 RET('+');
266                 case '-':
267                         if (peek() == '-') {
268                                 input(); yylval.i = DECR; RET(DECR);
269                         } else if (peek() == '=') {
270                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
271                         } else
272                                 RET('-');
273                 case '*':
274                         if (peek() == '=') {    /* *= */
275                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
276                         } else if (peek() == '*') {     /* ** or **= */
277                                 input();        /* eat 2nd * */
278                                 if (peek() == '=') {
279                                         input(); yylval.i = POWEQ; RET(ASGNOP);
280                                 } else {
281                                         RET(POWER);
282                                 }
283                         } else
284                                 RET('*');
285                 case '/':
286                         RET('/');
287                 case '%':
288                         if (peek() == '=') {
289                                 input(); yylval.i = MODEQ; RET(ASGNOP);
290                         } else
291                                 RET('%');
292                 case '^':
293                         if (peek() == '=') {
294                                 input(); yylval.i = POWEQ; RET(ASGNOP);
295                         } else
296                                 RET(POWER);
297         
298                 case '$':
299                         /* BUG: awkward, if not wrong */
300                         c = gettok(&buf, &bufsize);
301                         if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
302                                 unputstr(buf);
303                                 RET(INDIRECT);
304                         } else if (isalpha(c)) {
305                                 if (strcmp(buf, "NF") == 0) {   /* very special */
306                                         unputstr("(NF)");
307                                         RET(INDIRECT);
308                                 }
309                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
310                                 RET(IVAR);
311                         } else {
312                                 unputstr(buf);
313                                 RET(INDIRECT);
314                         }
315         
316                 case '}':
317                         if (--bracecnt < 0)
318                                 SYNTAX( "extra }" );
319                         sc = 1;
320                         RET(';');
321                 case ']':
322                         if (--brackcnt < 0)
323                                 SYNTAX( "extra ]" );
324                         RET(']');
325                 case ')':
326                         if (--parencnt < 0)
327                                 SYNTAX( "extra )" );
328                         RET(')');
329                 case '{':
330                         bracecnt++;
331                         RET('{');
332                 case '[':
333                         brackcnt++;
334                         RET('[');
335                 case '(':
336                         parencnt++;
337                         RET('(');
338         
339                 case '"':
340                         return string();        /* BUG: should be like tran.c ? */
341         
342                 default:
343                         RET(c);
344                 }
345         }
346 }
347
348 int string(void)
349 {
350         int c, n;
351         char *s, *bp;
352         static char *buf = 0;
353         static int bufsz = 500;
354
355         if (buf == 0 && (buf = (char *) malloc(bufsz)) == nil)
356                 FATAL("out of space for strings");
357         for (bp = buf; (c = input()) != '"'; ) {
358                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)){
359                         *bp = 0;
360                         FATAL("out of space for string %.10s...", buf);
361                 }
362                 switch (c) {
363                 case '\n':
364                 case '\r':
365                 case 0:         
366                         *bp = 0;
367                         SYNTAX( "non-terminated string %.10s...", buf );
368                         lineno++;
369                         RET(0);
370                 case '\\':
371                         c = input();
372                         switch (c) {
373                         case '"': *bp++ = '"'; break;
374                         case 'n': *bp++ = '\n'; break;  
375                         case 't': *bp++ = '\t'; break;
376                         case 'f': *bp++ = '\f'; break;
377                         case 'r': *bp++ = '\r'; break;
378                         case 'b': *bp++ = '\b'; break;
379                         case 'v': *bp++ = '\v'; break;
380                         case 'a': *bp++ = '\007'; break;
381                         case '\\': *bp++ = '\\'; break;
382
383                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
384                         case '3': case '4': case '5': case '6': case '7':
385                                 n = c - '0';
386                                 if ((c = peek()) >= '0' && c < '8') {
387                                         n = 8 * n + input() - '0';
388                                         if ((c = peek()) >= '0' && c < '8')
389                                                 n = 8 * n + input() - '0';
390                                 }
391                                 *bp++ = n;
392                                 break;
393
394                         case 'x':       /* hex  \x0-9a-fA-F + */
395                             {   char xbuf[100], *px;
396                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
397                                         if (isdigit(c)
398                                          || (c >= 'a' && c <= 'f')
399                                          || (c >= 'A' && c <= 'F'))
400                                                 *px++ = c;
401                                         else
402                                                 break;
403                                 }
404                                 *px = 0;
405                                 unput(c);
406                                 n = strtol(xbuf, nil, 16);
407                                 *bp++ = n;
408                                 break;
409                             }
410
411                         default: 
412                                 *bp++ = c;
413                                 break;
414                         }
415                         break;
416                 default:
417                         *bp++ = c;
418                         break;
419                 }
420         }
421         *bp = 0; 
422         s = tostring(buf);
423         *bp++ = ' '; *bp++ = 0;
424         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
425         RET(STRING);
426 }
427
428
429 int binsearch(char *w, Keyword *kp, int n)
430 {
431         int cond, low, mid, high;
432
433         low = 0;
434         high = n - 1;
435         while (low <= high) {
436                 mid = (low + high) / 2;
437                 if ((cond = strcmp(w, kp[mid].word)) < 0)
438                         high = mid - 1;
439                 else if (cond > 0)
440                         low = mid + 1;
441                 else
442                         return mid;
443         }
444         return -1;
445 }
446
447 int word(char *w) 
448 {
449         Keyword *kp;
450         int c, n;
451
452         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
453         kp = keywords + n;
454         if (n != -1) {  /* found in table */
455                 yylval.i = kp->sub;
456                 switch (kp->type) {     /* special handling */
457                 case FSYSTEM:
458                         if (safe)
459                                 SYNTAX( "system is unsafe" );
460                         RET(kp->type);
461                 case FUNC:
462                         if (infunc)
463                                 SYNTAX( "illegal nested function" );
464                         RET(kp->type);
465                 case RETURN:
466                         if (!infunc)
467                                 SYNTAX( "return not in function" );
468                         RET(kp->type);
469                 case VARNF:
470                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
471                         RET(VARNF);
472                 default:
473                         RET(kp->type);
474                 }
475         }
476         c = peek();     /* look for '(' */
477         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
478                 yylval.i = n;
479                 RET(ARG);
480         } else {
481                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
482                 if (c == '(') {
483                         RET(CALL);
484                 } else {
485                         RET(VAR);
486                 }
487         }
488 }
489
490 void startreg(void)     /* next call to yyles will return a regular expression */
491 {
492         reg = 1;
493 }
494
495 int regexpr(void)
496 {
497         int c;
498         static char *buf = 0;
499         static int bufsz = 500;
500         char *bp;
501
502         if (buf == 0 && (buf = (char *) malloc(bufsz)) == nil)
503                 FATAL("out of space for rex expr");
504         bp = buf;
505         for ( ; (c = input()) != '/' && c != 0; ) {
506                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
507                         FATAL("out of space for reg expr %.10s...", buf);
508                 if (c == '\n') {
509                         SYNTAX( "newline in regular expression %.10s...", buf ); 
510                         unput('\n');
511                         break;
512                 } else if (c == '\\') {
513                         *bp++ = '\\'; 
514                         *bp++ = input();
515                 } else {
516                         *bp++ = c;
517                 }
518         }
519         *bp = 0;
520         yylval.s = tostring(buf);
521         unput('/');
522         RET(REGEXPR);
523 }
524
525 /* low-level lexical stuff, sort of inherited from lex */
526
527 char    ebuf[300];
528 char    *ep = ebuf;
529 char    yysbuf[100];    /* pushback buffer */
530 char    *yysptr = yysbuf;
531 Biobuf  *yyin;
532
533 int input(void) /* get next lexical input character */
534 {
535         int c;
536         extern char *lexprog;
537
538         if (yysptr > yysbuf)
539                 c = *--yysptr;
540         else if (lexprog != nil) {      /* awk '...' */
541                 if ((c = *lexprog) != 0)
542                         lexprog++;
543         } else                          /* awk -f ... */
544                 c = pgetc();
545         if (c == '\n')
546                 lineno++;
547         else if (c == Beof)
548                 c = 0;
549         if (ep >= ebuf + sizeof ebuf)
550                 ep = ebuf;
551         return *ep++ = c;
552 }
553
554 void unput(int c)       /* put lexical character back on input */
555 {
556         if (c == '\n')
557                 lineno--;
558         if (yysptr >= yysbuf + sizeof(yysbuf))
559                 FATAL("pushed back too much: %.20s...", yysbuf);
560         *yysptr++ = c;
561         if (--ep < ebuf)
562                 ep = ebuf + sizeof(ebuf) - 1;
563 }
564
565 void unputstr(char *s)  /* put a string back on input */
566 {
567         int i;
568
569         for (i = strlen(s)-1; i >= 0; i--)
570                 unput(s[i]);
571 }