1 ----------------------------------------------------------------------
4 -- Summary: parser generator. Collection of higher order functors,
5 -- which allow to build and combine parsers. Relies on a lexer
6 -- that supports the same API as the one exposed in mll.lua.
8 ----------------------------------------------------------------------
10 -- Copyright (c) 2006-2008, Fabien Fleutot <metalua@gmail.com>.
12 -- This software is released under the MIT Licence, see licence.txt
15 ----------------------------------------------------------------------
17 --------------------------------------------------------------------------------
23 -- * [gg.multisequence()]
27 -- * [gg.optkeyword()]
30 -- * [gg.parse_error()]
31 -- * [gg.make_parser()]
34 --------------------------------------------------------------------------------
36 module("gg", package.seeall)
38 -------------------------------------------------------------------------------
39 -- parser metatable, which maps __call to method parse, and adds some
40 -- error tracing boilerplate.
41 -------------------------------------------------------------------------------
42 local parser_metatable = { }
43 function parser_metatable.__call (parser, lx, ...)
44 --printf ("Call parser %q of type %q", parser.name or "?", parser.kind)
46 return parser:parse (lx, ...)
47 --local x = parser:parse (lx, ...)
48 --printf ("Result of parser %q: %s",
49 -- parser.name or "?",
50 -- _G.table.tostring(x, "nohash", 80))
53 local li = lx:lineinfo_right() or { "?", "?", "?", "?" }
54 local status, ast = pcall (parser.parse, parser, lx, ...)
55 if status then return ast else
56 -- Try to replace the gg.lua location, in the error msg, with
57 -- the place where the current parser started handling the
59 -- Since the error is rethrown, these places are stacked.
60 error (string.format ("%s\n - (l.%s, c.%s, k.%s) in parser %s",
61 ast :strmatch "gg.lua:%d+: (.*)" or ast,
62 li[1], li[2], li[3], parser.name or parser.kind))
67 -------------------------------------------------------------------------------
68 -- Turn a table into a parser, mainly by setting the metatable.
69 -------------------------------------------------------------------------------
70 function make_parser(kind, p)
72 if not p.transformers then p.transformers = { } end
73 function p.transformers:add (x)
74 table.insert (self, x)
76 setmetatable (p, parser_metatable)
80 -------------------------------------------------------------------------------
81 -- Return true iff [x] is a parser.
82 -- If it's a gg-generated parser, return the name of its kind.
83 -------------------------------------------------------------------------------
84 function is_parser (x)
85 return type(x)=="function" or getmetatable(x)==parser_metatable and x.kind
88 -------------------------------------------------------------------------------
89 -- Parse a sequence, without applying builder nor transformers
90 -------------------------------------------------------------------------------
91 local function raw_parse_sequence (lx, p)
95 if type(e) == "string" then
96 if not lx:is_keyword (lx:next(), e) then
97 parse_error (lx, "A keyword was expected, probably `%s'.", e) end
98 elseif is_parser (e) then
99 table.insert (r, e (lx))
101 gg.parse_error (lx,"Sequence `%s': element #%i is neither a string "..
103 p.name, i, table.tostring(e))
109 -------------------------------------------------------------------------------
110 -- Parse a multisequence, without applying multisequence transformers.
111 -- The sequences are completely parsed.
112 -------------------------------------------------------------------------------
113 local function raw_parse_multisequence (lx, sequence_table, default)
114 local seq_parser = sequence_table[lx:is_keyword(lx:peek())]
115 if seq_parser then return seq_parser (lx)
116 elseif default then return default (lx)
117 else return false end
120 -------------------------------------------------------------------------------
121 -- Applies all transformers listed in parser on ast.
122 -------------------------------------------------------------------------------
123 local function transform (ast, parser, fli, lli)
124 if parser.transformers then
125 for _, t in ipairs (parser.transformers) do ast = t(ast) or ast end
127 if type(ast) == 'table'then
128 local ali = ast.lineinfo
129 if not ali or ali.first~=fli or ali.last~=lli then
130 ast.lineinfo = { first = fli, last = lli }
136 -------------------------------------------------------------------------------
137 -- Generate a tracable parsing error (not implemented yet)
138 -------------------------------------------------------------------------------
139 function parse_error(lx, fmt, ...)
140 local li = lx:lineinfo_left() or {-1,-1,-1, "<unknown file>"}
141 local msg = string.format("line %i, char %i: "..fmt, li[1], li[2], ...)
143 if li[3]>0 and src then
144 local i, j = li[3], li[3]
145 while src:sub(i,i) ~= '\n' and i>=0 do i=i-1 end
146 while src:sub(j,j) ~= '\n' and j<=#src do j=j+1 end
147 local srcline = src:sub (i+1, j-1)
148 local idx = string.rep (" ", li[2]).."^"
149 msg = string.format("%s\n>>> %s\n>>> %s", msg, srcline, idx)
154 -------------------------------------------------------------------------------
156 -- Sequence parser generator
158 -------------------------------------------------------------------------------
161 -- * [builder]: how to build an AST out of sequence parts. let [x] be the list
162 -- of subparser results (keywords are simply omitted). [builder] can be:
163 -- - [nil], in which case the result of parsing is simply [x]
164 -- - a string, which is then put as a tag on [x]
165 -- - a function, which takes [x] as a parameter and returns an AST.
167 -- * [name]: the name of the parser. Used for debug messages
169 -- * [transformers]: a list of AST->AST functions, applied in order on ASTs
170 -- returned by the parser.
172 -- * Table-part entries corresponds to keywords (strings) and subparsers
173 -- (function and callable objects).
175 -- After creation, the following fields are added:
176 -- * [parse] the parsing function lexer->AST
177 -- * [kind] == "sequence"
178 -- * [name] is set, if it wasn't in the input.
180 -------------------------------------------------------------------------------
181 function sequence (p)
182 make_parser ("sequence", p)
184 -------------------------------------------------------------------
186 -------------------------------------------------------------------
187 function p:parse (lx)
189 local fli = lx:lineinfo_right()
190 local seq = raw_parse_sequence (lx, self)
191 local lli = lx:lineinfo_left()
193 -- Builder application:
194 local builder, tb = self.builder, type (self.builder)
195 if tb == "string" then seq.tag = builder
196 elseif tb == "function" or builder and builder.__call then seq = builder(seq)
197 elseif builder == nil then -- nothing
198 else error ("Invalid builder of type "..tb.." in sequence") end
199 seq = transform (seq, self, fli, lli)
200 assert (not seq or seq.lineinfo)
204 -------------------------------------------------------------------
206 -------------------------------------------------------------------
207 -- Try to build a proper name
209 -- don't touch existing name
210 elseif type(p[1])=="string" then -- find name based on 1st keyword
211 if #p==1 then p.name=p[1]
212 elseif type(p[#p])=="string" then
213 p.name = p[1] .. " ... " . p[#p]
214 else p.name = p[1] .. " ..." end
215 else -- can't find a decent name
216 p.name = "<anonymous>"
223 -------------------------------------------------------------------------------
225 -- Multiple, keyword-driven, sequence parser generator
227 -------------------------------------------------------------------------------
228 -- in [p], useful fields are:
230 -- * [transformers]: as usual
232 -- * [name]: as usual
234 -- * Table-part entries must be sequence parsers, or tables which can
235 -- be turned into a sequence parser by [gg.sequence]. These
236 -- sequences must start with a keyword, and this initial keyword
237 -- must be different for each sequence. The table-part entries will
238 -- be removed after [gg.multisequence] returns.
240 -- * [default]: the parser to run if the next keyword in the lexer is
241 -- none of the registered initial keywords. If there's no default
242 -- parser and no suitable initial keyword, the multisequence parser
243 -- simply returns [false].
245 -- After creation, the following fields are added:
247 -- * [parse] the parsing function lexer->AST
249 -- * [sequences] the table of sequences, indexed by initial keywords.
251 -- * [add] method takes a sequence parser or a config table for
252 -- [gg.sequence], and adds/replaces the corresponding sequence
253 -- parser. If the keyword was already used, the former sequence is
254 -- removed and a warning is issued.
256 -- * [get] method returns a sequence by its initial keyword
258 -- * [kind] == "multisequence"
260 -------------------------------------------------------------------------------
261 function multisequence (p)
262 make_parser ("multisequence", p)
264 -------------------------------------------------------------------
265 -- Add a sequence (might be just a config table for [gg.sequence])
266 -------------------------------------------------------------------
268 -- compile if necessary:
270 if not is_parser(s) then sequence(s) end
271 if is_parser(s) ~= 'sequence' or type(keyword) ~= "string" then
272 if self.default then -- two defaults
273 error ("In a multisequence parser, all but one sequences "..
274 "must start with a keyword")
275 else self.default = s end -- first default
276 elseif self.sequences[keyword] then -- duplicate keyword
277 eprintf (" *** Warning: keyword %q overloaded in multisequence ***", keyword)
278 self.sequences[keyword] = s
279 else -- newly caught keyword
280 self.sequences[keyword] = s
282 end -- </multisequence.add>
284 -------------------------------------------------------------------
285 -- Get the sequence starting with this keyword. [kw :: string]
286 -------------------------------------------------------------------
287 function p:get (kw) return self.sequences [kw] end
289 -------------------------------------------------------------------
290 -- Remove the sequence starting with keyword [kw :: string]
291 -------------------------------------------------------------------
293 if not self.sequences[kw] then
294 eprintf("*** Warning: trying to delete sequence starting "..
295 "with %q from a multisequence having no such "..
297 local removed = self.sequences[kw]
298 self.sequences[kw] = nil
302 -------------------------------------------------------------------
304 -------------------------------------------------------------------
305 function p:parse (lx)
306 local fli = lx:lineinfo_right()
307 local x = raw_parse_multisequence (lx, self.sequences, self.default)
308 local lli = lx:lineinfo_left()
309 return transform (x, self, fli, lli)
312 -------------------------------------------------------------------
314 -------------------------------------------------------------------
315 -- Register the sequences passed to the constructor. They're going
316 -- from the array part of the parser to the hash part of field
319 for i=1, #p do p:add (p[i]); p[i] = nil end
321 -- FIXME: why is this commented out?
322 --if p.default and not is_parser(p.default) then sequence(p.default) end
324 end --</multisequence>
327 -------------------------------------------------------------------------------
329 -- Expression parser generator
331 -------------------------------------------------------------------------------
333 -- Expression configuration relies on three tables: [prefix], [infix]
334 -- and [suffix]. Moreover, the primary parser can be replaced by a
335 -- table: in this case the [primary] table will be passed to
336 -- [gg.multisequence] to create a parser.
338 -- Each of these tables is a modified multisequence parser: the
339 -- differences with respect to regular multisequence config tables are:
341 -- * the builder takes specific parameters:
342 -- - for [prefix], it takes the result of the prefix sequence parser,
343 -- and the prefixed expression
344 -- - for [infix], it takes the left-hand-side expression, the results
345 -- of the infix sequence parser, and the right-hand-side expression.
346 -- - for [suffix], it takes the suffixed expression, and theresult
347 -- of the suffix sequence parser.
349 -- * the default field is a list, with parameters:
350 -- - [parser] the raw parsing function
351 -- - [transformers], as usual
352 -- - [prec], the operator's precedence
353 -- - [assoc] for [infix] table, the operator's associativity, which
354 -- can be "left", "right" or "flat" (default to left)
356 -- In [p], useful fields are:
357 -- * [transformers]: as usual
358 -- * [name]: as usual
359 -- * [primary]: the atomic expression parser, or a multisequence config
361 -- * [prefix]: prefix operators config table, see above.
362 -- * [infix]: infix operators config table, see above.
363 -- * [suffix]: suffix operators config table, see above.
365 -- After creation, these fields are added:
366 -- * [kind] == "expr"
367 -- * [parse] as usual
368 -- * each table is turned into a multisequence, and therefore has an
371 -------------------------------------------------------------------------------
373 make_parser ("expr", p)
375 -------------------------------------------------------------------
377 -- In addition to the lexer, it takes an optional precedence:
378 -- it won't read expressions whose precedence is lower or equal
380 -------------------------------------------------------------------
381 function p:parse (lx, prec)
384 ------------------------------------------------------
385 -- Extract the right parser and the corresponding
386 -- options table, for (pre|in|suff)fix operators.
387 -- Options include prec, assoc, transformers.
388 ------------------------------------------------------
389 local function get_parser_info (tab)
390 local p2 = tab:get (lx:is_keyword (lx:peek()))
391 if p2 then -- keyword-based sequence found
392 local function parser(lx) return raw_parse_sequence(lx, p2) end
394 else -- Got to use the default parser
395 local d = tab.default
396 if d then return d.parse or d.parser, d
397 else return false, false end
401 ------------------------------------------------------
402 -- Look for a prefix sequence. Multiple prefixes are
403 -- handled through the recursive [p.parse] call.
404 -- Notice the double-transform: one for the primary
405 -- expr, and one for the one with the prefix op.
406 ------------------------------------------------------
407 local function handle_prefix ()
408 local fli = lx:lineinfo_right()
409 local p2_func, p2 = get_parser_info (self.prefix)
410 local op = p2_func and p2_func (lx)
411 if op then -- Keyword-based sequence found
412 local ili = lx:lineinfo_right() -- Intermediate LineInfo
413 local e = p2.builder (op, self:parse (lx, p2.prec))
414 local lli = lx:lineinfo_left()
415 return transform (transform (e, p2, ili, lli), self, fli, lli)
416 else -- No prefix found, get a primary expression
417 local e = self.primary(lx)
418 local lli = lx:lineinfo_left()
419 return transform (e, self, fli, lli)
421 end --</expr.parse.handle_prefix>
423 ------------------------------------------------------
424 -- Look for an infix sequence+right-hand-side operand.
425 -- Return the whole binary expression result,
426 -- or false if no operator was found.
427 ------------------------------------------------------
428 local function handle_infix (e)
429 local p2_func, p2 = get_parser_info (self.infix)
430 if not p2 then return false end
432 -----------------------------------------
433 -- Handle flattening operators: gather all operands
434 -- of the series in [list]; when a different operator
435 -- is found, stop, build from [list], [transform] and
437 -----------------------------------------
438 if (not p2.prec or p2.prec>prec) and p2.assoc=="flat" then
439 local fli = lx:lineinfo_right()
440 local pflat, list = p2, { e }
442 local op = p2_func(lx)
443 if not op then break end
444 table.insert (list, self:parse (lx, p2.prec))
445 local _ -- We only care about checking that p2==pflat
446 _, p2 = get_parser_info (self.infix)
448 local e2 = pflat.builder (list)
449 local lli = lx:lineinfo_left()
450 return transform (transform (e2, pflat, fli, lli), self, fli, lli)
452 -----------------------------------------
453 -- Handle regular infix operators: [e] the LHS is known,
454 -- just gather the operator and [e2] the RHS.
455 -- Result goes in [e3].
456 -----------------------------------------
457 elseif p2.prec and p2.prec>prec or
458 p2.prec==prec and p2.assoc=="right" then
459 local fli = e.lineinfo.first -- lx:lineinfo_right()
460 local op = p2_func(lx)
461 if not op then return false end
462 local e2 = self:parse (lx, p2.prec)
463 local e3 = p2.builder (e, op, e2)
464 local lli = lx:lineinfo_left()
465 return transform (transform (e3, p2, fli, lli), self, fli, lli)
467 -----------------------------------------
468 -- Check for non-associative operators, and complain if applicable.
469 -----------------------------------------
470 elseif p2.assoc=="none" and p2.prec==prec then
471 parse_error (lx, "non-associative operator!")
473 -----------------------------------------
474 -- No infix operator suitable at that precedence
475 -----------------------------------------
476 else return false end
478 end --</expr.parse.handle_infix>
480 ------------------------------------------------------
481 -- Look for a suffix sequence.
482 -- Return the result of suffix operator on [e],
483 -- or false if no operator was found.
484 ------------------------------------------------------
485 local function handle_suffix (e)
486 -- FIXME bad fli, must take e.lineinfo.first
487 local p2_func, p2 = get_parser_info (self.suffix)
488 if not p2 then return false end
489 if not p2.prec or p2.prec>=prec then
490 --local fli = lx:lineinfo_right()
491 local fli = e.lineinfo.first
492 local op = p2_func(lx)
493 if not op then return false end
494 local lli = lx:lineinfo_left()
495 e = p2.builder (e, op)
496 e = transform (transform (e, p2, fli, lli), self, fli, lli)
500 end --</expr.parse.handle_suffix>
502 ------------------------------------------------------
503 -- Parser body: read suffix and (infix+operand)
504 -- extensions as long as we're able to fetch more at
505 -- this precedence level.
506 ------------------------------------------------------
507 local e = handle_prefix()
509 local x = handle_suffix (e); e = x or e
510 local y = handle_infix (e); e = y or e
513 -- No transform: it already happened in operators handling
517 -------------------------------------------------------------------
519 -------------------------------------------------------------------
520 if not p.primary then p.primary=p[1]; p[1]=nil end
521 for _, t in ipairs{ "primary", "prefix", "infix", "suffix" } do
522 if not p[t] then p[t] = { } end
523 if not is_parser(p[t]) then multisequence(p[t]) end
525 function p:add(...) return self.primary:add(...) end
530 -------------------------------------------------------------------------------
532 -- List parser generator
534 -------------------------------------------------------------------------------
535 -- In [p], the following fields can be provided in input:
537 -- * [builder]: takes list of subparser results, returns AST
538 -- * [transformers]: as usual
539 -- * [name]: as usual
541 -- * [terminators]: list of strings representing the keywords which
542 -- might mark the end of the list. When non-empty, the list is
543 -- allowed to be empty. A string is treated as a single-element
544 -- table, whose element is that string, e.g. ["do"] is the same as
547 -- * [separators]: list of strings representing the keywords which can
548 -- separate elements of the list. When non-empty, one of these
549 -- keyword has to be found between each element. Lack of a separator
550 -- indicates the end of the list. A string is treated as a
551 -- single-element table, whose element is that string, e.g. ["do"]
552 -- is the same as [{"do"}]. If [terminators] is empty/nil, then
553 -- [separators] has to be non-empty.
555 -- After creation, the following fields are added:
556 -- * [parse] the parsing function lexer->AST
557 -- * [kind] == "list"
559 -------------------------------------------------------------------------------
561 make_parser ("list", p)
563 -------------------------------------------------------------------
565 -------------------------------------------------------------------
566 function p:parse (lx)
568 ------------------------------------------------------
569 -- Used to quickly check whether there's a terminator
570 -- or a separator immediately ahead
571 ------------------------------------------------------
572 local function peek_is_in (keywords)
573 return keywords and lx:is_keyword(lx:peek(), unpack(keywords)) end
576 local fli = lx:lineinfo_right()
578 -- if there's a terminator to start with, don't bother trying
579 if not peek_is_in (self.terminators) then
580 repeat table.insert (x, self.primary (lx)) -- read one element
582 -- First reason to stop: There's a separator list specified,
583 -- and next token isn't one. Otherwise, consume it with [lx:next()]
584 self.separators and not(peek_is_in (self.separators) and lx:next()) or
585 -- Other reason to stop: terminator token ahead
586 peek_is_in (self.terminators) or
587 -- Last reason: end of file reached
591 local lli = lx:lineinfo_left()
593 -- Apply the builder. It can be a string, or a callable value,
594 -- or simply nothing.
595 local b = self.builder
597 if type(b)=="string" then x.tag = b -- b is a string, use it as a tag
598 elseif type(b)=="function" then x=b(x)
600 local bmt = getmetatable(b)
601 if bmt and bmt.__call then x=b(x) end
604 return transform (x, self, fli, lli)
607 -------------------------------------------------------------------
609 -------------------------------------------------------------------
610 if not p.primary then p.primary = p[1]; p[1] = nil end
611 if type(p.terminators) == "string" then p.terminators = { p.terminators }
612 elseif p.terminators and #p.terminators == 0 then p.terminators = nil end
613 if type(p.separators) == "string" then p.separators = { p.separators }
614 elseif p.separators and #p.separators == 0 then p.separators = nil end
620 -------------------------------------------------------------------------------
622 -- Keyword-conditionned parser generator
624 -------------------------------------------------------------------------------
626 -- Only apply a parser if a given keyword is found. The result of
627 -- [gg.onkeyword] parser is the result of the subparser (modulo
628 -- [transformers] applications).
630 -- lineinfo: the keyword is *not* included in the boundaries of the
631 -- resulting lineinfo. A review of all usages of gg.onkeyword() in the
632 -- implementation of metalua has shown that it was the appropriate choice
637 -- * [name]: as usual
639 -- * [transformers]: as usual
641 -- * [peek]: if non-nil, the conditionning keyword is left in the lexeme
642 -- stream instead of being consumed.
644 -- * [primary]: the subparser.
646 -- * [keywords]: list of strings representing triggering keywords.
648 -- * Table-part entries can contain strings, and/or exactly one parser.
649 -- Strings are put in [keywords], and the parser is put in [primary].
651 -- After the call, the following fields will be set:
653 -- * [parse] the parsing method
654 -- * [kind] == "onkeyword"
658 -------------------------------------------------------------------------------
659 function onkeyword (p)
660 make_parser ("onkeyword", p)
662 -------------------------------------------------------------------
664 -------------------------------------------------------------------
666 if lx:is_keyword (lx:peek(), unpack(self.keywords)) then
667 --local fli = lx:lineinfo_right()
668 if not self.peek then lx:next() end
669 local content = self.primary (lx)
670 --local lli = lx:lineinfo_left()
671 local fli, lli = content.lineinfo.first, content.lineinfo.last
672 return transform (content, p, fli, lli)
673 else return false end
676 -------------------------------------------------------------------
678 -------------------------------------------------------------------
679 if not p.keywords then p.keywords = { } end
680 for _, x in ipairs(p) do
681 if type(x)=="string" then table.insert (p.keywords, x)
682 else assert (not p.primary and is_parser (x)); p.primary = x end
684 if not next (p.keywords) then
685 eprintf("Warning, no keyword to trigger gg.onkeyword") end
686 assert (p.primary, 'no primary parser in gg.onkeyword')
691 -------------------------------------------------------------------------------
693 -- Optional keyword consummer pseudo-parser generator
695 -------------------------------------------------------------------------------
697 -- This doesn't return a real parser, just a function. That function parses
698 -- one of the keywords passed as parameters, and returns it. It returns
699 -- [false] if no matching keyword is found.
701 -- Notice that tokens returned by lexer already carry lineinfo, therefore
702 -- there's no need to add them, as done usually through transform() calls.
703 -------------------------------------------------------------------------------
704 function optkeyword (...)
706 if type (args[1]) == "table" then
710 for _, v in ipairs(args) do assert (type(v)=="string") end
712 local x = lx:is_keyword (lx:peek(), unpack (args))
713 if x then lx:next(); return x
714 else return false end
719 -------------------------------------------------------------------------------
721 -- Run a parser with a special lexer
723 -------------------------------------------------------------------------------
725 -- This doesn't return a real parser, just a function.
726 -- First argument is the lexer class to be used with the parser,
727 -- 2nd is the parser itself.
728 -- The resulting parser returns whatever the argument parser does.
730 -------------------------------------------------------------------------------
731 function with_lexer(new_lexer, parser)
733 -------------------------------------------------------------------
734 -- Most gg functions take their parameters in a table, so it's
735 -- better to silently accept when with_lexer{ } is called with
736 -- its arguments in a list:
737 -------------------------------------------------------------------
738 if not parser and #new_lexer==2 and type(new_lexer[1])=='table' then
739 return with_lexer(unpack(new_lexer))
742 -------------------------------------------------------------------
743 -- Save the current lexer, switch it for the new one, run the parser,
744 -- restore the previous lexer, even if the parser caused an error.
745 -------------------------------------------------------------------
747 local old_lexer = getmetatable(lx)
749 setmetatable(lx, new_lexer)
750 local status, result = pcall(parser, lx)
752 setmetatable(lx, old_lexer)
753 if status then return result else error(result) end