1 ----------------------------------------------------------------------
2 -- Metalua: $Id: mll.lua,v 1.3 2006/11/15 09:07:50 fab13n Exp $
4 -- Summary: generic Lua-style lexer definition. You need this plus
5 -- some keyword additions to create the complete Lua lexer,
6 -- as is done in mlp_lexer.lua.
10 -- * Make it possible to change lexer on the fly. This implies the
11 -- ability to easily undo any pre-extracted tokens;
13 -- * Make it easy to define new flavors of strings. Replacing the
14 -- lexer.patterns.long_string regexp by an extensible list, with
15 -- customizable token tag, would probably be enough. Maybe add:
16 -- + an index of capture for the regexp, that would specify
17 -- which capture holds the content of the string-like token
19 -- + or a string->string transformer function.
21 -- * There are some _G.table to prevent a namespace clash which has
22 -- now disappered. remove them.
23 ----------------------------------------------------------------------
25 -- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
27 -- This software is released under the MIT Licence, see licence.txt
30 ----------------------------------------------------------------------
32 module ("lexer", package.seeall)
34 require 'metalua.runtime'
37 lexer = { alpha={ }, sym={ } }
40 local debugf = function() end
43 ----------------------------------------------------------------------
44 -- Patterns used by [lexer:extract] to decompose the raw string into
45 -- correctly tagged tokens.
46 ----------------------------------------------------------------------
48 spaces = "^[ \r\n\t]*()",
49 short_comment = "^%-%-([^\n]*)()\n",
50 final_short_comment = "^%-%-([^\n]*)()$",
51 long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
52 long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
56 number_exponant = "^[eE][%+%-]?%d+()",
57 number_hex = "^0[xX]%x+()",
58 word = "^([%a_][%w_]*)()"
61 ----------------------------------------------------------------------
62 -- unescape a whole string, applying [unesc_digits] and
63 -- [unesc_letter] as many times as required.
64 ----------------------------------------------------------------------
65 local function unescape_string (s)
67 -- Turn the digits of an escape sequence into the corresponding
68 -- character, e.g. [unesc_digits("123") == string.char(123)].
69 local function unesc_digits (x)
70 local k, j, i = x:reverse():byte(1, 3)
71 local z = _G.string.byte "0"
72 return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
75 -- Take a letter [x], and returns the character represented by the
76 -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
77 local function unesc_letter(x)
79 a = "\a", b = "\b", f = "\f",
80 n = "\n", r = "\r", t = "\t", v = "\v",
81 ["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
82 return t[x] or error([[Unknown escape sequence '\]]..x..[[']])
86 :gsub ("\\(%D)",unesc_letter)
87 :gsub ("\\([0-9]+)", unesc_digits)
91 "skip_whitespaces_and_comments",
92 "extract_short_string", "extract_word", "extract_number",
93 "extract_long_string", "extract_symbol" }
95 lexer.token_metatable = {
96 -- __tostring = function(a)
97 -- return string.format ("`%s{'%s'}",a.tag, a[1])
101 ----------------------------------------------------------------------
102 -- Really extract next token fron the raw string
103 -- (and update the index).
104 -- loc: offset of the position just after spaces and comments
105 -- previous_i: offset in src before extraction began
106 ----------------------------------------------------------------------
107 function lexer:extract ()
108 local previous_i = self.i
112 -- Put line info, comments and metatable arount the tag and content
113 -- provided by extractors, thus returning a complete lexer token.
114 local function build_token (tag, content)
115 assert (tag and content)
116 local i, first_line, first_column_offset =
117 previous_i, self.line, self.column_offset
118 -- update self.line and first_line. i := indexes of '\n' chars
120 i = self.src :find ("\n", i+1, true)
121 if not i then break end
122 if loc and i <= loc then
123 first_column_offset = i
124 first_line = first_line+1
127 self.line = self.line+1
128 self.column_offset = i
131 local a = { --char = loc, line = self.line,
134 name = self.src_name,
135 first = { first_line, loc - first_column_offset, loc },
136 last = { self.line, self.i - self.column_offset, self.i } },
138 if #self.attached_comments > 0 then
139 a.lineinfo.comments = self.attached_comments
140 self.attached_comments = nil
142 self.attached_comments = { }
143 return setmetatable (a, self.token_metatable)
144 end --</function build_token>
146 for ext_idx, extractor in ipairs(self.extractors) do
147 -- printf("method = %s", method)
148 local tag, content = self [extractor] (self)
149 -- [loc] is placed just after the leading whitespaces and comments;
150 -- for this to work, the whitespace extractor *must be* at index 1.
151 if ext_idx==1 then loc = self.i end
154 --printf("`%s{ %q }\t%i", tag, content, loc);
155 return build_token (tag, content)
159 error "None of the lexer extractors returned anything!"
162 ----------------------------------------------------------------------
163 -- skip whites and comments
164 -- FIXME: doesn't take into account:
165 -- - unterminated long comments
166 -- - short comments at last line without a final \n
167 ----------------------------------------------------------------------
168 function lexer:skip_whitespaces_and_comments()
169 local table_insert = _G.table.insert
170 repeat -- loop as long as a space or comment chunk is found
173 local last_comment_content = nil
175 self.i = self.src:match (self.patterns.spaces, self.i)
176 -- skip a long comment if any
177 _, last_comment_content, j =
178 self.src :match (self.patterns.long_comment, self.i)
180 table_insert(self.attached_comments,
181 {last_comment_content, self.i, j, "long"})
184 -- skip a short comment if any
185 last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
187 table_insert(self.attached_comments,
188 {last_comment_content, self.i, j, "short"})
191 if self.i>#self.src then return "Eof", "eof" end
194 if self.src:match (self.patterns.final_short_comment, self.i) then
195 return "Eof", "eof" end
196 --assert (not self.src:match(self.patterns.short_comment, self.i))
197 --assert (not self.src:match(self.patterns.long_comment, self.i))
198 -- --assert (not self.src:match(self.patterns.spaces, self.i))
202 ----------------------------------------------------------------------
203 -- extract a '...' or "..." short string
204 ----------------------------------------------------------------------
205 function lexer:extract_short_string()
206 -- [k] is the first unread char, [self.i] points to [k] in [self.src]
207 local j, k = self.i, self.src :sub (self.i,self.i)
208 if k~="'" and k~='"' then return end
212 -- k = opening char: either simple-quote or double-quote
213 -- i = index of beginning-of-string
214 -- x = next "interesting" character
215 -- j = position after interesting char
216 -- y = char just after x
218 x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j)
219 if x == '\\' then j=j+1 -- don't parse escaped char
220 elseif x == k then break -- unescaped end of string
221 else -- eof or '\r' or '\n' reached before end of string
222 assert (not x or x=="\r" or x=="\n")
223 error "Unterminated string"
228 return "String", unescape_string (self.src:sub (i,j-2))
231 ----------------------------------------------------------------------
233 ----------------------------------------------------------------------
234 function lexer:extract_word()
236 local word, j = self.src:match (self.patterns.word, self.i)
239 if self.alpha [word] then return "Keyword", word
240 else return "Id", word end
244 ----------------------------------------------------------------------
246 ----------------------------------------------------------------------
247 function lexer:extract_number()
249 local j = self.src:match(self.patterns.number_hex, self.i)
251 j = self.src:match (self.patterns.number_mantissa[1], self.i) or
252 self.src:match (self.patterns.number_mantissa[2], self.i)
254 j = self.src:match (self.patterns.number_exponant, j) or j;
257 if not j then return end
258 -- Number found, interpret with tonumber() and return it
259 local n = tonumber (self.src:sub (self.i, j-1))
264 ----------------------------------------------------------------------
266 ----------------------------------------------------------------------
267 function lexer:extract_long_string()
269 local _, content, j = self.src:match (self.patterns.long_string, self.i)
270 if j then self.i = j; return "String", content end
273 ----------------------------------------------------------------------
275 ----------------------------------------------------------------------
276 function lexer:extract_symbol()
278 local k = self.src:sub (self.i,self.i)
279 local symk = self.sym [k]
284 for _, sym in pairs (symk) do
285 if sym == self.src:sub (self.i, self.i + #sym - 1) then
286 self.i = self.i + #sym;
287 return "Keyword", sym
290 -- single char symbol
295 ----------------------------------------------------------------------
296 -- Add a keyword to the list of keywords recognized by the lexer.
297 ----------------------------------------------------------------------
298 function lexer:add (w, ...)
299 assert(not ..., "lexer:add() takes only one arg, although possibly a table")
300 if type (w) == "table" then
301 for _, x in ipairs (w) do self:add (x) end
303 if w:match (self.patterns.word .. "$") then self.alpha [w] = true
304 elseif w:match "^%p%p+$" then
306 local list = self.sym [k]
307 if not list then list = { }; self.sym [k] = list end
308 _G.table.insert (list, w)
309 elseif w:match "^%p$" then return
310 else error "Invalid keyword" end
314 ----------------------------------------------------------------------
315 -- Return the [n]th next token, without consumming it.
316 -- [n] defaults to 1. If it goes pass the end of the stream, an EOF
317 -- token is returned.
318 ----------------------------------------------------------------------
319 function lexer:peek (n)
320 if not n then n=1 end
321 if n > #self.peeked then
322 for i = #self.peeked+1, n do
323 self.peeked [i] = self:extract()
326 return self.peeked [n]
329 ----------------------------------------------------------------------
330 -- Return the [n]th next token, removing it as well as the 0..n-1
331 -- previous tokens. [n] defaults to 1. If it goes pass the end of the
332 -- stream, an EOF token is returned.
333 ----------------------------------------------------------------------
334 function lexer:next (n)
339 a = _G.table.remove (self.peeked, 1)
341 --debugf ("lexer:next() ==> %s %s",
342 -- table.tostring(a), tostring(a))
344 self.lastline = a.lineinfo.last[1]
346 return a or eof_token
349 ----------------------------------------------------------------------
350 -- Returns an object which saves the stream's current state.
351 ----------------------------------------------------------------------
352 -- FIXME there are more fields than that to save
353 function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
355 ----------------------------------------------------------------------
356 -- Restore the stream's state, as saved by method [save].
357 ----------------------------------------------------------------------
358 -- FIXME there are more fields than that to restore
359 function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
361 ----------------------------------------------------------------------
362 -- Resynchronize: cancel any token in self.peeked, by emptying the
363 -- list and resetting the indexes
364 ----------------------------------------------------------------------
365 function lexer:sync()
366 local p1 = self.peeked[1]
368 li = p1.lineinfo.first
369 self.line, self.i = li[1], li[3]
370 self.column_offset = self.i - li[2]
372 self.attached_comments = p1.lineinfo.comments or { }
376 ----------------------------------------------------------------------
377 -- Take the source and offset of an old lexer.
378 ----------------------------------------------------------------------
379 function lexer:takeover(old)
381 self.line, self.column_offset, self.i, self.src, self.attached_comments =
382 old.line, old.column_offset, old.i, old.src, old.attached_comments
386 function lexer:lineinfo()
387 if self.peeked[1] then return self.peeked[1].lineinfo.first
388 else return { self.line, self.i-self.column_offset, self.i } end
391 ----------------------------------------------------------------------
392 -- Create a new lexstream.
393 ----------------------------------------------------------------------
394 function lexer:newstream (src_or_stream, name)
396 if type(src_or_stream)=='table' then -- it's a stream
397 return setmetatable ({ }, self) :takeover (src_or_stream)
398 elseif type(src_or_stream)=='string' then -- it's a source string
399 local src = src_or_stream
401 src_name = name; -- Name of the file
402 src = src; -- The source, as a single string
403 peeked = { }; -- Already peeked, but not discarded yet, tokens
404 i = 1; -- Character offset in src
405 line = 1; -- Current line number
406 column_offset = 0; -- distance from beginning of file to last '\n'
407 attached_comments = { } -- comments accumulator
409 setmetatable (stream, self)
411 -- skip initial sharp-bang for unix scripts
412 -- FIXME: redundant with mlp.chunk()
413 if src and src :match "^#" then stream.i = src :find "\n" + 1 end
416 assert(false, ":newstream() takes a source string or a stream, not a "..
421 ----------------------------------------------------------------------
422 -- if there's no ... args, return the token a (whose truth value is
423 -- true) if it's a `Keyword{ }, or nil. If there are ... args, they
424 -- have to be strings. if the token a is a keyword, and it's content
425 -- is one of the ... args, then returns it (it's truth value is
426 -- true). If no a keyword or not in ..., return nil.
427 ----------------------------------------------------------------------
428 function lexer:is_keyword (a, ...)
429 if not a or a.tag ~= "Keyword" then return false end
431 if #words == 0 then return a[1] end
432 for _, w in ipairs (words) do
433 if w == a[1] then return w end
438 ----------------------------------------------------------------------
439 -- Cause an error if the next token isn't a keyword whose content
440 -- is listed among ... args (which have to be strings).
441 ----------------------------------------------------------------------
442 function lexer:check (...)
444 local a = self:next()
445 local function err ()
446 error ("Got " .. tostring (a) ..
447 ", expected one of these keywords : '" ..
448 _G.table.concat (words,"', '") .. "'") end
450 if not a or a.tag ~= "Keyword" then err () end
451 if #words == 0 then return a[1] end
452 for _, w in ipairs (words) do
453 if w == a[1] then return w end
458 ----------------------------------------------------------------------
460 ----------------------------------------------------------------------
461 function lexer:clone()
463 alpha = table.deep_copy(self.alpha),
464 sym = table.deep_copy(self.sym) }
465 setmetatable(clone, self)
466 clone.__index = clone