1 ----------------------------------------------------------------------
2 -- Metalua: $Id: mll.lua,v 1.3 2006/11/15 09:07:50 fab13n Exp $
4 -- Summary: generic Lua-style lexer definition. You need this plus
5 -- some keyword additions to create the complete Lua lexer,
6 -- as is done in mlp_lexer.lua.
10 -- * Make it easy to define new flavors of strings. Replacing the
11 -- lexer.patterns.long_string regexp by an extensible list, with
12 -- customizable token tag, would probably be enough. Maybe add:
13 -- + an index of capture for the regexp, that would specify
14 -- which capture holds the content of the string-like token
16 -- + or a string->string transformer function.
18 -- * There are some _G.table to prevent a namespace clash which has
19 -- now disappered. remove them.
20 ----------------------------------------------------------------------
22 -- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
24 -- This software is released under the MIT Licence, see licence.txt
27 ----------------------------------------------------------------------
29 module ("lexer", package.seeall)
31 require 'metalua.runtime'
34 lexer = { alpha={ }, sym={ } }
37 local debugf = function() end
40 ----------------------------------------------------------------------
41 -- Patterns used by [lexer:extract] to decompose the raw string into
42 -- correctly tagged tokens.
43 ----------------------------------------------------------------------
45 spaces = "^[ \r\n\t]*()",
46 short_comment = "^%-%-([^\n]*)()\n",
47 final_short_comment = "^%-%-([^\n]*)()$",
48 long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
49 long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
50 number_mantissa = { "^%d+%.?%d*()", "^%d*%.%d+()" },
51 number_exponant = "^[eE][%+%-]?%d+()",
52 number_hex = "^0[xX]%x+()",
53 word = "^([%a_][%w_]*)()"
56 ----------------------------------------------------------------------
57 -- unescape a whole string, applying [unesc_digits] and
58 -- [unesc_letter] as many times as required.
59 ----------------------------------------------------------------------
60 local function unescape_string (s)
62 -- Turn the digits of an escape sequence into the corresponding
63 -- character, e.g. [unesc_digits("123") == string.char(123)].
64 local function unesc_digits (backslashes, digits)
65 if #backslashes%2==0 then
66 -- Even number of backslashes, they escape each other, not the digits.
67 -- Return them so that unesc_letter() can treaat them
68 return backslashes..digits
70 -- Remove the odd backslash, which escapes the number sequence.
71 -- The rest will be returned and parsed by unesc_letter()
72 backslashes = backslashes :sub (1,-2)
74 local k, j, i = digits:reverse():byte(1, 3)
75 local z = _G.string.byte "0"
76 local code = (k or z) + 10*(j or z) + 100*(i or z) - 111*z
78 error ("Illegal escape sequence '\\"..digits..
79 "' in string: ASCII codes must be in [0..255]")
81 return backslashes .. string.char (code)
84 -- Take a letter [x], and returns the character represented by the
85 -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
86 local function unesc_letter(x)
88 a = "\a", b = "\b", f = "\f",
89 n = "\n", r = "\r", t = "\t", v = "\v",
90 ["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
91 return t[x] or error([[Unknown escape sequence '\]]..x..[[']])
95 :gsub ("(\\+)([0-9][0-9]?[0-9]?)", unesc_digits)
96 :gsub ("\\(%D)",unesc_letter)
100 "skip_whitespaces_and_comments",
101 "extract_short_string", "extract_word", "extract_number",
102 "extract_long_string", "extract_symbol" }
104 lexer.token_metatable = {
105 -- __tostring = function(a)
106 -- return string.format ("`%s{'%s'}",a.tag, a[1])
110 lexer.lineinfo_metatable = { }
112 ----------------------------------------------------------------------
113 -- Really extract next token fron the raw string
114 -- (and update the index).
115 -- loc: offset of the position just after spaces and comments
116 -- previous_i: offset in src before extraction began
117 ----------------------------------------------------------------------
118 function lexer:extract ()
119 local previous_i = self.i
123 -- Put line info, comments and metatable around the tag and content
124 -- provided by extractors, thus returning a complete lexer token.
125 -- first_line: line # at the beginning of token
126 -- first_column_offset: char # of the last '\n' before beginning of token
127 -- i: scans from beginning of prefix spaces/comments to end of token.
128 local function build_token (tag, content)
129 assert (tag and content)
130 local i, first_line, first_column_offset, previous_line_length =
131 previous_i, self.line, self.column_offset, nil
133 -- update self.line and first_line. i := indexes of '\n' chars
135 i = self.src :find ("\n", i+1, true)
136 if not i or i>self.i then break end -- no more '\n' until end of token
137 previous_line_length = i - self.column_offset
138 if loc and i <= loc then -- '\n' before beginning of token
139 first_column_offset = i
140 first_line = first_line+1
142 self.line = self.line+1
143 self.column_offset = i
146 -- lineinfo entries: [1]=line, [2]=column, [3]=char, [4]=filename
147 local fli = { first_line, loc-first_column_offset, loc, self.src_name }
148 local lli = { self.line, self.i-self.column_offset-1, self.i-1, self.src_name }
149 --Pluto barfes when the metatable is set:(
150 setmetatable(fli, lexer.lineinfo_metatable)
151 setmetatable(lli, lexer.lineinfo_metatable)
152 local a = { tag = tag, lineinfo = { first=fli, last=lli }, content }
153 if lli[2]==-1 then lli[1], lli[2] = lli[1]-1, previous_line_length-1 end
154 if #self.attached_comments > 0 then
155 a.lineinfo.comments = self.attached_comments
156 fli.comments = self.attached_comments
157 if self.lineinfo_last then
158 self.lineinfo_last.comments = self.attached_comments
161 self.attached_comments = { }
162 return setmetatable (a, self.token_metatable)
163 end --</function build_token>
165 for ext_idx, extractor in ipairs(self.extractors) do
166 -- printf("method = %s", method)
167 local tag, content = self [extractor] (self)
168 -- [loc] is placed just after the leading whitespaces and comments;
169 -- for this to work, the whitespace extractor *must be* at index 1.
170 if ext_idx==1 then loc = self.i end
173 --printf("`%s{ %q }\t%i", tag, content, loc);
174 return build_token (tag, content)
178 error "None of the lexer extractors returned anything!"
181 ----------------------------------------------------------------------
182 -- skip whites and comments
183 -- FIXME: doesn't take into account:
184 -- - unterminated long comments
185 -- - short comments at last line without a final \n
186 ----------------------------------------------------------------------
187 function lexer:skip_whitespaces_and_comments()
188 local table_insert = _G.table.insert
189 repeat -- loop as long as a space or comment chunk is found
192 local last_comment_content = nil
194 self.i = self.src:match (self.patterns.spaces, self.i)
195 -- skip a long comment if any
196 _, last_comment_content, j =
197 self.src :match (self.patterns.long_comment, self.i)
199 table_insert(self.attached_comments,
200 {last_comment_content, self.i, j, "long"})
203 -- skip a short comment if any
204 last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
206 table_insert(self.attached_comments,
207 {last_comment_content, self.i, j, "short"})
210 if self.i>#self.src then return "Eof", "eof" end
213 if self.src:match (self.patterns.final_short_comment, self.i) then
214 return "Eof", "eof" end
215 --assert (not self.src:match(self.patterns.short_comment, self.i))
216 --assert (not self.src:match(self.patterns.long_comment, self.i))
217 -- --assert (not self.src:match(self.patterns.spaces, self.i))
221 ----------------------------------------------------------------------
222 -- extract a '...' or "..." short string
223 ----------------------------------------------------------------------
224 function lexer:extract_short_string()
225 -- [k] is the first unread char, [self.i] points to [k] in [self.src]
226 local j, k = self.i, self.src :sub (self.i,self.i)
227 if k~="'" and k~='"' then return end
231 -- k = opening char: either simple-quote or double-quote
232 -- i = index of beginning-of-string
233 -- x = next "interesting" character
234 -- j = position after interesting char
235 -- y = char just after x
237 x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j)
238 if x == '\\' then j=j+1 -- don't parse escaped char
239 elseif x == k then break -- unescaped end of string
240 else -- eof or '\r' or '\n' reached before end of string
241 assert (not x or x=="\r" or x=="\n")
242 error "Unterminated string"
247 return "String", unescape_string (self.src:sub (i,j-2))
250 ----------------------------------------------------------------------
252 ----------------------------------------------------------------------
253 function lexer:extract_word()
255 local word, j = self.src:match (self.patterns.word, self.i)
258 if self.alpha [word] then return "Keyword", word
259 else return "Id", word end
263 ----------------------------------------------------------------------
265 ----------------------------------------------------------------------
266 function lexer:extract_number()
268 local j = self.src:match(self.patterns.number_hex, self.i)
270 j = self.src:match (self.patterns.number_mantissa[1], self.i) or
271 self.src:match (self.patterns.number_mantissa[2], self.i)
273 j = self.src:match (self.patterns.number_exponant, j) or j;
276 if not j then return end
277 -- Number found, interpret with tonumber() and return it
278 local n = tonumber (self.src:sub (self.i, j-1))
283 ----------------------------------------------------------------------
285 ----------------------------------------------------------------------
286 function lexer:extract_long_string()
288 local _, content, j = self.src:match (self.patterns.long_string, self.i)
289 if j then self.i = j; return "String", content end
292 ----------------------------------------------------------------------
294 ----------------------------------------------------------------------
295 function lexer:extract_symbol()
297 local k = self.src:sub (self.i,self.i)
298 local symk = self.sym [k]
303 for _, sym in pairs (symk) do
304 if sym == self.src:sub (self.i, self.i + #sym - 1) then
305 self.i = self.i + #sym;
306 return "Keyword", sym
309 -- single char symbol
314 ----------------------------------------------------------------------
315 -- Add a keyword to the list of keywords recognized by the lexer.
316 ----------------------------------------------------------------------
317 function lexer:add (w, ...)
318 assert(not ..., "lexer:add() takes only one arg, although possibly a table")
319 if type (w) == "table" then
320 for _, x in ipairs (w) do self:add (x) end
322 if w:match (self.patterns.word .. "$") then self.alpha [w] = true
323 elseif w:match "^%p%p+$" then
325 local list = self.sym [k]
326 if not list then list = { }; self.sym [k] = list end
327 _G.table.insert (list, w)
328 elseif w:match "^%p$" then return
329 else error "Invalid keyword" end
333 ----------------------------------------------------------------------
334 -- Return the [n]th next token, without consumming it.
335 -- [n] defaults to 1. If it goes pass the end of the stream, an EOF
336 -- token is returned.
337 ----------------------------------------------------------------------
338 function lexer:peek (n)
339 if not n then n=1 end
340 if n > #self.peeked then
341 for i = #self.peeked+1, n do
342 self.peeked [i] = self:extract()
345 return self.peeked [n]
348 ----------------------------------------------------------------------
349 -- Return the [n]th next token, removing it as well as the 0..n-1
350 -- previous tokens. [n] defaults to 1. If it goes pass the end of the
351 -- stream, an EOF token is returned.
352 ----------------------------------------------------------------------
353 function lexer:next (n)
358 a = _G.table.remove (self.peeked, 1)
360 --debugf ("lexer:next() ==> %s %s",
361 -- table.tostring(a), tostring(a))
363 self.lastline = a.lineinfo.last[1]
365 self.lineinfo_last = a.lineinfo.last
366 return a or eof_token
369 ----------------------------------------------------------------------
370 -- Returns an object which saves the stream's current state.
371 ----------------------------------------------------------------------
372 -- FIXME there are more fields than that to save
373 function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
375 ----------------------------------------------------------------------
376 -- Restore the stream's state, as saved by method [save].
377 ----------------------------------------------------------------------
378 -- FIXME there are more fields than that to restore
379 function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
381 ----------------------------------------------------------------------
382 -- Resynchronize: cancel any token in self.peeked, by emptying the
383 -- list and resetting the indexes
384 ----------------------------------------------------------------------
385 function lexer:sync()
386 local p1 = self.peeked[1]
388 li = p1.lineinfo.first
389 self.line, self.i = li[1], li[3]
390 self.column_offset = self.i - li[2]
392 self.attached_comments = p1.lineinfo.first.comments or { }
396 ----------------------------------------------------------------------
397 -- Take the source and offset of an old lexer.
398 ----------------------------------------------------------------------
399 function lexer:takeover(old)
401 self.line, self.column_offset, self.i, self.src, self.attached_comments =
402 old.line, old.column_offset, old.i, old.src, old.attached_comments
406 -- function lexer:lineinfo()
407 -- if self.peeked[1] then return self.peeked[1].lineinfo.first
408 -- else return { self.line, self.i-self.column_offset, self.i } end
412 ----------------------------------------------------------------------
413 -- Return the current position in the sources. This position is between
414 -- two tokens, and can be within a space / comment area, and therefore
415 -- have a non-null width. :lineinfo_left() returns the beginning of the
416 -- separation area, :lineinfo_right() returns the end of that area.
418 -- ____ last consummed token ____ first unconsummed token
420 -- XXXXX <spaces and comments> YYYYY
422 -- :lineinfo_left() :lineinfo_right()
423 ----------------------------------------------------------------------
424 function lexer:lineinfo_right()
425 return self:peek(1).lineinfo.first
428 function lexer:lineinfo_left()
429 return self.lineinfo_last
432 ----------------------------------------------------------------------
433 -- Create a new lexstream.
434 ----------------------------------------------------------------------
435 function lexer:newstream (src_or_stream, name)
437 if type(src_or_stream)=='table' then -- it's a stream
438 return setmetatable ({ }, self) :takeover (src_or_stream)
439 elseif type(src_or_stream)=='string' then -- it's a source string
440 local src = src_or_stream
442 src_name = name; -- Name of the file
443 src = src; -- The source, as a single string
444 peeked = { }; -- Already peeked, but not discarded yet, tokens
445 i = 1; -- Character offset in src
446 line = 1; -- Current line number
447 column_offset = 0; -- distance from beginning of file to last '\n'
448 attached_comments = { },-- comments accumulator
449 lineinfo_last = { 1, 1, 1, name }
451 setmetatable (stream, self)
453 -- skip initial sharp-bang for unix scripts
454 -- FIXME: redundant with mlp.chunk()
455 if src and src :match "^#" then stream.i = src :find "\n" + 1 end
458 assert(false, ":newstream() takes a source string or a stream, not a "..
463 ----------------------------------------------------------------------
464 -- if there's no ... args, return the token a (whose truth value is
465 -- true) if it's a `Keyword{ }, or nil. If there are ... args, they
466 -- have to be strings. if the token a is a keyword, and it's content
467 -- is one of the ... args, then returns it (it's truth value is
468 -- true). If no a keyword or not in ..., return nil.
469 ----------------------------------------------------------------------
470 function lexer:is_keyword (a, ...)
471 if not a or a.tag ~= "Keyword" then return false end
473 if #words == 0 then return a[1] end
474 for _, w in ipairs (words) do
475 if w == a[1] then return w end
480 ----------------------------------------------------------------------
481 -- Cause an error if the next token isn't a keyword whose content
482 -- is listed among ... args (which have to be strings).
483 ----------------------------------------------------------------------
484 function lexer:check (...)
486 local a = self:next()
487 local function err ()
488 error ("Got " .. tostring (a) ..
489 ", expected one of these keywords : '" ..
490 _G.table.concat (words,"', '") .. "'") end
492 if not a or a.tag ~= "Keyword" then err () end
493 if #words == 0 then return a[1] end
494 for _, w in ipairs (words) do
495 if w == a[1] then return w end
500 ----------------------------------------------------------------------
502 ----------------------------------------------------------------------
503 function lexer:clone()
505 alpha = table.deep_copy(self.alpha),
506 sym = table.deep_copy(self.sym) }
507 setmetatable(clone, self)
508 clone.__index = clone