1 ----------------------------------------------------------------------
2 -- Metalua: $Id: mll.lua,v 1.3 2006/11/15 09:07:50 fab13n Exp $
4 -- Summary: generic Lua-style lexer definition. You need this plus
5 -- some keyword additions to create the complete Lua lexer,
6 -- as is done in mlp_lexer.lua.
10 -- * Make it possible to change lexer on the fly. This implies the
11 -- ability to easily undo any pre-extracted tokens;
13 -- * Make it easy to define new flavors of strings. Replacing the
14 -- lexer.patterns.long_string regexp by an extensible list, with
15 -- customizable token tag, would probably be enough. Maybe add:
16 -- + an index of capture for the regexp, that would specify
17 -- which capture holds the content of the string-like token
19 -- + or a string->string transformer function.
20 ----------------------------------------------------------------------
22 -- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
24 -- This software is released under the MIT Licence, see licence.txt
27 ----------------------------------------------------------------------
29 module ("lexer", package.seeall)
32 lexer = { alpha={ }, sym={ } }
35 local debugf = function() end
38 ----------------------------------------------------------------------
39 -- Patterns used by [lexer:extract] to decompose the raw string into
40 -- correctly tagged tokens.
41 ----------------------------------------------------------------------
43 spaces = "^[ \r\n\t]*()",
44 short_comment = "^%-%-([^\n]*)()\n",
45 final_short_comment = "^%-%-([^\n]*)()$",
46 long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
47 long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
51 number_exponant = "^[eE][%+%-]?%d+()",
52 word = "^([%a_][%w_]*)()"
55 ----------------------------------------------------------------------
56 -- Take a letter [x], and returns the character represented by the
57 -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
58 ----------------------------------------------------------------------
59 local function unesc_letter(x)
61 a = "\a", b = "\b", f = "\f",
62 n = "\n", r = "\r", t = "\t", v = "\v",
63 ["\\"] = "\\", ["'"] = "'", ['"'] = '"' }
64 return t[x] or error("Unknown escape sequence \\"..x)
67 ----------------------------------------------------------------------
68 -- Turn the digits of an escape sequence into the corresponding
69 -- character, e.g. [unesc_digits("123") == string.char(123)].
70 ----------------------------------------------------------------------
71 local function unesc_digits (x)
72 local k, j, i = x:reverse():byte(1, 3)
73 local z = _G.string.byte "0"
74 return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
77 ----------------------------------------------------------------------
78 -- unescape a whole string, applying [unesc_digits] and [unesc_letter]
79 -- as many times as required.
80 ----------------------------------------------------------------------
81 local function unescape_string (s)
82 return s:gsub("\\([0-9]+)", unesc_digits):gsub("\\(.)",unesc_letter)
86 "skip_whitespaces_and_comments",
87 "extract_short_string", "extract_word", "extract_number",
88 "extract_long_string", "extract_symbol" }
90 ----------------------------------------------------------------------
91 -- Really extract next token fron the raw string
92 -- (and update the index).
93 ----------------------------------------------------------------------
94 function lexer:extract ()
95 local previous_i = self.i
96 local loc, eof, token = self.i
98 local function tk (tag, content)
99 assert (tag and content)
100 local i, ln = previous_i, self.line
101 -- update line numbers
103 i = self.src:find("\n", i+1, true)
104 if not i then break end
105 if loc and i <= loc then ln = ln+1 end
106 if i <= self.i then self.line = self.line+1 else break end
108 local a = { tag = tag, char=loc, line=ln, content }
109 if #self.attached_comments > 0 then
110 a.comments = self.attached_comments
111 self.attached_comments = nil
114 __tostring = function()
115 return _G.string.format ("`%s{'%s'}",a.tag, a[1]) end }
116 setmetatable (a, amt)
120 self.attached_comments = { }
122 for ext_idx, extractor in ipairs(self.extractors) do
123 -- printf("method = %s", method)
124 local tag, content = self[extractor](self)
125 -- [loc] is placed just after the leading whitespaces and comments,
126 -- and the whitespace extractor is at index 1.
127 if ext_idx==1 then loc = self.i end
130 --printf("`%s{ %q }\t%i", tag, content, loc);
131 return tk (tag, content)
135 error "Cant extract anything!"
138 ----------------------------------------------------------------------
139 -- skip whites and comments
140 -- FIXME: doesn't take into account:
141 -- - unterminated long comments
142 -- - short comments without a final \n
143 ----------------------------------------------------------------------
144 function lexer:skip_whitespaces_and_comments()
145 local attached_comments = { }
149 local last_comment_content = nil
151 self.i = self.src:match (self.patterns.spaces, self.i)
152 -- skip a long comment if any
153 _, last_comment_content, j = self.src:match (self.patterns.long_comment, self.i)
155 _G.table.insert(self.attached_comments,
156 {last_comment_content, self.i, j, "long"})
159 -- skip a short comment if any
160 last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
162 _G.table.insert(attached_comments,
163 {last_comment_content, self.i, j, "short"})
166 if self.i>#self.src then return "Eof", "eof" end
169 if self.src:match (self.patterns.final_short_comment, self.i) then
170 return "Eof", "eof" end
171 --assert (not self.src:match(self.patterns.short_comment, self.i))
172 --assert (not self.src:match(self.patterns.long_comment, self.i))
173 -- --assert (not self.src:match(self.patterns.spaces, self.i))
177 ----------------------------------------------------------------------
179 ----------------------------------------------------------------------
180 function lexer:extract_short_string()
181 -- [k] is the first unread char, [self.i] points to [k] in [self.src]
182 local j, k = self.i, self.src:sub (self.i,self.i)
183 if k=="'" or k=='"' then
187 local kk = self.src:sub (self.i, self.i)
190 kk = self.src:sub (self.i, self.i)
192 if self.i > #self.src then error "Unterminated string" end
193 if self.i == "\r" or self.i == "\n" then error "no \\n in short strings!" end
194 until self.src:sub (self.i, self.i) == k
195 and ( self.src:sub (self.i-1, self.i-1) ~= '\\'
196 or self.src:sub (self.i-2, self.i-2) == '\\')
198 return "String", unescape_string (self.src:sub (j+1,self.i-2))
202 ----------------------------------------------------------------------
204 ----------------------------------------------------------------------
205 function lexer:extract_word()
207 local word, j = self.src:match (self.patterns.word, self.i)
210 if self.alpha [word] then return "Keyword", word
211 else return "Id", word end
215 ----------------------------------------------------------------------
217 ----------------------------------------------------------------------
218 function lexer:extract_number()
220 local j = self.src:match (self.patterns.number_mantissa[1], self.i) or
221 self.src:match (self.patterns.number_mantissa[2], self.i)
223 j = self.src:match (self.patterns.number_exponant, j) or j;
224 local n = tonumber (self.src:sub (self.i, j-1))
230 ----------------------------------------------------------------------
232 ----------------------------------------------------------------------
233 function lexer:extract_long_string()
235 local _, content, j = self.src:match (self.patterns.long_string, self.i)
236 if j then self.i = j; return "String", content end
239 ----------------------------------------------------------------------
241 ----------------------------------------------------------------------
242 function lexer:extract_symbol()
244 local k = self.src:sub (self.i,self.i)
245 local symk = self.sym [k]
250 for _, sym in pairs (symk) do
251 if sym == self.src:sub (self.i, self.i + #sym - 1) then
252 self.i = self.i + #sym;
253 return "Keyword", sym
256 -- single char symbol
261 ----------------------------------------------------------------------
262 -- Add a keyword to the list of keywords recognized by the lexer.
263 ----------------------------------------------------------------------
264 function lexer:add (w)
265 if type (w) == "table" then
266 for _, x in ipairs (w) do self:add (x) end
268 if w:match (self.patterns.word .. "$") then self.alpha [w] = true
269 elseif w:match "^%p%p+$" then
271 local list = self.sym [k]
272 if not list then list = { }; self.sym [k] = list end
273 _G.table.insert (list, w)
274 elseif w:match "^%p$" then return
275 else error "Invalid keyword" end
279 ----------------------------------------------------------------------
280 -- Return the [n]th next token, without consumming it.
281 -- [n] defaults to 1. If it goes pass the end of the stream, an EOF
282 -- token is returned.
283 ----------------------------------------------------------------------
284 function lexer:peek (n)
286 if not n then n=1 end
287 if n > #self.peeked then
288 for i = #self.peeked+1, n do
289 self.peeked [i] = self:extract()
292 return self.peeked [n]
295 ----------------------------------------------------------------------
296 -- Return the [n]th next token, removing it as well as the 0..n-1
297 -- previous tokens. [n] defaults to 1. If it goes pass the end of the
298 -- stream, an EOF token is returned.
299 ----------------------------------------------------------------------
300 function lexer:next (n)
301 if not n then n=1 end
305 a = _G.table.remove (self.peeked, 1)
306 if a then debugf ("[L:%i K:%i T:%s %q]", a.line or -1, a.char or -1, a.tag or '<none>', a[1]) end
308 return a or eof_token
311 ----------------------------------------------------------------------
312 -- Returns an object which saves the stream's current state.
313 ----------------------------------------------------------------------
314 function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
316 ----------------------------------------------------------------------
317 -- Restore the stream's state, as saved by method [save].
318 ----------------------------------------------------------------------
319 function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
321 ----------------------------------------------------------------------
322 -- Create a new lexstream.
323 ----------------------------------------------------------------------
324 function lexer:newstream (src)
326 src = src; -- The source, as a single string
327 peeked = { }; -- Already peeked, but not discarded yet, tokens
328 i = 1; -- Character offset in src
329 line = 1; -- current line number
331 setmetatable (stream, self)
333 -- skip initial sharp-bang for unix scripts
334 if src:match "^#!" then stream.i = src:find "\n" + 1 end
338 ----------------------------------------------------------------------
339 -- if there's no ... args, return the token a (whose truth value is
340 -- true) if it's a `Keyword{ }, or nil. If there are ... args, they
341 -- have to be strings. if the token a is a keyword, and it's content
342 -- is one of the ... args, then returns it (it's truth value is
343 -- true). If no a keyword or not in ..., return nil.
344 ----------------------------------------------------------------------
345 function lexer:is_keyword (a, ...)
346 if not a or a.tag ~= "Keyword" then return false end
348 if #words == 0 then return a[1] end
349 for _, w in ipairs (words) do
350 if w == a[1] then return w end
355 ----------------------------------------------------------------------
356 -- Cause an error if the next token isn't a keyword whose content
357 -- is listed among ... args (which have to be strings).
358 ----------------------------------------------------------------------
359 function lexer:check (...)
361 local a = self:next()
362 local function err ()
363 error ("Got " .. tostring (a) ..
364 ", expected one of these keywords : '" ..
365 _G.table.concat (words,"', '") .. "'") end
367 if not a or a.tag ~= "Keyword" then err () end
368 if #words == 0 then return a[1] end
369 for _, w in ipairs (words) do
370 if w == a[1] then return w end
375 ----------------------------------------------------------------------
377 ----------------------------------------------------------------------
378 function lexer:clone()
380 alpha = table.deep_copy(self.alpha),
381 sym = table.deep_copy(self.sym) }
382 setmetatable(clone, self)
383 clone.__index = clone
387 ----------------------------------------------------------------------
389 ----------------------------------------------------------------------
390 function is_stream (x)
391 return getmetable(x) == lexer