1 -------------------------------------------------------------------------------
2 -- Copyright (c) 2006-2013 Fabien Fleutot and others.
4 -- All rights reserved.
6 -- This program and the accompanying materials are made available
7 -- under the terms of the Eclipse Public License v1.0 which
8 -- accompanies this distribution, and is available at
9 -- http://www.eclipse.org/legal/epl-v10.html
11 -- This program and the accompanying materials are also made available
12 -- under the terms of the MIT public license which accompanies this
13 -- distribution, and is available at http://www.lua.org/license.html
16 -- Fabien Fleutot - API and implementation
18 -------------------------------------------------------------------------------
24 local lexer = { alpha={ }, sym={ } }
26 lexer.__type='lexer.stream'
31 local debugf = function() end
32 -- local debugf=printf
34 ----------------------------------------------------------------------
35 -- Some locale settings produce bad results, e.g. French locale
36 -- expect float numbers to use commas instead of periods.
37 -- TODO: change number parser into something loclae-independent,
39 ----------------------------------------------------------------------
46 ----------------------------------------------------------------------
47 -- Create a new metatable, for a new class of objects.
48 ----------------------------------------------------------------------
49 local function new_metatable(name)
50 local mt = { __type = 'lexer.'..name };
56 ----------------------------------------------------------------------
57 -- Position: represent a point in a source file.
58 ----------------------------------------------------------------------
59 new_metatable 'position'
63 function M.new_position(line, column, offset, source)
64 checks('number', 'number', 'number', 'string')
65 local id = position_idx; position_idx = position_idx+1
66 return setmetatable({line=line, column=column, offset=offset,
67 source=source, id=id}, MT.position)
70 function MT.position :__tostring()
71 return string.format("<%s%s|L%d|C%d|K%d>",
72 self.comments and "C|" or "",
73 self.source, self.line, self.column, self.offset)
78 ----------------------------------------------------------------------
79 -- Position factory: convert offsets into line/column/offset positions.
80 ----------------------------------------------------------------------
81 new_metatable 'position_factory'
83 function M.new_position_factory(src, src_name)
84 -- assert(type(src)=='string')
85 -- assert(type(src_name)=='string')
87 for offset in src :gmatch '\n()' do table.insert(lines, offset) end
89 table.insert(lines, max+1) -- +1 includes Eof
90 return setmetatable({ src_name=src_name, line2offset=lines, max=max },
94 function MT.position_factory :get_position (offset)
95 -- assert(type(offset)=='number')
96 assert(offset<=self.max)
97 local line2offset = self.line2offset
98 local left = self.last_left or 1
99 if offset<line2offset[left] then left=1 end
101 if line2offset[right]<=offset then right = right+1 end
102 if line2offset[right]<=offset then right = #line2offset end
104 -- print (" trying lines "..left.."/"..right..", offsets "..line2offset[left]..
105 -- "/"..line2offset[right].." for offset "..offset)
106 -- assert(line2offset[left]<=offset)
107 -- assert(offset<line2offset[right])
108 -- assert(left<right)
109 if left+1==right then break end
110 local middle = math.floor((left+right)/2)
111 if line2offset[middle]<=offset then left=middle else right=middle end
113 -- assert(left+1==right)
114 -- printf("found that offset %d is between %d and %d, hence on line %d",
115 -- offset, line2offset[left], line2offset[right], left)
117 local column = offset - line2offset[line] + 1
118 self.last_left = left
119 return M.new_position(line, column, offset, self.src_name)
124 ----------------------------------------------------------------------
125 -- Lineinfo: represent a node's range in a source file;
126 -- embed information about prefix and suffix comments.
127 ----------------------------------------------------------------------
128 new_metatable 'lineinfo'
130 function M.new_lineinfo(first, last)
131 checks('lexer.position', 'lexer.position')
132 return setmetatable({first=first, last=last}, MT.lineinfo)
135 function MT.lineinfo :__tostring()
136 local fli, lli = self.first, self.last
137 local line = fli.line; if line~=lli.line then line =line ..'-'..lli.line end
138 local column = fli.column; if column~=lli.column then column=column..'-'..lli.column end
139 local offset = fli.offset; if offset~=lli.offset then offset=offset..'-'..lli.offset end
140 return string.format("<%s%s|L%s|C%s|K%s%s>",
141 fli.comments and "C|" or "",
142 fli.source, line, column, offset,
143 lli.comments and "|C" or "")
146 ----------------------------------------------------------------------
147 -- Token: atomic Lua language element, with a category, a content,
148 -- and some lineinfo relating it to its original source.
149 ----------------------------------------------------------------------
150 new_metatable 'token'
152 function M.new_token(tag, content, lineinfo)
153 --printf("TOKEN `%s{ %q, lineinfo = %s} boundaries %d, %d",
154 -- tag, content, tostring(lineinfo), lineinfo.first.id, lineinfo.last.id)
155 return setmetatable({tag=tag, lineinfo=lineinfo, content}, MT.token)
158 function MT.token :__tostring()
159 --return string.format("`%s{ %q, %s }", self.tag, self[1], tostring(self.lineinfo))
160 return string.format("`%s %q", self.tag, self[1])
164 ----------------------------------------------------------------------
165 -- Comment: series of comment blocks with associated lineinfo.
166 -- To be attached to the tokens just before and just after them.
167 ----------------------------------------------------------------------
168 new_metatable 'comment'
170 function M.new_comment(lines)
171 local first = lines[1].lineinfo.first
172 local last = lines[#lines].lineinfo.last
173 local lineinfo = M.new_lineinfo(first, last)
174 return setmetatable({lineinfo=lineinfo, unpack(lines)}, MT.comment)
177 function MT.comment :text()
178 local last_line = self[1].lineinfo.last.line
180 for i, line in ipairs(self) do
181 local nreturns = line.lineinfo.first.line - last_line
182 table.insert(acc, ("\n"):rep(nreturns))
183 table.insert(acc, line[1])
185 return table.concat(acc)
188 function M.new_comment_line(text, lineinfo, nequals)
189 checks('string', 'lexer.lineinfo', '?number')
190 return { lineinfo = lineinfo, text, nequals }
195 ----------------------------------------------------------------------
196 -- Patterns used by [lexer :extract] to decompose the raw string into
197 -- correctly tagged tokens.
198 ----------------------------------------------------------------------
200 spaces = "^[ \r\n\t]*()",
201 short_comment = "^%-%-([^\n]*)\n?()",
202 --final_short_comment = "^%-%-([^\n]*)()$",
203 long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
204 long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
205 number_mantissa = { "^%d+%.?%d*()", "^%d*%.%d+()" },
206 number_mantissa_hex = { "^%x+%.?%x*()", "^%x*%.%x+()" }, --Lua5.1 and Lua5.2
207 number_exponant = "^[eE][%+%-]?%d+()",
208 number_exponant_hex = "^[pP][%+%-]?%d+()", --Lua5.2
209 number_hex = "^0[xX]()",
210 word = "^([%a_][%w_]*)()"
213 ----------------------------------------------------------------------
214 -- unescape a whole string, applying [unesc_digits] and
215 -- [unesc_letter] as many times as required.
216 ----------------------------------------------------------------------
217 local function unescape_string (s)
219 -- Turn the digits of an escape sequence into the corresponding
220 -- character, e.g. [unesc_digits("123") == string.char(123)].
221 local function unesc_digits (backslashes, digits)
222 if #backslashes%2==0 then
223 -- Even number of backslashes, they escape each other, not the digits.
224 -- Return them so that unesc_letter() can treat them
225 return backslashes..digits
227 -- Remove the odd backslash, which escapes the number sequence.
228 -- The rest will be returned and parsed by unesc_letter()
229 backslashes = backslashes :sub (1,-2)
231 local k, j, i = digits :reverse() :byte(1, 3)
232 local z = string.byte "0"
233 local code = (k or z) + 10*(j or z) + 100*(i or z) - 111*z
235 error ("Illegal escape sequence '\\"..digits..
236 "' in string: ASCII codes must be in [0..255]")
238 local c = string.char (code)
239 if c == '\\' then c = '\\\\' end -- parsed by unesc_letter (test: "\092b" --> "\\b")
240 return backslashes..c
243 -- Turn hex digits of escape sequence into char.
244 local function unesc_hex(backslashes, digits)
245 if #backslashes%2==0 then
246 return backslashes..'x'..digits
248 backslashes = backslashes :sub (1,-2)
250 local c = string.char(tonumber(digits,16))
251 if c == '\\' then c = '\\\\' end -- parsed by unesc_letter (test: "\x5cb" --> "\\b")
252 return backslashes..c
255 -- Handle Lua 5.2 \z sequences
256 local function unesc_z(backslashes, more)
257 if #backslashes%2==0 then
258 return backslashes..more
260 return backslashes :sub (1,-2)
264 -- Take a letter [x], and returns the character represented by the
265 -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
266 local function unesc_letter(x)
268 a = "\a", b = "\b", f = "\f",
269 n = "\n", r = "\r", t = "\t", v = "\v",
270 ["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
274 s = s: gsub ("(\\+)(z%s*)", unesc_z) -- Lua 5.2
275 s = s: gsub ("(\\+)([0-9][0-9]?[0-9]?)", unesc_digits)
276 s = s: gsub ("(\\+)x([0-9a-fA-F][0-9a-fA-F])", unesc_hex) -- Lua 5.2
277 s = s: gsub ("\\(%D)",unesc_letter)
282 "extract_long_comment", "extract_short_comment",
283 "extract_short_string", "extract_word", "extract_number",
284 "extract_long_string", "extract_symbol" }
288 ----------------------------------------------------------------------
289 -- Really extract next token from the raw string
290 -- (and update the index).
291 -- loc: offset of the position just after spaces and comments
292 -- previous_i: offset in src before extraction began
293 ----------------------------------------------------------------------
294 function lexer :extract ()
295 local attached_comments = { }
296 local function gen_token(...)
297 local token = M.new_token(...)
298 if #attached_comments>0 then -- attach previous comments to token
299 local comments = M.new_comment(attached_comments)
300 token.lineinfo.first.comments = comments
301 if self.lineinfo_last_extracted then
302 self.lineinfo_last_extracted.comments = comments
304 attached_comments = { }
306 token.lineinfo.first.facing = self.lineinfo_last_extracted
307 self.lineinfo_last_extracted.facing = assert(token.lineinfo.first)
308 self.lineinfo_last_extracted = assert(token.lineinfo.last)
311 while true do -- loop until a non-comment token is found
314 self.i = self.src:match (self.patterns.spaces, self.i)
315 if self.i>#self.src then
316 local fli = self.posfact :get_position (#self.src+1)
317 local lli = self.posfact :get_position (#self.src+1) -- ok?
318 local tok = gen_token("Eof", "eof", M.new_lineinfo(fli, lli))
319 tok.lineinfo.last.facing = lli
322 local i_first = self.i -- loc = position after whitespaces
324 -- try every extractor until a token is found
325 for _, extractor in ipairs(self.extractors) do
326 local tag, content, xtra = self [extractor] (self)
328 local fli = self.posfact :get_position (i_first)
329 local lli = self.posfact :get_position (self.i-1)
330 local lineinfo = M.new_lineinfo(fli, lli)
331 if tag=='Comment' then
332 local prev_comment = attached_comments[#attached_comments]
333 if not xtra -- new comment is short
334 and prev_comment and not prev_comment[2] -- prev comment is short
335 and prev_comment.lineinfo.last.line+1==fli.line then -- adjascent lines
336 -- concat with previous comment
337 prev_comment[1] = prev_comment[1].."\n"..content -- TODO quadratic, BAD!
338 prev_comment.lineinfo.last = lli
339 else -- accumulate comment
340 local comment = M.new_comment_line(content, lineinfo, xtra)
341 table.insert(attached_comments, comment)
343 break -- back to skipping spaces
344 else -- not a comment: real token, then
345 return gen_token(tag, content, lineinfo)
346 end -- if token is a comment
347 end -- if token found
348 end -- for each extractor
349 end -- while token is a comment
355 ----------------------------------------------------------------------
356 -- Extract a short comment.
357 ----------------------------------------------------------------------
358 function lexer :extract_short_comment()
359 -- TODO: handle final_short_comment
360 local content, j = self.src :match (self.patterns.short_comment, self.i)
361 if content then self.i=j; return 'Comment', content, nil end
364 ----------------------------------------------------------------------
365 -- Extract a long comment.
366 ----------------------------------------------------------------------
367 function lexer :extract_long_comment()
368 local equals, content, j = self.src:match (self.patterns.long_comment, self.i)
369 if j then self.i = j; return "Comment", content, #equals end
372 ----------------------------------------------------------------------
373 -- Extract a '...' or "..." short string.
374 ----------------------------------------------------------------------
375 function lexer :extract_short_string()
376 local k = self.src :sub (self.i,self.i) -- first char
377 if k~=[[']] and k~=[["]] then return end -- no match'
381 local x,y; x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j) -- next interesting char
383 if y == 'z' then -- Lua 5.2 \z
384 j = self.src :match ("^%s*()", j+1)
386 j=j+1 -- escaped char
388 elseif x == k then break -- end of string
390 assert (not x or x=='\r' or x=='\n')
391 return nil, 'Unterminated string'
396 return 'String', unescape_string (self.src :sub (i,j-2))
399 ----------------------------------------------------------------------
400 -- Extract Id or Keyword.
401 ----------------------------------------------------------------------
402 function lexer :extract_word()
403 local word, j = self.src:match (self.patterns.word, self.i)
406 return (self.alpha [word] and 'Keyword' or 'Id'), word
410 ----------------------------------------------------------------------
412 ----------------------------------------------------------------------
413 function lexer :extract_number()
414 local j = self.src:match(self.patterns.number_hex, self.i)
416 j = self.src:match (self.patterns.number_mantissa_hex[1], j) or
417 self.src:match (self.patterns.number_mantissa_hex[2], j)
419 j = self.src:match (self.patterns.number_exponant_hex, j) or j
422 j = self.src:match (self.patterns.number_mantissa[1], self.i) or
423 self.src:match (self.patterns.number_mantissa[2], self.i)
425 j = self.src:match (self.patterns.number_exponant, j) or j
428 if not j then return end
429 -- Number found, interpret with tonumber() and return it
430 local str = self.src:sub (self.i, j-1)
431 -- :TODO: tonumber on Lua5.2 floating hex may or may not work on Lua5.1
432 local n = tonumber (str)
433 if not n then error(str.." is not a valid number according to tonumber()") end
438 ----------------------------------------------------------------------
439 -- Extract long string.
440 ----------------------------------------------------------------------
441 function lexer :extract_long_string()
442 local _, content, j = self.src :match (self.patterns.long_string, self.i)
443 if j then self.i = j; return 'String', content end
446 ----------------------------------------------------------------------
448 ----------------------------------------------------------------------
449 function lexer :extract_symbol()
450 local k = self.src:sub (self.i,self.i)
451 local symk = self.sym [k] -- symbols starting with `k`
456 for _, sym in pairs (symk) do
457 if sym == self.src:sub (self.i, self.i + #sym - 1) then
458 self.i = self.i + #sym
459 return 'Keyword', sym
466 ----------------------------------------------------------------------
467 -- Add a keyword to the list of keywords recognized by the lexer.
468 ----------------------------------------------------------------------
469 function lexer :add (w, ...)
470 assert(not ..., "lexer :add() takes only one arg, although possibly a table")
471 if type (w) == "table" then
472 for _, x in ipairs (w) do self :add (x) end
474 if w:match (self.patterns.word .. "$") then self.alpha [w] = true
475 elseif w:match "^%p%p+$" then
477 local list = self.sym [k]
478 if not list then list = { }; self.sym [k] = list end
479 table.insert (list, w)
480 elseif w:match "^%p$" then return
481 else error "Invalid keyword" end
485 ----------------------------------------------------------------------
486 -- Return the [n]th next token, without consuming it.
487 -- [n] defaults to 1. If it goes pass the end of the stream, an EOF
488 -- token is returned.
489 ----------------------------------------------------------------------
490 function lexer :peek (n)
491 if not n then n=1 end
492 if n > #self.peeked then
493 for i = #self.peeked+1, n do
494 self.peeked [i] = self :extract()
497 return self.peeked [n]
500 ----------------------------------------------------------------------
501 -- Return the [n]th next token, removing it as well as the 0..n-1
502 -- previous tokens. [n] defaults to 1. If it goes pass the end of the
503 -- stream, an EOF token is returned.
504 ----------------------------------------------------------------------
505 function lexer :next (n)
510 a = table.remove (self.peeked, 1)
511 -- TODO: is this used anywhere? I think not. a.lineinfo.last may be nil.
512 --self.lastline = a.lineinfo.last.line
514 self.lineinfo_last_consumed = a.lineinfo.last
518 ----------------------------------------------------------------------
519 -- Returns an object which saves the stream's current state.
520 ----------------------------------------------------------------------
521 -- FIXME there are more fields than that to save
522 function lexer :save () return { self.i; {unpack(self.peeked) } } end
524 ----------------------------------------------------------------------
525 -- Restore the stream's state, as saved by method [save].
526 ----------------------------------------------------------------------
527 -- FIXME there are more fields than that to restore
528 function lexer :restore (s) self.i=s[1]; self.peeked=s[2] end
530 ----------------------------------------------------------------------
531 -- Resynchronize: cancel any token in self.peeked, by emptying the
532 -- list and resetting the indexes
533 ----------------------------------------------------------------------
534 function lexer :sync()
535 local p1 = self.peeked[1]
537 local li_first = p1.lineinfo.first
538 if li_first.comments then li_first=li_first.comments.lineinfo.first end
539 self.i = li_first.offset
540 self.column_offset = self.i - li_first.column
542 self.attached_comments = p1.lineinfo.first.comments or { }
546 ----------------------------------------------------------------------
547 -- Take the source and offset of an old lexer.
548 ----------------------------------------------------------------------
549 function lexer :takeover(old)
550 self :sync(); old :sync()
551 for _, field in ipairs{ 'i', 'src', 'attached_comments', 'posfact' } do
552 self[field] = old[field]
557 ----------------------------------------------------------------------
558 -- Return the current position in the sources. This position is between
559 -- two tokens, and can be within a space / comment area, and therefore
560 -- have a non-null width. :lineinfo_left() returns the beginning of the
561 -- separation area, :lineinfo_right() returns the end of that area.
563 -- ____ last consummed token ____ first unconsummed token
565 -- XXXXX <spaces and comments> YYYYY
567 -- :lineinfo_left() :lineinfo_right()
568 ----------------------------------------------------------------------
569 function lexer :lineinfo_right()
570 return self :peek(1).lineinfo.first
573 function lexer :lineinfo_left()
574 return self.lineinfo_last_consumed
577 ----------------------------------------------------------------------
578 -- Create a new lexstream.
579 ----------------------------------------------------------------------
580 function lexer :newstream (src_or_stream, name)
582 if type(src_or_stream)=='table' then -- it's a stream
583 return setmetatable ({ }, self) :takeover (src_or_stream)
584 elseif type(src_or_stream)=='string' then -- it's a source string
585 local src = src_or_stream
586 local pos1 = M.new_position(1, 1, 1, name)
588 src_name = name; -- Name of the file
589 src = src; -- The source, as a single string
590 peeked = { }; -- Already peeked, but not discarded yet, tokens
591 i = 1; -- Character offset in src
592 attached_comments = { },-- comments accumulator
593 lineinfo_last_extracted = pos1,
594 lineinfo_last_consumed = pos1,
595 posfact = M.new_position_factory (src_or_stream, name)
597 setmetatable (stream, self)
599 -- Skip initial sharp-bang for Unix scripts
600 -- FIXME: redundant with mlp.chunk()
601 if src and src :match "^#!" then
602 local endofline = src :find "\n"
603 stream.i = endofline and (endofline + 1) or #src
607 assert(false, ":newstream() takes a source string or a stream, not a "..
612 ----------------------------------------------------------------------
613 -- If there's no ... args, return the token a (whose truth value is
614 -- true) if it's a `Keyword{ }, or nil. If there are ... args, they
615 -- have to be strings. if the token a is a keyword, and it's content
616 -- is one of the ... args, then returns it (it's truth value is
617 -- true). If no a keyword or not in ..., return nil.
618 ----------------------------------------------------------------------
619 function lexer :is_keyword (a, ...)
620 if not a or a.tag ~= "Keyword" then return false end
622 if #words == 0 then return a[1] end
623 for _, w in ipairs (words) do
624 if w == a[1] then return w end
629 ----------------------------------------------------------------------
630 -- Cause an error if the next token isn't a keyword whose content
631 -- is listed among ... args (which have to be strings).
632 ----------------------------------------------------------------------
633 function lexer :check (...)
635 local a = self :next()
636 local function err ()
637 error ("Got " .. tostring (a) ..
638 ", expected one of these keywords : '" ..
639 table.concat (words,"', '") .. "'") end
640 if not a or a.tag ~= "Keyword" then err () end
641 if #words == 0 then return a[1] end
642 for _, w in ipairs (words) do
643 if w == a[1] then return w end
648 ----------------------------------------------------------------------
650 ----------------------------------------------------------------------
651 function lexer :clone()
652 local alpha_clone, sym_clone = { }, { }
653 for word in pairs(self.alpha) do alpha_clone[word]=true end
654 for letter, list in pairs(self.sym) do sym_clone[letter] = { unpack(list) } end
655 local clone = { alpha=alpha_clone, sym=sym_clone }
656 setmetatable(clone, self)
657 clone.__index = clone
661 ----------------------------------------------------------------------
662 -- Cancel everything left in a lexer, all subsequent attempts at
663 -- `:peek()` or `:next()` will return `Eof`.
664 ----------------------------------------------------------------------
665 function lexer :kill()
668 self.attached_comments = { }
669 self.lineinfo_last = self.posfact :get_position (#self.src+1)