--
-- TODO:
--
--- * Make it possible to change lexer on the fly. This implies the
--- ability to easily undo any pre-extracted tokens;
---
-- * Make it easy to define new flavors of strings. Replacing the
-- lexer.patterns.long_string regexp by an extensible list, with
-- customizable token tag, would probably be enough. Maybe add:
-- which capture holds the content of the string-like token
-- + a token tag
-- + or a string->string transformer function.
+--
+-- * There are some _G.table to prevent a namespace clash which has
+-- now disappered. remove them.
----------------------------------------------------------------------
--
-- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
final_short_comment = "^%-%-([^\n]*)()$",
long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
- number_mantissa = {
- "^%d+%.?%d*()",
- "^%d*%d%.%d+()" },
- number_exponant = "^[eE][%+%-]?%d+()",
- word = "^([%a_][%w_]*)()"
+ number_mantissa = { "^%d+%.?%d*()", "^%d*%.%d+()" },
+ number_exponant = "^[eE][%+%-]?%d+()",
+ number_hex = "^0[xX]%x+()",
+ word = "^([%a_][%w_]*)()"
}
----------------------------------------------------------------------
--- Take a letter [x], and returns the character represented by the
--- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
+-- unescape a whole string, applying [unesc_digits] and
+-- [unesc_letter] as many times as required.
----------------------------------------------------------------------
-local function unesc_letter(x)
- local t = {
- a = "\a", b = "\b", f = "\f",
- n = "\n", r = "\r", t = "\t", v = "\v",
- ["\\"] = "\\", ["'"] = "'", ['"'] = '"' }
- return t[x] or error("Unknown escape sequence \\"..x)
-end
+local function unescape_string (s)
-----------------------------------------------------------------------
--- Turn the digits of an escape sequence into the corresponding
--- character, e.g. [unesc_digits("123") == string.char(123)].
-----------------------------------------------------------------------
-local function unesc_digits (x)
- local k, j, i = x:reverse():byte(1, 3)
- local z = _G.string.byte "0"
- return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
-end
+ -- Turn the digits of an escape sequence into the corresponding
+ -- character, e.g. [unesc_digits("123") == string.char(123)].
+ local function unesc_digits (x)
+ local k, j, i = x:reverse():byte(1, 3)
+ local z = _G.string.byte "0"
+ return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
+ end
-----------------------------------------------------------------------
--- unescape a whole string, applying [unesc_digits] and [unesc_letter]
--- as many times as required.
-----------------------------------------------------------------------
-local function unescape_string (s)
- return s:gsub("\\([0-9]+)", unesc_digits):gsub("\\(.)",unesc_letter)
+ -- Take a letter [x], and returns the character represented by the
+ -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
+ local function unesc_letter(x)
+ local t = {
+ a = "\a", b = "\b", f = "\f",
+ n = "\n", r = "\r", t = "\t", v = "\v",
+ ["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
+ return t[x] or error([[Unknown escape sequence '\]]..x..[[']])
+ end
+
+ return s
+ :gsub ("\\(%D)",unesc_letter)
+ :gsub ("\\([0-9][0-9]?[0-9]?)", unesc_digits)
end
lexer.extractors = {
-- __tostring = function(a)
-- return string.format ("`%s{'%s'}",a.tag, a[1])
-- end
- }
+}
+
+lexer.lineinfo_metatable = { }
----------------------------------------------------------------------
-- Really extract next token fron the raw string
-- (and update the index).
+-- loc: offset of the position just after spaces and comments
+-- previous_i: offset in src before extraction began
----------------------------------------------------------------------
function lexer:extract ()
local previous_i = self.i
- local loc, eof, token = self.i
-
- local function tk (tag, content)
+ local loc = self.i
+ local eof, token
+
+ -- Put line info, comments and metatable around the tag and content
+ -- provided by extractors, thus returning a complete lexer token.
+ -- first_line: line # at the beginning of token
+ -- first_column_offset: char # of the last '\n' before beginning of token
+ -- i: scans from beginning of prefix spaces/comments to end of token.
+ local function build_token (tag, content)
assert (tag and content)
- local i, ln = previous_i, self.line
- -- update line numbers
+ local i, first_line, first_column_offset, previous_line_length =
+ previous_i, self.line, self.column_offset, nil
+
+ -- update self.line and first_line. i := indexes of '\n' chars
while true do
- i = self.src:find("\n", i+1, true)
- if not i then break end
- if loc and i <= loc then ln = ln+1 end
- if i <= self.i then self.line = self.line+1 else break end
+ i = self.src :find ("\n", i+1, true)
+ if not i or i>self.i then break end -- no more '\n' until end of token
+ previous_line_length = i - self.column_offset
+ if loc and i <= loc then -- '\n' before beginning of token
+ first_column_offset = i
+ first_line = first_line+1
+ end
+ self.line = self.line+1
+ self.column_offset = i
end
- local a = { tag = tag, char=loc, line=ln, content }
+
+ -- lineinfo entries: [1]=line, [2]=column, [3]=char, [4]=filename
+ local fli = { first_line, loc-first_column_offset, loc, self.src_name }
+ local lli = { self.line, self.i-self.column_offset-1, self.i-1, self.src_name }
+ --Pluto barfes when the metatable is set:(
+ setmetatable(fli, lexer.lineinfo_metatable)
+ setmetatable(lli, lexer.lineinfo_metatable)
+ local a = { tag = tag, lineinfo = { first=fli, last=lli }, content }
+ if lli[2]==-1 then lli[1], lli[2] = lli[1]-1, previous_line_length-1 end
if #self.attached_comments > 0 then
- a.comments = self.attached_comments
- self.attached_comments = nil
+ a.lineinfo.comments = self.attached_comments
+ fli.comments = self.attached_comments
+ if self.lineinfo_last then
+ self.lineinfo_last.comments = self.attached_comments
+ end
end
+ self.attached_comments = { }
return setmetatable (a, self.token_metatable)
- end
+ end --</function build_token>
- self.attached_comments = { }
-
for ext_idx, extractor in ipairs(self.extractors) do
-- printf("method = %s", method)
- local tag, content = self[extractor](self)
- -- [loc] is placed just after the leading whitespaces and comments,
- -- and the whitespace extractor is at index 1.
+ local tag, content = self [extractor] (self)
+ -- [loc] is placed just after the leading whitespaces and comments;
+ -- for this to work, the whitespace extractor *must be* at index 1.
if ext_idx==1 then loc = self.i end
if tag then
--printf("`%s{ %q }\t%i", tag, content, loc);
- return tk (tag, content)
+ return build_token (tag, content)
end
end
- error "Cant extract anything!"
+ error "None of the lexer extractors returned anything!"
end
----------------------------------------------------------------------
-- skip whites and comments
-- FIXME: doesn't take into account:
-- - unterminated long comments
--- - short comments without a final \n
+-- - short comments at last line without a final \n
----------------------------------------------------------------------
function lexer:skip_whitespaces_and_comments()
- local attached_comments = { }
- repeat
+ local table_insert = _G.table.insert
+ repeat -- loop as long as a space or comment chunk is found
local _, j
local again = false
local last_comment_content = nil
-- skip spaces
self.i = self.src:match (self.patterns.spaces, self.i)
-- skip a long comment if any
- _, last_comment_content, j = self.src:match (self.patterns.long_comment, self.i)
+ _, last_comment_content, j =
+ self.src :match (self.patterns.long_comment, self.i)
if j then
- _G.table.insert(self.attached_comments,
+ table_insert(self.attached_comments,
{last_comment_content, self.i, j, "long"})
self.i=j; again=true
end
-- skip a short comment if any
last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
if j then
- _G.table.insert(attached_comments,
+ table_insert(self.attached_comments,
{last_comment_content, self.i, j, "short"})
self.i=j; again=true
end
end
----------------------------------------------------------------------
---
+-- extract a '...' or "..." short string
----------------------------------------------------------------------
function lexer:extract_short_string()
-- [k] is the first unread char, [self.i] points to [k] in [self.src]
- local j, k = self.i, self.src:sub (self.i,self.i)
- if k=="'" or k=='"' then
- -- short string
- repeat
- self.i=self.i+1;
- local kk = self.src:sub (self.i, self.i)
- if kk=="\\" then
- self.i=self.i+1;
- kk = self.src:sub (self.i, self.i)
- end
- if self.i > #self.src then error "Unterminated string" end
- if self.i == "\r" or self.i == "\n" then error "no \\n in short strings!" end
- until self.src:sub (self.i, self.i) == k
- and ( self.src:sub (self.i-1, self.i-1) ~= '\\'
- or self.src:sub (self.i-2, self.i-2) == '\\')
- self.i=self.i+1
- return "String", unescape_string (self.src:sub (j+1,self.i-2))
- end
+ local j, k = self.i, self.src :sub (self.i,self.i)
+ if k~="'" and k~='"' then return end
+ local i = self.i + 1
+ local j = i
+ while true do
+ -- k = opening char: either simple-quote or double-quote
+ -- i = index of beginning-of-string
+ -- x = next "interesting" character
+ -- j = position after interesting char
+ -- y = char just after x
+ local x, y
+ x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j)
+ if x == '\\' then j=j+1 -- don't parse escaped char
+ elseif x == k then break -- unescaped end of string
+ else -- eof or '\r' or '\n' reached before end of string
+ assert (not x or x=="\r" or x=="\n")
+ error "Unterminated string"
+ end
+ end
+ self.i = j
+
+ return "String", unescape_string (self.src:sub (i,j-2))
end
----------------------------------------------------------------------
----------------------------------------------------------------------
function lexer:extract_number()
-- Number
- local j = self.src:match (self.patterns.number_mantissa[1], self.i) or
- self.src:match (self.patterns.number_mantissa[2], self.i)
- if j then
- j = self.src:match (self.patterns.number_exponant, j) or j;
- local n = tonumber (self.src:sub (self.i, j-1))
- self.i = j
- return "Number", n
+ local j = self.src:match(self.patterns.number_hex, self.i)
+ if not j then
+ j = self.src:match (self.patterns.number_mantissa[1], self.i) or
+ self.src:match (self.patterns.number_mantissa[2], self.i)
+ if j then
+ j = self.src:match (self.patterns.number_exponant, j) or j;
+ end
end
+ if not j then return end
+ -- Number found, interpret with tonumber() and return it
+ local n = tonumber (self.src:sub (self.i, j-1))
+ self.i = j
+ return "Number", n
end
----------------------------------------------------------------------
----------------------------------------------------------------------
-- Add a keyword to the list of keywords recognized by the lexer.
----------------------------------------------------------------------
-function lexer:add (w)
+function lexer:add (w, ...)
+ assert(not ..., "lexer:add() takes only one arg, although possibly a table")
if type (w) == "table" then
for _, x in ipairs (w) do self:add (x) end
else
-- token is returned.
----------------------------------------------------------------------
function lexer:peek (n)
- assert(self)
if not n then n=1 end
if n > #self.peeked then
for i = #self.peeked+1, n do
-- stream, an EOF token is returned.
----------------------------------------------------------------------
function lexer:next (n)
- if not n then n=1 end
+ n = n or 1
self:peek (n)
local a
for i=1,n do
a = _G.table.remove (self.peeked, 1)
- if a then debugf ("[L:%i K:%i T:%s %q]", a.line or -1, a.char or -1, a.tag or '<none>', a[1]) end
+ if a then
+ --debugf ("lexer:next() ==> %s %s",
+ -- table.tostring(a), tostring(a))
+ end
+ self.lastline = a.lineinfo.last[1]
end
+ self.lineinfo_last = a.lineinfo.last
return a or eof_token
end
----------------------------------------------------------------------
-- Returns an object which saves the stream's current state.
----------------------------------------------------------------------
+-- FIXME there are more fields than that to save
function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
----------------------------------------------------------------------
-- Restore the stream's state, as saved by method [save].
----------------------------------------------------------------------
+-- FIXME there are more fields than that to restore
function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
+----------------------------------------------------------------------
+-- Resynchronize: cancel any token in self.peeked, by emptying the
+-- list and resetting the indexes
+----------------------------------------------------------------------
+function lexer:sync()
+ local p1 = self.peeked[1]
+ if p1 then
+ li = p1.lineinfo.first
+ self.line, self.i = li[1], li[3]
+ self.column_offset = self.i - li[2]
+ self.peeked = { }
+ self.attached_comments = p1.lineinfo.first.comments or { }
+ end
+end
+
+----------------------------------------------------------------------
+-- Take the source and offset of an old lexer.
+----------------------------------------------------------------------
+function lexer:takeover(old)
+ self:sync()
+ self.line, self.column_offset, self.i, self.src, self.attached_comments =
+ old.line, old.column_offset, old.i, old.src, old.attached_comments
+ return self
+end
+
+-- function lexer:lineinfo()
+-- if self.peeked[1] then return self.peeked[1].lineinfo.first
+-- else return { self.line, self.i-self.column_offset, self.i } end
+-- end
+
+
+----------------------------------------------------------------------
+-- Return the current position in the sources. This position is between
+-- two tokens, and can be within a space / comment area, and therefore
+-- have a non-null width. :lineinfo_left() returns the beginning of the
+-- separation area, :lineinfo_right() returns the end of that area.
+--
+-- ____ last consummed token ____ first unconsummed token
+-- / /
+-- XXXXX <spaces and comments> YYYYY
+-- \____ \____
+-- :lineinfo_left() :lineinfo_right()
+----------------------------------------------------------------------
+function lexer:lineinfo_right()
+ return self:peek(1).lineinfo.first
+end
+
+function lexer:lineinfo_left()
+ return self.lineinfo_last
+end
+
----------------------------------------------------------------------
-- Create a new lexstream.
----------------------------------------------------------------------
-function lexer:newstream (src)
- local stream = {
- src = src; -- The source, as a single string
- peeked = { }; -- Already peeked, but not discarded yet, tokens
- i = 1; -- Character offset in src
- line = 1; -- current line number
- }
- setmetatable (stream, self)
-
- -- skip initial sharp-bang for unix scripts
- if src:match "^#!" then stream.i = src:find "\n" + 1 end
- return stream
+function lexer:newstream (src_or_stream, name)
+ name = name or "?"
+ if type(src_or_stream)=='table' then -- it's a stream
+ return setmetatable ({ }, self) :takeover (src_or_stream)
+ elseif type(src_or_stream)=='string' then -- it's a source string
+ local src = src_or_stream
+ local stream = {
+ src_name = name; -- Name of the file
+ src = src; -- The source, as a single string
+ peeked = { }; -- Already peeked, but not discarded yet, tokens
+ i = 1; -- Character offset in src
+ line = 1; -- Current line number
+ column_offset = 0; -- distance from beginning of file to last '\n'
+ attached_comments = { },-- comments accumulator
+ lineinfo_last = { 1, 1, 1, name }
+ }
+ setmetatable (stream, self)
+
+ -- skip initial sharp-bang for unix scripts
+ -- FIXME: redundant with mlp.chunk()
+ if src and src :match "^#" then stream.i = src :find "\n" + 1 end
+ return stream
+ else
+ assert(false, ":newstream() takes a source string or a stream, not a "..
+ type(src_or_stream))
+ end
end
----------------------------------------------------------------------
clone.__index = clone
return clone
end
-
-----------------------------------------------------------------------
---
-----------------------------------------------------------------------
-function is_stream (x)
- return getmetable(x) == lexer
-end