--
-- TODO:
--
--- * Make it possible to change lexer on the fly. This implies the
--- ability to easily undo any pre-extracted tokens;
---
-- * Make it easy to define new flavors of strings. Replacing the
-- lexer.patterns.long_string regexp by an extensible list, with
-- customizable token tag, would probably be enough. Maybe add:
final_short_comment = "^%-%-([^\n]*)()$",
long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
- number_mantissa = {
- "^%d+%.?%d*()",
- "^%d*%d%.%d+()" },
- number_exponant = "^[eE][%+%-]?%d+()",
- number_hex = "^0[xX]%x+()",
- word = "^([%a_][%w_]*)()"
+ number_mantissa = { "^%d+%.?%d*()", "^%d*%.%d+()" },
+ number_exponant = "^[eE][%+%-]?%d+()",
+ number_hex = "^0[xX]%x+()",
+ word = "^([%a_][%w_]*)()"
}
-
-
----------------------------------------------------------------------
-- unescape a whole string, applying [unesc_digits] and
-- [unesc_letter] as many times as required.
a = "\a", b = "\b", f = "\f",
n = "\n", r = "\r", t = "\t", v = "\v",
["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
- return t[x] or error("Unknown escape sequence \\"..x)
+ return t[x] or error([[Unknown escape sequence '\]]..x..[[']])
end
return s
- :gsub ("\\([0-9]+)", unesc_digits)
- :gsub ("\\(.)",unesc_letter)
+ :gsub ("\\(%D)",unesc_letter)
+ :gsub ("\\([0-9][0-9]?[0-9]?)", unesc_digits)
end
lexer.extractors = {
-- __tostring = function(a)
-- return string.format ("`%s{'%s'}",a.tag, a[1])
-- end
- }
+}
+
+lexer.lineinfo_metatable = { }
----------------------------------------------------------------------
-- Really extract next token fron the raw string
local loc = self.i
local eof, token
- -- Put line info, comments and metatable arount the tag and content
+ -- Put line info, comments and metatable around the tag and content
-- provided by extractors, thus returning a complete lexer token.
+ -- first_line: line # at the beginning of token
+ -- first_column_offset: char # of the last '\n' before beginning of token
+ -- i: scans from beginning of prefix spaces/comments to end of token.
local function build_token (tag, content)
assert (tag and content)
- local i, first_line, first_column_offset =
- previous_i, self.line, self.column_offset
+ local i, first_line, first_column_offset, previous_line_length =
+ previous_i, self.line, self.column_offset, nil
+
-- update self.line and first_line. i := indexes of '\n' chars
while true do
i = self.src :find ("\n", i+1, true)
- if not i then break end
- if loc and i <= loc then
+ if not i or i>self.i then break end -- no more '\n' until end of token
+ previous_line_length = i - self.column_offset
+ if loc and i <= loc then -- '\n' before beginning of token
first_column_offset = i
first_line = first_line+1
end
- if i <= self.i then
- self.line = self.line+1
- self.column_offset = i
- else break end
+ self.line = self.line+1
+ self.column_offset = i
end
- local a = { --char = loc, line = self.line,
- tag = tag,
- lineinfo = {
- first = { first_line, loc - first_column_offset, loc },
- last = { self.line, self.i - self.column_offset, self.i } },
- content }
+
+ -- lineinfo entries: [1]=line, [2]=column, [3]=char, [4]=filename
+ local fli = { first_line, loc-first_column_offset, loc, self.src_name }
+ local lli = { self.line, self.i-self.column_offset-1, self.i-1, self.src_name }
+ --Pluto barfes when the metatable is set:(
+ setmetatable(fli, lexer.lineinfo_metatable)
+ setmetatable(lli, lexer.lineinfo_metatable)
+ local a = { tag = tag, lineinfo = { first=fli, last=lli }, content }
+ if lli[2]==-1 then lli[1], lli[2] = lli[1]-1, previous_line_length-1 end
if #self.attached_comments > 0 then
a.lineinfo.comments = self.attached_comments
- self.attached_comments = nil
+ fli.comments = self.attached_comments
+ if self.lineinfo_last then
+ self.lineinfo_last.comments = self.attached_comments
+ end
end
+ self.attached_comments = { }
return setmetatable (a, self.token_metatable)
end --</function build_token>
- self.attached_comments = { }
-
for ext_idx, extractor in ipairs(self.extractors) do
-- printf("method = %s", method)
local tag, content = self [extractor] (self)
----------------------------------------------------------------------
function lexer:skip_whitespaces_and_comments()
local table_insert = _G.table.insert
- local attached_comments = { }
repeat -- loop as long as a space or comment chunk is found
local _, j
local again = false
-- skip a short comment if any
last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
if j then
- table_insert(attached_comments,
+ table_insert(self.attached_comments,
{last_comment_content, self.i, j, "short"})
self.i=j; again=true
end
-- y = char just after x
local x, y
x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j)
- if x == '\\' then j=j+1 -- don't parse escaped char
+ if x == '\\' then j=j+1 -- don't parse escaped char
elseif x == k then break -- unescaped end of string
- else -- end of source or \r/\n before end of string
+ else -- eof or '\r' or '\n' reached before end of string
assert (not x or x=="\r" or x=="\n")
error "Unterminated string"
end
-- token is returned.
----------------------------------------------------------------------
function lexer:peek (n)
- assert(self)
if not n then n=1 end
if n > #self.peeked then
for i = #self.peeked+1, n do
-- stream, an EOF token is returned.
----------------------------------------------------------------------
function lexer:next (n)
- if not n then n=1 end
+ n = n or 1
self:peek (n)
local a
for i=1,n do
a = _G.table.remove (self.peeked, 1)
if a then
- debugf ("lexer:next() ==> %s",
- table.tostring(a))
+ --debugf ("lexer:next() ==> %s %s",
+ -- table.tostring(a), tostring(a))
end
self.lastline = a.lineinfo.last[1]
end
+ self.lineinfo_last = a.lineinfo.last
return a or eof_token
end
----------------------------------------------------------------------
-- Returns an object which saves the stream's current state.
----------------------------------------------------------------------
+-- FIXME there are more fields than that to save
function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
----------------------------------------------------------------------
-- Restore the stream's state, as saved by method [save].
----------------------------------------------------------------------
+-- FIXME there are more fields than that to restore
function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
----------------------------------------------------------------------
local p1 = self.peeked[1]
if p1 then
li = p1.lineinfo.first
- self.line, self.column_offset, self.i, self.peeked =
- li[1], li[2], li[3], { }
+ self.line, self.i = li[1], li[3]
+ self.column_offset = self.i - li[2]
+ self.peeked = { }
+ self.attached_comments = p1.lineinfo.first.comments or { }
end
end
----------------------------------------------------------------------
function lexer:takeover(old)
self:sync()
- self.line, self.column_offset, self.i, self.src =
- old.line, old.column_offset, old.i, old.src
+ self.line, self.column_offset, self.i, self.src, self.attached_comments =
+ old.line, old.column_offset, old.i, old.src, old.attached_comments
return self
end
-function lexer:lineinfo()
- if self.peeked[1] then return self.peeked[1].lineinfo.first
- else return { self.line, self.i-self.column_offset, self.i } end
+-- function lexer:lineinfo()
+-- if self.peeked[1] then return self.peeked[1].lineinfo.first
+-- else return { self.line, self.i-self.column_offset, self.i } end
+-- end
+
+
+----------------------------------------------------------------------
+-- Return the current position in the sources. This position is between
+-- two tokens, and can be within a space / comment area, and therefore
+-- have a non-null width. :lineinfo_left() returns the beginning of the
+-- separation area, :lineinfo_right() returns the end of that area.
+--
+-- ____ last consummed token ____ first unconsummed token
+-- / /
+-- XXXXX <spaces and comments> YYYYY
+-- \____ \____
+-- :lineinfo_left() :lineinfo_right()
+----------------------------------------------------------------------
+function lexer:lineinfo_right()
+ return self:peek(1).lineinfo.first
+end
+
+function lexer:lineinfo_left()
+ return self.lineinfo_last
end
----------------------------------------------------------------------
-- Create a new lexstream.
----------------------------------------------------------------------
-function lexer:newstream (src_or_stream)
+function lexer:newstream (src_or_stream, name)
+ name = name or "?"
if type(src_or_stream)=='table' then -- it's a stream
return setmetatable ({ }, self) :takeover (src_or_stream)
elseif type(src_or_stream)=='string' then -- it's a source string
local src = src_or_stream
local stream = {
- src = src; -- The source, as a single string
- peeked = { }; -- Already peeked, but not discarded yet, tokens
- i = 1; -- Character offset in src
- line = 1; -- Current line number
- column_offset = 0; -- distance from beginning of file to last '\n'
+ src_name = name; -- Name of the file
+ src = src; -- The source, as a single string
+ peeked = { }; -- Already peeked, but not discarded yet, tokens
+ i = 1; -- Character offset in src
+ line = 1; -- Current line number
+ column_offset = 0; -- distance from beginning of file to last '\n'
+ attached_comments = { },-- comments accumulator
+ lineinfo_last = { 1, 1, 1, name }
}
setmetatable (stream, self)
-- skip initial sharp-bang for unix scripts
- if src and src :match "^#!" then stream.i = src :find "\n" + 1 end
+ -- FIXME: redundant with mlp.chunk()
+ if src and src :match "^#" then stream.i = src :find "\n" + 1 end
return stream
else
assert(false, ":newstream() takes a source string or a stream, not a "..