handling of string escape sequences in the lexer: not being fooled by double-backslas...

[metalua.git] / src / compiler / lexer.lua
diff --git a/src/compiler/lexer.lua b/src/compiler/lexer.lua

index 51879d2305bfe732efac5ebaaee3f9125049b59e..698e5ddccf49b6382554e190f80a899f493e1348 100644 (file)
--- a/src/compiler/lexer.lua
+++ b/src/compiler/lexer.lua
@@ -17,6 +17,9 @@
  --     which capture holds the content of the string-like token
  --   + a token tag
  --   + or a string->string transformer function.
+--
+-- * There are some _G.table to prevent a namespace clash which has
+--   now disappered. remove them.
  ----------------------------------------------------------------------
  --
  -- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
@@ -49,39 +52,44 @@ lexer.patterns = {
     long_string         = "^%[(=*)%[\n?(.-)%]%1%]()",
     number_mantissa     = {
        "^%d+%.?%d*()",
-      "^%d*%d%.%d+()" },
+      "^%d*%.%d+()" },
     number_exponant = "^[eE][%+%-]?%d+()",
+   number_hex      = "^0[xX]%x+()",
     word            = "^([%a_][%w_]*)()"
  }
  
  ----------------------------------------------------------------------
--- Take a letter [x], and returns the character represented by the 
--- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
+-- unescape a whole string, applying [unesc_digits] and
+-- [unesc_letter] as many times as required.
  ----------------------------------------------------------------------
-local function unesc_letter(x)
-   local t = { 
-      a = "\a", b = "\b", f = "\f",
-      n = "\n", r = "\r", t = "\t", v = "\v",
-      ["\\"] = "\\", ["'"] = "'", ['"'] = '"' }
-   return t[x] or error("Unknown escape sequence \\"..x)
-end
+local function unescape_string (s)
  
-----------------------------------------------------------------------
--- Turn the digits of an escape sequence into the corresponding
--- character, e.g. [unesc_digits("123") == string.char(123)].
-----------------------------------------------------------------------
-local function unesc_digits (x)
-   local k, j, i = x:reverse():byte(1, 3)
-   local z = _G.string.byte "0"
-   return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
-end
+   -- Turn the digits of an escape sequence into the corresponding
+   -- character, e.g. [unesc_digits("123") == string.char(123)].
+   local function unesc_digits (x)
+      if x:sub(1,1)=="\\" then return x end -- Hack to parse correctly "\\123"
+      local k, j, i = x:reverse():byte(1, 3)
+      local z = _G.string.byte "0"
+      local code = (k or z) + 10*(j or z) + 100*(i or z) - 111*z
+      if code > 255 then 
+        error ("Illegal escape sequence '\\"..x.."' in string: ASCII codes must be in [0..255]") 
+      end
+      return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
+   end
  
-----------------------------------------------------------------------
--- unescape a whole string, applying [unesc_digits] and [unesc_letter]
--- as many times as required.
-----------------------------------------------------------------------
-local function unescape_string (s)
-   return s:gsub("\\([0-9]+)", unesc_digits):gsub("\\(.)",unesc_letter)
+   -- Take a letter [x], and returns the character represented by the 
+   -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
+   local function unesc_letter(x)
+      local t = { 
+         a = "\a", b = "\b", f = "\f",
+         n = "\n", r = "\r", t = "\t", v = "\v",
+         ["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
+      return t[x] or error([[Unknown escape sequence '\]]..x..[[']])
+   end
+
+   return s
+      :gsub ("\\(%D)",unesc_letter)
+      :gsub ("\\(\\?[0-9][0-9]?[0-9]?)", unesc_digits)
  end
  
  lexer.extractors = {
@@ -93,77 +101,121 @@ lexer.token_metatable = {
  --         __tostring = function(a) 
  --            return string.format ("`%s{'%s'}",a.tag, a[1]) 
  --         end 
-      } 
+} 
+      
+lexer.lineinfo_metatable = { }
+--[[ 
+-- The presence of this function prevents serialization by Pluto, 
+-- I can't figure out why :(
+function lexer.lineinfo_metatable:__tostring()
+   local txt = string.format("%s:%i(%i,%i)", self[4], self[3], self[1], self[2])
+   if self.comments then 
+      acc = { }
+      for comment in ivalues(self.comments) do
+        local content, loc1, loc2, kind = unpack(comment)
+        table.insert (acc, string.format ("%s@%i..%i:%q", kind, loc1, loc2, content))
+      end
+      txt = txt.."["..table.concat(acc,"; ").."]"
+   end
+   return txt
+end
+--]]
  
  ----------------------------------------------------------------------
  -- Really extract next token fron the raw string 
  -- (and update the index).
+-- loc: offset of the position just after spaces and comments
+-- previous_i: offset in src before extraction began
  ----------------------------------------------------------------------
  function lexer:extract ()
     local previous_i = self.i
-   local loc, eof, token = self.i
-
-   local function tk (tag, content)
+   local loc = self.i
+   local eof, token
+
+   -- Put line info, comments and metatable arount the tag and content
+   -- provided by extractors, thus returning a complete lexer token.
+   -- first_line: line # at the beginning of token
+   -- first_column_offset: char # of the last '\n' before beginning of token
+   -- i: scans from beginning of prefix spaces/comments to end of token.
+   local function build_token (tag, content)
        assert (tag and content)
-      local i, ln = previous_i, self.line
-      -- update line numbers
+      local i, first_line, first_column_offset, previous_line_length =
+         previous_i, self.line, self.column_offset, nil
+
+      -- update self.line and first_line. i := indexes of '\n' chars
        while true do
-         i = self.src:find("\n", i+1, true)
-         if not i then break end
-         if loc and i <= loc then ln = ln+1 end
-         if i <= self.i then self.line = self.line+1 else break end
+         i = self.src :find ("\n", i+1, true)
+         if not i or i>self.i then break end -- no more '\n' until end of token
+         previous_line_length = i - self.column_offset
+         if loc and i <= loc then -- '\n' before beginning of token
+            first_column_offset = i
+            first_line = first_line+1 
+         end
+         self.line   = self.line+1 
+         self.column_offset = i 
        end
-      local a = { tag = tag, char=loc, line=ln, content }
+
+      -- lineinfo entries: [1]=line, [2]=column, [3]=char, [4]=filename
+      local fli = { first_line, loc-first_column_offset, loc, self.src_name }
+      local lli = { self.line, self.i-self.column_offset-1, self.i-1, self.src_name }
+      --Pluto barfes when the metatable is set:(
+      setmetatable(fli, lexer.lineinfo_metatable)
+      setmetatable(lli, lexer.lineinfo_metatable)
+      local a = { tag = tag, lineinfo = { first=fli, last=lli }, content } 
+      if lli[2]==-1 then lli[1], lli[2] = lli[1]-1, previous_line_length-1 end
        if #self.attached_comments > 0 then 
-         a.comments = self.attached_comments 
-         self.attached_comments = nil
+         a.lineinfo.comments = self.attached_comments 
+         fli.comments = self.attached_comments
+         if self.lineinfo_last then
+            self.lineinfo_last.comments = self.attached_comments
+         end
        end
+      self.attached_comments = { }
        return setmetatable (a, self.token_metatable)
-   end
+   end --</function build_token>
  
-   self.attached_comments = { }
-   
     for ext_idx, extractor in ipairs(self.extractors) do
        -- printf("method = %s", method)
-      local tag, content = self[extractor](self)
-      -- [loc] is placed just after the leading whitespaces and comments,
-      -- and the whitespace extractor is at index 1.
+      local tag, content = self [extractor] (self)
+      -- [loc] is placed just after the leading whitespaces and comments;
+      -- for this to work, the whitespace extractor *must be* at index 1.
        if ext_idx==1 then loc = self.i end
  
        if tag then 
           --printf("`%s{ %q }\t%i", tag, content, loc);
-         return tk (tag, content) 
+         return build_token (tag, content) 
        end
     end
  
-   error "Cant extract anything!"
+   error "None of the lexer extractors returned anything!"
  end   
  
  ----------------------------------------------------------------------
  -- skip whites and comments
  -- FIXME: doesn't take into account:
  -- - unterminated long comments
--- - short comments without a final \n
+-- - short comments at last line without a final \n
  ----------------------------------------------------------------------
  function lexer:skip_whitespaces_and_comments()
-   local attached_comments = { }
-   repeat
+   local table_insert = _G.table.insert
+   repeat -- loop as long as a space or comment chunk is found
        local _, j
        local again = false
        local last_comment_content = nil
        -- skip spaces
        self.i = self.src:match (self.patterns.spaces, self.i)
        -- skip a long comment if any
-      _, last_comment_content, j = self.src:match (self.patterns.long_comment, self.i)
+      _, last_comment_content, j = 
+         self.src :match (self.patterns.long_comment, self.i)
        if j then 
-         _G.table.insert(self.attached_comments, 
+         table_insert(self.attached_comments, 
                           {last_comment_content, self.i, j, "long"})
           self.i=j; again=true 
        end
        -- skip a short comment if any
        last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
        if j then
-         _G.table.insert(attached_comments, 
+         table_insert(self.attached_comments, 
                           {last_comment_content, self.i, j, "short"})
           self.i=j; again=true 
        end
@@ -179,28 +231,32 @@ function lexer:skip_whitespaces_and_comments()
  end
  
  ----------------------------------------------------------------------
---
+-- extract a '...' or "..." short string
  ----------------------------------------------------------------------
  function lexer:extract_short_string()
     -- [k] is the first unread char, [self.i] points to [k] in [self.src]
-   local j, k = self.i, self.src:sub (self.i,self.i)
-   if k=="'" or k=='"' then
-      -- short string
-      repeat
-         self.i=self.i+1; 
-         local kk = self.src:sub (self.i, self.i)
-         if kk=="\\" then 
-            self.i=self.i+1; 
-            kk = self.src:sub (self.i, self.i)
-         end
-         if self.i > #self.src then error "Unterminated string" end
-         if self.i == "\r" or self.i == "\n" then error "no \\n in short strings!" end
-      until self.src:sub (self.i, self.i) == k 
-         and ( self.src:sub (self.i-1, self.i-1) ~= '\\' 
-         or self.src:sub (self.i-2, self.i-2) == '\\')
-      self.i=self.i+1
-      return "String", unescape_string (self.src:sub (j+1,self.i-2))
-   end   
+   local j, k = self.i, self.src :sub (self.i,self.i)
+   if k~="'" and k~='"' then return end
+   local i = self.i + 1
+   local j = i
+   while true do
+      -- k = opening char: either simple-quote or double-quote
+      -- i = index of beginning-of-string
+      -- x = next "interesting" character
+      -- j = position after interesting char
+      -- y = char just after x
+      local x, y
+      x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j)
+      if x == '\\' then j=j+1  -- don't parse escaped char
+      elseif x == k then break -- unescaped end of string
+      else -- eof or '\r' or '\n' reached before end of string
+         assert (not x or x=="\r" or x=="\n")
+         error "Unterminated string"
+      end
+   end
+   self.i = j
+
+   return "String", unescape_string (self.src:sub (i,j-2))
  end
  
  ----------------------------------------------------------------------
@@ -221,14 +277,19 @@ end
  ----------------------------------------------------------------------
  function lexer:extract_number()
     -- Number
-   local j = self.src:match (self.patterns.number_mantissa[1], self.i) or
-             self.src:match (self.patterns.number_mantissa[2], self.i)
-   if j then 
-      j = self.src:match (self.patterns.number_exponant, j) or j;
-      local n = tonumber (self.src:sub (self.i, j-1))
-      self.i = j
-      return "Number", n
+   local j = self.src:match(self.patterns.number_hex, self.i)
+   if not j then
+      j = self.src:match (self.patterns.number_mantissa[1], self.i) or
+          self.src:match (self.patterns.number_mantissa[2], self.i)
+      if j then
+         j = self.src:match (self.patterns.number_exponant, j) or j;
+      end
     end
+   if not j then return end
+   -- Number found, interpret with tonumber() and return it
+   local n = tonumber (self.src:sub (self.i, j-1))
+   self.i = j
+   return "Number", n
  end
  
  ----------------------------------------------------------------------
@@ -265,7 +326,8 @@ end
  ----------------------------------------------------------------------
  -- Add a keyword to the list of keywords recognized by the lexer.
  ----------------------------------------------------------------------
-function lexer:add (w)
+function lexer:add (w, ...)
+   assert(not ..., "lexer:add() takes only one arg, although possibly a table")
     if type (w) == "table" then
        for _, x in ipairs (w) do self:add (x) end
     else
@@ -286,7 +348,6 @@ end
  -- token is returned.
  ----------------------------------------------------------------------
  function lexer:peek (n)
-   assert(self)
     if not n then n=1 end
     if n > #self.peeked then
        for i = #self.peeked+1, n do
@@ -302,41 +363,113 @@ end
  -- stream, an EOF token is returned.
  ----------------------------------------------------------------------
  function lexer:next (n)
-   if not n then n=1 end
+   n = n or 1
     self:peek (n)
     local a
     for i=1,n do 
        a = _G.table.remove (self.peeked, 1) 
-      if a then debugf ("[L:%i K:%i T:%s %q]", a.line or -1, a.char or -1, a.tag or '<none>', a[1]) end
+      if a then 
+         --debugf ("lexer:next() ==> %s %s",
+         --        table.tostring(a), tostring(a))
+      end
+      self.lastline = a.lineinfo.last[1]
     end
+   self.lineinfo_last = a.lineinfo.last
     return a or eof_token
  end
  
  ----------------------------------------------------------------------
  -- Returns an object which saves the stream's current state.
  ----------------------------------------------------------------------
+-- FIXME there are more fields than that to save
  function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
  
  ----------------------------------------------------------------------
  -- Restore the stream's state, as saved by method [save].
  ----------------------------------------------------------------------
+-- FIXME there are more fields than that to restore
  function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
  
+----------------------------------------------------------------------
+-- Resynchronize: cancel any token in self.peeked, by emptying the
+-- list and resetting the indexes
+----------------------------------------------------------------------
+function lexer:sync()
+   local p1 = self.peeked[1]
+   if p1 then 
+      li = p1.lineinfo.first
+      self.line, self.i = li[1], li[3]
+      self.column_offset = self.i - li[2]
+      self.peeked = { }
+      self.attached_comments = p1.lineinfo.first.comments or { }
+   end
+end
+
+----------------------------------------------------------------------
+-- Take the source and offset of an old lexer.
+----------------------------------------------------------------------
+function lexer:takeover(old)
+   self:sync()
+   self.line, self.column_offset, self.i, self.src, self.attached_comments =
+      old.line, old.column_offset, old.i, old.src, old.attached_comments
+   return self
+end
+
+-- function lexer:lineinfo()
+--     if self.peeked[1] then return self.peeked[1].lineinfo.first
+--     else return { self.line, self.i-self.column_offset, self.i } end
+-- end
+
+
+----------------------------------------------------------------------
+-- Return the current position in the sources. This position is between
+-- two tokens, and can be within a space / comment area, and therefore
+-- have a non-null width. :lineinfo_left() returns the beginning of the
+-- separation area, :lineinfo_right() returns the end of that area.
+--
+--    ____ last consummed token    ____ first unconsummed token
+--   /                            /
+-- XXXXX  <spaces and comments> YYYYY
+--      \____                    \____
+--           :lineinfo_left()         :lineinfo_right()
+----------------------------------------------------------------------
+function lexer:lineinfo_right()
+   return self:peek(1).lineinfo.first
+end
+
+function lexer:lineinfo_left()
+   return self.lineinfo_last
+end
+
  ----------------------------------------------------------------------
  -- Create a new lexstream.
  ----------------------------------------------------------------------
-function lexer:newstream (src)
-   local stream = { 
-      src    = src; -- The source, as a single string
-      peeked = { }; -- Already peeked, but not discarded yet, tokens
-      i      = 1;   -- Character offset in src
-      line   = 1;   -- current line number
-   }
-   setmetatable (stream, self)
-
-   -- skip initial sharp-bang for unix scripts
-   if src:match "^#!" then stream.i = src:find "\n" + 1 end
-   return stream
+function lexer:newstream (src_or_stream, name)
+   name = name or "?"
+   if type(src_or_stream)=='table' then -- it's a stream
+      return setmetatable ({ }, self) :takeover (src_or_stream)
+   elseif type(src_or_stream)=='string' then -- it's a source string
+      local src = src_or_stream
+      local stream = { 
+         src_name      = name;   -- Name of the file
+         src           = src;    -- The source, as a single string
+         peeked        = { };    -- Already peeked, but not discarded yet, tokens
+         i             = 1;      -- Character offset in src
+         line          = 1;      -- Current line number
+         column_offset = 0;      -- distance from beginning of file to last '\n'
+         attached_comments = { },-- comments accumulator
+         lineinfo_last = { 1, 1, 1, name }
+      }
+      setmetatable (stream, self)
+
+      -- skip initial sharp-bang for unix scripts
+      -- FIXME: redundant with mlp.chunk()
+      if src and src :match "^#" then stream.i = src :find "\n" + 1 end
+      return stream
+   else
+      assert(false, ":newstream() takes a source string or a stream, not a "..
+                    type(src_or_stream))
+   end
  end
  
  ----------------------------------------------------------------------
@@ -387,10 +520,3 @@ function lexer:clone()
     clone.__index = clone
     return clone
  end
-
-----------------------------------------------------------------------
--- 
-----------------------------------------------------------------------
-function is_stream (x)
-   return getmetable(x) == lexer
-end