runtime/plugins/autoclose/autoclose.lua

   1 -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
   2 --
   3 -- Provides UTF-8 aware string functions implemented in pure lua:
   4 -- * utf8len(s)
   5 -- * utf8sub(s, i, j)
   6 -- * utf8reverse(s)
   7 -- * utf8char(unicode)
   8 -- * utf8unicode(s, i, j)
   9 -- * utf8gensub(s, sub_len)
  10 -- * utf8find(str, regex, init, plain)
  11 -- * utf8match(str, regex, init)
  12 -- * utf8gmatch(str, regex, all)
  13 -- * utf8gsub(str, regex, repl, limit)
  14 --
  15 -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
  16 -- additional functions are available:
  17 -- * utf8upper(s)
  18 -- * utf8lower(s)
  19 --
  20 -- All functions behave as their non UTF-8 aware counterparts with the exception
  21 -- that UTF-8 characters are used instead of bytes for all units.
  22
  23 --[[
  24 Copyright (c) 2006-2007, Kyle Smith
  25 All rights reserved.
  26
  27 Contributors:
  28     Alimov Stepan
  29
  30 Redistribution and use in source and binary forms, with or without
  31 modification, are permitted provided that the following conditions are met:
  32
  33     * Redistributions of source code must retain the above copyright notice,
  34       this list of conditions and the following disclaimer.
  35     * Redistributions in binary form must reproduce the above copyright
  36       notice, this list of conditions and the following disclaimer in the
  37       documentation and/or other materials provided with the distribution.
  38     * Neither the name of the author nor the names of its contributors may be
  39       used to endorse or promote products derived from this software without
  40       specific prior written permission.
  41
  42 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  43 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  45 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  46 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  48 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  49 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  50 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52 --]]
  53
  54 -- ABNF from RFC 3629
  55 --
  56 -- UTF8-octets = *( UTF8-char )
  57 -- UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  58 -- UTF8-1      = %x00-7F
  59 -- UTF8-2      = %xC2-DF UTF8-tail
  60 -- UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
  61 --               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
  62 -- UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
  63 --               %xF4 %x80-8F 2( UTF8-tail )
  64 -- UTF8-tail   = %x80-BF
  65 --
  66
  67 local byte    = string.byte
  68 local char    = string.char
  69 local dump    = string.dump
  70 local find    = string.find
  71 local format  = string.format
  72 local len     = string.len
  73 local lower   = string.lower
  74 local rep     = string.rep
  75 local sub     = string.sub
  76 local upper   = string.upper
  77
  78 -- returns the number of bytes used by the UTF-8 character at byte i in s
  79 -- also doubles as a UTF-8 character validator
  80 local function utf8charbytes (s, i)
  81     -- argument defaults
  82     i = i or 1
  83
  84     -- argument checking
  85     if type(s) ~= "string" then
  86         error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
  87     end
  88     if type(i) ~= "number" then
  89         error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
  90     end
  91
  92     local c = byte(s, i)
  93
  94     -- determine bytes needed for character, based on RFC 3629
  95     -- validate byte 1
  96     if c > 0 and c <= 127 then
  97         -- UTF8-1
  98         return 1
  99
 100     elseif c >= 194 and c <= 223 then
 101         -- UTF8-2
 102         local c2 = byte(s, i + 1)
 103
 104         if not c2 then
 105             error("UTF-8 string terminated early")
 106         end
 107
 108         -- validate byte 2
 109         if c2 < 128 or c2 > 191 then
 110             error("Invalid UTF-8 character")
 111         end
 112
 113         return 2
 114
 115     elseif c >= 224 and c <= 239 then
 116         -- UTF8-3
 117         local c2 = byte(s, i + 1)
 118         local c3 = byte(s, i + 2)
 119
 120         if not c2 or not c3 then
 121             error("UTF-8 string terminated early")
 122         end
 123
 124         -- validate byte 2
 125         if c == 224 and (c2 < 160 or c2 > 191) then
 126             error("Invalid UTF-8 character")
 127         elseif c == 237 and (c2 < 128 or c2 > 159) then
 128             error("Invalid UTF-8 character")
 129         elseif c2 < 128 or c2 > 191 then
 130             error("Invalid UTF-8 character")
 131         end
 132
 133         -- validate byte 3
 134         if c3 < 128 or c3 > 191 then
 135             error("Invalid UTF-8 character")
 136         end
 137
 138         return 3
 139
 140     elseif c >= 240 and c <= 244 then
 141         -- UTF8-4
 142         local c2 = byte(s, i + 1)
 143         local c3 = byte(s, i + 2)
 144         local c4 = byte(s, i + 3)
 145
 146         if not c2 or not c3 or not c4 then
 147             error("UTF-8 string terminated early")
 148         end
 149
 150         -- validate byte 2
 151         if c == 240 and (c2 < 144 or c2 > 191) then
 152             error("Invalid UTF-8 character")
 153         elseif c == 244 and (c2 < 128 or c2 > 143) then
 154             error("Invalid UTF-8 character")
 155         elseif c2 < 128 or c2 > 191 then
 156             error("Invalid UTF-8 character")
 157         end
 158
 159         -- validate byte 3
 160         if c3 < 128 or c3 > 191 then
 161             error("Invalid UTF-8 character")
 162         end
 163
 164         -- validate byte 4
 165         if c4 < 128 or c4 > 191 then
 166             error("Invalid UTF-8 character")
 167         end
 168
 169         return 4
 170
 171     else
 172         error("Invalid UTF-8 character")
 173     end
 174 end
 175
 176 -- returns the number of characters in a UTF-8 string
 177 local function utf8len (s)
 178     -- argument checking
 179     if type(s) ~= "string" then
 180         for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end
 181         error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
 182     end
 183
 184     local pos = 1
 185     local bytes = len(s)
 186     local length = 0
 187
 188     while pos <= bytes do
 189         length = length + 1
 190         pos = pos + utf8charbytes(s, pos)
 191     end
 192
 193     return length
 194 end
 195
 196 -- functions identically to string.sub except that i and j are UTF-8 characters
 197 -- instead of bytes
 198 local function utf8sub (s, i, j)
 199     -- argument defaults
 200     j = j or -1
 201
 202     local pos = 1
 203     local bytes = len(s)
 204     local length = 0
 205
 206     -- only set l if i or j is negative
 207     local l = (i >= 0 and j >= 0) or utf8len(s)
 208     local startChar = (i >= 0) and i or l + i + 1
 209     local endChar   = (j >= 0) and j or l + j + 1
 210
 211     -- can't have start before end!
 212     if startChar > endChar then
 213         return ""
 214     end
 215
 216     -- byte offsets to pass to string.sub
 217     local startByte,endByte = 1,bytes
 218
 219     while pos <= bytes do
 220         length = length + 1
 221
 222         if length == startChar then
 223             startByte = pos
 224         end
 225
 226         pos = pos + utf8charbytes(s, pos)
 227
 228         if length == endChar then
 229             endByte = pos - 1
 230             break
 231         end
 232     end
 233
 234     if startChar > length then startByte = bytes+1   end
 235     if endChar   < 1      then endByte   = 0         end
 236
 237     return sub(s, startByte, endByte)
 238 end
 239
 240 --[[
 241 -- replace UTF-8 characters based on a mapping table
 242 local function utf8replace (s, mapping)
 243     -- argument checking
 244     if type(s) ~= "string" then
 245         error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
 246     end
 247     if type(mapping) ~= "table" then
 248         error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
 249     end
 250
 251     local pos = 1
 252     local bytes = len(s)
 253     local charbytes
 254     local newstr = ""
 255
 256     while pos <= bytes do
 257         charbytes = utf8charbytes(s, pos)
 258         local c = sub(s, pos, pos + charbytes - 1)
 259
 260         newstr = newstr .. (mapping[c] or c)
 261
 262         pos = pos + charbytes
 263     end
 264
 265     return newstr
 266 end
 267
 268
 269 -- identical to string.upper except it knows about unicode simple case conversions
 270 local function utf8upper (s)
 271     return utf8replace(s, utf8_lc_uc)
 272 end
 273
 274 -- identical to string.lower except it knows about unicode simple case conversions
 275 local function utf8lower (s)
 276     return utf8replace(s, utf8_uc_lc)
 277 end
 278 ]]
 279
 280 -- identical to string.reverse except that it supports UTF-8
 281 local function utf8reverse (s)
 282     -- argument checking
 283     if type(s) ~= "string" then
 284         error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
 285     end
 286
 287     local bytes = len(s)
 288     local pos = bytes
 289     local charbytes
 290     local newstr = ""
 291
 292     while pos > 0 do
 293         local c = byte(s, pos)
 294         while c >= 128 and c <= 191 do
 295             pos = pos - 1
 296             c = byte(s, pos)
 297         end
 298
 299         charbytes = utf8charbytes(s, pos)
 300
 301         newstr = newstr .. sub(s, pos, pos + charbytes - 1)
 302
 303         pos = pos - 1
 304     end
 305
 306     return newstr
 307 end
 308
 309 -- http://en.wikipedia.org/wiki/Utf8
 310 -- http://developer.coronalabs.com/code/utf-8-conversion-utility
 311 local function utf8char(unicode)
 312     if unicode <= 0x7F then return char(unicode) end
 313
 314     if (unicode <= 0x7FF) then
 315         local Byte0 = 0xC0 + math.floor(unicode / 0x40);
 316         local Byte1 = 0x80 + (unicode % 0x40);
 317         return char(Byte0, Byte1);
 318     end;
 319
 320     if (unicode <= 0xFFFF) then
 321         local Byte0 = 0xE0 +  math.floor(unicode / 0x1000);
 322         local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
 323         local Byte2 = 0x80 + (unicode % 0x40);
 324         return char(Byte0, Byte1, Byte2);
 325     end;
 326
 327     if (unicode <= 0x10FFFF) then
 328         local code = unicode
 329         local Byte3= 0x80 + (code % 0x40);
 330         code       = math.floor(code / 0x40)
 331         local Byte2= 0x80 + (code % 0x40);
 332         code       = math.floor(code / 0x40)
 333         local Byte1= 0x80 + (code % 0x40);
 334         code       = math.floor(code / 0x40)
 335         local Byte0= 0xF0 + code;
 336
 337         return char(Byte0, Byte1, Byte2, Byte3);
 338     end;
 339
 340     error 'Unicode cannot be greater than U+10FFFF!'
 341 end
 342
 343 local shift_6  = 2^6
 344 local shift_12 = 2^12
 345 local shift_18 = 2^18
 346
 347 local utf8unicode
 348 utf8unicode = function(str, i, j, byte_pos)
 349     i = i or 1
 350     j = j or i
 351
 352     if i > j then return end
 353
 354     local ch,bytes
 355
 356     if byte_pos then
 357         bytes = utf8charbytes(str,byte_pos)
 358         ch  = sub(str,byte_pos,byte_pos-1+bytes)
 359     else
 360         ch,byte_pos = utf8sub(str,i,i), 0
 361         bytes       = #ch
 362     end
 363
 364     local unicode
 365
 366     if bytes == 1 then unicode = byte(ch) end
 367     if bytes == 2 then
 368         local byte0,byte1 = byte(ch,1,2)
 369         local code0,code1 = byte0-0xC0,byte1-0x80
 370         unicode = code0*shift_6 + code1
 371     end
 372     if bytes == 3 then
 373         local byte0,byte1,byte2 = byte(ch,1,3)
 374         local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
 375         unicode = code0*shift_12 + code1*shift_6 + code2
 376     end
 377     if bytes == 4 then
 378         local byte0,byte1,byte2,byte3 = byte(ch,1,4)
 379         local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
 380         unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
 381     end
 382
 383     return unicode,utf8unicode(str, i+1, j, byte_pos+bytes)
 384 end
 385
 386 -- Returns an iterator which returns the next substring and its byte interval
 387 local function utf8gensub(str, sub_len)
 388     sub_len        = sub_len or 1
 389     local byte_pos = 1
 390     local length   = #str
 391     return function(skip)
 392         if skip then byte_pos = byte_pos + skip end
 393         local char_count = 0
 394         local start      = byte_pos
 395         repeat
 396             if byte_pos > length then return end
 397             char_count  = char_count + 1
 398             local bytes = utf8charbytes(str,byte_pos)
 399             byte_pos    = byte_pos+bytes
 400
 401         until char_count == sub_len
 402
 403         local last  = byte_pos-1
 404         local slice = sub(str,start,last)
 405         return slice, start, last
 406     end
 407 end
 408
 409 local function binsearch(sortedTable, item, comp)
 410     local head, tail = 1, #sortedTable
 411     local mid = math.floor((head + tail)/2)
 412     if not comp then
 413         while (tail - head) > 1 do
 414             if sortedTable[tonumber(mid)] > item then
 415                 tail = mid
 416             else
 417                 head = mid
 418             end
 419             mid = math.floor((head + tail)/2)
 420         end
 421     end
 422     if sortedTable[tonumber(head)] == item then
 423         return true, tonumber(head)
 424     elseif sortedTable[tonumber(tail)] == item then
 425         return true, tonumber(tail)
 426     else
 427         return false
 428     end
 429 end
 430 local function classMatchGenerator(class, plain)
 431     local codes = {}
 432     local ranges = {}
 433     local ignore = false
 434     local range = false
 435     local firstletter = true
 436     local unmatch = false
 437
 438     local it = utf8gensub(class)
 439
 440     local skip
 441     for c, _, be in it do
 442         skip = be
 443         if not ignore and not plain then
 444             if c == "%" then
 445                 ignore = true
 446             elseif c == "-" then
 447                 table.insert(codes, utf8unicode(c))
 448                 range = true
 449             elseif c == "^" then
 450                 if not firstletter then
 451                     error('!!!')
 452                 else
 453                     unmatch = true
 454                 end
 455             elseif c == ']' then
 456                 break
 457             else
 458                 if not range then
 459                     table.insert(codes, utf8unicode(c))
 460                 else
 461                     table.remove(codes) -- removing '-'
 462                     table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 463                     range = false
 464                 end
 465             end
 466         elseif ignore and not plain then
 467             if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
 468                 table.insert(ranges, {65, 90}) -- A - Z
 469                 table.insert(ranges, {97, 122}) -- a - z
 470             elseif c == 'c' then -- %c: represents all control characters.
 471                 table.insert(ranges, {0, 31})
 472                 table.insert(codes, 127)
 473             elseif c == 'd' then -- %d: represents all digits.
 474                 table.insert(ranges, {48, 57}) -- 0 - 9
 475             elseif c == 'g' then -- %g: represents all printable characters except space.
 476                 table.insert(ranges, {1, 8})
 477                 table.insert(ranges, {14, 31})
 478                 table.insert(ranges, {33, 132})
 479                 table.insert(ranges, {134, 159})
 480                 table.insert(ranges, {161, 5759})
 481                 table.insert(ranges, {5761, 8191})
 482                 table.insert(ranges, {8203, 8231})
 483                 table.insert(ranges, {8234, 8238})
 484                 table.insert(ranges, {8240, 8286})
 485                 table.insert(ranges, {8288, 12287})
 486             elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
 487                 table.insert(ranges, {97, 122}) -- a - z
 488             elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
 489                 table.insert(ranges, {33, 47})
 490                 table.insert(ranges, {58, 64})
 491                 table.insert(ranges, {91, 96})
 492                 table.insert(ranges, {123, 126})
 493             elseif c == 's' then -- %s: represents all space characters.
 494                 table.insert(ranges, {9, 13})
 495                 table.insert(codes, 32)
 496                 table.insert(codes, 133)
 497                 table.insert(codes, 160)
 498                 table.insert(codes, 5760)
 499                 table.insert(ranges, {8192, 8202})
 500                 table.insert(codes, 8232)
 501                 table.insert(codes, 8233)
 502                 table.insert(codes, 8239)
 503                 table.insert(codes, 8287)
 504                 table.insert(codes, 12288)
 505             elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
 506                 table.insert(ranges, {65, 90}) -- A - Z
 507             elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
 508                 table.insert(ranges, {48, 57}) -- 0 - 9
 509                 table.insert(ranges, {65, 90}) -- A - Z
 510                 table.insert(ranges, {97, 122}) -- a - z
 511             elseif c == 'x' then -- %x: represents all hexadecimal digits.
 512                 table.insert(ranges, {48, 57}) -- 0 - 9
 513                 table.insert(ranges, {65, 70}) -- A - F
 514                 table.insert(ranges, {97, 102}) -- a - f
 515             else
 516                 if not range then
 517                     table.insert(codes, utf8unicode(c))
 518                 else
 519                     table.remove(codes) -- removing '-'
 520                     table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 521                     range = false
 522                 end
 523             end
 524             ignore = false
 525         else
 526             if not range then
 527                 table.insert(codes, utf8unicode(c))
 528             else
 529                 table.remove(codes) -- removing '-'
 530                 table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 531                 range = false
 532             end
 533             ignore = false
 534         end
 535
 536         firstletter = false
 537     end
 538
 539     table.sort(codes)
 540
 541     local function inRanges(charCode)
 542         for _,r in ipairs(ranges) do
 543             if r[1] <= charCode and charCode <= r[2] then
 544                 return true
 545             end
 546         end
 547         return false
 548     end
 549     if not unmatch then
 550         return function(charCode)
 551             return binsearch(codes, charCode) or inRanges(charCode)
 552         end, skip
 553     else
 554         return function(charCode)
 555             return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
 556         end, skip
 557     end
 558 end
 559
 560 --[[
 561 -- utf8sub with extra argument, and extra result value
 562 local function utf8subWithBytes (s, i, j, sb)
 563     -- argument defaults
 564     j = j or -1
 565
 566     local pos = sb or 1
 567     local bytes = len(s)
 568     local length = 0
 569
 570     -- only set l if i or j is negative
 571     local l = (i >= 0 and j >= 0) or utf8len(s)
 572     local startChar = (i >= 0) and i or l + i + 1
 573     local endChar   = (j >= 0) and j or l + j + 1
 574
 575     -- can't have start before end!
 576     if startChar > endChar then
 577         return ""
 578     end
 579
 580     -- byte offsets to pass to string.sub
 581     local startByte,endByte = 1,bytes
 582
 583     while pos <= bytes do
 584         length = length + 1
 585
 586         if length == startChar then
 587             startByte = pos
 588         end
 589
 590         pos = pos + utf8charbytes(s, pos)
 591
 592         if length == endChar then
 593             endByte = pos - 1
 594             break
 595         end
 596     end
 597
 598     if startChar > length then startByte = bytes+1   end
 599     if endChar   < 1      then endByte   = 0         end
 600
 601     return sub(s, startByte, endByte), endByte + 1
 602 end
 603 ]]
 604
 605 local cache = setmetatable({},{
 606     __mode = 'kv'
 607 })
 608 local cachePlain = setmetatable({},{
 609     __mode = 'kv'
 610 })
 611 local function matcherGenerator(regex, plain)
 612     local matcher = {
 613         functions = {},
 614         captures = {}
 615     }
 616     if not plain then
 617         cache[regex] =  matcher
 618     else
 619         cachePlain[regex] = matcher
 620     end
 621     local function simple(func)
 622         return function(cC)
 623             if func(cC) then
 624                 matcher:nextFunc()
 625                 matcher:nextStr()
 626             else
 627                 matcher:reset()
 628             end
 629         end
 630     end
 631     local function star(func)
 632         return function(cC)
 633             if func(cC) then
 634                 matcher:fullResetOnNextFunc()
 635                 matcher:nextStr()
 636             else
 637                 matcher:nextFunc()
 638             end
 639         end
 640     end
 641     local function minus(func)
 642         return function(cC)
 643             if func(cC) then
 644                 matcher:fullResetOnNextStr()
 645             end
 646             matcher:nextFunc()
 647         end
 648     end
 649     local function question(func)
 650         return function(cC)
 651             if func(cC) then
 652                 matcher:fullResetOnNextFunc()
 653                 matcher:nextStr()
 654             end
 655             matcher:nextFunc()
 656         end
 657     end
 658
 659     local function capture(id)
 660         return function(_)
 661             local l = matcher.captures[id][2] - matcher.captures[id][1]
 662             local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
 663             local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
 664             if captured == check then
 665                 for _ = 0, l do
 666                     matcher:nextStr()
 667                 end
 668                 matcher:nextFunc()
 669             else
 670                 matcher:reset()
 671             end
 672         end
 673     end
 674     local function captureStart(id)
 675         return function(_)
 676             matcher.captures[id][1] = matcher.str
 677             matcher:nextFunc()
 678         end
 679     end
 680     local function captureStop(id)
 681         return function(_)
 682             matcher.captures[id][2] = matcher.str - 1
 683             matcher:nextFunc()
 684         end
 685     end
 686
 687     local function balancer(str)
 688         local sum = 0
 689         local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
 690         local skip = len(bc) + len(ec)
 691         bc, ec = utf8unicode(bc), utf8unicode(ec)
 692         return function(cC)
 693             if cC == ec and sum > 0 then
 694                 sum = sum - 1
 695                 if sum == 0 then
 696                     matcher:nextFunc()
 697                 end
 698                 matcher:nextStr()
 699             elseif cC == bc then
 700                 sum = sum + 1
 701                 matcher:nextStr()
 702             else
 703                 if sum == 0 or cC == -1 then
 704                     sum = 0
 705                     matcher:reset()
 706                 else
 707                     matcher:nextStr()
 708                 end
 709             end
 710         end, skip
 711     end
 712
 713     matcher.functions[1] = function(_)
 714         matcher:fullResetOnNextStr()
 715         matcher.seqStart = matcher.str
 716         matcher:nextFunc()
 717         if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
 718             matcher.stop = true
 719             matcher.seqStart = nil
 720         end
 721     end
 722
 723     local lastFunc
 724     local ignore = false
 725     local skip = nil
 726     local it = (function()
 727         local gen = utf8gensub(regex)
 728         return function()
 729             return gen(skip)
 730         end
 731     end)()
 732     local cs = {}
 733     for c, bs, be in it do
 734         skip = nil
 735         if plain then
 736             table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
 737         else
 738             if ignore then
 739                 if find('123456789', c, 1, true) then
 740                     if lastFunc then
 741                         table.insert(matcher.functions, simple(lastFunc))
 742                         lastFunc = nil
 743                     end
 744                     table.insert(matcher.functions, capture(tonumber(c)))
 745                 elseif c == 'b' then
 746                     if lastFunc then
 747                         table.insert(matcher.functions, simple(lastFunc))
 748                         lastFunc = nil
 749                     end
 750                     local b
 751                     b, skip = balancer(sub(regex, be + 1, be + 9))
 752                     table.insert(matcher.functions, b)
 753                 else
 754                     lastFunc = classMatchGenerator('%' .. c)
 755                 end
 756                 ignore = false
 757             else
 758                 if c == '*' then
 759                     if lastFunc then
 760                         table.insert(matcher.functions, star(lastFunc))
 761                         lastFunc = nil
 762                     else
 763                         error('invalid regex after ' .. sub(regex, 1, bs))
 764                     end
 765                 elseif c == '+' then
 766                     if lastFunc then
 767                         table.insert(matcher.functions, simple(lastFunc))
 768                         table.insert(matcher.functions, star(lastFunc))
 769                         lastFunc = nil
 770                     else
 771                         error('invalid regex after ' .. sub(regex, 1, bs))
 772                     end
 773                 elseif c == '-' then
 774                     if lastFunc then
 775                         table.insert(matcher.functions, minus(lastFunc))
 776                         lastFunc = nil
 777                     else
 778                         error('invalid regex after ' .. sub(regex, 1, bs))
 779                     end
 780                 elseif c == '?' then
 781                     if lastFunc then
 782                         table.insert(matcher.functions, question(lastFunc))
 783                         lastFunc = nil
 784                     else
 785                         error('invalid regex after ' .. sub(regex, 1, bs))
 786                     end
 787                 elseif c == '^' then
 788                     if bs == 1 then
 789                         matcher.fromStart = true
 790                     else
 791                         error('invalid regex after ' .. sub(regex, 1, bs))
 792                     end
 793                 elseif c == '$' then
 794                     if be == len(regex) then
 795                         matcher.toEnd = true
 796                     else
 797                         error('invalid regex after ' .. sub(regex, 1, bs))
 798                     end
 799                 elseif c == '[' then
 800                     if lastFunc then
 801                         table.insert(matcher.functions, simple(lastFunc))
 802                     end
 803                     lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
 804                 elseif c == '(' then
 805                     if lastFunc then
 806                         table.insert(matcher.functions, simple(lastFunc))
 807                         lastFunc = nil
 808                     end
 809                     table.insert(matcher.captures, {})
 810                     table.insert(cs, #matcher.captures)
 811                     table.insert(matcher.functions, captureStart(cs[#cs]))
 812                     if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
 813                 elseif c == ')' then
 814                     if lastFunc then
 815                         table.insert(matcher.functions, simple(lastFunc))
 816                         lastFunc = nil
 817                     end
 818                     local cap = table.remove(cs)
 819                     if not cap then
 820                         error('invalid capture: "(" missing')
 821                     end
 822                     table.insert(matcher.functions, captureStop(cap))
 823                 elseif c == '.' then
 824                     if lastFunc then
 825                         table.insert(matcher.functions, simple(lastFunc))
 826                     end
 827                     lastFunc = function(cC) return cC ~= -1 end
 828                 elseif c == '%' then
 829                     ignore = true
 830                 else
 831                     if lastFunc then
 832                         table.insert(matcher.functions, simple(lastFunc))
 833                     end
 834                     lastFunc = classMatchGenerator(c)
 835                 end
 836             end
 837         end
 838     end
 839     if #cs > 0 then
 840         error('invalid capture: ")" missing')
 841     end
 842     if lastFunc then
 843         table.insert(matcher.functions, simple(lastFunc))
 844     end
 845
 846     table.insert(matcher.functions, function()
 847         if matcher.toEnd and matcher.str ~= matcher.stringLen then
 848             matcher:reset()
 849         else
 850             matcher.stop = true
 851         end
 852     end)
 853
 854     matcher.nextFunc = function(self)
 855         self.func = self.func + 1
 856     end
 857     matcher.nextStr = function(self)
 858         self.str = self.str + 1
 859     end
 860     matcher.strReset = function(self)
 861         local oldReset = self.reset
 862         local str = self.str
 863         self.reset = function(s)
 864             s.str = str
 865             s.reset = oldReset
 866         end
 867     end
 868     matcher.fullResetOnNextFunc = function(self)
 869         local oldReset = self.reset
 870         local func = self.func +1
 871         local str = self.str
 872         self.reset = function(s)
 873             s.func = func
 874             s.str = str
 875             s.reset = oldReset
 876         end
 877     end
 878     matcher.fullResetOnNextStr = function(self)
 879         local oldReset = self.reset
 880         local str = self.str + 1
 881         local func = self.func
 882         self.reset = function(s)
 883             s.func = func
 884             s.str = str
 885             s.reset = oldReset
 886         end
 887     end
 888
 889     matcher.process = function(self, str, start)
 890
 891         self.func = 1
 892         start = start or 1
 893         self.startStr = (start >= 0) and start or utf8len(str) + start + 1
 894         self.seqStart = self.startStr
 895         self.str = self.startStr
 896         self.stringLen = utf8len(str) + 1
 897         self.string = str
 898         self.stop = false
 899
 900         self.reset = function(s)
 901             s.func = 1
 902         end
 903
 904         -- local lastPos = self.str
 905         -- local lastByte
 906         local ch
 907         while not self.stop do
 908             if self.str < self.stringLen then
 909                 --[[ if lastPos < self.str then
 910                     print('last byte', lastByte)
 911                     ch, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
 912                     ch, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
 913                     lastByte = lastByte - 1
 914                 else
 915                     ch, lastByte = utf8subWithBytes(str, self.str, self.str)
 916                 end
 917                 lastPos = self.str ]]
 918                 ch = utf8sub(str, self.str,self.str)
 919                 --print('char', ch, utf8unicode(ch))
 920                 self.functions[self.func](utf8unicode(ch))
 921             else
 922                 self.functions[self.func](-1)
 923             end
 924         end
 925
 926         if self.seqStart then
 927             local captures = {}
 928             for _,pair in pairs(self.captures) do
 929                 if pair.empty then
 930                     table.insert(captures, pair[1])
 931                 else
 932                     table.insert(captures, utf8sub(str, pair[1], pair[2]))
 933                 end
 934             end
 935             return self.seqStart, self.str - 1, unpack(captures)
 936         end
 937     end
 938
 939     return matcher
 940 end
 941
 942 -- string.find
 943 local function utf8find(str, regex, init, plain)
 944     local matcher = cache[regex] or matcherGenerator(regex, plain)
 945     return matcher:process(str, init)
 946 end
 947
 948 -- string.match
 949 local function utf8match(str, regex, init)
 950     init = init or 1
 951     local found = {utf8find(str, regex, init)}
 952     if found[1] then
 953         if found[3] then
 954             return unpack(found, 3)
 955         end
 956         return utf8sub(str, found[1], found[2])
 957     end
 958 end
 959
 960 -- string.gmatch
 961 local function utf8gmatch(str, regex, all)
 962     regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
 963     local lastChar = 1
 964     return function()
 965         local found = {utf8find(str, regex, lastChar)}
 966         if found[1] then
 967             lastChar = found[2] + 1
 968             if found[all and 1 or 3] then
 969                 return unpack(found, all and 1 or 3)
 970             end
 971             return utf8sub(str, found[1], found[2])
 972         end
 973     end
 974 end
 975
 976 local function replace(repl, args)
 977     local ret = ''
 978     if type(repl) == 'string' then
 979         local ignore = false
 980         local num
 981         for c in utf8gensub(repl) do
 982             if not ignore then
 983                 if c == '%' then
 984                     ignore = true
 985                 else
 986                     ret = ret .. c
 987                 end
 988             else
 989                 num = tonumber(c)
 990                 if num then
 991                     ret = ret .. args[num]
 992                 else
 993                     ret = ret .. c
 994                 end
 995                 ignore = false
 996             end
 997         end
 998     elseif type(repl) == 'table' then
 999         ret = repl[args[1] or args[0]] or ''
1000     elseif type(repl) == 'function' then
1001         if #args > 0 then
1002             ret = repl(unpack(args, 1)) or ''
1003         else
1004             ret = repl(args[0]) or ''
1005         end
1006     end
1007     return ret
1008 end
1009 -- string.gsub
1010 local function utf8gsub(str, regex, repl, limit)
1011     limit = limit or -1
1012     local ret = ''
1013     local prevEnd = 1
1014     local it = utf8gmatch(str, regex, true)
1015     local found = {it()}
1016     local n = 0
1017     while #found > 0 and limit ~= n do
1018         local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
1019         ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
1020         .. replace(repl, args)
1021         prevEnd = found[2] + 1
1022         n = n + 1
1023         found = {it()}
1024     end
1025     return ret .. utf8sub(str, prevEnd), n
1026 end
1027
1028 local utf8 = {}
1029 utf8.len = utf8len
1030 utf8.sub = utf8sub
1031 utf8.reverse = utf8reverse
1032 utf8.char = utf8char
1033 utf8.unicode = utf8unicode
1034 utf8.gensub = utf8gensub
1035 utf8.byte = utf8unicode
1036 utf8.find    = utf8find
1037 utf8.match   = utf8match
1038 utf8.gmatch  = utf8gmatch
1039 utf8.gsub    = utf8gsub
1040 utf8.dump    = dump
1041 utf8.format = format
1042 utf8.lower = lower
1043 utf8.upper = upper
1044 utf8.rep     = rep
1045
1046 function charAt(str, i)
1047     if i <= utf8.len(str) then
1048         return utf8.sub(str, i, i)
1049     else
1050         return ""
1051     end
1052 end
1053
1054 if GetOption("autoclose") == nil then
1055     AddOption("autoclose", true)
1056 end
1057
1058 local autoclosePairs = {"\"\"", "''", "``", "()", "{}", "[]"}
1059 local autoNewlinePairs = {"()", "{}", "[]"}
1060
1061 function onRune(r, v)
1062     if not GetOption("autoclose") then
1063         return
1064     end
1065
1066     for i = 1, #autoclosePairs do
1067         if r == charAt(autoclosePairs[i], 2) then
1068             local curLine = v.Buf:Line(v.Cursor.Y)
1069
1070             if charAt(curLine, v.Cursor.X+1) == charAt(autoclosePairs[i], 2) then
1071                 v:Backspace(false)
1072                 v:CursorRight(false)
1073                 break
1074             end
1075
1076             if v.Cursor.X > 1 and (IsWordChar(charAt(curLine, v.Cursor.X-1)) or charAt(curLine, v.Cursor.X-1) == charAt(autoclosePairs[i], 1)) then
1077                 break
1078             end
1079         end
1080         if r == charAt(autoclosePairs[i], 1) then
1081             local curLine = v.Buf:Line(v.Cursor.Y)
1082
1083             if v.Cursor.X == utf8.len(curLine) or not IsWordChar(charAt(curLine, v.Cursor.X+1)) then
1084                 -- the '-' here is to derefence the pointer to v.Cursor.Loc which is automatically made
1085                 -- when converting go structs to lua
1086                 -- It needs to be dereferenced because the function expects a non pointer struct
1087                 v.Buf:Insert(-v.Cursor.Loc, charAt(autoclosePairs[i], 2))
1088                 v:CursorLeft(false)
1089                 break
1090             end
1091         end
1092     end
1093 end
1094
1095 function preInsertNewline(v)
1096     if not GetOption("autoclose") then
1097         return
1098     end
1099
1100     local curLine = v.Buf:Line(v.Cursor.Y)
1101     local curRune = charAt(curLine, v.Cursor.X)
1102     local nextRune = charAt(curLine, v.Cursor.X+1)
1103     local ws = GetLeadingWhitespace(curLine)
1104
1105     for i = 1, #autoNewlinePairs do
1106         if curRune == charAt(autoNewlinePairs[i], 1) then
1107             if nextRune == charAt(autoNewlinePairs[i], 2) then
1108                 v:InsertNewline(false)
1109                 v:InsertTab(false)
1110                 v.Buf:Insert(-v.Cursor.Loc, "\n" .. ws)
1111                 v:CursorLeft(false)
1112                 return false
1113             end
1114         end
1115     end
1116
1117     return true
1118 end
1119
1120 function preBackspace(v)
1121     if not GetOption("autoclose") then
1122         return
1123     end
1124
1125     for i = 1, #autoclosePairs do
1126         local curLine = v.Buf:Line(v.Cursor.Y)
1127         if charAt(curLine, v.Cursor.X+1) == charAt(autoclosePairs[i], 2) and charAt(curLine, v.Cursor.X) == charAt(autoclosePairs[i], 1) then
1128             v:Delete(false)
1129         end
1130     end
1131
1132     return true
1133 end