utf8.lua

   1 -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
   2 --
   3 -- Provides UTF-8 aware string functions implemented in pure lua:
   4 -- * utf8len(s)
   5 -- * utf8sub(s, i, j)
   6 -- * utf8reverse(s)
   7 -- * utf8char(unicode)
   8 -- * utf8unicode(s, i, j)
   9 -- * utf8gensub(s, sub_len)
  10 -- * utf8find(str, regex, init, plain)
  11 -- * utf8match(str, regex, init)
  12 -- * utf8gmatch(str, regex, all)
  13 -- * utf8gsub(str, regex, repl, limit)
  14 --
  15 -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
  16 -- additional functions are available:
  17 -- * utf8upper(s)
  18 -- * utf8lower(s)
  19 --
  20 -- All functions behave as their non UTF-8 aware counterparts with the exception
  21 -- that UTF-8 characters are used instead of bytes for all units.
  22
  23 --[[
  24 Copyright (c) 2006-2007, Kyle Smith
  25 All rights reserved.
  26
  27 Contributors:
  28         Alimov Stepan
  29
  30 Redistribution and use in source and binary forms, with or without
  31 modification, are permitted provided that the following conditions are met:
  32
  33     * Redistributions of source code must retain the above copyright notice,
  34       this list of conditions and the following disclaimer.
  35     * Redistributions in binary form must reproduce the above copyright
  36       notice, this list of conditions and the following disclaimer in the
  37       documentation and/or other materials provided with the distribution.
  38     * Neither the name of the author nor the names of its contributors may be
  39       used to endorse or promote products derived from this software without
  40       specific prior written permission.
  41
  42 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  43 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  45 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  46 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  48 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  49 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  50 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52 --]]
  53
  54 -- ABNF from RFC 3629
  55 --
  56 -- UTF8-octets = *( UTF8-char )
  57 -- UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  58 -- UTF8-1      = %x00-7F
  59 -- UTF8-2      = %xC2-DF UTF8-tail
  60 -- UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
  61 --               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
  62 -- UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
  63 --               %xF4 %x80-8F 2( UTF8-tail )
  64 -- UTF8-tail   = %x80-BF
  65 --
  66
  67 local byte    = string.byte
  68 local char    = string.char
  69 local dump    = string.dump
  70 local find    = string.find
  71 local format  = string.format
  72 local gmatch  = string.gmatch
  73 local gsub    = string.gsub
  74 local len     = string.len
  75 local lower   = string.lower
  76 local match   = string.match
  77 local rep     = string.rep
  78 local reverse = string.reverse
  79 local sub     = string.sub
  80 local upper   = string.upper
  81
  82 -- returns the number of bytes used by the UTF-8 character at byte i in s
  83 -- also doubles as a UTF-8 character validator
  84 local function utf8charbytes (s, i)
  85         -- argument defaults
  86         i = i or 1
  87
  88         -- argument checking
  89         if type(s) ~= "string" then
  90                 error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
  91         end
  92         if type(i) ~= "number" then
  93                 error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
  94         end
  95
  96         local c = byte(s, i)
  97
  98         -- determine bytes needed for character, based on RFC 3629
  99         -- validate byte 1
 100         if c > 0 and c <= 127 then
 101                 -- UTF8-1
 102                 return 1
 103
 104         elseif c >= 194 and c <= 223 then
 105                 -- UTF8-2
 106                 local c2 = byte(s, i + 1)
 107
 108                 if not c2 then
 109                         error("UTF-8 string terminated early")
 110                 end
 111
 112                 -- validate byte 2
 113                 if c2 < 128 or c2 > 191 then
 114                         error("Invalid UTF-8 character")
 115                 end
 116
 117                 return 2
 118
 119         elseif c >= 224 and c <= 239 then
 120                 -- UTF8-3
 121                 local c2 = byte(s, i + 1)
 122                 local c3 = byte(s, i + 2)
 123
 124                 if not c2 or not c3 then
 125                         error("UTF-8 string terminated early")
 126                 end
 127
 128                 -- validate byte 2
 129                 if c == 224 and (c2 < 160 or c2 > 191) then
 130                         error("Invalid UTF-8 character")
 131                 elseif c == 237 and (c2 < 128 or c2 > 159) then
 132                         error("Invalid UTF-8 character")
 133                 elseif c2 < 128 or c2 > 191 then
 134                         error("Invalid UTF-8 character")
 135                 end
 136
 137                 -- validate byte 3
 138                 if c3 < 128 or c3 > 191 then
 139                         error("Invalid UTF-8 character")
 140                 end
 141
 142                 return 3
 143
 144         elseif c >= 240 and c <= 244 then
 145                 -- UTF8-4
 146                 local c2 = byte(s, i + 1)
 147                 local c3 = byte(s, i + 2)
 148                 local c4 = byte(s, i + 3)
 149
 150                 if not c2 or not c3 or not c4 then
 151                         error("UTF-8 string terminated early")
 152                 end
 153
 154                 -- validate byte 2
 155                 if c == 240 and (c2 < 144 or c2 > 191) then
 156                         error("Invalid UTF-8 character")
 157                 elseif c == 244 and (c2 < 128 or c2 > 143) then
 158                         error("Invalid UTF-8 character")
 159                 elseif c2 < 128 or c2 > 191 then
 160                         error("Invalid UTF-8 character")
 161                 end
 162
 163                 -- validate byte 3
 164                 if c3 < 128 or c3 > 191 then
 165                         error("Invalid UTF-8 character")
 166                 end
 167
 168                 -- validate byte 4
 169                 if c4 < 128 or c4 > 191 then
 170                         error("Invalid UTF-8 character")
 171                 end
 172
 173                 return 4
 174
 175         else
 176                 error("Invalid UTF-8 character")
 177         end
 178 end
 179
 180 -- returns the number of characters in a UTF-8 string
 181 local function utf8len (s)
 182         -- argument checking
 183         if type(s) ~= "string" then
 184                 for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end
 185                 error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
 186         end
 187
 188         local pos = 1
 189         local bytes = len(s)
 190         local len = 0
 191
 192         while pos <= bytes do
 193                 len = len + 1
 194                 pos = pos + utf8charbytes(s, pos)
 195         end
 196
 197         return len
 198 end
 199
 200 -- functions identically to string.sub except that i and j are UTF-8 characters
 201 -- instead of bytes
 202 local function utf8sub (s, i, j)
 203         -- argument defaults
 204         j = j or -1
 205
 206         local pos = 1
 207         local bytes = len(s)
 208         local len = 0
 209
 210         -- only set l if i or j is negative
 211         local l = (i >= 0 and j >= 0) or utf8len(s)
 212         local startChar = (i >= 0) and i or l + i + 1
 213         local endChar   = (j >= 0) and j or l + j + 1
 214
 215         -- can't have start before end!
 216         if startChar > endChar then
 217                 return ""
 218         end
 219
 220         -- byte offsets to pass to string.sub
 221         local startByte,endByte = 1,bytes
 222
 223         while pos <= bytes do
 224                 len = len + 1
 225
 226                 if len == startChar then
 227                         startByte = pos
 228                 end
 229
 230                 pos = pos + utf8charbytes(s, pos)
 231
 232                 if len == endChar then
 233                         endByte = pos - 1
 234                         break
 235                 end
 236         end
 237
 238         if startChar > len then startByte = bytes+1   end
 239         if endChar   < 1   then endByte   = 0         end
 240
 241         return sub(s, startByte, endByte)
 242 end
 243
 244
 245 -- replace UTF-8 characters based on a mapping table
 246 local function utf8replace (s, mapping)
 247         -- argument checking
 248         if type(s) ~= "string" then
 249                 error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
 250         end
 251         if type(mapping) ~= "table" then
 252                 error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
 253         end
 254
 255         local pos = 1
 256         local bytes = len(s)
 257         local charbytes
 258         local newstr = ""
 259
 260         while pos <= bytes do
 261                 charbytes = utf8charbytes(s, pos)
 262                 local c = sub(s, pos, pos + charbytes - 1)
 263
 264                 newstr = newstr .. (mapping[c] or c)
 265
 266                 pos = pos + charbytes
 267         end
 268
 269         return newstr
 270 end
 271
 272
 273 -- identical to string.upper except it knows about unicode simple case conversions
 274 local function utf8upper (s)
 275         return utf8replace(s, utf8_lc_uc)
 276 end
 277
 278 -- identical to string.lower except it knows about unicode simple case conversions
 279 local function utf8lower (s)
 280         return utf8replace(s, utf8_uc_lc)
 281 end
 282
 283 -- identical to string.reverse except that it supports UTF-8
 284 local function utf8reverse (s)
 285         -- argument checking
 286         if type(s) ~= "string" then
 287                 error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
 288         end
 289
 290         local bytes = len(s)
 291         local pos = bytes
 292         local charbytes
 293         local newstr = ""
 294
 295         while pos > 0 do
 296                 c = byte(s, pos)
 297                 while c >= 128 and c <= 191 do
 298                         pos = pos - 1
 299                         c = byte(s, pos)
 300                 end
 301
 302                 charbytes = utf8charbytes(s, pos)
 303
 304                 newstr = newstr .. sub(s, pos, pos + charbytes - 1)
 305
 306                 pos = pos - 1
 307         end
 308
 309         return newstr
 310 end
 311
 312 -- http://en.wikipedia.org/wiki/Utf8
 313 -- http://developer.coronalabs.com/code/utf-8-conversion-utility
 314 local function utf8char(unicode)
 315         if unicode <= 0x7F then return char(unicode) end
 316
 317         if (unicode <= 0x7FF) then
 318                 local Byte0 = 0xC0 + math.floor(unicode / 0x40);
 319                 local Byte1 = 0x80 + (unicode % 0x40);
 320                 return char(Byte0, Byte1);
 321         end;
 322
 323         if (unicode <= 0xFFFF) then
 324                 local Byte0 = 0xE0 +  math.floor(unicode / 0x1000);
 325                 local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
 326                 local Byte2 = 0x80 + (unicode % 0x40);
 327                 return char(Byte0, Byte1, Byte2);
 328         end;
 329
 330         if (unicode <= 0x10FFFF) then
 331                 local code = unicode
 332                 local Byte3= 0x80 + (code % 0x40);
 333                 code       = math.floor(code / 0x40)
 334                 local Byte2= 0x80 + (code % 0x40);
 335                 code       = math.floor(code / 0x40)
 336                 local Byte1= 0x80 + (code % 0x40);
 337                 code       = math.floor(code / 0x40)
 338                 local Byte0= 0xF0 + code;
 339
 340                 return char(Byte0, Byte1, Byte2, Byte3);
 341         end;
 342
 343         error 'Unicode cannot be greater than U+10FFFF!'
 344 end
 345
 346 local shift_6  = 2^6
 347 local shift_12 = 2^12
 348 local shift_18 = 2^18
 349
 350 local utf8unicode
 351 utf8unicode = function(str, i, j, byte_pos)
 352         i = i or 1
 353         j = j or i
 354
 355         if i > j then return end
 356
 357         local char,bytes
 358
 359         if byte_pos then
 360                 bytes = utf8charbytes(str,byte_pos)
 361                 char  = sub(str,byte_pos,byte_pos-1+bytes)
 362         else
 363                 char,byte_pos = utf8sub(str,i,i), 0
 364                 bytes         = #char
 365         end
 366
 367         local unicode
 368
 369         if bytes == 1 then unicode = byte(char) end
 370         if bytes == 2 then
 371                 local byte0,byte1 = byte(char,1,2)
 372                 local code0,code1 = byte0-0xC0,byte1-0x80
 373                 unicode = code0*shift_6 + code1
 374         end
 375         if bytes == 3 then
 376                 local byte0,byte1,byte2 = byte(char,1,3)
 377                 local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
 378                 unicode = code0*shift_12 + code1*shift_6 + code2
 379         end
 380         if bytes == 4 then
 381                 local byte0,byte1,byte2,byte3 = byte(char,1,4)
 382                 local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
 383                 unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
 384         end
 385
 386         return unicode,utf8unicode(str, i+1, j, byte_pos+bytes)
 387 end
 388
 389 -- Returns an iterator which returns the next substring and its byte interval
 390 local function utf8gensub(str, sub_len)
 391         sub_len        = sub_len or 1
 392         local byte_pos = 1
 393         local len      = #str
 394         return function(skip)
 395                 if skip then byte_pos = byte_pos + skip end
 396                 local char_count = 0
 397                 local start      = byte_pos
 398                 repeat
 399                         if byte_pos > len then return end
 400                         char_count  = char_count + 1
 401                         local bytes = utf8charbytes(str,byte_pos)
 402                         byte_pos    = byte_pos+bytes
 403
 404                 until char_count == sub_len
 405
 406                 local last  = byte_pos-1
 407                 local sub   = sub(str,start,last)
 408                 return sub, start, last
 409         end
 410 end
 411
 412 local function binsearch(sortedTable, item, comp)
 413         local head, tail = 1, #sortedTable
 414         local mid = math.floor((head + tail)/2)
 415         if not comp then
 416                 while (tail - head) > 1 do
 417                         if sortedTable[tonumber(mid)] > item then
 418                                 tail = mid
 419                         else
 420                                 head = mid
 421                         end
 422                         mid = math.floor((head + tail)/2)
 423                 end
 424         else
 425         end
 426         if sortedTable[tonumber(head)] == item then
 427                 return true, tonumber(head)
 428         elseif sortedTable[tonumber(tail)] == item then
 429                 return true, tonumber(tail)
 430         else
 431                 return false
 432         end
 433 end
 434 local function classMatchGenerator(class, plain)
 435         local codes = {}
 436         local ranges = {}
 437         local ignore = false
 438         local range = false
 439         local firstletter = true
 440         local unmatch = false
 441
 442         local it = utf8gensub(class)
 443
 444         local skip
 445         for c,bs,be in it do
 446                 skip = be
 447                 if not ignore and not plain then
 448                         if c == "%" then
 449                                 ignore = true
 450                         elseif c == "-" then
 451                                 table.insert(codes, utf8unicode(c))
 452                                 range = true
 453                         elseif c == "^" then
 454                                 if not firstletter then
 455                                         error('!!!')
 456                                 else
 457                                         unmatch = true
 458                                 end
 459                         elseif c == ']' then
 460                                 break
 461                         else
 462                                 if not range then
 463                                         table.insert(codes, utf8unicode(c))
 464                                 else
 465                                         table.remove(codes) -- removing '-'
 466                                         table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 467                                         range = false
 468                                 end
 469                         end
 470                 elseif ignore and not plain then
 471                         if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
 472                                 table.insert(ranges, {65, 90}) -- A - Z
 473                                 table.insert(ranges, {97, 122}) -- a - z
 474                         elseif c == 'c' then -- %c: represents all control characters.
 475                                 table.insert(ranges, {0, 31})
 476                                 table.insert(codes, 127)
 477                         elseif c == 'd' then -- %d: represents all digits.
 478                                 table.insert(ranges, {48, 57}) -- 0 - 9
 479                         elseif c == 'g' then -- %g: represents all printable characters except space.
 480                                 table.insert(ranges, {1, 8})
 481                                 table.insert(ranges, {14, 31})
 482                                 table.insert(ranges, {33, 132})
 483                                 table.insert(ranges, {134, 159})
 484                                 table.insert(ranges, {161, 5759})
 485                                 table.insert(ranges, {5761, 8191})
 486                                 table.insert(ranges, {8203, 8231})
 487                                 table.insert(ranges, {8234, 8238})
 488                                 table.insert(ranges, {8240, 8286})
 489                                 table.insert(ranges, {8288, 12287})
 490                         elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
 491                                 table.insert(ranges, {97, 122}) -- a - z
 492                         elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
 493                                 table.insert(ranges, {33, 47})
 494                                 table.insert(ranges, {58, 64})
 495                                 table.insert(ranges, {91, 96})
 496                                 table.insert(ranges, {123, 126})
 497                         elseif c == 's' then -- %s: represents all space characters.
 498                                 table.insert(ranges, {9, 13})
 499                                 table.insert(codes, 32)
 500                                 table.insert(codes, 133)
 501                                 table.insert(codes, 160)
 502                                 table.insert(codes, 5760)
 503                                 table.insert(ranges, {8192, 8202})
 504                                 table.insert(codes, 8232)
 505                                 table.insert(codes, 8233)
 506                                 table.insert(codes, 8239)
 507                                 table.insert(codes, 8287)
 508                                 table.insert(codes, 12288)
 509                         elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
 510                                 table.insert(ranges, {65, 90}) -- A - Z
 511                         elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
 512                                 table.insert(ranges, {48, 57}) -- 0 - 9
 513                                 table.insert(ranges, {65, 90}) -- A - Z
 514                                 table.insert(ranges, {97, 122}) -- a - z
 515                         elseif c == 'x' then -- %x: represents all hexadecimal digits.
 516                                 table.insert(ranges, {48, 57}) -- 0 - 9
 517                                 table.insert(ranges, {65, 70}) -- A - F
 518                                 table.insert(ranges, {97, 102}) -- a - f
 519                         else
 520                                 if not range then
 521                                         table.insert(codes, utf8unicode(c))
 522                                 else
 523                                         table.remove(codes) -- removing '-'
 524                                         table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 525                                         range = false
 526                                 end
 527                         end
 528                         ignore = false
 529                 else
 530                         if not range then
 531                                 table.insert(codes, utf8unicode(c))
 532                         else
 533                                 table.remove(codes) -- removing '-'
 534                                 table.insert(ranges, {table.remove(codes), utf8unicode(c)})
 535                                 range = false
 536                         end
 537                         ignore = false
 538                 end
 539
 540                 firstletter = false
 541         end
 542
 543         table.sort(codes)
 544
 545         local function inRanges(charCode)
 546                 for _,r in ipairs(ranges) do
 547                         if r[1] <= charCode and charCode <= r[2] then
 548                                 return true
 549                         end
 550                 end
 551                 return false
 552         end
 553         if not unmatch then
 554                 return function(charCode)
 555                         return binsearch(codes, charCode) or inRanges(charCode)
 556                 end, skip
 557         else
 558                 return function(charCode)
 559                         return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
 560                 end, skip
 561         end
 562 end
 563
 564 -- utf8sub with extra argument, and extra result value
 565 local function utf8subWithBytes (s, i, j, sb)
 566         -- argument defaults
 567         j = j or -1
 568
 569         local pos = sb or 1
 570         local bytes = len(s)
 571         local len = 0
 572
 573         -- only set l if i or j is negative
 574         local l = (i >= 0 and j >= 0) or utf8len(s)
 575         local startChar = (i >= 0) and i or l + i + 1
 576         local endChar   = (j >= 0) and j or l + j + 1
 577
 578         -- can't have start before end!
 579         if startChar > endChar then
 580                 return ""
 581         end
 582
 583         -- byte offsets to pass to string.sub
 584         local startByte,endByte = 1,bytes
 585
 586         while pos <= bytes do
 587                 len = len + 1
 588
 589                 if len == startChar then
 590                         startByte = pos
 591                 end
 592
 593                 pos = pos + utf8charbytes(s, pos)
 594
 595                 if len == endChar then
 596                         endByte = pos - 1
 597                         break
 598                 end
 599         end
 600
 601         if startChar > len then startByte = bytes+1   end
 602         if endChar   < 1   then endByte   = 0         end
 603
 604         return sub(s, startByte, endByte), endByte + 1
 605 end
 606
 607 local cache = setmetatable({},{
 608         __mode = 'kv'
 609 })
 610 local cachePlain = setmetatable({},{
 611         __mode = 'kv'
 612 })
 613 local function matcherGenerator(regex, plain)
 614         local matcher = {
 615                 functions = {},
 616                 captures = {}
 617         }
 618         if not plain then
 619                 cache[regex] =  matcher
 620         else
 621                 cachePlain[regex] = matcher
 622         end
 623         local function simple(func)
 624                 return function(cC)
 625                         if func(cC) then
 626                                 matcher:nextFunc()
 627                                 matcher:nextStr()
 628                         else
 629                                 matcher:reset()
 630                         end
 631                 end
 632         end
 633         local function star(func)
 634                 return function(cC)
 635                         if func(cC) then
 636                                 matcher:fullResetOnNextFunc()
 637                                 matcher:nextStr()
 638                         else
 639                                 matcher:nextFunc()
 640                         end
 641                 end
 642         end
 643         local function minus(func)
 644                 return function(cC)
 645                         if func(cC) then
 646                                 matcher:fullResetOnNextStr()
 647                         end
 648                         matcher:nextFunc()
 649                 end
 650         end
 651         local function question(func)
 652                 return function(cC)
 653                         if func(cC) then
 654                                 matcher:fullResetOnNextFunc()
 655                                 matcher:nextStr()
 656                         end
 657                         matcher:nextFunc()
 658                 end
 659         end
 660
 661         local function capture(id)
 662                 return function(cC)
 663                         local l = matcher.captures[id][2] - matcher.captures[id][1]
 664                         local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
 665                         local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
 666                         if captured == check then
 667                                 for i = 0, l do
 668                                         matcher:nextStr()
 669                                 end
 670                                 matcher:nextFunc()
 671                         else
 672                                 matcher:reset()
 673                         end
 674                 end
 675         end
 676         local function captureStart(id)
 677                 return function(cC)
 678                         matcher.captures[id][1] = matcher.str
 679                         matcher:nextFunc()
 680                 end
 681         end
 682         local function captureStop(id)
 683                 return function(cC)
 684                         matcher.captures[id][2] = matcher.str - 1
 685                         matcher:nextFunc()
 686                 end
 687         end
 688
 689         local function balancer(str)
 690                 local sum = 0
 691                 local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
 692                 local skip = len(bc) + len(ec)
 693                 bc, ec = utf8unicode(bc), utf8unicode(ec)
 694                 return function(cC)
 695                         if cC == ec and sum > 0 then
 696                                 sum = sum - 1
 697                                 if sum == 0 then
 698                                         matcher:nextFunc()
 699                                 end
 700                                 matcher:nextStr()
 701                         elseif cC == bc then
 702                                 sum = sum + 1
 703                                 matcher:nextStr()
 704                         else
 705                                 if sum == 0 or cC == -1 then
 706                                         sum = 0
 707                                         matcher:reset()
 708                                 else
 709                                         matcher:nextStr()
 710                                 end
 711                         end
 712                 end, skip
 713         end
 714
 715         matcher.functions[1] = function(cC)
 716                 matcher:fullResetOnNextStr()
 717                 matcher.seqStart = matcher.str
 718                 matcher:nextFunc()
 719                 if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
 720                         matcher.stop = true
 721                         matcher.seqStart = nil
 722                 end
 723         end
 724
 725         local lastFunc
 726         local ignore = false
 727         local skip = nil
 728         local it = (function()
 729                 local gen = utf8gensub(regex)
 730                 return function()
 731                         return gen(skip)
 732                 end
 733         end)()
 734         local cs = {}
 735         for c, bs, be in it do
 736                 skip = nil
 737                 if plain then
 738                         table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
 739                 else
 740                         if ignore then
 741                                 if find('123456789', c, 1, true) then
 742                                         if lastFunc then
 743                                                 table.insert(matcher.functions, simple(lastFunc))
 744                                                 lastFunc = nil
 745                                         end
 746                                         table.insert(matcher.functions, capture(tonumber(c)))
 747                                 elseif c == 'b' then
 748                                         if lastFunc then
 749                                                 table.insert(matcher.functions, simple(lastFunc))
 750                                                 lastFunc = nil
 751                                         end
 752                                         local b
 753                                         b, skip = balancer(sub(regex, be + 1, be + 9))
 754                                         table.insert(matcher.functions, b)
 755                                 else
 756                                         lastFunc = classMatchGenerator('%' .. c)
 757                                 end
 758                                 ignore = false
 759                         else
 760                                 if c == '*' then
 761                                         if lastFunc then
 762                                                 table.insert(matcher.functions, star(lastFunc))
 763                                                 lastFunc = nil
 764                                         else
 765                                                 error('invalid regex after ' .. sub(regex, 1, bs))
 766                                         end
 767                                 elseif c == '+' then
 768                                         if lastFunc then
 769                                                 table.insert(matcher.functions, simple(lastFunc))
 770                                                 table.insert(matcher.functions, star(lastFunc))
 771                                                 lastFunc = nil
 772                                         else
 773                                                 error('invalid regex after ' .. sub(regex, 1, bs))
 774                                         end
 775                                 elseif c == '-' then
 776                                         if lastFunc then
 777                                                 table.insert(matcher.functions, minus(lastFunc))
 778                                                 lastFunc = nil
 779                                         else
 780                                                 error('invalid regex after ' .. sub(regex, 1, bs))
 781                                         end
 782                                 elseif c == '?' then
 783                                         if lastFunc then
 784                                                 table.insert(matcher.functions, question(lastFunc))
 785                                                 lastFunc = nil
 786                                         else
 787                                                 error('invalid regex after ' .. sub(regex, 1, bs))
 788                                         end
 789                                 elseif c == '^' then
 790                                         if bs == 1 then
 791                                                 matcher.fromStart = true
 792                                         else
 793                                                 error('invalid regex after ' .. sub(regex, 1, bs))
 794                                         end
 795                                 elseif c == '$' then
 796                                         if be == len(regex) then
 797                                                 matcher.toEnd = true
 798                                         else
 799                                                 error('invalid regex after ' .. sub(regex, 1, bs))
 800                                         end
 801                                 elseif c == '[' then
 802                                         if lastFunc then
 803                                                 table.insert(matcher.functions, simple(lastFunc))
 804                                         end
 805                                         lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
 806                                 elseif c == '(' then
 807                                         if lastFunc then
 808                                                 table.insert(matcher.functions, simple(lastFunc))
 809                                                 lastFunc = nil
 810                                         end
 811                                         table.insert(matcher.captures, {})
 812                                         table.insert(cs, #matcher.captures)
 813                                         table.insert(matcher.functions, captureStart(cs[#cs]))
 814                                         if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
 815                                 elseif c == ')' then
 816                                         if lastFunc then
 817                                                 table.insert(matcher.functions, simple(lastFunc))
 818                                                 lastFunc = nil
 819                                         end
 820                                         local cap = table.remove(cs)
 821                                         if not cap then
 822                                                 error('invalid capture: "(" missing')
 823                                         end
 824                                         table.insert(matcher.functions, captureStop(cap))
 825                                 elseif c == '.' then
 826                                         if lastFunc then
 827                                                 table.insert(matcher.functions, simple(lastFunc))
 828                                         end
 829                                         lastFunc = function(cC) return cC ~= -1 end
 830                                 elseif c == '%' then
 831                                         ignore = true
 832                                 else
 833                                         if lastFunc then
 834                                                 table.insert(matcher.functions, simple(lastFunc))
 835                                         end
 836                                         lastFunc = classMatchGenerator(c)
 837                                 end
 838                         end
 839                 end
 840         end
 841         if #cs > 0 then
 842                 error('invalid capture: ")" missing')
 843         end
 844         if lastFunc then
 845                 table.insert(matcher.functions, simple(lastFunc))
 846         end
 847         lastFunc = nil
 848         ignore = nil
 849
 850         table.insert(matcher.functions, function()
 851                 if matcher.toEnd and matcher.str ~= matcher.stringLen then
 852                         matcher:reset()
 853                 else
 854                         matcher.stop = true
 855                 end
 856         end)
 857
 858         matcher.nextFunc = function(self)
 859                 self.func = self.func + 1
 860         end
 861         matcher.nextStr = function(self)
 862                 self.str = self.str + 1
 863         end
 864         matcher.strReset = function(self)
 865                 local oldReset = self.reset
 866                 local str = self.str
 867                 self.reset = function(s)
 868                         s.str = str
 869                         s.reset = oldReset
 870                 end
 871         end
 872         matcher.fullResetOnNextFunc = function(self)
 873                 local oldReset = self.reset
 874                 local func = self.func +1
 875                 local str = self.str
 876                 self.reset = function(s)
 877                         s.func = func
 878                         s.str = str
 879                         s.reset = oldReset
 880                 end
 881         end
 882         matcher.fullResetOnNextStr = function(self)
 883                 local oldReset = self.reset
 884                 local str = self.str + 1
 885                 local func = self.func
 886                 self.reset = function(s)
 887                         s.func = func
 888                         s.str = str
 889                         s.reset = oldReset
 890                 end
 891         end
 892
 893         matcher.process = function(self, str, start)
 894
 895                 self.func = 1
 896                 start = start or 1
 897                 self.startStr = (start >= 0) and start or utf8len(str) + start + 1
 898                 self.seqStart = self.startStr
 899                 self.str = self.startStr
 900                 self.stringLen = utf8len(str) + 1
 901                 self.string = str
 902                 self.stop = false
 903
 904                 self.reset = function(s)
 905                         s.func = 1
 906                 end
 907
 908                 local lastPos = self.str
 909                 local lastByte
 910                 local char
 911                 while not self.stop do
 912                         if self.str < self.stringLen then
 913                                 --[[ if lastPos < self.str then
 914                                         print('last byte', lastByte)
 915                                         char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
 916                                         char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
 917                                         lastByte = lastByte - 1
 918                                 else
 919                                         char, lastByte = utf8subWithBytes(str, self.str, self.str)
 920                                 end
 921                                 lastPos = self.str ]]
 922                                 char = utf8sub(str, self.str,self.str)
 923                                 --print('char', char, utf8unicode(char))
 924                                 self.functions[self.func](utf8unicode(char))
 925                         else
 926                                 self.functions[self.func](-1)
 927                         end
 928                 end
 929
 930                 if self.seqStart then
 931                         local captures = {}
 932                         for _,pair in pairs(self.captures) do
 933                                 if pair.empty then
 934                                         table.insert(captures, pair[1])
 935                                 else
 936                                         table.insert(captures, utf8sub(str, pair[1], pair[2]))
 937                                 end
 938                         end
 939                         return self.seqStart, self.str - 1, unpack(captures)
 940                 end
 941         end
 942
 943         return matcher
 944 end
 945
 946 -- string.find
 947 local function utf8find(str, regex, init, plain)
 948         local matcher = cache[regex] or matcherGenerator(regex, plain)
 949         return matcher:process(str, init)
 950 end
 951
 952 -- string.match
 953 local function utf8match(str, regex, init)
 954         init = init or 1
 955         local found = {utf8find(str, regex, init)}
 956         if found[1] then
 957                 if found[3] then
 958                         return unpack(found, 3)
 959                 end
 960                 return utf8sub(str, found[1], found[2])
 961         end
 962 end
 963
 964 -- string.gmatch
 965 local function utf8gmatch(str, regex, all)
 966         regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
 967         local lastChar = 1
 968         return function()
 969                 local found = {utf8find(str, regex, lastChar)}
 970                 if found[1] then
 971                         lastChar = found[2] + 1
 972                         if found[all and 1 or 3] then
 973                                 return unpack(found, all and 1 or 3)
 974                         end
 975                         return utf8sub(str, found[1], found[2])
 976                 end
 977         end
 978 end
 979
 980 local function replace(repl, args)
 981         local ret = ''
 982         if type(repl) == 'string' then
 983                 local ignore = false
 984                 local num = 0
 985                 for c in utf8gensub(repl) do
 986                         if not ignore then
 987                                 if c == '%' then
 988                                         ignore = true
 989                                 else
 990                                         ret = ret .. c
 991                                 end
 992                         else
 993                                 num = tonumber(c)
 994                                 if num then
 995                                         ret = ret .. args[num]
 996                                 else
 997                                         ret = ret .. c
 998                                 end
 999                                 ignore = false
1000                         end
1001                 end
1002         elseif type(repl) == 'table' then
1003                 ret = repl[args[1] or args[0]] or ''
1004         elseif type(repl) == 'function' then
1005                 if #args > 0 then
1006                         ret = repl(unpack(args, 1)) or ''
1007                 else
1008                         ret = repl(args[0]) or ''
1009                 end
1010         end
1011         return ret
1012 end
1013 -- string.gsub
1014 local function utf8gsub(str, regex, repl, limit)
1015         limit = limit or -1
1016         local ret = ''
1017         local prevEnd = 1
1018         local it = utf8gmatch(str, regex, true)
1019         local found = {it()}
1020         local n = 0
1021         while #found > 0 and limit ~= n do
1022                 local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
1023                 ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
1024                 .. replace(repl, args)
1025                 prevEnd = found[2] + 1
1026                 n = n + 1
1027                 found = {it()}
1028         end
1029         return ret .. utf8sub(str, prevEnd), n
1030 end
1031
1032 local utf8 = {}
1033 utf8.len = utf8len
1034 utf8.sub = utf8sub
1035 utf8.reverse = utf8reverse
1036 utf8.char = utf8char
1037 utf8.unicode = utf8unicode
1038 utf8.gensub = utf8gensub
1039 utf8.byte = utf8unicode
1040 utf8.find    = utf8find
1041 utf8.match   = utf8match
1042 utf8.gmatch  = utf8gmatch
1043 utf8.gsub    = utf8gsub
1044 utf8.dump    = dump
1045 utf8.format = format
1046 utf8.lower = lower
1047 utf8.upper = upper
1048 utf8.rep     = rep
1049 return utf8