]> git.lizzy.rs Git - furrybot.git/blob - utf8.lua
Japanese waifu names
[furrybot.git] / utf8.lua
1 -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
2 --
3 -- Provides UTF-8 aware string functions implemented in pure lua:
4 -- * utf8len(s)
5 -- * utf8sub(s, i, j)
6 -- * utf8reverse(s)
7 -- * utf8char(unicode)
8 -- * utf8unicode(s, i, j)
9 -- * utf8gensub(s, sub_len)
10 -- * utf8find(str, regex, init, plain)
11 -- * utf8match(str, regex, init)
12 -- * utf8gmatch(str, regex, all)
13 -- * utf8gsub(str, regex, repl, limit)
14 --
15 -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
16 -- additional functions are available:
17 -- * utf8upper(s)
18 -- * utf8lower(s)
19 --
20 -- All functions behave as their non UTF-8 aware counterparts with the exception
21 -- that UTF-8 characters are used instead of bytes for all units.
22
23 --[[
24 Copyright (c) 2006-2007, Kyle Smith
25 All rights reserved.
26
27 Contributors:
28         Alimov Stepan
29
30 Redistribution and use in source and binary forms, with or without
31 modification, are permitted provided that the following conditions are met:
32
33     * Redistributions of source code must retain the above copyright notice,
34       this list of conditions and the following disclaimer.
35     * Redistributions in binary form must reproduce the above copyright
36       notice, this list of conditions and the following disclaimer in the
37       documentation and/or other materials provided with the distribution.
38     * Neither the name of the author nor the names of its contributors may be
39       used to endorse or promote products derived from this software without
40       specific prior written permission.
41
42 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
43 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
46 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
48 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
49 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
50 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 --]]
53
54 -- ABNF from RFC 3629
55 -- 
56 -- UTF8-octets = *( UTF8-char )
57 -- UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
58 -- UTF8-1      = %x00-7F
59 -- UTF8-2      = %xC2-DF UTF8-tail
60 -- UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
61 --               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
62 -- UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
63 --               %xF4 %x80-8F 2( UTF8-tail )
64 -- UTF8-tail   = %x80-BF
65 -- 
66
67 local byte    = string.byte
68 local char    = string.char
69 local dump    = string.dump
70 local find    = string.find
71 local format  = string.format
72 local gmatch  = string.gmatch
73 local gsub    = string.gsub
74 local len     = string.len
75 local lower   = string.lower
76 local match   = string.match
77 local rep     = string.rep
78 local reverse = string.reverse
79 local sub     = string.sub
80 local upper   = string.upper
81
82 -- returns the number of bytes used by the UTF-8 character at byte i in s
83 -- also doubles as a UTF-8 character validator
84 local function utf8charbytes (s, i)
85         -- argument defaults
86         i = i or 1
87
88         -- argument checking
89         if type(s) ~= "string" then
90                 error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
91         end
92         if type(i) ~= "number" then
93                 error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
94         end
95
96         local c = byte(s, i)
97
98         -- determine bytes needed for character, based on RFC 3629
99         -- validate byte 1
100         if c > 0 and c <= 127 then
101                 -- UTF8-1
102                 return 1
103
104         elseif c >= 194 and c <= 223 then
105                 -- UTF8-2
106                 local c2 = byte(s, i + 1)
107
108                 if not c2 then
109                         error("UTF-8 string terminated early")
110                 end
111
112                 -- validate byte 2
113                 if c2 < 128 or c2 > 191 then
114                         error("Invalid UTF-8 character")
115                 end
116
117                 return 2
118
119         elseif c >= 224 and c <= 239 then
120                 -- UTF8-3
121                 local c2 = byte(s, i + 1)
122                 local c3 = byte(s, i + 2)
123
124                 if not c2 or not c3 then
125                         error("UTF-8 string terminated early")
126                 end
127
128                 -- validate byte 2
129                 if c == 224 and (c2 < 160 or c2 > 191) then
130                         error("Invalid UTF-8 character")
131                 elseif c == 237 and (c2 < 128 or c2 > 159) then
132                         error("Invalid UTF-8 character")
133                 elseif c2 < 128 or c2 > 191 then
134                         error("Invalid UTF-8 character")
135                 end
136
137                 -- validate byte 3
138                 if c3 < 128 or c3 > 191 then
139                         error("Invalid UTF-8 character")
140                 end
141
142                 return 3
143
144         elseif c >= 240 and c <= 244 then
145                 -- UTF8-4
146                 local c2 = byte(s, i + 1)
147                 local c3 = byte(s, i + 2)
148                 local c4 = byte(s, i + 3)
149
150                 if not c2 or not c3 or not c4 then
151                         error("UTF-8 string terminated early")
152                 end
153
154                 -- validate byte 2
155                 if c == 240 and (c2 < 144 or c2 > 191) then
156                         error("Invalid UTF-8 character")
157                 elseif c == 244 and (c2 < 128 or c2 > 143) then
158                         error("Invalid UTF-8 character")
159                 elseif c2 < 128 or c2 > 191 then
160                         error("Invalid UTF-8 character")
161                 end
162                 
163                 -- validate byte 3
164                 if c3 < 128 or c3 > 191 then
165                         error("Invalid UTF-8 character")
166                 end
167
168                 -- validate byte 4
169                 if c4 < 128 or c4 > 191 then
170                         error("Invalid UTF-8 character")
171                 end
172
173                 return 4
174
175         else
176                 error("Invalid UTF-8 character")
177         end
178 end
179
180 -- returns the number of characters in a UTF-8 string
181 local function utf8len (s)
182         -- argument checking
183         if type(s) ~= "string" then
184                 for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end
185                 error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
186         end
187
188         local pos = 1
189         local bytes = len(s)
190         local len = 0
191
192         while pos <= bytes do
193                 len = len + 1
194                 pos = pos + utf8charbytes(s, pos)
195         end
196
197         return len
198 end
199
200 -- functions identically to string.sub except that i and j are UTF-8 characters
201 -- instead of bytes
202 local function utf8sub (s, i, j)
203         -- argument defaults
204         j = j or -1
205
206         local pos = 1
207         local bytes = len(s)
208         local len = 0
209
210         -- only set l if i or j is negative
211         local l = (i >= 0 and j >= 0) or utf8len(s)
212         local startChar = (i >= 0) and i or l + i + 1
213         local endChar   = (j >= 0) and j or l + j + 1
214
215         -- can't have start before end!
216         if startChar > endChar then
217                 return ""
218         end
219
220         -- byte offsets to pass to string.sub
221         local startByte,endByte = 1,bytes
222         
223         while pos <= bytes do
224                 len = len + 1
225
226                 if len == startChar then
227                         startByte = pos
228                 end
229
230                 pos = pos + utf8charbytes(s, pos)
231
232                 if len == endChar then
233                         endByte = pos - 1
234                         break
235                 end
236         end
237         
238         if startChar > len then startByte = bytes+1   end
239         if endChar   < 1   then endByte   = 0         end
240         
241         return sub(s, startByte, endByte)
242 end
243
244
245 -- replace UTF-8 characters based on a mapping table
246 local function utf8replace (s, mapping)
247         -- argument checking
248         if type(s) ~= "string" then
249                 error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
250         end
251         if type(mapping) ~= "table" then
252                 error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
253         end
254
255         local pos = 1
256         local bytes = len(s)
257         local charbytes
258         local newstr = ""
259
260         while pos <= bytes do
261                 charbytes = utf8charbytes(s, pos)
262                 local c = sub(s, pos, pos + charbytes - 1)
263
264                 newstr = newstr .. (mapping[c] or c)
265
266                 pos = pos + charbytes
267         end
268
269         return newstr
270 end
271
272
273 -- identical to string.upper except it knows about unicode simple case conversions
274 local function utf8upper (s)
275         return utf8replace(s, utf8_lc_uc)
276 end
277
278 -- identical to string.lower except it knows about unicode simple case conversions
279 local function utf8lower (s)
280         return utf8replace(s, utf8_uc_lc)
281 end
282
283 -- identical to string.reverse except that it supports UTF-8
284 local function utf8reverse (s)
285         -- argument checking
286         if type(s) ~= "string" then
287                 error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
288         end
289
290         local bytes = len(s)
291         local pos = bytes
292         local charbytes
293         local newstr = ""
294
295         while pos > 0 do
296                 c = byte(s, pos)
297                 while c >= 128 and c <= 191 do
298                         pos = pos - 1
299                         c = byte(s, pos)
300                 end
301
302                 charbytes = utf8charbytes(s, pos)
303
304                 newstr = newstr .. sub(s, pos, pos + charbytes - 1)
305
306                 pos = pos - 1
307         end
308
309         return newstr
310 end
311
312 -- http://en.wikipedia.org/wiki/Utf8
313 -- http://developer.coronalabs.com/code/utf-8-conversion-utility
314 local function utf8char(unicode)
315         if unicode <= 0x7F then return char(unicode) end
316         
317         if (unicode <= 0x7FF) then
318                 local Byte0 = 0xC0 + math.floor(unicode / 0x40);
319                 local Byte1 = 0x80 + (unicode % 0x40);
320                 return char(Byte0, Byte1);
321         end;
322         
323         if (unicode <= 0xFFFF) then
324                 local Byte0 = 0xE0 +  math.floor(unicode / 0x1000);
325                 local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
326                 local Byte2 = 0x80 + (unicode % 0x40);
327                 return char(Byte0, Byte1, Byte2);
328         end;
329         
330         if (unicode <= 0x10FFFF) then
331                 local code = unicode
332                 local Byte3= 0x80 + (code % 0x40);
333                 code       = math.floor(code / 0x40)
334                 local Byte2= 0x80 + (code % 0x40);
335                 code       = math.floor(code / 0x40)
336                 local Byte1= 0x80 + (code % 0x40);
337                 code       = math.floor(code / 0x40)  
338                 local Byte0= 0xF0 + code;
339                 
340                 return char(Byte0, Byte1, Byte2, Byte3);
341         end;
342         
343         error 'Unicode cannot be greater than U+10FFFF!'
344 end
345
346 local shift_6  = 2^6
347 local shift_12 = 2^12
348 local shift_18 = 2^18
349
350 local utf8unicode
351 utf8unicode = function(str, i, j, byte_pos)
352         i = i or 1
353         j = j or i
354         
355         if i > j then return end
356         
357         local char,bytes
358         
359         if byte_pos then 
360                 bytes = utf8charbytes(str,byte_pos)
361                 char  = sub(str,byte_pos,byte_pos-1+bytes)
362         else
363                 char,byte_pos = utf8sub(str,i,i), 0
364                 bytes         = #char
365         end
366         
367         local unicode
368         
369         if bytes == 1 then unicode = byte(char) end
370         if bytes == 2 then
371                 local byte0,byte1 = byte(char,1,2)
372                 local code0,code1 = byte0-0xC0,byte1-0x80
373                 unicode = code0*shift_6 + code1
374         end
375         if bytes == 3 then
376                 local byte0,byte1,byte2 = byte(char,1,3)
377                 local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
378                 unicode = code0*shift_12 + code1*shift_6 + code2
379         end
380         if bytes == 4 then
381                 local byte0,byte1,byte2,byte3 = byte(char,1,4)
382                 local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
383                 unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
384         end
385         
386         return unicode,utf8unicode(str, i+1, j, byte_pos+bytes)
387 end
388
389 -- Returns an iterator which returns the next substring and its byte interval
390 local function utf8gensub(str, sub_len)
391         sub_len        = sub_len or 1
392         local byte_pos = 1
393         local len      = #str
394         return function(skip)
395                 if skip then byte_pos = byte_pos + skip end
396                 local char_count = 0
397                 local start      = byte_pos
398                 repeat
399                         if byte_pos > len then return end
400                         char_count  = char_count + 1
401                         local bytes = utf8charbytes(str,byte_pos)
402                         byte_pos    = byte_pos+bytes
403                         
404                 until char_count == sub_len
405                 
406                 local last  = byte_pos-1
407                 local sub   = sub(str,start,last)
408                 return sub, start, last
409         end
410 end
411
412 local function binsearch(sortedTable, item, comp)
413         local head, tail = 1, #sortedTable
414         local mid = math.floor((head + tail)/2)
415         if not comp then
416                 while (tail - head) > 1 do
417                         if sortedTable[tonumber(mid)] > item then
418                                 tail = mid
419                         else
420                                 head = mid
421                         end
422                         mid = math.floor((head + tail)/2)
423                 end
424         else
425         end
426         if sortedTable[tonumber(head)] == item then
427                 return true, tonumber(head)
428         elseif sortedTable[tonumber(tail)] == item then
429                 return true, tonumber(tail)
430         else
431                 return false
432         end
433 end
434 local function classMatchGenerator(class, plain)
435         local codes = {}
436         local ranges = {}
437         local ignore = false
438         local range = false
439         local firstletter = true
440         local unmatch = false
441         
442         local it = utf8gensub(class) 
443         
444         local skip
445         for c,bs,be in it do
446                 skip = be
447                 if not ignore and not plain then
448                         if c == "%" then
449                                 ignore = true
450                         elseif c == "-" then
451                                 table.insert(codes, utf8unicode(c))
452                                 range = true
453                         elseif c == "^" then
454                                 if not firstletter then
455                                         error('!!!')
456                                 else
457                                         unmatch = true
458                                 end
459                         elseif c == ']' then
460                                 break
461                         else
462                                 if not range then
463                                         table.insert(codes, utf8unicode(c))
464                                 else
465                                         table.remove(codes) -- removing '-'
466                                         table.insert(ranges, {table.remove(codes), utf8unicode(c)})
467                                         range = false
468                                 end
469                         end
470                 elseif ignore and not plain then
471                         if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
472                                 table.insert(ranges, {65, 90}) -- A - Z
473                                 table.insert(ranges, {97, 122}) -- a - z
474                         elseif c == 'c' then -- %c: represents all control characters.
475                                 table.insert(ranges, {0, 31})
476                                 table.insert(codes, 127)
477                         elseif c == 'd' then -- %d: represents all digits.
478                                 table.insert(ranges, {48, 57}) -- 0 - 9
479                         elseif c == 'g' then -- %g: represents all printable characters except space.
480                                 table.insert(ranges, {1, 8})
481                                 table.insert(ranges, {14, 31})
482                                 table.insert(ranges, {33, 132})
483                                 table.insert(ranges, {134, 159})
484                                 table.insert(ranges, {161, 5759})
485                                 table.insert(ranges, {5761, 8191})
486                                 table.insert(ranges, {8203, 8231})
487                                 table.insert(ranges, {8234, 8238})
488                                 table.insert(ranges, {8240, 8286})
489                                 table.insert(ranges, {8288, 12287})
490                         elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
491                                 table.insert(ranges, {97, 122}) -- a - z
492                         elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
493                                 table.insert(ranges, {33, 47})
494                                 table.insert(ranges, {58, 64})
495                                 table.insert(ranges, {91, 96})
496                                 table.insert(ranges, {123, 126})
497                         elseif c == 's' then -- %s: represents all space characters.
498                                 table.insert(ranges, {9, 13})
499                                 table.insert(codes, 32)
500                                 table.insert(codes, 133)
501                                 table.insert(codes, 160)
502                                 table.insert(codes, 5760)
503                                 table.insert(ranges, {8192, 8202})
504                                 table.insert(codes, 8232)
505                                 table.insert(codes, 8233)
506                                 table.insert(codes, 8239)
507                                 table.insert(codes, 8287)
508                                 table.insert(codes, 12288)
509                         elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
510                                 table.insert(ranges, {65, 90}) -- A - Z
511                         elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
512                                 table.insert(ranges, {48, 57}) -- 0 - 9
513                                 table.insert(ranges, {65, 90}) -- A - Z
514                                 table.insert(ranges, {97, 122}) -- a - z
515                         elseif c == 'x' then -- %x: represents all hexadecimal digits.
516                                 table.insert(ranges, {48, 57}) -- 0 - 9
517                                 table.insert(ranges, {65, 70}) -- A - F
518                                 table.insert(ranges, {97, 102}) -- a - f
519                         else
520                                 if not range then
521                                         table.insert(codes, utf8unicode(c))
522                                 else
523                                         table.remove(codes) -- removing '-'
524                                         table.insert(ranges, {table.remove(codes), utf8unicode(c)})
525                                         range = false
526                                 end
527                         end
528                         ignore = false
529                 else
530                         if not range then
531                                 table.insert(codes, utf8unicode(c))
532                         else
533                                 table.remove(codes) -- removing '-'
534                                 table.insert(ranges, {table.remove(codes), utf8unicode(c)})
535                                 range = false
536                         end
537                         ignore = false
538                 end
539                 
540                 firstletter = false
541         end
542         
543         table.sort(codes)
544         
545         local function inRanges(charCode)
546                 for _,r in ipairs(ranges) do
547                         if r[1] <= charCode and charCode <= r[2] then
548                                 return true
549                         end
550                 end
551                 return false
552         end
553         if not unmatch then 
554                 return function(charCode)
555                         return binsearch(codes, charCode) or inRanges(charCode) 
556                 end, skip
557         else
558                 return function(charCode)
559                         return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
560                 end, skip
561         end
562 end
563
564 -- utf8sub with extra argument, and extra result value 
565 local function utf8subWithBytes (s, i, j, sb)
566         -- argument defaults
567         j = j or -1
568
569         local pos = sb or 1
570         local bytes = len(s)
571         local len = 0
572
573         -- only set l if i or j is negative
574         local l = (i >= 0 and j >= 0) or utf8len(s)
575         local startChar = (i >= 0) and i or l + i + 1
576         local endChar   = (j >= 0) and j or l + j + 1
577
578         -- can't have start before end!
579         if startChar > endChar then
580                 return ""
581         end
582
583         -- byte offsets to pass to string.sub
584         local startByte,endByte = 1,bytes
585         
586         while pos <= bytes do
587                 len = len + 1
588
589                 if len == startChar then
590                         startByte = pos
591                 end
592
593                 pos = pos + utf8charbytes(s, pos)
594
595                 if len == endChar then
596                         endByte = pos - 1
597                         break
598                 end
599         end
600         
601         if startChar > len then startByte = bytes+1   end
602         if endChar   < 1   then endByte   = 0         end
603         
604         return sub(s, startByte, endByte), endByte + 1
605 end
606
607 local cache = setmetatable({},{
608         __mode = 'kv'
609 })
610 local cachePlain = setmetatable({},{
611         __mode = 'kv'
612 })
613 local function matcherGenerator(regex, plain)
614         local matcher = {
615                 functions = {},
616                 captures = {}
617         }
618         if not plain then
619                 cache[regex] =  matcher
620         else
621                 cachePlain[regex] = matcher
622         end
623         local function simple(func)
624                 return function(cC) 
625                         if func(cC) then
626                                 matcher:nextFunc()
627                                 matcher:nextStr()
628                         else
629                                 matcher:reset()
630                         end
631                 end
632         end
633         local function star(func)
634                 return function(cC)
635                         if func(cC) then
636                                 matcher:fullResetOnNextFunc()
637                                 matcher:nextStr()
638                         else
639                                 matcher:nextFunc()
640                         end
641                 end
642         end
643         local function minus(func)
644                 return function(cC)
645                         if func(cC) then
646                                 matcher:fullResetOnNextStr()
647                         end
648                         matcher:nextFunc()
649                 end
650         end
651         local function question(func)
652                 return function(cC)
653                         if func(cC) then
654                                 matcher:fullResetOnNextFunc()
655                                 matcher:nextStr()
656                         end
657                         matcher:nextFunc()
658                 end
659         end
660         
661         local function capture(id)
662                 return function(cC)
663                         local l = matcher.captures[id][2] - matcher.captures[id][1]
664                         local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
665                         local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
666                         if captured == check then
667                                 for i = 0, l do
668                                         matcher:nextStr()
669                                 end
670                                 matcher:nextFunc()
671                         else
672                                 matcher:reset()
673                         end
674                 end
675         end
676         local function captureStart(id)
677                 return function(cC)
678                         matcher.captures[id][1] = matcher.str
679                         matcher:nextFunc()
680                 end
681         end
682         local function captureStop(id)
683                 return function(cC)
684                         matcher.captures[id][2] = matcher.str - 1
685                         matcher:nextFunc()
686                 end
687         end
688         
689         local function balancer(str)
690                 local sum = 0
691                 local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
692                 local skip = len(bc) + len(ec)
693                 bc, ec = utf8unicode(bc), utf8unicode(ec)
694                 return function(cC)
695                         if cC == ec and sum > 0 then
696                                 sum = sum - 1
697                                 if sum == 0 then
698                                         matcher:nextFunc()
699                                 end
700                                 matcher:nextStr()
701                         elseif cC == bc then
702                                 sum = sum + 1
703                                 matcher:nextStr()
704                         else
705                                 if sum == 0 or cC == -1 then
706                                         sum = 0
707                                         matcher:reset()
708                                 else
709                                         matcher:nextStr()
710                                 end
711                         end
712                 end, skip
713         end
714         
715         matcher.functions[1] = function(cC)
716                 matcher:fullResetOnNextStr()
717                 matcher.seqStart = matcher.str
718                 matcher:nextFunc()
719                 if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
720                         matcher.stop = true
721                         matcher.seqStart = nil
722                 end
723         end
724         
725         local lastFunc
726         local ignore = false
727         local skip = nil
728         local it = (function()
729                 local gen = utf8gensub(regex)
730                 return function()
731                         return gen(skip)
732                 end
733         end)()
734         local cs = {}
735         for c, bs, be in it do
736                 skip = nil
737                 if plain then
738                         table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
739                 else
740                         if ignore then
741                                 if find('123456789', c, 1, true) then
742                                         if lastFunc then
743                                                 table.insert(matcher.functions, simple(lastFunc))
744                                                 lastFunc = nil
745                                         end
746                                         table.insert(matcher.functions, capture(tonumber(c)))
747                                 elseif c == 'b' then
748                                         if lastFunc then
749                                                 table.insert(matcher.functions, simple(lastFunc))
750                                                 lastFunc = nil
751                                         end
752                                         local b
753                                         b, skip = balancer(sub(regex, be + 1, be + 9))
754                                         table.insert(matcher.functions, b)
755                                 else
756                                         lastFunc = classMatchGenerator('%' .. c)
757                                 end
758                                 ignore = false
759                         else
760                                 if c == '*' then
761                                         if lastFunc then
762                                                 table.insert(matcher.functions, star(lastFunc))
763                                                 lastFunc = nil
764                                         else
765                                                 error('invalid regex after ' .. sub(regex, 1, bs))
766                                         end
767                                 elseif c == '+' then
768                                         if lastFunc then
769                                                 table.insert(matcher.functions, simple(lastFunc))
770                                                 table.insert(matcher.functions, star(lastFunc))
771                                                 lastFunc = nil
772                                         else
773                                                 error('invalid regex after ' .. sub(regex, 1, bs))
774                                         end
775                                 elseif c == '-' then
776                                         if lastFunc then
777                                                 table.insert(matcher.functions, minus(lastFunc))
778                                                 lastFunc = nil
779                                         else
780                                                 error('invalid regex after ' .. sub(regex, 1, bs))
781                                         end
782                                 elseif c == '?' then
783                                         if lastFunc then
784                                                 table.insert(matcher.functions, question(lastFunc))
785                                                 lastFunc = nil
786                                         else
787                                                 error('invalid regex after ' .. sub(regex, 1, bs))
788                                         end
789                                 elseif c == '^' then
790                                         if bs == 1 then
791                                                 matcher.fromStart = true
792                                         else
793                                                 error('invalid regex after ' .. sub(regex, 1, bs))
794                                         end
795                                 elseif c == '$' then
796                                         if be == len(regex) then
797                                                 matcher.toEnd = true
798                                         else
799                                                 error('invalid regex after ' .. sub(regex, 1, bs))
800                                         end
801                                 elseif c == '[' then
802                                         if lastFunc then
803                                                 table.insert(matcher.functions, simple(lastFunc))
804                                         end
805                                         lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
806                                 elseif c == '(' then
807                                         if lastFunc then
808                                                 table.insert(matcher.functions, simple(lastFunc))
809                                                 lastFunc = nil
810                                         end
811                                         table.insert(matcher.captures, {})
812                                         table.insert(cs, #matcher.captures)
813                                         table.insert(matcher.functions, captureStart(cs[#cs]))
814                                         if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
815                                 elseif c == ')' then
816                                         if lastFunc then
817                                                 table.insert(matcher.functions, simple(lastFunc))
818                                                 lastFunc = nil
819                                         end
820                                         local cap = table.remove(cs)
821                                         if not cap then
822                                                 error('invalid capture: "(" missing')
823                                         end
824                                         table.insert(matcher.functions, captureStop(cap))
825                                 elseif c == '.' then
826                                         if lastFunc then
827                                                 table.insert(matcher.functions, simple(lastFunc))
828                                         end
829                                         lastFunc = function(cC) return cC ~= -1 end
830                                 elseif c == '%' then
831                                         ignore = true
832                                 else
833                                         if lastFunc then
834                                                 table.insert(matcher.functions, simple(lastFunc))
835                                         end
836                                         lastFunc = classMatchGenerator(c)
837                                 end
838                         end
839                 end
840         end
841         if #cs > 0 then
842                 error('invalid capture: ")" missing')
843         end
844         if lastFunc then
845                 table.insert(matcher.functions, simple(lastFunc))
846         end
847         lastFunc = nil
848         ignore = nil
849         
850         table.insert(matcher.functions, function()
851                 if matcher.toEnd and matcher.str ~= matcher.stringLen then
852                         matcher:reset()
853                 else
854                         matcher.stop = true
855                 end
856         end)
857         
858         matcher.nextFunc = function(self)
859                 self.func = self.func + 1
860         end
861         matcher.nextStr = function(self)
862                 self.str = self.str + 1
863         end
864         matcher.strReset = function(self)
865                 local oldReset = self.reset
866                 local str = self.str
867                 self.reset = function(s)
868                         s.str = str
869                         s.reset = oldReset
870                 end
871         end
872         matcher.fullResetOnNextFunc = function(self)
873                 local oldReset = self.reset
874                 local func = self.func +1
875                 local str = self.str
876                 self.reset = function(s)
877                         s.func = func
878                         s.str = str
879                         s.reset = oldReset
880                 end
881         end
882         matcher.fullResetOnNextStr = function(self)
883                 local oldReset = self.reset
884                 local str = self.str + 1
885                 local func = self.func
886                 self.reset = function(s)
887                         s.func = func
888                         s.str = str
889                         s.reset = oldReset
890                 end
891         end
892         
893         matcher.process = function(self, str, start)
894                 
895                 self.func = 1
896                 start = start or 1
897                 self.startStr = (start >= 0) and start or utf8len(str) + start + 1
898                 self.seqStart = self.startStr
899                 self.str = self.startStr
900                 self.stringLen = utf8len(str) + 1
901                 self.string = str
902                 self.stop = false
903                 
904                 self.reset = function(s)
905                         s.func = 1
906                 end
907
908                 local lastPos = self.str
909                 local lastByte
910                 local char
911                 while not self.stop do
912                         if self.str < self.stringLen then
913                                 --[[ if lastPos < self.str then
914                                         print('last byte', lastByte)
915                                         char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
916                                         char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
917                                         lastByte = lastByte - 1
918                                 else
919                                         char, lastByte = utf8subWithBytes(str, self.str, self.str)
920                                 end
921                                 lastPos = self.str ]]
922                                 char = utf8sub(str, self.str,self.str)
923                                 --print('char', char, utf8unicode(char))
924                                 self.functions[self.func](utf8unicode(char))
925                         else
926                                 self.functions[self.func](-1)
927                         end
928                 end
929                 
930                 if self.seqStart then
931                         local captures = {}
932                         for _,pair in pairs(self.captures) do
933                                 if pair.empty then
934                                         table.insert(captures, pair[1])
935                                 else
936                                         table.insert(captures, utf8sub(str, pair[1], pair[2]))
937                                 end
938                         end
939                         return self.seqStart, self.str - 1, unpack(captures)
940                 end
941         end
942         
943         return matcher
944 end
945
946 -- string.find
947 local function utf8find(str, regex, init, plain)
948         local matcher = cache[regex] or matcherGenerator(regex, plain)
949         return matcher:process(str, init)
950 end
951
952 -- string.match
953 local function utf8match(str, regex, init)
954         init = init or 1
955         local found = {utf8find(str, regex, init)}
956         if found[1] then
957                 if found[3] then
958                         return unpack(found, 3)
959                 end
960                 return utf8sub(str, found[1], found[2])
961         end
962 end
963
964 -- string.gmatch
965 local function utf8gmatch(str, regex, all)
966         regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex 
967         local lastChar = 1
968         return function()
969                 local found = {utf8find(str, regex, lastChar)}
970                 if found[1] then
971                         lastChar = found[2] + 1
972                         if found[all and 1 or 3] then
973                                 return unpack(found, all and 1 or 3)
974                         end
975                         return utf8sub(str, found[1], found[2])
976                 end
977         end
978 end
979
980 local function replace(repl, args)
981         local ret = ''
982         if type(repl) == 'string' then
983                 local ignore = false
984                 local num = 0
985                 for c in utf8gensub(repl) do
986                         if not ignore then
987                                 if c == '%' then
988                                         ignore = true
989                                 else
990                                         ret = ret .. c
991                                 end
992                         else
993                                 num = tonumber(c)
994                                 if num then
995                                         ret = ret .. args[num]
996                                 else
997                                         ret = ret .. c
998                                 end
999                                 ignore = false
1000                         end
1001                 end
1002         elseif type(repl) == 'table' then
1003                 ret = repl[args[1] or args[0]] or ''
1004         elseif type(repl) == 'function' then
1005                 if #args > 0 then
1006                         ret = repl(unpack(args, 1)) or ''
1007                 else
1008                         ret = repl(args[0]) or ''
1009                 end
1010         end
1011         return ret
1012 end
1013 -- string.gsub
1014 local function utf8gsub(str, regex, repl, limit)
1015         limit = limit or -1
1016         local ret = ''
1017         local prevEnd = 1
1018         local it = utf8gmatch(str, regex, true)
1019         local found = {it()}
1020         local n = 0
1021         while #found > 0 and limit ~= n do
1022                 local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
1023                 ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
1024                 .. replace(repl, args)
1025                 prevEnd = found[2] + 1
1026                 n = n + 1 
1027                 found = {it()}
1028         end
1029         return ret .. utf8sub(str, prevEnd), n 
1030 end
1031
1032 local utf8 = {}                                                                                             
1033 utf8.len = utf8len
1034 utf8.sub = utf8sub
1035 utf8.reverse = utf8reverse
1036 utf8.char = utf8char
1037 utf8.unicode = utf8unicode
1038 utf8.gensub = utf8gensub
1039 utf8.byte = utf8unicode
1040 utf8.find    = utf8find
1041 utf8.match   = utf8match
1042 utf8.gmatch  = utf8gmatch
1043 utf8.gsub    = utf8gsub  
1044 utf8.dump    = dump  
1045 utf8.format = format 
1046 utf8.lower = lower      
1047 utf8.upper = upper      
1048 utf8.rep     = rep
1049 return utf8