1 -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
3 -- Provides UTF-8 aware string functions implemented in pure lua:
8 -- * utf8unicode(s, i, j)
9 -- * utf8gensub(s, sub_len)
10 -- * utf8find(str, regex, init, plain)
11 -- * utf8match(str, regex, init)
12 -- * utf8gmatch(str, regex, all)
13 -- * utf8gsub(str, regex, repl, limit)
15 -- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
16 -- additional functions are available:
20 -- All functions behave as their non UTF-8 aware counterparts with the exception
21 -- that UTF-8 characters are used instead of bytes for all units.
24 Copyright (c) 2006-2007, Kyle Smith
30 Redistribution and use in source and binary forms, with or without
31 modification, are permitted provided that the following conditions are met:
33 * Redistributions of source code must retain the above copyright notice,
34 this list of conditions and the following disclaimer.
35 * Redistributions in binary form must reproduce the above copyright
36 notice, this list of conditions and the following disclaimer in the
37 documentation and/or other materials provided with the distribution.
38 * Neither the name of the author nor the names of its contributors may be
39 used to endorse or promote products derived from this software without
40 specific prior written permission.
42 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
43 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
46 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
48 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
49 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
50 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56 -- UTF8-octets = *( UTF8-char )
57 -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
59 -- UTF8-2 = %xC2-DF UTF8-tail
60 -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
61 -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
62 -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
63 -- %xF4 %x80-8F 2( UTF8-tail )
64 -- UTF8-tail = %x80-BF
67 local byte = string.byte
68 local char = string.char
69 local dump = string.dump
70 local find = string.find
71 local format = string.format
72 local gmatch = string.gmatch
73 local gsub = string.gsub
74 local len = string.len
75 local lower = string.lower
76 local match = string.match
77 local rep = string.rep
78 local reverse = string.reverse
79 local sub = string.sub
80 local upper = string.upper
82 -- returns the number of bytes used by the UTF-8 character at byte i in s
83 -- also doubles as a UTF-8 character validator
84 local function utf8charbytes (s, i)
89 if type(s) ~= "string" then
90 error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
92 if type(i) ~= "number" then
93 error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
98 -- determine bytes needed for character, based on RFC 3629
100 if c > 0 and c <= 127 then
104 elseif c >= 194 and c <= 223 then
106 local c2 = byte(s, i + 1)
109 error("UTF-8 string terminated early")
113 if c2 < 128 or c2 > 191 then
114 error("Invalid UTF-8 character")
119 elseif c >= 224 and c <= 239 then
121 local c2 = byte(s, i + 1)
122 local c3 = byte(s, i + 2)
124 if not c2 or not c3 then
125 error("UTF-8 string terminated early")
129 if c == 224 and (c2 < 160 or c2 > 191) then
130 error("Invalid UTF-8 character")
131 elseif c == 237 and (c2 < 128 or c2 > 159) then
132 error("Invalid UTF-8 character")
133 elseif c2 < 128 or c2 > 191 then
134 error("Invalid UTF-8 character")
138 if c3 < 128 or c3 > 191 then
139 error("Invalid UTF-8 character")
144 elseif c >= 240 and c <= 244 then
146 local c2 = byte(s, i + 1)
147 local c3 = byte(s, i + 2)
148 local c4 = byte(s, i + 3)
150 if not c2 or not c3 or not c4 then
151 error("UTF-8 string terminated early")
155 if c == 240 and (c2 < 144 or c2 > 191) then
156 error("Invalid UTF-8 character")
157 elseif c == 244 and (c2 < 128 or c2 > 143) then
158 error("Invalid UTF-8 character")
159 elseif c2 < 128 or c2 > 191 then
160 error("Invalid UTF-8 character")
164 if c3 < 128 or c3 > 191 then
165 error("Invalid UTF-8 character")
169 if c4 < 128 or c4 > 191 then
170 error("Invalid UTF-8 character")
176 error("Invalid UTF-8 character")
180 -- returns the number of characters in a UTF-8 string
181 local function utf8len (s)
183 if type(s) ~= "string" then
184 for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end
185 error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
192 while pos <= bytes do
194 pos = pos + utf8charbytes(s, pos)
200 -- functions identically to string.sub except that i and j are UTF-8 characters
202 local function utf8sub (s, i, j)
210 -- only set l if i or j is negative
211 local l = (i >= 0 and j >= 0) or utf8len(s)
212 local startChar = (i >= 0) and i or l + i + 1
213 local endChar = (j >= 0) and j or l + j + 1
215 -- can't have start before end!
216 if startChar > endChar then
220 -- byte offsets to pass to string.sub
221 local startByte,endByte = 1,bytes
223 while pos <= bytes do
226 if len == startChar then
230 pos = pos + utf8charbytes(s, pos)
232 if len == endChar then
238 if startChar > len then startByte = bytes+1 end
239 if endChar < 1 then endByte = 0 end
241 return sub(s, startByte, endByte)
245 -- replace UTF-8 characters based on a mapping table
246 local function utf8replace (s, mapping)
248 if type(s) ~= "string" then
249 error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
251 if type(mapping) ~= "table" then
252 error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
260 while pos <= bytes do
261 charbytes = utf8charbytes(s, pos)
262 local c = sub(s, pos, pos + charbytes - 1)
264 newstr = newstr .. (mapping[c] or c)
266 pos = pos + charbytes
273 -- identical to string.upper except it knows about unicode simple case conversions
274 local function utf8upper (s)
275 return utf8replace(s, utf8_lc_uc)
278 -- identical to string.lower except it knows about unicode simple case conversions
279 local function utf8lower (s)
280 return utf8replace(s, utf8_uc_lc)
283 -- identical to string.reverse except that it supports UTF-8
284 local function utf8reverse (s)
286 if type(s) ~= "string" then
287 error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
297 while c >= 128 and c <= 191 do
302 charbytes = utf8charbytes(s, pos)
304 newstr = newstr .. sub(s, pos, pos + charbytes - 1)
312 -- http://en.wikipedia.org/wiki/Utf8
313 -- http://developer.coronalabs.com/code/utf-8-conversion-utility
314 local function utf8char(unicode)
315 if unicode <= 0x7F then return char(unicode) end
317 if (unicode <= 0x7FF) then
318 local Byte0 = 0xC0 + math.floor(unicode / 0x40);
319 local Byte1 = 0x80 + (unicode % 0x40);
320 return char(Byte0, Byte1);
323 if (unicode <= 0xFFFF) then
324 local Byte0 = 0xE0 + math.floor(unicode / 0x1000);
325 local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
326 local Byte2 = 0x80 + (unicode % 0x40);
327 return char(Byte0, Byte1, Byte2);
330 if (unicode <= 0x10FFFF) then
332 local Byte3= 0x80 + (code % 0x40);
333 code = math.floor(code / 0x40)
334 local Byte2= 0x80 + (code % 0x40);
335 code = math.floor(code / 0x40)
336 local Byte1= 0x80 + (code % 0x40);
337 code = math.floor(code / 0x40)
338 local Byte0= 0xF0 + code;
340 return char(Byte0, Byte1, Byte2, Byte3);
343 error 'Unicode cannot be greater than U+10FFFF!'
347 local shift_12 = 2^12
348 local shift_18 = 2^18
351 utf8unicode = function(str, i, j, byte_pos)
355 if i > j then return end
360 bytes = utf8charbytes(str,byte_pos)
361 char = sub(str,byte_pos,byte_pos-1+bytes)
363 char,byte_pos = utf8sub(str,i,i), 0
369 if bytes == 1 then unicode = byte(char) end
371 local byte0,byte1 = byte(char,1,2)
372 local code0,code1 = byte0-0xC0,byte1-0x80
373 unicode = code0*shift_6 + code1
376 local byte0,byte1,byte2 = byte(char,1,3)
377 local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
378 unicode = code0*shift_12 + code1*shift_6 + code2
381 local byte0,byte1,byte2,byte3 = byte(char,1,4)
382 local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
383 unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
386 return unicode,utf8unicode(str, i+1, j, byte_pos+bytes)
389 -- Returns an iterator which returns the next substring and its byte interval
390 local function utf8gensub(str, sub_len)
391 sub_len = sub_len or 1
394 return function(skip)
395 if skip then byte_pos = byte_pos + skip end
397 local start = byte_pos
399 if byte_pos > len then return end
400 char_count = char_count + 1
401 local bytes = utf8charbytes(str,byte_pos)
402 byte_pos = byte_pos+bytes
404 until char_count == sub_len
406 local last = byte_pos-1
407 local sub = sub(str,start,last)
408 return sub, start, last
412 local function binsearch(sortedTable, item, comp)
413 local head, tail = 1, #sortedTable
414 local mid = math.floor((head + tail)/2)
416 while (tail - head) > 1 do
417 if sortedTable[tonumber(mid)] > item then
422 mid = math.floor((head + tail)/2)
426 if sortedTable[tonumber(head)] == item then
427 return true, tonumber(head)
428 elseif sortedTable[tonumber(tail)] == item then
429 return true, tonumber(tail)
434 local function classMatchGenerator(class, plain)
439 local firstletter = true
440 local unmatch = false
442 local it = utf8gensub(class)
447 if not ignore and not plain then
451 table.insert(codes, utf8unicode(c))
454 if not firstletter then
463 table.insert(codes, utf8unicode(c))
465 table.remove(codes) -- removing '-'
466 table.insert(ranges, {table.remove(codes), utf8unicode(c)})
470 elseif ignore and not plain then
471 if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
472 table.insert(ranges, {65, 90}) -- A - Z
473 table.insert(ranges, {97, 122}) -- a - z
474 elseif c == 'c' then -- %c: represents all control characters.
475 table.insert(ranges, {0, 31})
476 table.insert(codes, 127)
477 elseif c == 'd' then -- %d: represents all digits.
478 table.insert(ranges, {48, 57}) -- 0 - 9
479 elseif c == 'g' then -- %g: represents all printable characters except space.
480 table.insert(ranges, {1, 8})
481 table.insert(ranges, {14, 31})
482 table.insert(ranges, {33, 132})
483 table.insert(ranges, {134, 159})
484 table.insert(ranges, {161, 5759})
485 table.insert(ranges, {5761, 8191})
486 table.insert(ranges, {8203, 8231})
487 table.insert(ranges, {8234, 8238})
488 table.insert(ranges, {8240, 8286})
489 table.insert(ranges, {8288, 12287})
490 elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
491 table.insert(ranges, {97, 122}) -- a - z
492 elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
493 table.insert(ranges, {33, 47})
494 table.insert(ranges, {58, 64})
495 table.insert(ranges, {91, 96})
496 table.insert(ranges, {123, 126})
497 elseif c == 's' then -- %s: represents all space characters.
498 table.insert(ranges, {9, 13})
499 table.insert(codes, 32)
500 table.insert(codes, 133)
501 table.insert(codes, 160)
502 table.insert(codes, 5760)
503 table.insert(ranges, {8192, 8202})
504 table.insert(codes, 8232)
505 table.insert(codes, 8233)
506 table.insert(codes, 8239)
507 table.insert(codes, 8287)
508 table.insert(codes, 12288)
509 elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
510 table.insert(ranges, {65, 90}) -- A - Z
511 elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
512 table.insert(ranges, {48, 57}) -- 0 - 9
513 table.insert(ranges, {65, 90}) -- A - Z
514 table.insert(ranges, {97, 122}) -- a - z
515 elseif c == 'x' then -- %x: represents all hexadecimal digits.
516 table.insert(ranges, {48, 57}) -- 0 - 9
517 table.insert(ranges, {65, 70}) -- A - F
518 table.insert(ranges, {97, 102}) -- a - f
521 table.insert(codes, utf8unicode(c))
523 table.remove(codes) -- removing '-'
524 table.insert(ranges, {table.remove(codes), utf8unicode(c)})
531 table.insert(codes, utf8unicode(c))
533 table.remove(codes) -- removing '-'
534 table.insert(ranges, {table.remove(codes), utf8unicode(c)})
545 local function inRanges(charCode)
546 for _,r in ipairs(ranges) do
547 if r[1] <= charCode and charCode <= r[2] then
554 return function(charCode)
555 return binsearch(codes, charCode) or inRanges(charCode)
558 return function(charCode)
559 return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
564 -- utf8sub with extra argument, and extra result value
565 local function utf8subWithBytes (s, i, j, sb)
573 -- only set l if i or j is negative
574 local l = (i >= 0 and j >= 0) or utf8len(s)
575 local startChar = (i >= 0) and i or l + i + 1
576 local endChar = (j >= 0) and j or l + j + 1
578 -- can't have start before end!
579 if startChar > endChar then
583 -- byte offsets to pass to string.sub
584 local startByte,endByte = 1,bytes
586 while pos <= bytes do
589 if len == startChar then
593 pos = pos + utf8charbytes(s, pos)
595 if len == endChar then
601 if startChar > len then startByte = bytes+1 end
602 if endChar < 1 then endByte = 0 end
604 return sub(s, startByte, endByte), endByte + 1
607 local cache = setmetatable({},{
610 local cachePlain = setmetatable({},{
613 local function matcherGenerator(regex, plain)
619 cache[regex] = matcher
621 cachePlain[regex] = matcher
623 local function simple(func)
633 local function star(func)
636 matcher:fullResetOnNextFunc()
643 local function minus(func)
646 matcher:fullResetOnNextStr()
651 local function question(func)
654 matcher:fullResetOnNextFunc()
661 local function capture(id)
663 local l = matcher.captures[id][2] - matcher.captures[id][1]
664 local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
665 local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
666 if captured == check then
676 local function captureStart(id)
678 matcher.captures[id][1] = matcher.str
682 local function captureStop(id)
684 matcher.captures[id][2] = matcher.str - 1
689 local function balancer(str)
691 local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
692 local skip = len(bc) + len(ec)
693 bc, ec = utf8unicode(bc), utf8unicode(ec)
695 if cC == ec and sum > 0 then
705 if sum == 0 or cC == -1 then
715 matcher.functions[1] = function(cC)
716 matcher:fullResetOnNextStr()
717 matcher.seqStart = matcher.str
719 if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
721 matcher.seqStart = nil
728 local it = (function()
729 local gen = utf8gensub(regex)
735 for c, bs, be in it do
738 table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
741 if find('123456789', c, 1, true) then
743 table.insert(matcher.functions, simple(lastFunc))
746 table.insert(matcher.functions, capture(tonumber(c)))
749 table.insert(matcher.functions, simple(lastFunc))
753 b, skip = balancer(sub(regex, be + 1, be + 9))
754 table.insert(matcher.functions, b)
756 lastFunc = classMatchGenerator('%' .. c)
762 table.insert(matcher.functions, star(lastFunc))
765 error('invalid regex after ' .. sub(regex, 1, bs))
769 table.insert(matcher.functions, simple(lastFunc))
770 table.insert(matcher.functions, star(lastFunc))
773 error('invalid regex after ' .. sub(regex, 1, bs))
777 table.insert(matcher.functions, minus(lastFunc))
780 error('invalid regex after ' .. sub(regex, 1, bs))
784 table.insert(matcher.functions, question(lastFunc))
787 error('invalid regex after ' .. sub(regex, 1, bs))
791 matcher.fromStart = true
793 error('invalid regex after ' .. sub(regex, 1, bs))
796 if be == len(regex) then
799 error('invalid regex after ' .. sub(regex, 1, bs))
803 table.insert(matcher.functions, simple(lastFunc))
805 lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
808 table.insert(matcher.functions, simple(lastFunc))
811 table.insert(matcher.captures, {})
812 table.insert(cs, #matcher.captures)
813 table.insert(matcher.functions, captureStart(cs[#cs]))
814 if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
817 table.insert(matcher.functions, simple(lastFunc))
820 local cap = table.remove(cs)
822 error('invalid capture: "(" missing')
824 table.insert(matcher.functions, captureStop(cap))
827 table.insert(matcher.functions, simple(lastFunc))
829 lastFunc = function(cC) return cC ~= -1 end
834 table.insert(matcher.functions, simple(lastFunc))
836 lastFunc = classMatchGenerator(c)
842 error('invalid capture: ")" missing')
845 table.insert(matcher.functions, simple(lastFunc))
850 table.insert(matcher.functions, function()
851 if matcher.toEnd and matcher.str ~= matcher.stringLen then
858 matcher.nextFunc = function(self)
859 self.func = self.func + 1
861 matcher.nextStr = function(self)
862 self.str = self.str + 1
864 matcher.strReset = function(self)
865 local oldReset = self.reset
867 self.reset = function(s)
872 matcher.fullResetOnNextFunc = function(self)
873 local oldReset = self.reset
874 local func = self.func +1
876 self.reset = function(s)
882 matcher.fullResetOnNextStr = function(self)
883 local oldReset = self.reset
884 local str = self.str + 1
885 local func = self.func
886 self.reset = function(s)
893 matcher.process = function(self, str, start)
897 self.startStr = (start >= 0) and start or utf8len(str) + start + 1
898 self.seqStart = self.startStr
899 self.str = self.startStr
900 self.stringLen = utf8len(str) + 1
904 self.reset = function(s)
908 local lastPos = self.str
911 while not self.stop do
912 if self.str < self.stringLen then
913 --[[ if lastPos < self.str then
914 print('last byte', lastByte)
915 char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
916 char, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
917 lastByte = lastByte - 1
919 char, lastByte = utf8subWithBytes(str, self.str, self.str)
921 lastPos = self.str ]]
922 char = utf8sub(str, self.str,self.str)
923 --print('char', char, utf8unicode(char))
924 self.functions[self.func](utf8unicode(char))
926 self.functions[self.func](-1)
930 if self.seqStart then
932 for _,pair in pairs(self.captures) do
934 table.insert(captures, pair[1])
936 table.insert(captures, utf8sub(str, pair[1], pair[2]))
939 return self.seqStart, self.str - 1, unpack(captures)
947 local function utf8find(str, regex, init, plain)
948 local matcher = cache[regex] or matcherGenerator(regex, plain)
949 return matcher:process(str, init)
953 local function utf8match(str, regex, init)
955 local found = {utf8find(str, regex, init)}
958 return unpack(found, 3)
960 return utf8sub(str, found[1], found[2])
965 local function utf8gmatch(str, regex, all)
966 regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
969 local found = {utf8find(str, regex, lastChar)}
971 lastChar = found[2] + 1
972 if found[all and 1 or 3] then
973 return unpack(found, all and 1 or 3)
975 return utf8sub(str, found[1], found[2])
980 local function replace(repl, args)
982 if type(repl) == 'string' then
985 for c in utf8gensub(repl) do
995 ret = ret .. args[num]
1002 elseif type(repl) == 'table' then
1003 ret = repl[args[1] or args[0]] or ''
1004 elseif type(repl) == 'function' then
1006 ret = repl(unpack(args, 1)) or ''
1008 ret = repl(args[0]) or ''
1014 local function utf8gsub(str, regex, repl, limit)
1018 local it = utf8gmatch(str, regex, true)
1019 local found = {it()}
1021 while #found > 0 and limit ~= n do
1022 local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
1023 ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
1024 .. replace(repl, args)
1025 prevEnd = found[2] + 1
1029 return ret .. utf8sub(str, prevEnd), n
1035 utf8.reverse = utf8reverse
1036 utf8.char = utf8char
1037 utf8.unicode = utf8unicode
1038 utf8.gensub = utf8gensub
1039 utf8.byte = utf8unicode
1040 utf8.find = utf8find
1041 utf8.match = utf8match
1042 utf8.gmatch = utf8gmatch
1043 utf8.gsub = utf8gsub
1045 utf8.format = format