package util
-import (
- "unicode/utf8"
-)
-
// LuaRuneAt is a helper function for lua plugins to return the rune
// at an index within a string
func LuaRuneAt(str string, runeidx int) string {
i := 0
for len(str) > 0 {
- r, size := utf8.DecodeRuneInString(str)
+ r, _, size := DecodeCharacterInString(str)
str = str[size:]
func LuaGetLeadingWhitespace(s string) string {
ws := []byte{}
for len(s) > 0 {
- r, size := utf8.DecodeRuneInString(s)
+ r, _, size := DecodeCharacterInString(s)
if r == ' ' || r == '\t' {
ws = append(ws, byte(r))
} else {
// LuaIsWordChar returns true if the first rune in a string is a word character
func LuaIsWordChar(s string) bool {
- r, _ := utf8.DecodeRuneInString(s)
+ r, _, _ := DecodeCharacterInString(s)
return IsWordChar(r)
}
"unicode/utf8"
)
+// Unicode is annoying. A "code point" (rune in Go-speak) may need up to
+// 4 bytes to represent it. In general, a code point will represent a
+// complete character, but this is not always the case. A character with
+// accents may be made up of multiple code points (the code point for the
+// original character, and additional code points for each accent/marking).
+// The functions below are meant to help deal with these additional "combining"
+// code points. In underlying operations (search, replace, etc...), micro will
+// treat a character with combining code points as just the original code point.
+// For rendering, micro will display the combining characters. It's not perfect
+// but it's pretty good.
+
// combining character range table
var combining = &unicode.RangeTable{
R16: []unicode.Range16{
return r, combc, size
}
+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+ r, size := utf8.DecodeRuneInString(str)
+ str = str[size:]
+ c, s := utf8.DecodeRuneInString(str)
+
+ var combc []rune
+ for unicode.In(c, combining) {
+ combc = append(combc, c)
+ size += s
+
+ str = str[s:]
+ c, s = utf8.DecodeRuneInString(str)
+ }
+
+ return r, combc, size
+}
+
// CharacterCount returns the number of characters in a byte array
// Similar to utf8.RuneCount but for unicode characters
func CharacterCount(b []byte) int {
"strings"
"time"
"unicode"
- "unicode/utf8"
"github.com/blang/semver"
runewidth "github.com/mattn/go-runewidth"
return str[totalSize:]
}
- _, size := utf8.DecodeRuneInString(str[totalSize:])
+ _, _, size := DecodeCharacterInString(str[totalSize:])
totalSize += size
i++
}
return str[:totalSize]
}
- _, size := utf8.DecodeRuneInString(str[totalSize:])
+ _, _, size := DecodeCharacterInString(str[totalSize:])
totalSize += size
i++
}
import (
"regexp"
"strings"
- "unicode/utf8"
)
func sliceStart(slc []byte, index int) []byte {
return slc[totalSize:]
}
- _, size := utf8.DecodeRune(slc[totalSize:])
+ _, _, size := DecodeCharacter(slc[totalSize:])
totalSize += size
i++
}
return slc[:totalSize]
}
- _, size := utf8.DecodeRune(slc[totalSize:])
+ _, _, size := DecodeCharacter(slc[totalSize:])
totalSize += size
i++
}
return 0
}
if p >= len(str) {
- return utf8.RuneCount(str)
+ return CharacterCount(str)
}
- return utf8.RuneCount(str[:p])
+ return CharacterCount(str[:p])
}
func combineLineMatch(src, dst LineMatch) LineMatch {
var strbytes []byte
if skip != nil {
strbytes = skip.ReplaceAllFunc(str, func(match []byte) []byte {
- res := make([]byte, utf8.RuneCount(match))
+ res := make([]byte, CharacterCount(match))
return res
})
} else {
}
func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, curRegion *region, statesOnly bool) LineMatch {
- lineLen := utf8.RuneCount(line)
+ lineLen := CharacterCount(line)
if start == 0 {
if !statesOnly {
if _, ok := highlights[0]; !ok {
}
func (h *Highlighter) highlightEmptyRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, statesOnly bool) LineMatch {
- lineLen := utf8.RuneCount(line)
+ lineLen := CharacterCount(line)
if lineLen == 0 {
if canMatchEnd {
h.lastRegion = nil
--- /dev/null
+package highlight
+
+import (
+ "unicode"
+ "unicode/utf8"
+)
+
+// combining character range table
+var combining = &unicode.RangeTable{
+ R16: []unicode.Range16{
+ {0x0300, 0x036f, 1}, // combining diacritical marks
+ {0x1ab0, 0x1aff, 1}, // combining diacritical marks extended
+ {0x1dc0, 0x1dff, 1}, // combining diacritical marks supplement
+ {0x20d0, 0x20ff, 1}, // combining diacritical marks for symbols
+ {0xfe20, 0xfe2f, 1}, // combining half marks
+ },
+}
+
+// DecodeCharacter returns the next character from an array of bytes
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacter(b []byte) (rune, []rune, int) {
+ r, size := utf8.DecodeRune(b)
+ b = b[size:]
+ c, s := utf8.DecodeRune(b)
+
+ var combc []rune
+ for unicode.In(c, combining) {
+ combc = append(combc, c)
+ size += s
+
+ b = b[s:]
+ c, s = utf8.DecodeRune(b)
+ }
+
+ return r, combc, size
+}
+
+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+ r, size := utf8.DecodeRuneInString(str)
+ str = str[size:]
+ c, s := utf8.DecodeRuneInString(str)
+
+ var combc []rune
+ for unicode.In(c, combining) {
+ combc = append(combc, c)
+ size += s
+
+ str = str[s:]
+ c, s = utf8.DecodeRuneInString(str)
+ }
+
+ return r, combc, size
+}
+
+// CharacterCount returns the number of characters in a byte array
+// Similar to utf8.RuneCount but for unicode characters
+func CharacterCount(b []byte) int {
+ s := 0
+
+ for len(b) > 0 {
+ r, size := utf8.DecodeRune(b)
+ if !unicode.In(r, combining) {
+ s++
+ }
+
+ b = b[size:]
+ }
+
+ return s
+}
+
+// CharacterCount returns the number of characters in a string
+// Similar to utf8.RuneCountInString but for unicode characters
+func CharacterCountInString(str string) int {
+ s := 0
+
+ for _, r := range str {
+ if !unicode.In(r, combining) {
+ s++
+ }
+ }
+
+ return s
+}