internal/util/unicode.go

   1 package util
   2
   3 import (
   4         "unicode"
   5         "unicode/utf8"
   6 )
   7
   8 // Unicode is annoying. A "code point" (rune in Go-speak) may need up to
   9 // 4 bytes to represent it. In general, a code point will represent a
  10 // complete character, but this is not always the case. A character with
  11 // accents may be made up of multiple code points (the code point for the
  12 // original character, and additional code points for each accent/marking).
  13 // The functions below are meant to help deal with these additional "combining"
  14 // code points. In underlying operations (search, replace, etc...), micro will
  15 // treat a character with combining code points as just the original code point.
  16 // For rendering, micro will display the combining characters. It's not perfect
  17 // but it's pretty good.
  18
  19 var minMark = rune(unicode.Mark.R16[0].Lo)
  20
  21 func isMark(r rune) bool {
  22         // Fast path
  23         if r < minMark {
  24                 return false
  25         }
  26         return unicode.In(r, unicode.Mark)
  27 }
  28
  29 // DecodeCharacter returns the next character from an array of bytes
  30 // A character is a rune along with any accompanying combining runes
  31 func DecodeCharacter(b []byte) (rune, []rune, int) {
  32         r, size := utf8.DecodeRune(b)
  33         b = b[size:]
  34         c, s := utf8.DecodeRune(b)
  35
  36         var combc []rune
  37         for isMark(c) {
  38                 combc = append(combc, c)
  39                 size += s
  40
  41                 b = b[s:]
  42                 c, s = utf8.DecodeRune(b)
  43         }
  44
  45         return r, combc, size
  46 }
  47
  48 // DecodeCharacterInString returns the next character from a string
  49 // A character is a rune along with any accompanying combining runes
  50 func DecodeCharacterInString(str string) (rune, []rune, int) {
  51         r, size := utf8.DecodeRuneInString(str)
  52         str = str[size:]
  53         c, s := utf8.DecodeRuneInString(str)
  54
  55         var combc []rune
  56         for isMark(c) {
  57                 combc = append(combc, c)
  58                 size += s
  59
  60                 str = str[s:]
  61                 c, s = utf8.DecodeRuneInString(str)
  62         }
  63
  64         return r, combc, size
  65 }
  66
  67 // CharacterCount returns the number of characters in a byte array
  68 // Similar to utf8.RuneCount but for unicode characters
  69 func CharacterCount(b []byte) int {
  70         s := 0
  71
  72         for len(b) > 0 {
  73                 r, size := utf8.DecodeRune(b)
  74                 if !isMark(r) {
  75                         s++
  76                 }
  77
  78                 b = b[size:]
  79         }
  80
  81         return s
  82 }
  83
  84 // CharacterCount returns the number of characters in a string
  85 // Similar to utf8.RuneCountInString but for unicode characters
  86 func CharacterCountInString(str string) int {
  87         s := 0
  88
  89         for _, r := range str {
  90                 if !isMark(r) {
  91                         s++
  92                 }
  93         }
  94
  95         return s
  96 }