From: Steve Bennett Date: Fri, 8 Oct 2010 12:55:20 +0000 (+1000) Subject: Add utf-8 support to linenoise.c X-Git-Url: https://git.lizzy.rs/?a=commitdiff_plain;h=d384215c5aba7374c1bb2b543faedab30267906f;p=linenoise.git Add utf-8 support to linenoise.c Plus general improvements, including: - Allow pasting newlines to linenoise. Use TCSADRAIN, not TCAFLUSH so that unused input is not flushed Signed-off-by: Steve Bennett --- diff --git a/Makefile b/Makefile index a285410..7e50716 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,10 @@ -linenoise_example: linenoise.h linenoise.c +all: linenoise_example linenoise_utf8_example -linenoise_example: linenoise.c example.c - $(CC) -Wall -W -Os -g -o linenoise_example linenoise.c example.c +linenoise_example: linenoise.h linenoise.c example.c + $(CC) -Wall -W -Os -g -o $@ linenoise.c example.c + +linenoise_utf8_example: linenoise.c utf8.c example.c + $(CC) -DNO_COMPLETION -DUSE_UTF8 -Wall -W -Os -g -o $@ linenoise.c utf8.c example.c clean: - rm -f linenoise_example + rm -f linenoise_example linenoise_utf8_example diff --git a/linenoise.c b/linenoise.c index d1d32ca..bce5a4e 100644 --- a/linenoise.c +++ b/linenoise.c @@ -45,8 +45,8 @@ * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html * * Todo list: - * - Switch to gets() if $TERM is something we can't support. * - Win32 support + * - Save and load history containing newlines * * Bloat: * - Completion? @@ -106,10 +106,12 @@ #include #include #include +#include #include #include "linenoise.h" #include "linenoise.h" +#include "utf8.h" #define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100 #define LINENOISE_MAX_LINE 4096 @@ -171,7 +173,7 @@ static int enableRawMode(int fd) { raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */ /* put terminal in raw mode after flushing */ - if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal; + if (tcsetattr(fd,TCSADRAIN,&raw) < 0) goto fatal; rawmode = 1; return 0; @@ -182,7 +184,7 @@ fatal: static void disableRawMode(int fd) { /* Don't even check the return value as it's too late. */ - if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) + if (rawmode && tcsetattr(fd,TCSADRAIN,&orig_termios) != -1) rawmode = 0; } @@ -205,6 +207,7 @@ struct current { char *buf; /* Current buffer. Always null terminated */ int bufmax; /* Size of the buffer, including space for the null termination */ int len; /* Number of bytes in 'buf' */ + int chars; /* Number of chars in 'buf' (utf-8 chars) */ int pos; /* Cursor position, measured in chars */ int cols; /* Size of the window, in chars */ }; @@ -224,47 +227,122 @@ static void fd_printf(int fd, const char *format, ...) write(fd, buf, n); } -static void refreshLine(const char *prompt, struct current *c) { - size_t plen = strlen(prompt); - int extra = 0; - size_t i, p; - const char *buf = c->buf; - int len = c->len; - int pos = c->pos; +static int utf8_getchars(char *buf, int c) +{ +#ifdef USE_UTF8 + return utf8_fromunicode(buf, c); +#else + *buf = c; + return 1; +#endif +} + +/** + * Returns the unicode character at the given offset, + * or -1 if none. + */ +static int get_char(struct current *current, int pos) +{ + if (pos >= 0 && pos < current->chars) { + int c; + int i = utf8_index(current->buf, pos); + utf8_tounicode(current->buf + i, &c); + return c; + } + return -1; +} + +static void refreshLine(const char *prompt, struct current *current) { + int plen; + int pchars; + int backup = 0; + int i; + const char *buf = current->buf; + int chars = current->chars; + int pos = current->pos; + int b; + int ch; + int n; - //fprintf(stderr, "\nrefreshLine: prompt=<<%s>>, buf=<<%s>>\n", prompt, c->buf); - //fprintf(stderr, "pos=%d, len=%d, cols=%d\n", pos, len, c->cols); + /* Should intercept SIGWINCH. For now, just get the size every time */ + current->cols = getColumns(); + + plen = strlen(prompt); + pchars = utf8_strlen(prompt, plen); + + /* Account for a line which is too long to fit in the window. + * Note that control chars require an extra column + */ + + /* How many cols are required to the left of 'pos'? + * The prompt, plus one extra for each control char + */ + n = pchars + utf8_strlen(buf, current->len); + b = 0; + for (i = 0; i < pos; i++) { + b += utf8_tounicode(buf + b, &ch); + if (ch < ' ') { + n++; + } + } - while((plen+pos) >= c->cols) { - buf++; - len--; - pos--; + /* If too many are need, strip chars off the front of 'buf' + * until it fits. Note that if the current char is a control character, + * we need one extra col. + */ + if (current->pos < current->chars && get_char(current, current->pos) < ' ') { + n++; } - while (plen+len > c->cols) { - len--; + + while (n >= current->cols) { + b = utf8_tounicode(buf, &ch); + if (ch < ' ') { + n--; + } + n--; + buf += b; + pos--; + chars--; } /* Cursor to left edge, then the prompt */ - fd_printf(c->fd, "\x1b[0G"); - write(c->fd, prompt, strlen(prompt)); + fd_printf(current->fd, "\x1b[0G"); + write(current->fd, prompt, plen); /* Now the current buffer content */ - /* Need special handling for control characters */ - p = 0; - for (i = 0; i < len; i++) { - if (buf[i] >= 0 && buf[i] < ' ') { - write(c->fd, buf + p, i - p); - p = i + 1; - fd_printf(c->fd, "\033[7m^%c\033[0m", buf[i] + '@'); + + /* Need special handling for control characters. + * If we hit 'cols', stop. + */ + b = 0; /* unwritted bytes */ + n = 0; /* How many control chars were written */ + for (i = 0; i < chars; i++) { + int ch; + int w = utf8_tounicode(buf + b, &ch); + if (ch < ' ') { + n++; + } + if (pchars + i + n >= current->cols) { + break; + } + if (ch < ' ') { + /* A control character, so write the buffer so far */ + write(current->fd, buf, b); + buf += b + w; + b = 0; + fd_printf(current->fd, "\033[7m^%c\033[0m", ch + '@'); if (i < pos) { - extra++; + backup++; } } + else { + b += w; + } } - write(c->fd, buf + p, i - p); + write(current->fd, buf, b); /* Erase to right, move cursor to original position */ - fd_printf(c->fd, "\x1b[0K" "\x1b[0G\x1b[%dC", (int)(pos+plen+extra)); + fd_printf(current->fd, "\x1b[0K" "\x1b[0G\x1b[%dC", pos + pchars + backup); } static void set_current(struct current *current, const char *str) @@ -272,24 +350,27 @@ static void set_current(struct current *current, const char *str) strncpy(current->buf, str, current->bufmax); current->buf[current->bufmax - 1] = 0; current->len = strlen(current->buf); - current->pos = current->len; + current->pos = current->chars = utf8_strlen(current->buf, current->len); } -static int has_room(struct current *current, int chars) +static int has_room(struct current *current, int bytes) { - return current->len + chars < current->bufmax - 1; + return current->len + bytes < current->bufmax - 1; } static int remove_char(struct current *current, int pos) { - //fprintf(stderr, "Trying to remove char at %d (pos=%d, len=%d)\n", pos, current->pos, current->len); - if (pos >= 0 && pos < current->len) { + if (pos >= 0 && pos < current->chars) { + int p1, p2; + p1 = utf8_index(current->buf, pos); + p2 = p1 + utf8_index(current->buf + p1, 1); + /* Move the null char too */ + memmove(current->buf + p1, current->buf + p2, current->len - p2 + 1); + current->len -= (p2 - p1); + current->chars--; if (current->pos > pos) { current->pos--; } - /* Move the null char too */ - memmove(current->buf + pos, current->buf + pos + 1, current->len - pos); - current->len--; return 1; } return 0; @@ -297,10 +378,17 @@ static int remove_char(struct current *current, int pos) static int insert_char(struct current *current, int pos, int ch) { - if (has_room(current, 1) && pos >= 0 && pos <= current->len) { - memmove(current->buf+pos+1, current->buf + pos, current->len - pos); - current->buf[pos] = ch; - current->len++; + char buf[3]; + int n = utf8_getchars(buf, ch); + + if (has_room(current, n) && pos >= 0 && pos <= current->chars) { + int p1, p2; + p1 = utf8_index(current->buf, pos); + p2 = p1 + n; + memmove(current->buf + p2, current->buf + p1, current->len - p1); + memcpy(current->buf + p1, buf, n); + current->len += n; + current->chars++; if (current->pos >= pos) { current->pos++; } @@ -340,6 +428,7 @@ static int completeLine(const char *prompt, struct current *current) { struct current tmp = *current; tmp.buf = lc.cvec[i]; tmp.pos = tmp.len = strlen(tmp.buf); + tmp.chars = utf8_strlen(tmp.buf, tmp.len); refreshLine(prompt, &tmp); } else { refreshLine(prompt, current); @@ -399,21 +488,128 @@ static int remove_chars(struct current *current, int pos, int n) return removed; } -static int fd_read(int fd) +/** + * Reads a char from 'fd', waiting at most 'timeout' milliseconds. + * + * A timeout of -1 means to wait forever. + * + * Returns -1 if no char is received within the time or an error occurs. + */ +static int fd_read_char(int fd, int timeout) { + struct pollfd p; unsigned char c; + + p.fd = fd; + p.events = POLLIN; + + if (poll(&p, 1, timeout) == 0) { + /* timeout */ + return -1; + } if (read(fd, &c, 1) != 1) { return -1; } return c; } -#ifndef ctrl -#define ctrl(C) ((C) - '@') +/** + * Reads a complete utf-8 character + * and returns the unicode value, or -1 on error. + */ +static int fd_read(int fd) +{ +#ifdef USE_UTF8 + char buf[4]; + int n; + int i; + int c; + + if (read(fd, &buf[0], 1) != 1) { + return -1; + } + n = utf8_charlen(buf[0]); + if (n < 1 || n > 3) { + return -1; + } + for (i = 1; i < n; i++) { + if (read(fd, &buf[i], 1) != 1) { + return -1; + } + } + buf[n] = 0; + /* decode and return the character */ + utf8_tounicode(buf, &c); + return c; +#else + return fd_read_char(fd, -1); #endif +} + +/* Use -ve numbers here to co-exist with normal unicode chars */ +enum { + SPECIAL_NONE, + SPECIAL_UP = -20, + SPECIAL_DOWN = -21, + SPECIAL_LEFT = -22, + SPECIAL_RIGHT = -23, + SPECIAL_DELETE = -24, +}; + +/** + * If escape (27) was received, reads subsequent + * chars to determine if this is a known special key. + * + * Returns SPECIAL_NONE if unrecognised, or -1 if EOF. + * + * If no additional char is received within a short time, + * 27 is returned. + */ +static int check_special(int fd) +{ + int c = fd_read_char(fd, 50); + int c2; + + if (c < 0) { + return 27; + } + + c2 = fd_read_char(fd, 50); + if (c2 < 0) { + return c2; + } + if (c == '[' || c == 'O') { + /* Potential arrow key */ + switch (c2) { + case 'A': + return SPECIAL_UP; + case 'B': + return SPECIAL_DOWN; + case 'C': + return SPECIAL_RIGHT; + case 'D': + return SPECIAL_LEFT; + } + } + if (c == '[' && c2 >= '1' && c2 <= '6') { + /* extended escape */ + int c3 = fd_read_char(fd, 50); + if (c2 == '3' && c3 == '~') { + /* delete char under cursor */ + return SPECIAL_DELETE; + } + while (c3 != -1 && c3 != '~') { + /* .e.g \e[12~ or '\e[11;2~ discard the complete sequence */ + c3 = fd_read_char(fd, 50); + } + } + + return SPECIAL_NONE; +} + +#define ctrl(C) ((C) - '@') static int linenoisePrompt(const char *prompt, struct current *current) { - size_t plen = strlen(prompt); int history_index = 0; /* The latest history entry is always our current buffer, that @@ -424,11 +620,7 @@ static int linenoisePrompt(const char *prompt, struct current *current) { refreshLine(prompt, current); while(1) { - int ext; - int c; - int c2; - - c = fd_read(current->fd); + int c = fd_read(current->fd); #ifndef NO_COMPLETION /* Only autocomplete when the callback is set. It returns < 0 when @@ -444,7 +636,7 @@ static int linenoisePrompt(const char *prompt, struct current *current) { #endif process_char: - if (c < 0) return current->len; + if (c == -1) return current->len; switch(c) { case '\r': /* enter */ history_len--; @@ -475,12 +667,12 @@ process_char: /* eat any spaces on the left */ { int pos = current->pos; - while (pos > 0 && current->buf[pos - 1] == ' ') { + while (pos > 0 && get_char(current, pos - 1) == ' ') { pos--; } /* now eat any non-spaces on the left */ - while (pos > 0 && current->buf[pos - 1] != ' ') { + while (pos > 0 && get_char(current, pos - 1) != ' ') { pos--; } @@ -494,44 +686,87 @@ process_char: /* Display the reverse-i-search prompt and process chars */ char rbuf[50]; char rprompt[80]; - int i = 0; + int rchars = 0; + int rlen = 0; + int searchpos = history_len - 1; + rbuf[0] = 0; while (1) { + int n = 0; + const char *p = NULL; + int skipsame = 0; + int searchdir = -1; + snprintf(rprompt, sizeof(rprompt), "(reverse-i-search)'%s': ", rbuf); refreshLine(rprompt, current); c = fd_read(current->fd); if (c == ctrl('H') || c == 127) { - if (i > 0) { - rbuf[--i] = 0; + if (rchars) { + int p = utf8_index(rbuf, --rchars); + rbuf[p] = 0; + rlen = strlen(rbuf); } continue; } - if (c >= ' ' && c <= '~') { - if (i < (int)sizeof(rbuf)) { - int j; - const char *p = NULL; - rbuf[i++] = c; - rbuf[i] = 0; - /* Now search back through the history for a match */ - for (j = history_len - 1; j > 0; j--) { - p = strstr(history[j], rbuf); - if (p) { - /* Found a match. Copy it */ - set_current(current,history[j]); - current->pos = p - history[j]; - break; - } - } - if (!p) { - /* No match, so don't add it */ - rbuf[--i] = 0; + if (c == 27) { + c = check_special(current->fd); + } + if (c == ctrl('P') || c == SPECIAL_UP) { + /* Search for the previous (earlier) match */ + if (searchpos > 0) { + searchpos--; + } + skipsame = 1; + } + else if (c == ctrl('N') || c == SPECIAL_DOWN) { + /* Search for the next (later) match */ + if (searchpos < history_len) { + searchpos++; + } + searchdir = 1; + skipsame = 1; + } + else if (c >= ' ') { + if (rlen >= (int)sizeof(rbuf) + 3) { + continue; + } + + n = utf8_getchars(rbuf + rlen, c); + rlen += n; + rchars++; + rbuf[rlen] = 0; + + /* Adding a new char resets the search location */ + searchpos = history_len - 1; + } + else { + /* Exit from incremental search mode */ + break; + } + + /* Now search through the history for a match */ + for (; searchpos >= 0 && searchpos < history_len; searchpos += searchdir) { + p = strstr(history[searchpos], rbuf); + if (p) { + /* Found a match */ + if (skipsame && strcmp(history[searchpos], current->buf) == 0) { + /* But it is identical, so skip it */ + continue; } + /* Copy the matching line and set the cursor position */ + set_current(current,history[searchpos]); + current->pos = utf8_strlen(history[searchpos], p - history[searchpos]); + break; } - continue; } - break; + if (!p && n) { + /* No match, so don't add it */ + rchars--; + rlen -= n; + rbuf[rlen] = 0; + } } - if (c == ctrl('G')) { + if (c == ctrl('G') || c == ctrl('C')) { /* ctrl-g terminates the search with no effect */ set_current(current, ""); c = 0; @@ -546,114 +781,92 @@ process_char: } break; case ctrl('T'): /* ctrl-t */ - if (current->pos > 0 && current->pos < current->len) { - int aux = current->buf[current->pos-1]; - current->buf[current->pos-1] = current->buf[current->pos]; - current->buf[current->pos] = aux; - if (current->pos != current->len-1) current->pos++; + if (current->pos > 0 && current->pos < current->chars) { + c = get_char(current, current->pos); + remove_char(current, current->pos); + insert_char(current, current->pos - 1, c); refreshLine(prompt, current); } break; case ctrl('V'): /* ctrl-v */ - if (has_room(current, 1)) { + if (has_room(current, 3)) { /* Insert the ^V first */ if (insert_char(current, current->pos, c)) { refreshLine(prompt, current); /* Now wait for the next char. Can insert anything except \0 */ c = fd_read(current->fd); - if (c > 0) { - /* Replace the ^V with the actual char */ - current->buf[current->pos - 1] = c; - } - else { - remove_char(current, current->pos); + + /* Remove the ^V first */ + remove_char(current, current->pos - 1); + if (c != -1) { + /* Insert the actual char */ + insert_char(current, current->pos, c); } refreshLine(prompt, current); } } break; case ctrl('B'): /* ctrl-b */ - goto left_arrow; case ctrl('F'): /* ctrl-f */ - goto right_arrow; case ctrl('P'): /* ctrl-p */ - c2 = 65; - goto up_down_arrow; case ctrl('N'): /* ctrl-n */ - c2 = 66; - goto up_down_arrow; - break; - case 27: /* escape sequence */ - c = fd_read(current->fd); - if (c <= 0) { - break; - } - c2 = fd_read(current->fd); - if (c <= 0) { - break; + case 27: { /* escape sequence */ + int dir = -1; + if (c == 27) { + c = check_special(current->fd); } - ext = (c == 91 || c == 79); - if (ext && c2 == 68) { -left_arrow: - /* left arrow */ - if (current->pos > 0) { - current->pos--; - refreshLine(prompt, current); - } - } else if (ext && c2 == 67) { -right_arrow: - /* right arrow */ - if (current->pos < current->len) { - current->pos++; - refreshLine(prompt, current); - } - } else if (ext && (c2 == 65 || c2 == 66)) { -up_down_arrow: - /* up and down arrow: history */ - if (history_len > 1) { - /* Update the current history entry before to - * overwrite it with tne next one. */ - free(history[history_len-1-history_index]); - history[history_len-1-history_index] = strdup(current->buf); - /* Show the new entry */ - history_index += (c2 == 65) ? 1 : -1; - if (history_index < 0) { - history_index = 0; - break; - } else if (history_index >= history_len) { - history_index = history_len-1; - break; + switch (c) { + case ctrl('B'): + case SPECIAL_LEFT: + if (current->pos > 0) { + current->pos--; + refreshLine(prompt, current); } - set_current(current, history[history_len-1-history_index]); - refreshLine(prompt, current); - } - } else if (c == 91 && c2 > 48 && c2 < 55) { - /* extended escape */ - c = fd_read(current->fd); - if (c <= 0) { break; - } - fd_read(current->fd); - if (c2 == 51 && c == 126) { - /* delete char under cursor */ + case ctrl('F'): + case SPECIAL_RIGHT: + if (current->pos < current->chars) { + current->pos++; + refreshLine(prompt, current); + } + break; + case ctrl('P'): + case SPECIAL_UP: + dir = 1; + case ctrl('N'): + case SPECIAL_DOWN: + if (history_len > 1) { + /* Update the current history entry before to + * overwrite it with tne next one. */ + free(history[history_len-1-history_index]); + history[history_len-1-history_index] = strdup(current->buf); + /* Show the new entry */ + history_index += dir; + if (history_index < 0) { + history_index = 0; + break; + } else if (history_index >= history_len) { + history_index = history_len-1; + break; + } + set_current(current, history[history_len-1-history_index]); + refreshLine(prompt, current); + } + break; + + case SPECIAL_DELETE: if (remove_char(current, current->pos)) { refreshLine(prompt, current); } - } + break; + } } break; default: - /* Note that the only control character currently permitted is tab */ - if (c == '\t' || c < 0 || c >= ' ') { + /* Only tab is allowed without ^V */ + if (c == '\t' || c >= ' ') { if (insert_char(current, current->pos, c)) { - /* Avoid a full update of the line in the trivial case. */ - if (current->pos == current->len && c >= ' ' && plen + current->len < current->cols) { - char ch = c; - write(current->fd, &ch, 1); - } - else { - refreshLine(prompt, current); - } + refreshLine(prompt, current); } } break; @@ -663,7 +876,7 @@ up_down_arrow: } break; case ctrl('K'): /* Ctrl+k, delete from current to end of line. */ - if (remove_chars(current, current->pos, current->len - current->pos)) { + if (remove_chars(current, current->pos, current->chars - current->pos)) { refreshLine(prompt, current); } break; @@ -672,7 +885,7 @@ up_down_arrow: refreshLine(prompt, current); break; case ctrl('E'): /* ctrl+e, go to the end of the line */ - current->pos = current->len; + current->pos = current->chars; refreshLine(prompt, current); break; case ctrl('L'): /* Ctrl+L, clear screen */ @@ -709,11 +922,13 @@ static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { current.buf = buf; current.bufmax = buflen; current.len = 0; + current.chars = 0; current.pos = 0; - current.cols = getColumns(); + current.cols = 0; count = linenoisePrompt(prompt, ¤t); disableRawMode(fd); + printf("\n"); } return count; diff --git a/utf8.c b/utf8.c new file mode 100644 index 0000000..26924b4 --- /dev/null +++ b/utf8.c @@ -0,0 +1,115 @@ +/** + * UTF-8 utility functions + * + * (c) 2010 Steve Bennett + * + * See LICENCE for licence details. + */ + +#include +#include +#include +#include +#include "utf8.h" + +#ifdef USE_UTF8 +int utf8_fromunicode(char *p, unsigned short uc) +{ + if (uc <= 0x7f) { + *p = uc; + return 1; + } + else if (uc <= 0x7ff) { + *p++ = 0xc0 | ((uc & 0x7c0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 2; + } + else { + *p++ = 0xe0 | ((uc & 0xf000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 3; + } +} + +int utf8_charlen(int c) +{ + if ((c & 0x80) == 0) { + return 1; + } + if ((c & 0xe0) == 0xc0) { + return 2; + } + if ((c & 0xf0) == 0xe0) { + return 3; + } + if ((c & 0xf8) == 0xf0) { + return 4; + } + /* Invalid sequence */ + return -1; +} + +int utf8_strlen(const char *str, int bytelen) +{ + int charlen = 0; + if (bytelen < 0) { + bytelen = strlen(str); + } + while (bytelen) { + int c; + int l = utf8_tounicode(str, &c); + charlen++; + str += l; + bytelen -= l; + } + return charlen; +} + +int utf8_index(const char *str, int index) +{ + const char *s = str; + while (index--) { + int c; + s += utf8_tounicode(s, &c); + } + return s - str; +} + +int utf8_charequal(const char *s1, const char *s2) +{ + int c1, c2; + + utf8_tounicode(s1, &c1); + utf8_tounicode(s2, &c2); + + return c1 == c2; +} + +int utf8_tounicode(const char *str, int *uc) +{ + unsigned const char *s = (unsigned const char *)str; + + if (s[0] < 0xc0) { + *uc = s[0]; + return 1; + } + if (s[0] < 0xe0) { + if ((s[1] & 0xc0) == 0x80) { + *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); + return 2; + } + } + else if (s[0] < 0xf0) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80); + return 3; + } + } + + /* Invalid sequence, so just return the byte */ + *uc = *s; + return 1; +} + +#endif diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000..564d64e --- /dev/null +++ b/utf8.h @@ -0,0 +1,79 @@ +#ifndef UTF8_UTIL_H +#define UTF8_UTIL_H +/** + * UTF-8 utility functions + * + * (c) 2010 Steve Bennett + * + * See LICENCE for licence details. + */ + +#ifndef USE_UTF8 +#include + +/* No utf-8 support. 1 byte = 1 char */ +#define utf8_strlen(S, B) (B) < 0 ? (int)strlen(S) : (B) +#define utf8_tounicode(S, CP) (*(CP) = *(S), 1) +#define utf8_index(C, I) (I) +#define utf8_charlen(C) 1 + +#else +/** + * Converts the given unicode codepoint (0 - 0xffff) to utf-8 + * and stores the result at 'p'. + * + * Returns the number of utf-8 characters (1-3). + */ +int utf8_fromunicode(char *p, unsigned short uc); + +/** + * Returns the length of the utf-8 sequence starting with 'c'. + * + * Returns 1-4, or -1 if this is not a valid start byte. + * + * Note that charlen=4 is not supported by the rest of the API. + */ +int utf8_charlen(int c); + +/** + * Returns the number of characters in the utf-8 + * string of the given byte length. + * + * Any bytes which are not part of an valid utf-8 + * sequence are treated as individual characters. + * + * The string *must* be null terminated. + * + * Does not support unicode code points > \uffff + */ +int utf8_strlen(const char *str, int bytelen); + +/** + * Returns the byte index of the given character in the utf-8 string. + * + * The string *must* be null terminated. + * + * This will return the byte length of a utf-8 string + * if given the char length. + */ +int utf8_index(const char *str, int charindex); + +/** + * Returns the unicode codepoint corresponding to the + * utf-8 sequence 'str'. + * + * Stores the result in *uc and returns the number of bytes + * consumed. + * + * If 'str' is null terminated, then an invalid utf-8 sequence + * at the end of the string will be returned as individual bytes. + * + * If it is not null terminated, the length *must* be checked first. + * + * Does not support unicode code points > \uffff + */ +int utf8_tounicode(const char *str, int *uc); + +#endif + +#endif