From fd9f520d98ad3e48034aa1942e19e9a4a8e0d5c4 Mon Sep 17 00:00:00 2001 From: rexim Date: Mon, 27 Aug 2018 04:36:40 +0700 Subject: [PATCH] (#299) Simplify parser with tokenizer --- src/script/parser.c | 205 +++++++++++++++++++---------------------- src/script/parser.h | 17 ++-- src/script/tokenizer.c | 9 +- src/script_test.c | 9 +- 4 files changed, 111 insertions(+), 129 deletions(-) diff --git a/src/script/parser.c b/src/script/parser.c index d562f10f..17da8ed2 100644 --- a/src/script/parser.c +++ b/src/script/parser.c @@ -4,160 +4,140 @@ #include "script/parser.h" -static bool is_symbol_char(char x) +static struct ParseResult parse_cdr(struct Token current_token) { - static const char forbidden_symbol_chars[] = { - '(', ')', '"', '\'', ';' - }; - static const size_t n = sizeof(forbidden_symbol_chars) / sizeof(char); - - for (size_t i = 0; i < n; ++i) { - if (x == forbidden_symbol_chars[i] || isspace(x)) { - return false; - } + if (*current_token.begin != '.') { + return parse_failure("Expected .", current_token.begin); } - return true; -} + struct ParseResult cdr = parse_expr(next_token(current_token.end)); + if (cdr.is_error) { + return cdr; + } -static void skip_whitespaces(const char *str, size_t *cursor, size_t n) -{ - assert(str); - assert(cursor); + current_token = next_token(cdr.end); - while (*cursor < n && isspace(str[*cursor])) { - (*cursor)++; + if (*current_token.begin != ')') { + destroy_expr(cdr.expr); + return parse_failure("Expected )", current_token.begin); } + + return parse_success(cdr.expr, current_token.end); } -struct ParseResult parse_expr(const char *str, - size_t *cursor, - size_t n) +static struct ParseResult parse_cons(struct Token current_token) { - assert(str); - assert(cursor); - - /* TODO: parse_expr doesn't parse lists */ - - skip_whitespaces(str, cursor, n); - if (*cursor >= n) { - return parse_failure("EOF", *cursor); + if (*current_token.begin != '(') { + return parse_failure("Expected (", current_token.begin); } - switch (str[*cursor]) { - case '(': { - (*cursor)++; - struct ParseResult car = parse_expr(str, cursor, n); - if (car.is_error) { - return car; - } - - skip_whitespaces(str, cursor, n); - if (*cursor >= n) { - return parse_failure("EOF", *cursor); - } - - if (str[*cursor] != '.') { - return parse_failure("Expected .", *cursor); - } - (*cursor)++; - - skip_whitespaces(str, cursor, n); - if (*cursor >= n) { - return parse_failure("EOF", *cursor); - } + current_token = next_token(current_token.end); - struct ParseResult cdr = parse_expr(str, cursor, n); - if (cdr.is_error) { - return cdr; - } + if (*current_token.begin == ')') { + return parse_success(atom_as_expr(create_symbol_atom("nil", NULL)), current_token.end); + } - skip_whitespaces(str, cursor, n); - if (*cursor >= n) { - return parse_failure("EOF", *cursor); - } + struct ParseResult car = parse_expr(current_token); + if (car.is_error) { + return car; + } - if (str[*cursor] != ')') { - return parse_failure("Expected )", *cursor); - } + struct ParseResult cdr = parse_cdr(next_token(car.end)); + if (cdr.is_error) { + destroy_expr(car.expr); + return cdr; + } - (*cursor)++; + return parse_success(cons_as_expr(create_cons(car.expr, cdr.expr)), cdr.end); +} - return parse_success(cons_as_expr(create_cons(car.expr, cdr.expr))); +static struct ParseResult parse_string(struct Token current_token) +{ + if (*current_token.begin != '"') { + return parse_failure("Expected \"", current_token.begin); } - case '"': { - /* TODO(#292): parser does not support escaped string characters */ - const size_t str_begin = *cursor + 1; - size_t str_end = str_begin; + if (*(current_token.end - 1) != '"') { + return parse_failure("Unclosed string", current_token.begin); + } - while(str_end < n && str[str_end] != '"') { - str_end++; - } + if (current_token.begin + 1 == current_token.end) { + return parse_success(atom_as_expr(create_string_atom("", NULL)), + current_token.end); + } - if (str_end >= n) { - return parse_failure("Unclosed string", str_begin); - } + return parse_success( + atom_as_expr( + create_string_atom(current_token.begin + 1, current_token.end - 1)), + current_token.end); +} - *cursor = str_end + 1; +static struct ParseResult parse_number(struct Token current_token) +{ + char *endptr = 0; + const float x = strtof(current_token.begin, &endptr); - return parse_success( - atom_as_expr( - create_string_atom(str + str_begin, str + str_end))); + if (current_token.begin == endptr || current_token.end != endptr) { + return parse_failure("Expected number", current_token.begin); } - default: { - if (isdigit(str[*cursor])) { - const char *nptr = str + *cursor; - char *endptr = 0; - const float x = strtof(nptr, &endptr); - - if (nptr == endptr) { - return parse_failure("Number expected", *cursor); - } + return parse_success( + atom_as_expr(create_number_atom(x)), + current_token.end); +} - *cursor += (size_t) (endptr - nptr); +static struct ParseResult parse_symbol(struct Token current_token) +{ + if (*current_token.begin == 0) { + return parse_failure("EOF", current_token.begin); + } - return parse_success(atom_as_expr(create_number_atom(x))); - } else if (is_symbol_char(str[*cursor])) { - const size_t sym_begin = *cursor; - size_t sym_end = sym_begin; + return parse_success( + atom_as_expr(create_symbol_atom(current_token.begin, current_token.end)), + current_token.end); +} - while (sym_end < n && is_symbol_char(str[sym_end])) { - sym_end++; - } +struct ParseResult parse_expr(struct Token current_token) +{ + if (*current_token.begin == 0) { + return parse_failure("EOF", current_token.begin); + } - *cursor = sym_end; + /* TODO: parse_expr doesn't parse lists */ - return parse_success( - atom_as_expr( - create_symbol_atom(str + sym_begin, str + sym_end))); - } + switch (*current_token.begin) { + case '(': return parse_cons(current_token); + /* TODO(#292): parser does not support escaped string characters */ + case '"': return parse_string(current_token); + default: {} } + + if (isdigit(*current_token.begin)) { + return parse_number(current_token); } - return parse_failure("Unexpected sequence of characters", *cursor); + return parse_symbol(current_token); } -struct ParseResult parse_success(struct Expr expr) +struct ParseResult parse_success(struct Expr expr, + const char *end) { struct ParseResult result = { .is_error = false, - .expr = expr + .expr = expr, + .end = end }; return result; } struct ParseResult parse_failure(const char *error_message, - size_t error_cursor) + const char *end) { struct ParseResult result = { .is_error = true, - .error = { - .error_message = error_message, - .error_cursor = error_cursor - } + .error_message = error_message, + .end = end }; return result; @@ -165,15 +145,18 @@ struct ParseResult parse_failure(const char *error_message, void print_parse_error(FILE *stream, const char *str, - struct ParseError error) + struct ParseResult result) { /* TODO(#293): print_parse_error doesn't support colors */ /* TODO(#294): print_parse_error doesn't support multiple lines */ + if (!result.is_error) { + return; + } fprintf(stream, "%s\n", str); - for (size_t i = 0; i < error.error_cursor; ++i) { + for (size_t i = 0; i < (size_t) (result.end - str); ++i) { fprintf(stream, " "); } fprintf(stream, "^\n"); - fprintf(stream, "%s\n", error.error_message); + fprintf(stream, "%s\n", result.error_message); } diff --git a/src/script/parser.h b/src/script/parser.h index 95158f58..d7a8ace1 100644 --- a/src/script/parser.h +++ b/src/script/parser.h @@ -4,32 +4,33 @@ #include #include #include "script/expr.h" +#include "script/tokenizer.h" struct ParseError { - const char *error_message; + size_t error_cursor; }; struct ParseResult { bool is_error; + const char *end; union { struct Expr expr; - struct ParseError error; + const char *error_message; }; }; -struct ParseResult parse_success(struct Expr expr); +struct ParseResult parse_success(struct Expr expr, + const char *end); struct ParseResult parse_failure(const char *error, - size_t error_cursor); + const char *end); -struct ParseResult parse_expr(const char *str, - size_t *cursor, - size_t n); +struct ParseResult parse_expr(struct Token token); void print_parse_error(FILE *stream, const char *str, - struct ParseError result); + struct ParseResult result); #endif // PARSER_H_ diff --git a/src/script/tokenizer.c b/src/script/tokenizer.c index 641699b2..02deff58 100644 --- a/src/script/tokenizer.c +++ b/src/script/tokenizer.c @@ -8,7 +8,7 @@ static bool is_symbol_char(char x) { static const char forbidden_symbol_chars[] = { - '(', ')', '"', '\'', ';' + '(', ')', '"', '\'', ';', '.' }; static const size_t n = sizeof(forbidden_symbol_chars) / sizeof(char); @@ -67,13 +67,11 @@ static const char *next_non_symbol(const char *str) struct Token next_token(const char *str) { - if (!str) { - return token(NULL, NULL); - } + assert(str); str = skip_whitespace(str); if (*str == 0) { - return token(NULL, NULL); + return token(str, str); } while (*str != 0 && *str == ';') { @@ -84,6 +82,7 @@ struct Token next_token(const char *str) switch (*str) { case '(': case ')': + case '.': return token(str, str + 1); case '"': { diff --git a/src/script_test.c b/src/script_test.c index 57931051..820217c5 100644 --- a/src/script_test.c +++ b/src/script_test.c @@ -2,20 +2,19 @@ #include #include "script/parser.h" +#include "script/tokenizer.h" int main(int argc, char *argv[]) { (void) argc; (void) argv; - const char *code = "(1 . (\"2\" . (hello-world . nil)))"; - size_t cursor = 0; - const size_t n = strlen(code); + const char *code = "(1 . (\"2\" . (hello-world . (\"\" .nil))))"; - struct ParseResult result = parse_expr(code, &cursor, n); + struct ParseResult result = parse_expr(next_token(code)); if (result.is_error) { - print_parse_error(stderr, code, result.error); + print_parse_error(stderr, code, result); } else { print_expr_as_sexpr(result.expr); } -- 2.44.0