(#299) Simplify parser with tokenizer

author rexim <reximkut@gmail.com>

Sun, 26 Aug 2018 21:36:40 +0000 (04:36 +0700)

committer rexim <reximkut@gmail.com>

Sun, 26 Aug 2018 21:36:40 +0000 (04:36 +0700)
author rexim <reximkut@gmail.com>
Sun, 26 Aug 2018 21:36:40 +0000 (04:36 +0700)
committer rexim <reximkut@gmail.com>
Sun, 26 Aug 2018 21:36:40 +0000 (04:36 +0700)
diff --git a/src/script/parser.c b/src/script/parser.c

index d562f10ffc6711edd911f576ba6e064b3e4db2d8..17da8ed2d0d52bd1a903db0e6f3ef4f3e5f6572a 100644 (file)
--- a/src/script/parser.c
+++ b/src/script/parser.c
@@ -4,160 +4,140 @@
  
  #include "script/parser.h"
  
-static bool is_symbol_char(char x)
+static struct ParseResult parse_cdr(struct Token current_token)
  {
-    static const char forbidden_symbol_chars[] = {
-        '(', ')', '"', '\'', ';'
-    };
-    static const size_t n = sizeof(forbidden_symbol_chars) / sizeof(char);
-
-    for (size_t i = 0; i < n; ++i) {
-        if (x == forbidden_symbol_chars[i] || isspace(x)) {
-            return false;
-        }
+    if (*current_token.begin != '.') {
+        return parse_failure("Expected .", current_token.begin);
      }
  
-    return true;
-}
+    struct ParseResult cdr = parse_expr(next_token(current_token.end));
+    if (cdr.is_error) {
+        return cdr;
+    }
  
-static void skip_whitespaces(const char *str, size_t *cursor, size_t n)
-{
-    assert(str);
-    assert(cursor);
+    current_token = next_token(cdr.end);
  
-    while (*cursor < n && isspace(str[*cursor])) {
-        (*cursor)++;
+    if (*current_token.begin != ')') {
+        destroy_expr(cdr.expr);
+        return parse_failure("Expected )", current_token.begin);
      }
+
+    return parse_success(cdr.expr, current_token.end);
  }
  
-struct ParseResult parse_expr(const char *str,
-                                        size_t *cursor,
-                                        size_t n)
+static struct ParseResult parse_cons(struct Token current_token)
  {
-    assert(str);
-    assert(cursor);
-
-    /* TODO: parse_expr doesn't parse lists */
-
-    skip_whitespaces(str, cursor, n);
-    if (*cursor >= n) {
-        return parse_failure("EOF", *cursor);
+    if (*current_token.begin != '(') {
+        return parse_failure("Expected (", current_token.begin);
      }
  
-    switch (str[*cursor]) {
-    case '(': {
-        (*cursor)++;
-        struct ParseResult car = parse_expr(str, cursor, n);
-        if (car.is_error) {
-            return car;
-        }
-
-        skip_whitespaces(str, cursor, n);
-        if (*cursor >= n) {
-            return parse_failure("EOF", *cursor);
-        }
-
-        if (str[*cursor] != '.') {
-            return parse_failure("Expected .", *cursor);
-        }
-        (*cursor)++;
-
-        skip_whitespaces(str, cursor, n);
-        if (*cursor >= n) {
-            return parse_failure("EOF", *cursor);
-        }
+    current_token = next_token(current_token.end);
  
-        struct ParseResult cdr = parse_expr(str, cursor, n);
-        if (cdr.is_error) {
-            return cdr;
-        }
+    if (*current_token.begin == ')') {
+        return parse_success(atom_as_expr(create_symbol_atom("nil", NULL)), current_token.end);
+    }
  
-        skip_whitespaces(str, cursor, n);
-        if (*cursor >= n) {
-            return parse_failure("EOF", *cursor);
-        }
+    struct ParseResult car = parse_expr(current_token);
+    if (car.is_error) {
+        return car;
+    }
  
-        if (str[*cursor] != ')') {
-            return parse_failure("Expected )", *cursor);
-        }
+    struct ParseResult cdr = parse_cdr(next_token(car.end));
+    if (cdr.is_error) {
+        destroy_expr(car.expr);
+        return cdr;
+    }
  
-        (*cursor)++;
+    return parse_success(cons_as_expr(create_cons(car.expr, cdr.expr)), cdr.end);
+}
  
-        return parse_success(cons_as_expr(create_cons(car.expr, cdr.expr)));
+static struct ParseResult parse_string(struct Token current_token)
+{
+    if (*current_token.begin != '"') {
+        return parse_failure("Expected \"", current_token.begin);
      }
  
-    case '"': {
-        /* TODO(#292): parser does not support escaped string characters */
-        const size_t str_begin = *cursor + 1;
-        size_t str_end = str_begin;
+    if (*(current_token.end - 1) != '"') {
+        return parse_failure("Unclosed string", current_token.begin);
+    }
  
-        while(str_end < n && str[str_end] != '"') {
-            str_end++;
-        }
+    if (current_token.begin + 1 == current_token.end) {
+        return parse_success(atom_as_expr(create_string_atom("", NULL)),
+                             current_token.end);
+    }
  
-        if (str_end >= n) {
-            return parse_failure("Unclosed string", str_begin);
-        }
+    return parse_success(
+        atom_as_expr(
+            create_string_atom(current_token.begin + 1, current_token.end - 1)),
+        current_token.end);
+}
  
-        *cursor = str_end + 1;
+static struct ParseResult parse_number(struct Token current_token)
+{
+    char *endptr = 0;
+    const float x = strtof(current_token.begin, &endptr);
  
-        return parse_success(
-            atom_as_expr(
-                create_string_atom(str + str_begin, str + str_end)));
+    if (current_token.begin == endptr || current_token.end != endptr) {
+        return parse_failure("Expected number", current_token.begin);
      }
  
-    default: {
-        if (isdigit(str[*cursor])) {
-            const char *nptr = str + *cursor;
-            char *endptr = 0;
-            const float x = strtof(nptr, &endptr);
-
-            if (nptr == endptr) {
-                return parse_failure("Number expected", *cursor);
-            }
+    return parse_success(
+        atom_as_expr(create_number_atom(x)),
+        current_token.end);
+}
  
-            *cursor += (size_t) (endptr - nptr);
+static struct ParseResult parse_symbol(struct Token current_token)
+{
+    if (*current_token.begin == 0) {
+        return parse_failure("EOF", current_token.begin);
+    }
  
-            return parse_success(atom_as_expr(create_number_atom(x)));
-        } else if (is_symbol_char(str[*cursor])) {
-            const size_t sym_begin = *cursor;
-            size_t sym_end = sym_begin;
+    return parse_success(
+        atom_as_expr(create_symbol_atom(current_token.begin, current_token.end)),
+        current_token.end);
+}
  
-            while (sym_end < n && is_symbol_char(str[sym_end])) {
-                sym_end++;
-            }
+struct ParseResult parse_expr(struct Token current_token)
+{
+    if (*current_token.begin == 0) {
+        return parse_failure("EOF", current_token.begin);
+    }
  
-            *cursor = sym_end;
+    /* TODO: parse_expr doesn't parse lists */
  
-            return parse_success(
-                atom_as_expr(
-                    create_symbol_atom(str + sym_begin, str + sym_end)));
-        }
+    switch (*current_token.begin) {
+    case '(': return parse_cons(current_token);
+    /* TODO(#292): parser does not support escaped string characters */
+    case '"': return parse_string(current_token);
+    default: {}
      }
+
+    if (isdigit(*current_token.begin)) {
+        return parse_number(current_token);
      }
  
-    return parse_failure("Unexpected sequence of characters", *cursor);
+    return parse_symbol(current_token);
  }
  
-struct ParseResult parse_success(struct Expr expr)
+struct ParseResult parse_success(struct Expr expr,
+                                 const char *end)
  {
      struct ParseResult result = {
          .is_error = false,
-        .expr = expr
+        .expr = expr,
+        .end = end
      };
  
      return result;
  }
  
  struct ParseResult parse_failure(const char *error_message,
-                                 size_t error_cursor)
+                                 const char *end)
  {
      struct ParseResult result = {
          .is_error = true,
-        .error = {
-            .error_message = error_message,
-            .error_cursor = error_cursor
-        }
+        .error_message = error_message,
+        .end = end
      };
  
      return result;
@@ -165,15 +145,18 @@ struct ParseResult parse_failure(const char *error_message,
  
  void print_parse_error(FILE *stream,
                         const char *str,
-                       struct ParseError error)
+                       struct ParseResult result)
  {
      /* TODO(#293): print_parse_error doesn't support colors */
      /* TODO(#294): print_parse_error doesn't support multiple lines */
+    if (!result.is_error) {
+        return;
+    }
  
      fprintf(stream, "%s\n", str);
-    for (size_t i = 0; i < error.error_cursor; ++i) {
+    for (size_t i = 0; i < (size_t) (result.end - str); ++i) {
          fprintf(stream, " ");
      }
      fprintf(stream, "^\n");
-    fprintf(stream, "%s\n", error.error_message);
+    fprintf(stream, "%s\n", result.error_message);
  }
diff --git a/src/script/parser.h b/src/script/parser.h

index 95158f581da75ecbc927063e809841aaf0702f8d..d7a8ace1788fc67ddab7e5c4b89dacf4d9c1589a 100644 (file)
--- a/src/script/parser.h
+++ b/src/script/parser.h
@@ -4,32 +4,33 @@
  #include <stdio.h>
  #include <stdbool.h>
  #include "script/expr.h"
+#include "script/tokenizer.h"
  
  struct ParseError
  {
-    const char *error_message;
+
      size_t error_cursor;
  };
  
  struct ParseResult
  {
      bool is_error;
+    const char *end;
      union {
          struct Expr expr;
-        struct ParseError error;
+        const char *error_message;
      };
  };
  
-struct ParseResult parse_success(struct Expr expr);
+struct ParseResult parse_success(struct Expr expr,
+                                 const char *end);
  struct ParseResult parse_failure(const char *error,
-                                 size_t error_cursor);
+                                 const char *end);
  
-struct ParseResult parse_expr(const char *str,
-                              size_t *cursor,
-                              size_t n);
+struct ParseResult parse_expr(struct Token token);
  
  void print_parse_error(FILE *stream,
                         const char *str,
-                       struct ParseError result);
+                       struct ParseResult result);
  
  #endif  // PARSER_H_
diff --git a/src/script/tokenizer.c b/src/script/tokenizer.c

index 641699b2630b76db9c7a7db0b3e38a725291df75..02deff58a6d4e649154634a0675bc789c1a3c094 100644 (file)
--- a/src/script/tokenizer.c
+++ b/src/script/tokenizer.c
@@ -8,7 +8,7 @@
  static bool is_symbol_char(char x)
  {
      static const char forbidden_symbol_chars[] = {
-        '(', ')', '"', '\'', ';'
+        '(', ')', '"', '\'', ';', '.'
      };
      static const size_t n = sizeof(forbidden_symbol_chars) / sizeof(char);
  
@@ -67,13 +67,11 @@ static const char *next_non_symbol(const char *str)
  
  struct Token next_token(const char *str)
  {
-    if (!str) {
-        return token(NULL, NULL);
-    }
+    assert(str);
  
      str = skip_whitespace(str);
      if (*str == 0) {
-        return token(NULL, NULL);
+        return token(str, str);
      }
  
      while (*str != 0 && *str == ';') {
@@ -84,6 +82,7 @@ struct Token next_token(const char *str)
      switch (*str) {
      case '(':
      case ')':
+    case '.':
          return token(str, str + 1);
  
      case '"': {
diff --git a/src/script_test.c b/src/script_test.c

index 57931051a2416fe1d981d7f77676514204c1be31..820217c59dec4a64d415f2d21a21a4cc10d89945 100644 (file)
--- a/src/script_test.c
+++ b/src/script_test.c
@@ -2,20 +2,19 @@
  #include <string.h>
  
  #include "script/parser.h"
+#include "script/tokenizer.h"
  
  int main(int argc, char *argv[])
  {
      (void) argc;
      (void) argv;
  
-    const char *code = "(1 . (\"2\" . (hello-world . nil)))";
-    size_t cursor = 0;
-    const size_t n = strlen(code);
+    const char *code = "(1 . (\"2\" . (hello-world . (\"\" .nil))))";
  
-    struct ParseResult result = parse_expr(code, &cursor, n);
+    struct ParseResult result = parse_expr(next_token(code));
  
      if (result.is_error) {
-        print_parse_error(stderr, code, result.error);
+        print_parse_error(stderr, code, result);
      } else {
          print_expr_as_sexpr(result.expr);
      }
author	rexim <reximkut@gmail.com>
	Sun, 26 Aug 2018 21:36:40 +0000 (04:36 +0700)
committer	rexim <reximkut@gmail.com>
	Sun, 26 Aug 2018 21:36:40 +0000 (04:36 +0700)
src/script/parser.c		patch \| blob \| history
src/script/parser.h		patch \| blob \| history
src/script/tokenizer.c		patch \| blob \| history
src/script_test.c		patch \| blob \| history