Fix CRLF handling in external scanner

- Consume \r as part of token instead of skipping it - Break after consuming \n to avoid processing multiple lines - Consume leading whitespace separately for indent calculation - Fix ASCII_CONTENT to return false at EOF without asciiend This fixes ERROR tokens with CRLF line endings, especially with trailing blank lines.
2025-11-27 01:25:06 +01:00
parent 0b78c43138
commit eaf0963459
5 changed files with 12315 additions and 12389 deletions
--- a/src/grammar.json
+++ b/src/grammar.json
@@ -1,5 +1,4 @@
 {
-  "$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/grammar.schema.json",
  "name": "stonescript",
  "word": "identifier",
  "rules": {
@@ -1503,6 +1502,6 @@
    }
  ],
  "inline": [],
-  "supertypes": [],
-  "reserved": {}
-}
+  "supertypes": []
+}
+
--- a/src/node-types.json
+++ b/src/node-types.json
@@ -1543,7 +1543,6 @@
  {
    "type": "source_file",
    "named": true,
-    "root": true,
    "fields": {},
    "children": {
      "multiple": true,
@@ -2054,8 +2053,7 @@
  },
  {
    "type": "block_comment",
-    "named": true,
-    "extra": true
+    "named": true
  },
  {
    "type": "break_statement",
@@ -2067,8 +2065,7 @@
  },
  {
    "type": "comment",
-    "named": true,
-    "extra": true
+    "named": true
  },
  {
    "type": "continue_statement",
--- a/src/parser.c
+++ b/src/parser.c
--- a/src/scanner.c
+++ b/src/scanner.c
@@ -110,11 +110,8 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
      }
      
      // Check if we're at the start of a line with 'asciiend'
-      if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
+      if (lexer->lookahead == '\n') {
        lexer->advance(lexer, false);
-        if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
-          lexer->advance(lexer, false);
-        }
        lexer->mark_end(lexer);
        has_content = true;
        
@@ -156,8 +153,8 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
      }
    }
    
-    lexer->result_symbol = ASCII_CONTENT;
-    return has_content;
+    // If we reached EOF without finding asciiend, this is not valid ASCII content
+    return false;
  }

  if (scanner->queued_tokens_size > 0) {
@@ -180,10 +177,17 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
      found_end_of_line = true;
      indent_length = 0;
      lexer->advance(lexer, false);
+      // After consuming \n, only consume whitespace on the SAME logical line
+      // Don't continue to next line
+      break;
+    } else if (lexer->lookahead == '\r') {
+      // Consume \r as part of line ending (for CRLF), don't skip it
+      lexer->advance(lexer, false);
+      // Continue to potentially consume \n that follows \r
    } else if (lexer->lookahead == ' ') {
      indent_length++;
      lexer->advance(lexer, false);
-    } else if (lexer->lookahead == '\r' || lexer->lookahead == '\f') {
+    } else if (lexer->lookahead == '\f') {
      indent_length = 0;
      lexer->advance(lexer, false);
    } else if (lexer->lookahead == '\t') {
@@ -196,6 +200,18 @@ bool tree_sitter_stonescript_external_scanner_scan(void *payload, TSLexer *lexer
      break;
    }
  }
+  
+  // After breaking from newline, consume leading whitespace/indentation
+  if (found_end_of_line && !lexer->eof(lexer)) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
+      if (lexer->lookahead == ' ') {
+        indent_length++;
+      } else {
+        indent_length += 8;
+      }
+      lexer->advance(lexer, false);
+    }
+  }



--- a/src/tree_sitter/parser.h
+++ b/src/tree_sitter/parser.h
@@ -13,17 +13,12 @@ extern "C" {
 #define ts_builtin_sym_end 0
 #define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024

-#ifndef TREE_SITTER_API_H_
 typedef uint16_t TSStateId;
+
+#ifndef TREE_SITTER_API_H_
 typedef uint16_t TSSymbol;
 typedef uint16_t TSFieldId;
 typedef struct TSLanguage TSLanguage;
-typedef struct TSLanguageMetadata TSLanguageMetadata;
-typedef struct TSLanguageMetadata {
-  uint8_t major_version;
-  uint8_t minor_version;
-  uint8_t patch_version;
-} TSLanguageMetadata;
 #endif

 typedef struct {
@@ -32,11 +27,10 @@ typedef struct {
  bool inherited;
 } TSFieldMapEntry;

-// Used to index the field and supertype maps.
 typedef struct {
  uint16_t index;
  uint16_t length;
-} TSMapSlice;
+} TSFieldMapSlice;

 typedef struct {
  bool visible;
@@ -54,7 +48,6 @@ struct TSLexer {
  uint32_t (*get_column)(TSLexer *);
  bool (*is_at_included_range_start)(const TSLexer *);
  bool (*eof)(const TSLexer *);
-  void (*log)(const TSLexer *, const char *, ...);
 };

 typedef enum {
@@ -86,12 +79,6 @@ typedef struct {
  uint16_t external_lex_state;
 } TSLexMode;

-typedef struct {
-  uint16_t lex_state;
-  uint16_t external_lex_state;
-  uint16_t reserved_word_set_id;
-} TSLexerMode;
-
 typedef union {
  TSParseAction action;
  struct {
@@ -100,13 +87,8 @@ typedef union {
  } entry;
 } TSParseActionEntry;

-typedef struct {
-  int32_t start;
-  int32_t end;
-} TSCharacterRange;
-
 struct TSLanguage {
-  uint32_t abi_version;
+  uint32_t version;
  uint32_t symbol_count;
  uint32_t alias_count;
  uint32_t token_count;
@@ -122,13 +104,13 @@ struct TSLanguage {
  const TSParseActionEntry *parse_actions;
  const char * const *symbol_names;
  const char * const *field_names;
-  const TSMapSlice *field_map_slices;
+  const TSFieldMapSlice *field_map_slices;
  const TSFieldMapEntry *field_map_entries;
  const TSSymbolMetadata *symbol_metadata;
  const TSSymbol *public_symbol_map;
  const uint16_t *alias_map;
  const TSSymbol *alias_sequences;
-  const TSLexerMode *lex_modes;
+  const TSLexMode *lex_modes;
  bool (*lex_fn)(TSLexer *, TSStateId);
  bool (*keyword_lex_fn)(TSLexer *, TSStateId);
  TSSymbol keyword_capture_token;
@@ -142,48 +124,15 @@ struct TSLanguage {
    void (*deserialize)(void *, const char *, unsigned);
  } external_scanner;
  const TSStateId *primary_state_ids;
-  const char *name;
-  const TSSymbol *reserved_words;
-  uint16_t max_reserved_word_set_size;
-  uint32_t supertype_count;
-  const TSSymbol *supertype_symbols;
-  const TSMapSlice *supertype_map_slices;
-  const TSSymbol *supertype_map_entries;
-  TSLanguageMetadata metadata;
 };

-static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
-  uint32_t index = 0;
-  uint32_t size = len - index;
-  while (size > 1) {
-    uint32_t half_size = size / 2;
-    uint32_t mid_index = index + half_size;
-    const TSCharacterRange *range = &ranges[mid_index];
-    if (lookahead >= range->start && lookahead <= range->end) {
-      return true;
-    } else if (lookahead > range->end) {
-      index = mid_index;
-    }
-    size -= half_size;
-  }
-  const TSCharacterRange *range = &ranges[index];
-  return (lookahead >= range->start && lookahead <= range->end);
-}
-
 /*
 *  Lexer Macros
 */

-#ifdef _MSC_VER
-#define UNUSED __pragma(warning(suppress : 4101))
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
 #define START_LEXER()           \
  bool result = false;          \
  bool skip = false;            \
-  UNUSED                        \
  bool eof = false;             \
  int32_t lookahead;            \
  goto start;                   \
@@ -199,17 +148,6 @@ static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, in
    goto next_state;         \
  }

-#define ADVANCE_MAP(...)                                              \
-  {                                                                   \
-    static const uint16_t map[] = { __VA_ARGS__ };                    \
-    for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) {  \
-      if (map[i] == lookahead) {                                      \
-        state = map[i + 1];                                           \
-        goto next_state;                                              \
-      }                                                               \
-    }                                                                 \
-  }
-
 #define SKIP(state_value) \
  {                       \
    skip = true;          \
@@ -228,7 +166,7 @@ static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, in
 *  Parse Table Macros
 */

-#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT)
+#define SMALL_STATE(id) id - LARGE_STATE_COUNT

 #define STATE(id) id

@@ -238,7 +176,7 @@ static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, in
  {{                                  \
    .shift = {                        \
      .type = TSParseActionTypeShift, \
-      .state = (state_value)          \
+      .state = state_value            \
    }                                 \
  }}

@@ -246,7 +184,7 @@ static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, in
  {{                                  \
    .shift = {                        \
      .type = TSParseActionTypeShift, \
-      .state = (state_value),         \
+      .state = state_value,           \
      .repetition = true              \
    }                                 \
  }}
@@ -259,15 +197,14 @@ static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, in
    }                                 \
  }}

-#define REDUCE(symbol_name, children, precedence, prod_id) \
-  {{                                                       \
-    .reduce = {                                            \
-      .type = TSParseActionTypeReduce,                     \
-      .symbol = symbol_name,                               \
-      .child_count = children,                             \
-      .dynamic_precedence = precedence,                    \
-      .production_id = prod_id                             \
-    },                                                     \
+#define REDUCE(symbol_val, child_count_val, ...) \
+  {{                                             \
+    .reduce = {                                  \
+      .type = TSParseActionTypeReduce,           \
+      .symbol = symbol_val,                      \
+      .child_count = child_count_val,            \
+      __VA_ARGS__                                \
+    },                                           \
  }}

 #define RECOVER()                    \