Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a crash in the external scanner, convert scanner to use array.h header #91

Merged
merged 3 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 63 additions & 147 deletions src/scanner.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "tag.h"

#include <wctype.h>
#include "tree_sitter/array.h"
#include "tag.h"

enum TokenType {
START_TAG_NAME,
Expand All @@ -18,115 +18,27 @@ enum TokenType {
};

typedef struct {
uint32_t len;
uint32_t cap;
Tag *data;
} tags_vec;

typedef struct {
tags_vec tags;
Array(Tag) tags;
} Scanner;

#define MAX(a, b) ((a) > (b) ? (a) : (b))

#define VEC_RESIZE(vec, _cap) \
if ((_cap) > (vec).cap && (_cap) > 0) { \
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
(vec).cap = (_cap); \
}

#define VEC_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
VEC_RESIZE((vec), (_cap)); \
}

#define VEC_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);

#define VEC_POP(vec) \
{ \
if (VEC_BACK(vec).type == CUSTOM) { \
tag_free(&VEC_BACK(vec)); \
} \
(vec).len--; \
}

#define VEC_BACK(vec) ((vec).data[(vec).len - 1])

#define VEC_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
(vec).data = NULL; \
}

#define VEC_CLEAR(vec) \
{ \
for (int i = 0; i < (vec).len; i++) { \
tag_free(&(vec).data[i]); \
} \
(vec).len = 0; \
}

#define STRING_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
memset((vec).data + (vec).len, 0, (((_cap) + 1) - (vec).len) * sizeof((vec).data[0])); \
(vec).cap = (_cap);

#define STRING_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
STRING_RESIZE((vec), (_cap)); \
}

#define STRING_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);

#define STRING_INIT(vec) \
{ \
(vec).data = calloc(1, sizeof(char) * 17); \
(vec).len = 0; \
(vec).cap = 16; \
}

#define STRING_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
(vec).data = NULL; \
}

#define STRING_CLEAR(vec) \
{ \
(vec).len = 0; \
memset((vec).data, 0, (vec).cap * sizeof(char)); \
}

static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }

static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }

static unsigned serialize(Scanner *scanner, char *buffer) {
uint16_t tag_count = scanner->tags.len > UINT16_MAX ? UINT16_MAX : scanner->tags.len;
uint16_t tag_count = scanner->tags.size > UINT16_MAX ? UINT16_MAX : scanner->tags.size;
uint16_t serialized_tag_count = 0;

unsigned size = sizeof(tag_count);
memcpy(&buffer[size], &tag_count, sizeof(tag_count));
size += sizeof(tag_count);

for (; serialized_tag_count < tag_count; serialized_tag_count++) {
Tag tag = scanner->tags.data[serialized_tag_count];
Tag tag = scanner->tags.contents[serialized_tag_count];
if (tag.type == CUSTOM) {
unsigned name_length = tag.custom_tag_name.len;
unsigned name_length = tag.custom_tag_name.size;
if (name_length > UINT8_MAX) {
name_length = UINT8_MAX;
}
Expand All @@ -135,7 +47,7 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
}
buffer[size++] = (char)tag.type;
buffer[size++] = (char)name_length;
strncpy(&buffer[size], tag.custom_tag_name.data, name_length);
strncpy(&buffer[size], tag.custom_tag_name.contents, name_length);
size += name_length;
} else {
if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
Expand All @@ -150,7 +62,11 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
}

static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
VEC_CLEAR(scanner->tags);
for (unsigned i = 0; i < scanner->tags.size; i++) {
tag_free(&scanner->tags.contents[i]);
}
array_clear(&scanner->tags);

if (length > 0) {
unsigned size = 0;
uint16_t tag_count = 0;
Expand All @@ -162,37 +78,34 @@ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
memcpy(&tag_count, &buffer[size], sizeof(tag_count));
size += sizeof(tag_count);

VEC_RESIZE(scanner->tags, tag_count);
array_reserve(&scanner->tags, tag_count);
if (tag_count > 0) {
unsigned iter = 0;
for (iter = 0; iter < serialized_tag_count; iter++) {
Tag tag = scanner->tags.data[iter];
Tag tag = tag_new();
tag.type = (TagType)buffer[size++];
if (tag.type == CUSTOM) {
uint16_t name_length = (uint8_t)buffer[size++];
tag.custom_tag_name.len = name_length;
tag.custom_tag_name.cap = name_length;
tag.custom_tag_name.data = (char *)calloc(1, sizeof(char) * (name_length + 1));
strncpy(tag.custom_tag_name.data, &buffer[size], name_length);
array_reserve(&tag.custom_tag_name, name_length);
tag.custom_tag_name.size = name_length;
memcpy(tag.custom_tag_name.contents, &buffer[size], name_length);
size += name_length;
}
VEC_PUSH(scanner->tags, tag);
array_push(&scanner->tags, tag);
}
// add zero tags if we didn't read enough, this is because the
// buffer had no more room but we held more tags.
for (; iter < tag_count; iter++) {
Tag tag = new_tag();
VEC_PUSH(scanner->tags, tag);
array_push(&scanner->tags, tag_new());
}
}
}
}

static String scan_tag_name(TSLexer *lexer) {
String tag_name;
STRING_INIT(tag_name);
String tag_name = array_new();
while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
STRING_PUSH(tag_name, towupper(lexer->lookahead));
array_push(&tag_name, towupper(lexer->lookahead));
advance(lexer);
}
return tag_name;
Expand Down Expand Up @@ -230,13 +143,13 @@ static bool scan_comment(TSLexer *lexer) {
}

static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
if (scanner->tags.len == 0) {
if (scanner->tags.size == 0) {
return false;
}

lexer->mark_end(lexer);

const char *end_delimiter = VEC_BACK(scanner->tags).type == SCRIPT ? "</SCRIPT" : "</STYLE";
const char *end_delimiter = array_back(&scanner->tags)->type == SCRIPT ? "</SCRIPT" : "</STYLE";

unsigned delimiter_index = 0;
while (lexer->lookahead) {
Expand All @@ -258,70 +171,73 @@ static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
}

static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
Tag *parent = scanner->tags.len == 0 ? NULL : &VEC_BACK(scanner->tags);
Tag *parent = scanner->tags.size == 0 ? NULL : array_back(&scanner->tags);

bool is_closing_tag = false;
if (lexer->lookahead == '/') {
is_closing_tag = true;
advance(lexer);
} else {
if (parent && is_void(parent)) {
VEC_POP(scanner->tags);
if (parent && tag_is_void(parent)) {
array_pop(&scanner->tags);
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
}

String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0 && !lexer->eof(lexer)) {
STRING_FREE(tag_name);
if (tag_name.size == 0 && !lexer->eof(lexer)) {
array_delete(&tag_name);
return false;
}

Tag next_tag = for_name(tag_name.data);
Tag next_tag = tag_for_name(tag_name);

if (is_closing_tag) {
// The tag correctly closes the topmost element on the stack
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &next_tag)) {
STRING_FREE(tag_name);
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &next_tag)) {
tag_free(&next_tag);
return false;
}

// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
for (unsigned i = scanner->tags.len; i > 0; i--) {
if (scanner->tags.data[i - 1].type == next_tag.type) {
VEC_POP(scanner->tags);
for (unsigned i = scanner->tags.size; i > 0; i--) {
if (scanner->tags.contents[i - 1].type == next_tag.type) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = IMPLICIT_END_TAG;
STRING_FREE(tag_name);
tag_free(&next_tag);
return true;
}
}
} else if (parent &&
(!can_contain(parent, &next_tag) ||
(parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer))) {
VEC_POP(scanner->tags);
} else if (
parent &&
(
!tag_can_contain(parent, &next_tag) ||
(parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer)
)
) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = IMPLICIT_END_TAG;
STRING_FREE(tag_name);
tag_free(&next_tag);
return true;
}

STRING_FREE(tag_name);
tag_free(&next_tag);
return false;
}

static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);
if (tag_name.size == 0) {
array_delete(&tag_name);
return false;
}
Tag tag = for_name(tag_name.data);
VEC_PUSH(scanner->tags, tag);

Tag tag = tag_for_name(tag_name);
array_push(&scanner->tags, tag);
switch (tag.type) {
case SCRIPT:
lexer->result_symbol = SCRIPT_START_TAG_NAME;
Expand All @@ -333,34 +249,37 @@ static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
lexer->result_symbol = START_TAG_NAME;
break;
}
STRING_FREE(tag_name);
return true;
}

static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);

if (tag_name.size == 0) {
array_delete(&tag_name);
return false;
}
Tag tag = for_name(tag_name.data);
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &tag)) {
VEC_POP(scanner->tags);

Tag tag = tag_for_name(tag_name);
if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &tag)) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = END_TAG_NAME;
} else {
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
}

tag_free(&tag);
STRING_FREE(tag_name);
return true;
}

static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
if (scanner->tags.len > 0) {
VEC_POP(scanner->tags);
if (scanner->tags.size > 0) {
Tag popped_tag = array_pop(&scanner->tags);
tag_free(&popped_tag);
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
}
return true;
Expand All @@ -369,9 +288,6 @@ static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
}

static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (scanner->tags.len > 0) {
Tag *parent = &VEC_BACK(scanner->tags);
}
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
return scan_raw_text(scanner, lexer);
}
Expand Down Expand Up @@ -439,9 +355,9 @@ void tree_sitter_html_external_scanner_deserialize(void *payload, const char *bu

void tree_sitter_html_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
for (unsigned i = 0; i < scanner->tags.len; i++) {
STRING_FREE(scanner->tags.data[i].custom_tag_name);
for (unsigned i = 0; i < scanner->tags.size; i++) {
tag_free(&scanner->tags.contents[i]);
}
VEC_FREE(scanner->tags);
array_delete(&scanner->tags);
free(scanner);
}
Loading