godot/modules/gdscript/gdscript_tokenizer_buffer.cpp
George Marques b4d0a09f15
GDScript: Reintroduce binary tokenization on export
This adds back a function available in 3.x: exporting the GDScript
files in a binary form by converting the tokens recognized by the
tokenizer into a data format.

It is enabled by default on export but can be manually disabled. The
format helps with loading times since, the tokens are easily
reconstructed, and with hiding the source code, since recovering it
would require a specialized tool. Code comments are not stored in this
format.

The `--test` command can also include a `--use-binary-tokens` flag
which will run the GDScript tests with the binary format instead of the
regular source code by converting them in-memory before the test runs.
2024-02-08 11:20:05 -03:00

458 lines
14 KiB
C++

/**************************************************************************/
/* gdscript_tokenizer_buffer.cpp */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#include "gdscript_tokenizer_buffer.h"
#include "core/io/marshalls.h"
#define TOKENIZER_VERSION 100
int GDScriptTokenizerBuffer::_token_to_binary(const Token &p_token, Vector<uint8_t> &r_buffer, int p_start, HashMap<StringName, uint32_t> &r_identifiers_map, HashMap<Variant, uint32_t, VariantHasher, VariantComparator> &r_constants_map) {
int pos = p_start;
int token_type = p_token.type & TOKEN_MASK;
switch (p_token.type) {
case GDScriptTokenizer::Token::ANNOTATION:
case GDScriptTokenizer::Token::IDENTIFIER: {
// Add identifier to map.
int identifier_pos;
StringName id = p_token.get_identifier();
if (r_identifiers_map.has(id)) {
identifier_pos = r_identifiers_map[id];
} else {
identifier_pos = r_identifiers_map.size();
r_identifiers_map[id] = identifier_pos;
}
token_type |= identifier_pos << TOKEN_BITS;
} break;
case GDScriptTokenizer::Token::ERROR:
case GDScriptTokenizer::Token::LITERAL: {
// Add literal to map.
int constant_pos;
if (r_constants_map.has(p_token.literal)) {
constant_pos = r_constants_map[p_token.literal];
} else {
constant_pos = r_constants_map.size();
r_constants_map[p_token.literal] = constant_pos;
}
token_type |= constant_pos << TOKEN_BITS;
} break;
default:
break;
}
// Encode token.
int token_len;
if (token_type & TOKEN_MASK) {
token_len = 8;
r_buffer.resize(pos + token_len);
encode_uint32(token_type | TOKEN_BYTE_MASK, &r_buffer.write[pos]);
pos += 4;
} else {
token_len = 5;
r_buffer.resize(pos + token_len);
r_buffer.write[pos] = token_type;
pos++;
}
encode_uint32(p_token.start_line, &r_buffer.write[pos]);
return token_len;
}
GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t *p_buffer) {
Token token;
const uint8_t *b = p_buffer;
uint32_t token_type = decode_uint32(b);
token.type = (Token::Type)(token_type & TOKEN_MASK);
if (token_type & TOKEN_BYTE_MASK) {
b += 4;
} else {
b++;
}
token.start_line = decode_uint32(b);
token.end_line = token.start_line;
token.literal = token.get_name();
if (token.type == Token::CONST_NAN) {
token.literal = String("NAN"); // Special case since name and notation are different.
}
switch (token.type) {
case GDScriptTokenizer::Token::ANNOTATION:
case GDScriptTokenizer::Token::IDENTIFIER: {
// Get name from map.
int identifier_pos = token_type >> TOKEN_BITS;
if (unlikely(identifier_pos >= identifiers.size())) {
Token error;
error.type = Token::ERROR;
error.literal = "Identifier index out of bounds.";
return error;
}
token.literal = identifiers[identifier_pos];
} break;
case GDScriptTokenizer::Token::ERROR:
case GDScriptTokenizer::Token::LITERAL: {
// Get literal from map.
int constant_pos = token_type >> TOKEN_BITS;
if (unlikely(constant_pos >= constants.size())) {
Token error;
error.type = Token::ERROR;
error.literal = "Constant index out of bounds.";
return error;
}
token.literal = constants[constant_pos];
} break;
default:
break;
}
return token;
}
Error GDScriptTokenizerBuffer::set_code_buffer(const Vector<uint8_t> &p_buffer) {
const uint8_t *buf = p_buffer.ptr();
int total_len = p_buffer.size();
ERR_FAIL_COND_V(p_buffer.size() < 24 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA);
int version = decode_uint32(&buf[4]);
ERR_FAIL_COND_V_MSG(version > TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is too recent! Please use a newer engine version.");
uint32_t identifier_count = decode_uint32(&buf[8]);
uint32_t constant_count = decode_uint32(&buf[12]);
uint32_t token_line_count = decode_uint32(&buf[16]);
uint32_t token_count = decode_uint32(&buf[20]);
const uint8_t *b = &buf[24];
total_len -= 24;
identifiers.resize(identifier_count);
for (uint32_t i = 0; i < identifier_count; i++) {
uint32_t len = decode_uint32(b);
total_len -= 4;
ERR_FAIL_COND_V((len * 4u) > (uint32_t)total_len, ERR_INVALID_DATA);
b += 4;
Vector<uint32_t> cs;
cs.resize(len);
for (uint32_t j = 0; j < len; j++) {
uint8_t tmp[4];
for (uint32_t k = 0; k < 4; k++) {
tmp[k] = b[j * 4 + k] ^ 0xb6;
}
cs.write[j] = decode_uint32(tmp);
}
String s(reinterpret_cast<const char32_t *>(cs.ptr()), len);
b += len * 4;
total_len -= len * 4;
identifiers.write[i] = s;
}
constants.resize(constant_count);
for (uint32_t i = 0; i < constant_count; i++) {
Variant v;
int len;
Error err = decode_variant(v, b, total_len, &len, false);
if (err) {
return err;
}
b += len;
total_len -= len;
constants.write[i] = v;
}
for (uint32_t i = 0; i < token_line_count; i++) {
ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
uint32_t token_index = decode_uint32(b);
b += 4;
uint32_t line = decode_uint32(b);
b += 4;
total_len -= 8;
token_lines[token_index] = line;
}
for (uint32_t i = 0; i < token_line_count; i++) {
ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
uint32_t token_index = decode_uint32(b);
b += 4;
uint32_t column = decode_uint32(b);
b += 4;
total_len -= 8;
token_columns[token_index] = column;
}
tokens.resize(token_count);
for (uint32_t i = 0; i < token_count; i++) {
int token_len = 5;
if ((*b) & TOKEN_BYTE_MASK) {
token_len = 8;
}
ERR_FAIL_COND_V(total_len < token_len, ERR_INVALID_DATA);
Token token = _binary_to_token(b);
b += token_len;
ERR_FAIL_INDEX_V(token.type, Token::TK_MAX, ERR_INVALID_DATA);
tokens.write[i] = token;
total_len -= token_len;
}
ERR_FAIL_COND_V(total_len > 0, ERR_INVALID_DATA);
return OK;
}
Vector<uint8_t> GDScriptTokenizerBuffer::parse_code_string(const String &p_code) {
Vector<uint8_t> buf;
HashMap<StringName, uint32_t> identifier_map;
HashMap<Variant, uint32_t, VariantHasher, VariantComparator> constant_map;
Vector<uint8_t> token_buffer;
HashMap<uint32_t, uint32_t> token_lines;
HashMap<uint32_t, uint32_t> token_columns;
GDScriptTokenizerText tokenizer;
tokenizer.set_source_code(p_code);
tokenizer.set_multiline_mode(true); // Ignore whitespace tokens.
Token current = tokenizer.scan();
int token_pos = 0;
int last_token_line = 0;
int token_counter = 0;
while (current.type != Token::TK_EOF) {
int token_len = _token_to_binary(current, token_buffer, token_pos, identifier_map, constant_map);
token_pos += token_len;
if (token_counter > 0 && current.start_line > last_token_line) {
token_lines[token_counter] = current.start_line;
token_columns[token_counter] = current.start_column;
}
last_token_line = current.end_line;
current = tokenizer.scan();
token_counter++;
}
// Reverse maps.
Vector<StringName> rev_identifier_map;
rev_identifier_map.resize(identifier_map.size());
for (const KeyValue<StringName, uint32_t> &E : identifier_map) {
rev_identifier_map.write[E.value] = E.key;
}
Vector<Variant> rev_constant_map;
rev_constant_map.resize(constant_map.size());
for (const KeyValue<Variant, uint32_t> &E : constant_map) {
rev_constant_map.write[E.value] = E.key;
}
HashMap<uint32_t, uint32_t> rev_token_lines;
for (const KeyValue<uint32_t, uint32_t> &E : token_lines) {
rev_token_lines[E.value] = E.key;
}
// Remove continuation lines from map.
for (int line : tokenizer.get_continuation_lines()) {
if (rev_token_lines.has(line + 1)) {
token_lines.erase(rev_token_lines[line + 1]);
token_columns.erase(rev_token_lines[line + 1]);
}
}
// Save header.
buf.resize(24);
buf.write[0] = 'G';
buf.write[1] = 'D';
buf.write[2] = 'S';
buf.write[3] = 'C';
encode_uint32(TOKENIZER_VERSION, &buf.write[4]);
encode_uint32(identifier_map.size(), &buf.write[8]);
encode_uint32(constant_map.size(), &buf.write[12]);
encode_uint32(token_lines.size(), &buf.write[16]);
encode_uint32(token_counter, &buf.write[20]);
int buf_pos = 24;
// Save identifiers.
for (const StringName &id : rev_identifier_map) {
String s = id.operator String();
int len = s.length();
buf.resize(buf_pos + (len + 1) * 4);
encode_uint32(len, &buf.write[buf_pos]);
buf_pos += 4;
for (int i = 0; i < len; i++) {
uint8_t tmp[4];
encode_uint32(s[i], tmp);
for (int b = 0; b < 4; b++) {
buf.write[buf_pos + b] = tmp[b] ^ 0xb6;
}
buf_pos += 4;
}
}
// Save constants.
for (const Variant &v : rev_constant_map) {
int len;
// Objects cannot be constant, never encode objects.
Error err = encode_variant(v, nullptr, len, false);
ERR_FAIL_COND_V_MSG(err != OK, Vector<uint8_t>(), "Error when trying to encode Variant.");
buf.resize(buf_pos + len);
encode_variant(v, &buf.write[buf_pos], len, false);
buf_pos += len;
}
// Save lines and columns.
buf.resize(buf_pos + token_lines.size() * 16);
for (const KeyValue<uint32_t, uint32_t> &e : token_lines) {
encode_uint32(e.key, &buf.write[buf_pos]);
buf_pos += 4;
encode_uint32(e.value, &buf.write[buf_pos]);
buf_pos += 4;
}
for (const KeyValue<uint32_t, uint32_t> &e : token_columns) {
encode_uint32(e.key, &buf.write[buf_pos]);
buf_pos += 4;
encode_uint32(e.value, &buf.write[buf_pos]);
buf_pos += 4;
}
// Store tokens.
buf.append_array(token_buffer);
return buf;
}
int GDScriptTokenizerBuffer::get_cursor_line() const {
return 0;
}
int GDScriptTokenizerBuffer::get_cursor_column() const {
return 0;
}
void GDScriptTokenizerBuffer::set_cursor_position(int p_line, int p_column) {
}
void GDScriptTokenizerBuffer::set_multiline_mode(bool p_state) {
multiline_mode = p_state;
}
bool GDScriptTokenizerBuffer::is_past_cursor() const {
return false;
}
void GDScriptTokenizerBuffer::push_expression_indented_block() {
indent_stack_stack.push_back(indent_stack);
}
void GDScriptTokenizerBuffer::pop_expression_indented_block() {
ERR_FAIL_COND(indent_stack_stack.size() == 0);
indent_stack = indent_stack_stack.back()->get();
indent_stack_stack.pop_back();
}
GDScriptTokenizer::Token GDScriptTokenizerBuffer::scan() {
// Add final newline.
if (current >= tokens.size() && !last_token_was_newline) {
Token newline;
newline.type = Token::NEWLINE;
newline.start_line = current_line;
newline.end_line = current_line;
last_token_was_newline = true;
return newline;
}
// Resolve pending indentation change.
if (pending_indents > 0) {
pending_indents--;
Token indent;
indent.type = Token::INDENT;
indent.start_line = current_line;
indent.end_line = current_line;
return indent;
} else if (pending_indents < 0) {
pending_indents++;
Token dedent;
dedent.type = Token::DEDENT;
dedent.start_line = current_line;
dedent.end_line = current_line;
return dedent;
}
if (current >= tokens.size()) {
if (!indent_stack.is_empty()) {
pending_indents -= indent_stack.size();
indent_stack.clear();
return scan();
}
Token eof;
eof.type = Token::TK_EOF;
return eof;
};
if (!last_token_was_newline && token_lines.has(current)) {
current_line = token_lines[current];
uint32_t current_column = token_columns[current];
// Check if there's a need to indent/dedent.
if (!multiline_mode) {
uint32_t previous_indent = 0;
if (!indent_stack.is_empty()) {
previous_indent = indent_stack.back()->get();
}
if (current_column - 1 > previous_indent) {
pending_indents++;
indent_stack.push_back(current_column - 1);
} else {
while (current_column - 1 < previous_indent) {
pending_indents--;
indent_stack.pop_back();
if (indent_stack.is_empty()) {
break;
}
previous_indent = indent_stack.back()->get();
}
}
Token newline;
newline.type = Token::NEWLINE;
newline.start_line = current_line;
newline.end_line = current_line;
last_token_was_newline = true;
return newline;
}
}
last_token_was_newline = false;
Token token = tokens[current++];
return token;
}