godot/modules/gdscript/gdscript_tokenizer_buffer.cpp
George Marques 72e5f8c31e
GDScript: Enable compression on export
Besides the regular option to export GDScript as binary tokens, this
also includes a compression option on top of it. The binary format
needs to encode some information which generally makes it bigger than
the source text. This option reduces that difference by using Zstandard
compression on the buffer.
2024-02-08 11:20:07 -03:00

494 lines
15 KiB
C++

/**************************************************************************/
/* gdscript_tokenizer_buffer.cpp */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#include "gdscript_tokenizer_buffer.h"
#include "core/io/compression.h"
#include "core/io/marshalls.h"
#define TOKENIZER_VERSION 100
int GDScriptTokenizerBuffer::_token_to_binary(const Token &p_token, Vector<uint8_t> &r_buffer, int p_start, HashMap<StringName, uint32_t> &r_identifiers_map, HashMap<Variant, uint32_t, VariantHasher, VariantComparator> &r_constants_map) {
int pos = p_start;
int token_type = p_token.type & TOKEN_MASK;
switch (p_token.type) {
case GDScriptTokenizer::Token::ANNOTATION:
case GDScriptTokenizer::Token::IDENTIFIER: {
// Add identifier to map.
int identifier_pos;
StringName id = p_token.get_identifier();
if (r_identifiers_map.has(id)) {
identifier_pos = r_identifiers_map[id];
} else {
identifier_pos = r_identifiers_map.size();
r_identifiers_map[id] = identifier_pos;
}
token_type |= identifier_pos << TOKEN_BITS;
} break;
case GDScriptTokenizer::Token::ERROR:
case GDScriptTokenizer::Token::LITERAL: {
// Add literal to map.
int constant_pos;
if (r_constants_map.has(p_token.literal)) {
constant_pos = r_constants_map[p_token.literal];
} else {
constant_pos = r_constants_map.size();
r_constants_map[p_token.literal] = constant_pos;
}
token_type |= constant_pos << TOKEN_BITS;
} break;
default:
break;
}
// Encode token.
int token_len;
if (token_type & TOKEN_MASK) {
token_len = 8;
r_buffer.resize(pos + token_len);
encode_uint32(token_type | TOKEN_BYTE_MASK, &r_buffer.write[pos]);
pos += 4;
} else {
token_len = 5;
r_buffer.resize(pos + token_len);
r_buffer.write[pos] = token_type;
pos++;
}
encode_uint32(p_token.start_line, &r_buffer.write[pos]);
return token_len;
}
GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t *p_buffer) {
Token token;
const uint8_t *b = p_buffer;
uint32_t token_type = decode_uint32(b);
token.type = (Token::Type)(token_type & TOKEN_MASK);
if (token_type & TOKEN_BYTE_MASK) {
b += 4;
} else {
b++;
}
token.start_line = decode_uint32(b);
token.end_line = token.start_line;
token.literal = token.get_name();
if (token.type == Token::CONST_NAN) {
token.literal = String("NAN"); // Special case since name and notation are different.
}
switch (token.type) {
case GDScriptTokenizer::Token::ANNOTATION:
case GDScriptTokenizer::Token::IDENTIFIER: {
// Get name from map.
int identifier_pos = token_type >> TOKEN_BITS;
if (unlikely(identifier_pos >= identifiers.size())) {
Token error;
error.type = Token::ERROR;
error.literal = "Identifier index out of bounds.";
return error;
}
token.literal = identifiers[identifier_pos];
} break;
case GDScriptTokenizer::Token::ERROR:
case GDScriptTokenizer::Token::LITERAL: {
// Get literal from map.
int constant_pos = token_type >> TOKEN_BITS;
if (unlikely(constant_pos >= constants.size())) {
Token error;
error.type = Token::ERROR;
error.literal = "Constant index out of bounds.";
return error;
}
token.literal = constants[constant_pos];
} break;
default:
break;
}
return token;
}
Error GDScriptTokenizerBuffer::set_code_buffer(const Vector<uint8_t> &p_buffer) {
const uint8_t *buf = p_buffer.ptr();
ERR_FAIL_COND_V(p_buffer.size() < 12 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA);
int version = decode_uint32(&buf[4]);
ERR_FAIL_COND_V_MSG(version > TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is too recent! Please use a newer engine version.");
int decompressed_size = decode_uint32(&buf[8]);
Vector<uint8_t> contents;
if (decompressed_size == 0) {
contents = p_buffer.slice(12);
} else {
contents.resize(decompressed_size);
int result = Compression::decompress(contents.ptrw(), contents.size(), &buf[12], p_buffer.size() - 12, Compression::MODE_ZSTD);
ERR_FAIL_COND_V_MSG(result != decompressed_size, ERR_INVALID_DATA, "Error decompressing GDScript tokenizer buffer.");
}
int total_len = contents.size();
buf = contents.ptr();
uint32_t identifier_count = decode_uint32(&buf[0]);
uint32_t constant_count = decode_uint32(&buf[4]);
uint32_t token_line_count = decode_uint32(&buf[8]);
uint32_t token_count = decode_uint32(&buf[16]);
const uint8_t *b = &buf[20];
total_len -= 20;
identifiers.resize(identifier_count);
for (uint32_t i = 0; i < identifier_count; i++) {
uint32_t len = decode_uint32(b);
total_len -= 4;
ERR_FAIL_COND_V((len * 4u) > (uint32_t)total_len, ERR_INVALID_DATA);
b += 4;
Vector<uint32_t> cs;
cs.resize(len);
for (uint32_t j = 0; j < len; j++) {
uint8_t tmp[4];
for (uint32_t k = 0; k < 4; k++) {
tmp[k] = b[j * 4 + k] ^ 0xb6;
}
cs.write[j] = decode_uint32(tmp);
}
String s(reinterpret_cast<const char32_t *>(cs.ptr()), len);
b += len * 4;
total_len -= len * 4;
identifiers.write[i] = s;
}
constants.resize(constant_count);
for (uint32_t i = 0; i < constant_count; i++) {
Variant v;
int len;
Error err = decode_variant(v, b, total_len, &len, false);
if (err) {
return err;
}
b += len;
total_len -= len;
constants.write[i] = v;
}
for (uint32_t i = 0; i < token_line_count; i++) {
ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
uint32_t token_index = decode_uint32(b);
b += 4;
uint32_t line = decode_uint32(b);
b += 4;
total_len -= 8;
token_lines[token_index] = line;
}
for (uint32_t i = 0; i < token_line_count; i++) {
ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
uint32_t token_index = decode_uint32(b);
b += 4;
uint32_t column = decode_uint32(b);
b += 4;
total_len -= 8;
token_columns[token_index] = column;
}
tokens.resize(token_count);
for (uint32_t i = 0; i < token_count; i++) {
int token_len = 5;
if ((*b) & TOKEN_BYTE_MASK) {
token_len = 8;
}
ERR_FAIL_COND_V(total_len < token_len, ERR_INVALID_DATA);
Token token = _binary_to_token(b);
b += token_len;
ERR_FAIL_INDEX_V(token.type, Token::TK_MAX, ERR_INVALID_DATA);
tokens.write[i] = token;
total_len -= token_len;
}
ERR_FAIL_COND_V(total_len > 0, ERR_INVALID_DATA);
return OK;
}
Vector<uint8_t> GDScriptTokenizerBuffer::parse_code_string(const String &p_code, CompressMode p_compress_mode) {
HashMap<StringName, uint32_t> identifier_map;
HashMap<Variant, uint32_t, VariantHasher, VariantComparator> constant_map;
Vector<uint8_t> token_buffer;
HashMap<uint32_t, uint32_t> token_lines;
HashMap<uint32_t, uint32_t> token_columns;
GDScriptTokenizerText tokenizer;
tokenizer.set_source_code(p_code);
tokenizer.set_multiline_mode(true); // Ignore whitespace tokens.
Token current = tokenizer.scan();
int token_pos = 0;
int last_token_line = 0;
int token_counter = 0;
while (current.type != Token::TK_EOF) {
int token_len = _token_to_binary(current, token_buffer, token_pos, identifier_map, constant_map);
token_pos += token_len;
if (token_counter > 0 && current.start_line > last_token_line) {
token_lines[token_counter] = current.start_line;
token_columns[token_counter] = current.start_column;
}
last_token_line = current.end_line;
current = tokenizer.scan();
token_counter++;
}
// Reverse maps.
Vector<StringName> rev_identifier_map;
rev_identifier_map.resize(identifier_map.size());
for (const KeyValue<StringName, uint32_t> &E : identifier_map) {
rev_identifier_map.write[E.value] = E.key;
}
Vector<Variant> rev_constant_map;
rev_constant_map.resize(constant_map.size());
for (const KeyValue<Variant, uint32_t> &E : constant_map) {
rev_constant_map.write[E.value] = E.key;
}
HashMap<uint32_t, uint32_t> rev_token_lines;
for (const KeyValue<uint32_t, uint32_t> &E : token_lines) {
rev_token_lines[E.value] = E.key;
}
// Remove continuation lines from map.
for (int line : tokenizer.get_continuation_lines()) {
if (rev_token_lines.has(line + 1)) {
token_lines.erase(rev_token_lines[line + 1]);
token_columns.erase(rev_token_lines[line + 1]);
}
}
Vector<uint8_t> contents;
contents.resize(20);
encode_uint32(identifier_map.size(), &contents.write[0]);
encode_uint32(constant_map.size(), &contents.write[4]);
encode_uint32(token_lines.size(), &contents.write[8]);
encode_uint32(token_counter, &contents.write[16]);
int buf_pos = 20;
// Save identifiers.
for (const StringName &id : rev_identifier_map) {
String s = id.operator String();
int len = s.length();
contents.resize(buf_pos + (len + 1) * 4);
encode_uint32(len, &contents.write[buf_pos]);
buf_pos += 4;
for (int i = 0; i < len; i++) {
uint8_t tmp[4];
encode_uint32(s[i], tmp);
for (int b = 0; b < 4; b++) {
contents.write[buf_pos + b] = tmp[b] ^ 0xb6;
}
buf_pos += 4;
}
}
// Save constants.
for (const Variant &v : rev_constant_map) {
int len;
// Objects cannot be constant, never encode objects.
Error err = encode_variant(v, nullptr, len, false);
ERR_FAIL_COND_V_MSG(err != OK, Vector<uint8_t>(), "Error when trying to encode Variant.");
contents.resize(buf_pos + len);
encode_variant(v, &contents.write[buf_pos], len, false);
buf_pos += len;
}
// Save lines and columns.
contents.resize(buf_pos + token_lines.size() * 16);
for (const KeyValue<uint32_t, uint32_t> &e : token_lines) {
encode_uint32(e.key, &contents.write[buf_pos]);
buf_pos += 4;
encode_uint32(e.value, &contents.write[buf_pos]);
buf_pos += 4;
}
for (const KeyValue<uint32_t, uint32_t> &e : token_columns) {
encode_uint32(e.key, &contents.write[buf_pos]);
buf_pos += 4;
encode_uint32(e.value, &contents.write[buf_pos]);
buf_pos += 4;
}
// Store tokens.
contents.append_array(token_buffer);
Vector<uint8_t> buf;
// Save header.
buf.resize(12);
buf.write[0] = 'G';
buf.write[1] = 'D';
buf.write[2] = 'S';
buf.write[3] = 'C';
encode_uint32(TOKENIZER_VERSION, &buf.write[4]);
switch (p_compress_mode) {
case COMPRESS_NONE:
encode_uint32(0u, &buf.write[8]);
buf.append_array(contents);
break;
case COMPRESS_ZSTD: {
encode_uint32(contents.size(), &buf.write[8]);
Vector<uint8_t> compressed;
int max_size = Compression::get_max_compressed_buffer_size(contents.size(), Compression::MODE_ZSTD);
compressed.resize(max_size);
int compressed_size = Compression::compress(compressed.ptrw(), contents.ptr(), contents.size(), Compression::MODE_ZSTD);
ERR_FAIL_COND_V_MSG(compressed_size < 0, Vector<uint8_t>(), "Error compressing GDScript tokenizer buffer.");
compressed.resize(compressed_size);
buf.append_array(compressed);
} break;
}
return buf;
}
int GDScriptTokenizerBuffer::get_cursor_line() const {
return 0;
}
int GDScriptTokenizerBuffer::get_cursor_column() const {
return 0;
}
void GDScriptTokenizerBuffer::set_cursor_position(int p_line, int p_column) {
}
void GDScriptTokenizerBuffer::set_multiline_mode(bool p_state) {
multiline_mode = p_state;
}
bool GDScriptTokenizerBuffer::is_past_cursor() const {
return false;
}
void GDScriptTokenizerBuffer::push_expression_indented_block() {
indent_stack_stack.push_back(indent_stack);
}
void GDScriptTokenizerBuffer::pop_expression_indented_block() {
ERR_FAIL_COND(indent_stack_stack.is_empty());
indent_stack = indent_stack_stack.back()->get();
indent_stack_stack.pop_back();
}
GDScriptTokenizer::Token GDScriptTokenizerBuffer::scan() {
// Add final newline.
if (current >= tokens.size() && !last_token_was_newline) {
Token newline;
newline.type = Token::NEWLINE;
newline.start_line = current_line;
newline.end_line = current_line;
last_token_was_newline = true;
return newline;
}
// Resolve pending indentation change.
if (pending_indents > 0) {
pending_indents--;
Token indent;
indent.type = Token::INDENT;
indent.start_line = current_line;
indent.end_line = current_line;
return indent;
} else if (pending_indents < 0) {
pending_indents++;
Token dedent;
dedent.type = Token::DEDENT;
dedent.start_line = current_line;
dedent.end_line = current_line;
return dedent;
}
if (current >= tokens.size()) {
if (!indent_stack.is_empty()) {
pending_indents -= indent_stack.size();
indent_stack.clear();
return scan();
}
Token eof;
eof.type = Token::TK_EOF;
return eof;
};
if (!last_token_was_newline && token_lines.has(current)) {
current_line = token_lines[current];
uint32_t current_column = token_columns[current];
// Check if there's a need to indent/dedent.
if (!multiline_mode) {
uint32_t previous_indent = 0;
if (!indent_stack.is_empty()) {
previous_indent = indent_stack.back()->get();
}
if (current_column - 1 > previous_indent) {
pending_indents++;
indent_stack.push_back(current_column - 1);
} else {
while (current_column - 1 < previous_indent) {
pending_indents--;
indent_stack.pop_back();
if (indent_stack.is_empty()) {
break;
}
previous_indent = indent_stack.back()->get();
}
}
Token newline;
newline.type = Token::NEWLINE;
newline.start_line = current_line;
newline.end_line = current_line;
last_token_was_newline = true;
return newline;
}
}
last_token_was_newline = false;
Token token = tokens[current++];
return token;
}