LibSQL: Redesign heap storage to support arbitrary amounts of data

Previously, `Heap` would store serialized data in blocks of 1024 bytes
regardless of the actual length. Data longer than 1024 bytes was
silently truncated causing database corruption.

This changes the heap storage to prefix every block with two new fields:
the total data size in bytes, and the next block to retrieve if the data
is longer than what can be stored inside a single block. By chaining
blocks together, we can store arbitrary amounts of data without needing
to change anything of the logic in the rest of LibSQL.

As part of these changes, the "free list" is also removed from the heap
awaiting an actual implementation: it was never used.

Note that this bumps the database version from 3 to 4, and as such
invalidates (deletes) any database opened with LibSQL that is not
version 4.
This commit is contained in:
Jelle Raaijmakers 2023-04-23 12:38:57 +02:00 committed by Tim Flynn
parent 194f846f12
commit 6601ff9d65
13 changed files with 246 additions and 180 deletions

View file

@ -6,6 +6,7 @@
<br /><input type=button onclick="setCookie(this.value)" value="cookie4=value4;SameSite=Lax" />
<br /><input type=button onclick="setCookie(this.value)" value="cookie5=value5;SameSite=Strict" />
<br /><input type=button onclick="setCookie(this.value)" value="cookie6=value6;SameSite=None" />
<br /><input type=button onclick="setPrettyLargeCookie()" value="cookie7=xxxxx..[2048 x's]" />
<br />
<h3>Invalid cookies (the browser should reject these):</h3>
@ -36,6 +37,10 @@
document.getElementById('cookies').innerHTML = document.cookie;
}
function setPrettyLargeCookie() {
setCookie('cookie7=' + 'x'.repeat(2048));
}
function setTooLargeCookie() {
const cookie = 'name=' + 'x'.repeat(4 << 10);
setCookie(cookie);

View file

@ -131,7 +131,7 @@ NonnullRefPtr<SQL::BTree> setup_btree(SQL::Serializer& serializer)
auto root_pointer = serializer.heap().user_value(0);
if (!root_pointer) {
root_pointer = serializer.heap().new_record_pointer();
root_pointer = serializer.heap().request_new_block_index();
serializer.heap().set_user_value(0, root_pointer);
}
auto btree = SQL::BTree::construct(serializer, tuple_descriptor, true, root_pointer);

View file

@ -129,7 +129,7 @@ NonnullRefPtr<SQL::HashIndex> setup_hash_index(SQL::Serializer& serializer)
auto directory_pointer = serializer.heap().user_value(0);
if (!directory_pointer) {
directory_pointer = serializer.heap().new_record_pointer();
directory_pointer = serializer.heap().request_new_block_index();
serializer.heap().set_user_value(0, directory_pointer);
}
auto hash_index = SQL::HashIndex::construct(serializer, tuple_descriptor, directory_pointer);

View file

@ -234,6 +234,24 @@ TEST_CASE(insert_with_placeholders)
}
}
TEST_CASE(insert_and_retrieve_long_text_value)
{
ScopeGuard guard([]() { unlink(db_name); });
auto database = SQL::Database::construct(db_name);
EXPECT(!database->open().is_error());
create_table(database);
StringBuilder sb;
MUST(sb.try_append_repeated('x', 8192));
auto long_string = sb.string_view();
auto result = execute(database, DeprecatedString::formatted("INSERT INTO TestSchema.TestTable VALUES ('{}', 0);", long_string));
EXPECT(result.size() == 1);
result = execute(database, "SELECT TextColumn FROM TestSchema.TestTable;");
EXPECT_EQ(result.size(), 1u);
EXPECT_EQ(result[0].row[0], long_string);
}
TEST_CASE(select_from_empty_table)
{
ScopeGuard guard([]() { unlink(db_name); });

View file

@ -37,13 +37,13 @@ void BTree::initialize_root()
{
if (pointer()) {
if (serializer().has_block(pointer())) {
serializer().get_block(pointer());
serializer().read_storage(pointer());
m_root = serializer().make_and_deserialize<TreeNode>(*this, pointer());
} else {
m_root = make<TreeNode>(*this, nullptr, pointer());
}
} else {
set_pointer(new_record_pointer());
set_pointer(request_new_block_index());
m_root = make<TreeNode>(*this, nullptr, pointer());
if (on_new_root)
on_new_root();
@ -53,7 +53,7 @@ void BTree::initialize_root()
TreeNode* BTree::new_root()
{
set_pointer(new_record_pointer());
set_pointer(request_new_block_index());
m_root = make<TreeNode>(*this, nullptr, m_root.leak_ptr(), pointer());
serializer().serialize_and_write(*m_root.ptr());
if (on_new_root)

View file

@ -197,9 +197,9 @@ ErrorOr<Vector<Row>> Database::match(TableDef& table, Key const& key)
ErrorOr<void> Database::insert(Row& row)
{
VERIFY(m_table_cache.get(row.table().key().hash()).has_value());
// TODO Check constraints
// TODO: implement table constraints such as unique, foreign key, etc.
row.set_pointer(m_heap->new_record_pointer());
row.set_pointer(m_heap->request_new_block_index());
row.set_next_pointer(row.table().pointer());
TRY(update(row));
@ -244,7 +244,8 @@ ErrorOr<void> Database::remove(Row& row)
ErrorOr<void> Database::update(Row& tuple)
{
VERIFY(m_table_cache.get(tuple.table().key().hash()).has_value());
// TODO Check constraints
// TODO: implement table constraints such as unique, foreign key, etc.
m_serializer.reset();
m_serializer.serialize_and_write<Tuple>(tuple);

View file

@ -132,7 +132,7 @@ bool HashBucket::insert(Key const& key)
m_hash_index.serializer().deserialize_block_to(pointer(), *this);
if (find_key_in_bucket(key).has_value())
return false;
if ((length() + key.length()) > Heap::BLOCK_SIZE) {
if (length() + key.length() > Block::DATA_SIZE) {
dbgln_if(SQL_DEBUG, "Adding key {} would make length exceed block size", key.to_deprecated_string());
return false;
}
@ -205,7 +205,7 @@ HashIndex::HashIndex(Serializer& serializer, NonnullRefPtr<TupleDescriptor> cons
, m_buckets()
{
if (!first_node)
set_pointer(new_record_pointer());
set_pointer(request_new_block_index());
if (serializer.has_block(first_node)) {
u32 pointer = first_node;
do {
@ -216,14 +216,14 @@ HashIndex::HashIndex(Serializer& serializer, NonnullRefPtr<TupleDescriptor> cons
pointer = m_nodes.last(); // FIXME Ugly
} while (pointer);
} else {
auto bucket = append_bucket(0u, 1u, new_record_pointer());
auto bucket = append_bucket(0u, 1u, request_new_block_index());
bucket->m_inflated = true;
serializer.serialize_and_write(*bucket);
bucket = append_bucket(1u, 1u, new_record_pointer());
bucket = append_bucket(1u, 1u, request_new_block_index());
bucket->m_inflated = true;
serializer.serialize_and_write(*bucket);
m_nodes.append(first_node);
write_directory_to_write_ahead_log();
write_directory();
}
}
@ -247,7 +247,7 @@ HashBucket* HashIndex::get_bucket_for_insert(Key const& key)
do {
dbgln_if(SQL_DEBUG, "HashIndex::get_bucket_for_insert({}) bucket {} of {}", key.to_deprecated_string(), key_hash % size(), size());
auto bucket = get_bucket(key_hash % size());
if (bucket->length() + key.length() < Heap::BLOCK_SIZE)
if (bucket->length() + key.length() < Block::DATA_SIZE)
return bucket;
dbgln_if(SQL_DEBUG, "Bucket is full (bucket size {}/length {} key length {}). Expanding directory", bucket->size(), bucket->length(), key.length());
@ -266,7 +266,7 @@ HashBucket* HashIndex::get_bucket_for_insert(Key const& key)
for (auto entry_index = (int)bucket->m_entries.size() - 1; entry_index >= 0; entry_index--) {
if (bucket->m_entries[entry_index].hash() % size() == ix) {
if (!sub_bucket->pointer())
sub_bucket->set_pointer(new_record_pointer());
sub_bucket->set_pointer(request_new_block_index());
sub_bucket->insert(bucket->m_entries.take(entry_index));
moved++;
}
@ -283,10 +283,10 @@ HashBucket* HashIndex::get_bucket_for_insert(Key const& key)
dbgln_if(SQL_DEBUG, "Nothing redistributed from bucket #{}", base_index);
bucket->set_local_depth(bucket->local_depth() + 1);
serializer().serialize_and_write(*bucket);
write_directory_to_write_ahead_log();
write_directory();
auto bucket_after_redistribution = get_bucket(key_hash % size());
if (bucket_after_redistribution->length() + key.length() < Heap::BLOCK_SIZE)
if (bucket_after_redistribution->length() + key.length() < Block::DATA_SIZE)
return bucket_after_redistribution;
}
expand();
@ -304,14 +304,14 @@ void HashIndex::expand()
bucket->m_inflated = true;
}
m_global_depth++;
write_directory_to_write_ahead_log();
write_directory();
}
void HashIndex::write_directory_to_write_ahead_log()
void HashIndex::write_directory()
{
auto num_nodes_required = (size() / HashDirectoryNode::max_pointers_in_node()) + 1;
while (m_nodes.size() < num_nodes_required)
m_nodes.append(new_record_pointer());
m_nodes.append(request_new_block_index());
size_t offset = 0u;
size_t num_node = 0u;

View file

@ -82,7 +82,7 @@ private:
HashIndex(Serializer&, NonnullRefPtr<TupleDescriptor> const&, u32);
void expand();
void write_directory_to_write_ahead_log();
void write_directory();
HashBucket* append_bucket(u32 index, u32 local_depth, u32 pointer);
HashBucket* get_bucket_for_insert(Key const&);
[[nodiscard]] HashBucket* get_bucket_by_index(u32 index);
@ -104,7 +104,7 @@ public:
void serialize(Serializer&) const;
[[nodiscard]] u32 number_of_pointers() const { return min(max_pointers_in_node(), m_hash_index.size() - m_offset); }
[[nodiscard]] bool is_last() const { return m_is_last; }
static constexpr size_t max_pointers_in_node() { return (Heap::BLOCK_SIZE - 3 * sizeof(u32)) / (2 * sizeof(u32)); }
static constexpr size_t max_pointers_in_node() { return (Block::DATA_SIZE - 3 * sizeof(u32)) / (2 * sizeof(u32)); }
private:
HashIndex& m_hash_index;

View file

@ -1,5 +1,6 @@
/*
* Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
* Copyright (c) 2023, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -42,8 +43,10 @@ ErrorOr<void> Heap::open()
} else {
file_size = stat_buffer.st_size;
}
if (file_size > 0)
m_next_block = m_end_of_file = file_size / BLOCK_SIZE;
if (file_size > 0) {
m_next_block = file_size / Block::SIZE;
m_highest_block_written = m_next_block - 1;
}
auto file = TRY(Core::File::open(name(), Core::File::OpenMode::ReadWrite));
m_file = TRY(Core::BufferedFile::create(move(file)));
@ -54,7 +57,7 @@ ErrorOr<void> Heap::open()
return error_maybe.release_error();
}
} else {
initialize_zero_block();
TRY(initialize_zero_block());
}
// FIXME: We should more gracefully handle version incompatibilities. For now, we drop the database.
@ -66,118 +69,137 @@ ErrorOr<void> Heap::open()
return open();
}
dbgln_if(SQL_DEBUG, "Heap file {} opened. Size = {}", name(), size());
dbgln_if(SQL_DEBUG, "Heap file {} opened; number of blocks = {}", name(), m_highest_block_written);
return {};
}
ErrorOr<ByteBuffer> Heap::read_block(u32 block)
bool Heap::has_block(Block::Index index) const
{
if (!m_file) {
warnln("Heap({})::read_block({}): Heap file not opened"sv, name(), block);
return Error::from_string_literal("Heap()::read_block(): Heap file not opened");
return index <= m_highest_block_written || m_write_ahead_log.contains(index);
}
ErrorOr<ByteBuffer> Heap::read_storage(Block::Index index)
{
dbgln_if(SQL_DEBUG, "{}({})", __FUNCTION__, index);
// Reconstruct the data storage from a potential chain of blocks
ByteBuffer data;
while (index > 0) {
auto block = TRY(read_block(index));
dbgln_if(SQL_DEBUG, " -> {} bytes", block.size_in_bytes());
TRY(data.try_append(block.data().bytes().slice(0, block.size_in_bytes())));
index = block.next_block();
}
return data;
}
if (auto buffer = m_write_ahead_log.get(block); buffer.has_value())
return TRY(ByteBuffer::copy(*buffer));
ErrorOr<void> Heap::write_storage(Block::Index index, ReadonlyBytes data)
{
dbgln_if(SQL_DEBUG, "{}({}, {} bytes)", __FUNCTION__, index, data.size());
VERIFY(data.size() > 0);
if (block >= m_next_block) {
warnln("Heap({})::read_block({}): block # out of range (>= {})"sv, name(), block, m_next_block);
return Error::from_string_literal("Heap()::read_block(): block # out of range");
// Split up the storage across multiple blocks if necessary, creating a chain
u32 remaining_size = static_cast<u32>(data.size());
u32 offset_in_data = 0;
while (remaining_size > 0) {
auto block_data_size = AK::min(remaining_size, Block::DATA_SIZE);
remaining_size -= block_data_size;
auto next_block_index = (remaining_size > 0) ? request_new_block_index() : 0;
auto block_data = TRY(ByteBuffer::create_uninitialized(block_data_size));
block_data.bytes().overwrite(0, data.offset(offset_in_data), block_data_size);
TRY(write_block({ index, block_data_size, next_block_index, move(block_data) }));
index = next_block_index;
offset_in_data += block_data_size;
}
return {};
}
dbgln_if(SQL_DEBUG, "Read heap block {}", block);
TRY(seek_block(block));
ErrorOr<ByteBuffer> Heap::read_raw_block(Block::Index index)
{
VERIFY(m_file);
VERIFY(index < m_next_block);
auto buffer = TRY(ByteBuffer::create_uninitialized(BLOCK_SIZE));
if (auto data = m_write_ahead_log.get(index); data.has_value())
return data.value();
TRY(m_file->seek(index * Block::SIZE, SeekMode::SetPosition));
auto buffer = TRY(ByteBuffer::create_uninitialized(Block::SIZE));
TRY(m_file->read_until_filled(buffer));
dbgln_if(SQL_DEBUG, "{:hex-dump}", buffer.bytes().trim(8));
return buffer;
}
ErrorOr<void> Heap::write_block(u32 block, ByteBuffer& buffer)
ErrorOr<Block> Heap::read_block(Block::Index index)
{
if (!m_file) {
warnln("Heap({})::write_block({}): Heap file not opened"sv, name(), block);
return Error::from_string_literal("Heap()::write_block(): Heap file not opened");
}
if (block > m_next_block) {
warnln("Heap({})::write_block({}): block # out of range (> {})"sv, name(), block, m_next_block);
return Error::from_string_literal("Heap()::write_block(): block # out of range");
}
if (buffer.size() > BLOCK_SIZE) {
warnln("Heap({})::write_block({}): Oversized block ({} > {})"sv, name(), block, buffer.size(), BLOCK_SIZE);
return Error::from_string_literal("Heap()::write_block(): Oversized block");
}
dbgln_if(SQL_DEBUG, "Read heap block {}", index);
dbgln_if(SQL_DEBUG, "Write heap block {} size {}", block, buffer.size());
TRY(seek_block(block));
auto buffer = TRY(read_raw_block(index));
auto size_in_bytes = *reinterpret_cast<u32*>(buffer.offset_pointer(0));
auto next_block = *reinterpret_cast<Block::Index*>(buffer.offset_pointer(sizeof(u32)));
auto data = TRY(buffer.slice(Block::HEADER_SIZE, Block::DATA_SIZE));
if (auto current_size = buffer.size(); current_size < BLOCK_SIZE) {
TRY(buffer.try_resize(BLOCK_SIZE));
memset(buffer.offset_pointer(current_size), 0, BLOCK_SIZE - current_size);
}
dbgln_if(SQL_DEBUG, "{:hex-dump}", buffer.bytes().trim(8));
TRY(m_file->write_until_depleted(buffer));
if (block == m_end_of_file)
m_end_of_file++;
return {};
return Block { index, size_in_bytes, next_block, move(data) };
}
ErrorOr<void> Heap::seek_block(u32 block)
ErrorOr<void> Heap::write_raw_block(Block::Index index, ReadonlyBytes data)
{
if (!m_file) {
warnln("Heap({})::seek_block({}): Heap file not opened"sv, name(), block);
return Error::from_string_literal("Heap()::seek_block(): Heap file not opened");
}
if (block > m_end_of_file) {
warnln("Heap({})::seek_block({}): Cannot seek beyond end of file at block {}"sv, name(), block, m_end_of_file);
return Error::from_string_literal("Heap()::seek_block(): Cannot seek beyond end of file");
}
dbgln_if(SQL_DEBUG, "Write raw block {}", index);
if (block == m_end_of_file)
TRY(m_file->seek(0, SeekMode::FromEndPosition));
else
TRY(m_file->seek(block * BLOCK_SIZE, SeekMode::SetPosition));
return {};
}
u32 Heap::new_record_pointer()
{
VERIFY(m_file);
if (m_free_list) {
auto block_or_error = read_block(m_free_list);
if (block_or_error.is_error()) {
warnln("FREE LIST CORRUPTION");
VERIFY_NOT_REACHED();
}
auto new_pointer = m_free_list;
memcpy(&m_free_list, block_or_error.value().offset_pointer(0), sizeof(u32));
update_zero_block();
return new_pointer;
}
return m_next_block++;
VERIFY(data.size() == Block::SIZE);
TRY(m_file->seek(index * Block::SIZE, SeekMode::SetPosition));
TRY(m_file->write_until_depleted(data));
if (index > m_highest_block_written)
m_highest_block_written = index;
return {};
}
ErrorOr<void> Heap::write_raw_block_to_wal(Block::Index index, ByteBuffer&& data)
{
dbgln_if(SQL_DEBUG, "{}(): adding raw block {} to WAL", __FUNCTION__, index);
VERIFY(index < m_next_block);
VERIFY(data.size() == Block::SIZE);
TRY(m_write_ahead_log.try_set(index, move(data)));
return {};
}
ErrorOr<void> Heap::write_block(Block const& block)
{
VERIFY(block.index() < m_next_block);
VERIFY(block.next_block() < m_next_block);
VERIFY(block.data().size() <= Block::DATA_SIZE);
auto size_in_bytes = block.size_in_bytes();
auto next_block = block.next_block();
auto heap_data = TRY(ByteBuffer::create_zeroed(Block::SIZE));
heap_data.overwrite(0, &size_in_bytes, sizeof(size_in_bytes));
heap_data.overwrite(sizeof(size_in_bytes), &next_block, sizeof(next_block));
block.data().bytes().copy_to(heap_data.bytes().slice(Block::HEADER_SIZE));
return write_raw_block_to_wal(block.index(), move(heap_data));
}
ErrorOr<void> Heap::flush()
{
VERIFY(m_file);
Vector<u32> blocks;
for (auto& wal_entry : m_write_ahead_log)
blocks.append(wal_entry.key);
quick_sort(blocks);
for (auto& block : blocks) {
auto buffer_it = m_write_ahead_log.find(block);
VERIFY(buffer_it != m_write_ahead_log.end());
dbgln_if(SQL_DEBUG, "Flushing block {} to {}", block, name());
TRY(write_block(block, buffer_it->value));
auto indices = m_write_ahead_log.keys();
quick_sort(indices);
for (auto index : indices) {
dbgln_if(SQL_DEBUG, "Flushing block {} to {}", index, name());
auto& data = m_write_ahead_log.get(index).value();
TRY(write_raw_block(index, data));
}
m_write_ahead_log.clear();
dbgln_if(SQL_DEBUG, "WAL flushed. Heap size = {}", size());
dbgln_if(SQL_DEBUG, "WAL flushed; new number of blocks = {}", m_highest_block_written);
return {};
}
@ -186,37 +208,33 @@ constexpr static auto VERSION_OFFSET = FILE_ID.length();
constexpr static auto SCHEMAS_ROOT_OFFSET = VERSION_OFFSET + sizeof(u32);
constexpr static auto TABLES_ROOT_OFFSET = SCHEMAS_ROOT_OFFSET + sizeof(u32);
constexpr static auto TABLE_COLUMNS_ROOT_OFFSET = TABLES_ROOT_OFFSET + sizeof(u32);
constexpr static auto FREE_LIST_OFFSET = TABLE_COLUMNS_ROOT_OFFSET + sizeof(u32);
constexpr static auto USER_VALUES_OFFSET = FREE_LIST_OFFSET + sizeof(u32);
constexpr static auto USER_VALUES_OFFSET = TABLE_COLUMNS_ROOT_OFFSET + sizeof(u32);
ErrorOr<void> Heap::read_zero_block()
{
auto buffer = TRY(read_block(0));
auto file_id_buffer = TRY(buffer.slice(0, FILE_ID.length()));
dbgln_if(SQL_DEBUG, "Read zero block from {}", name());
auto block = TRY(read_raw_block(0));
auto file_id_buffer = TRY(block.slice(0, FILE_ID.length()));
auto file_id = StringView(file_id_buffer);
if (file_id != FILE_ID) {
warnln("{}: Zero page corrupt. This is probably not a {} heap file"sv, name(), FILE_ID);
return Error::from_string_literal("Heap()::read_zero_block(): Zero page corrupt. This is probably not a SerenitySQL heap file");
}
dbgln_if(SQL_DEBUG, "Read zero block from {}", name());
memcpy(&m_version, buffer.offset_pointer(VERSION_OFFSET), sizeof(u32));
memcpy(&m_version, block.offset_pointer(VERSION_OFFSET), sizeof(u32));
dbgln_if(SQL_DEBUG, "Version: {}.{}", (m_version & 0xFFFF0000) >> 16, (m_version & 0x0000FFFF));
memcpy(&m_schemas_root, buffer.offset_pointer(SCHEMAS_ROOT_OFFSET), sizeof(u32));
memcpy(&m_schemas_root, block.offset_pointer(SCHEMAS_ROOT_OFFSET), sizeof(u32));
dbgln_if(SQL_DEBUG, "Schemas root node: {}", m_schemas_root);
memcpy(&m_tables_root, buffer.offset_pointer(TABLES_ROOT_OFFSET), sizeof(u32));
memcpy(&m_tables_root, block.offset_pointer(TABLES_ROOT_OFFSET), sizeof(u32));
dbgln_if(SQL_DEBUG, "Tables root node: {}", m_tables_root);
memcpy(&m_table_columns_root, buffer.offset_pointer(TABLE_COLUMNS_ROOT_OFFSET), sizeof(u32));
memcpy(&m_table_columns_root, block.offset_pointer(TABLE_COLUMNS_ROOT_OFFSET), sizeof(u32));
dbgln_if(SQL_DEBUG, "Table columns root node: {}", m_table_columns_root);
memcpy(&m_free_list, buffer.offset_pointer(FREE_LIST_OFFSET), sizeof(u32));
dbgln_if(SQL_DEBUG, "Free list: {}", m_free_list);
memcpy(m_user_values.data(), buffer.offset_pointer(USER_VALUES_OFFSET), m_user_values.size() * sizeof(u32));
memcpy(m_user_values.data(), block.offset_pointer(USER_VALUES_OFFSET), m_user_values.size() * sizeof(u32));
for (auto ix = 0u; ix < m_user_values.size(); ix++) {
if (m_user_values[ix])
dbgln_if(SQL_DEBUG, "User value {}: {}", ix, m_user_values[ix]);
@ -224,43 +242,40 @@ ErrorOr<void> Heap::read_zero_block()
return {};
}
void Heap::update_zero_block()
ErrorOr<void> Heap::update_zero_block()
{
dbgln_if(SQL_DEBUG, "Write zero block to {}", name());
dbgln_if(SQL_DEBUG, "Version: {}.{}", (m_version & 0xFFFF0000) >> 16, (m_version & 0x0000FFFF));
dbgln_if(SQL_DEBUG, "Schemas root node: {}", m_schemas_root);
dbgln_if(SQL_DEBUG, "Tables root node: {}", m_tables_root);
dbgln_if(SQL_DEBUG, "Table Columns root node: {}", m_table_columns_root);
dbgln_if(SQL_DEBUG, "Free list: {}", m_free_list);
for (auto ix = 0u; ix < m_user_values.size(); ix++) {
if (m_user_values[ix])
if (m_user_values[ix] > 0)
dbgln_if(SQL_DEBUG, "User value {}: {}", ix, m_user_values[ix]);
}
// FIXME: Handle an OOM failure here.
auto buffer = ByteBuffer::create_zeroed(BLOCK_SIZE).release_value_but_fixme_should_propagate_errors();
buffer.overwrite(0, FILE_ID.characters_without_null_termination(), FILE_ID.length());
buffer.overwrite(VERSION_OFFSET, &m_version, sizeof(u32));
buffer.overwrite(SCHEMAS_ROOT_OFFSET, &m_schemas_root, sizeof(u32));
buffer.overwrite(TABLES_ROOT_OFFSET, &m_tables_root, sizeof(u32));
buffer.overwrite(TABLE_COLUMNS_ROOT_OFFSET, &m_table_columns_root, sizeof(u32));
buffer.overwrite(FREE_LIST_OFFSET, &m_free_list, sizeof(u32));
buffer.overwrite(USER_VALUES_OFFSET, m_user_values.data(), m_user_values.size() * sizeof(u32));
auto buffer = TRY(ByteBuffer::create_zeroed(Block::SIZE));
auto buffer_bytes = buffer.bytes();
buffer_bytes.overwrite(0, FILE_ID.characters_without_null_termination(), FILE_ID.length());
buffer_bytes.overwrite(VERSION_OFFSET, &m_version, sizeof(u32));
buffer_bytes.overwrite(SCHEMAS_ROOT_OFFSET, &m_schemas_root, sizeof(u32));
buffer_bytes.overwrite(TABLES_ROOT_OFFSET, &m_tables_root, sizeof(u32));
buffer_bytes.overwrite(TABLE_COLUMNS_ROOT_OFFSET, &m_table_columns_root, sizeof(u32));
buffer_bytes.overwrite(USER_VALUES_OFFSET, m_user_values.data(), m_user_values.size() * sizeof(u32));
add_to_wal(0, buffer);
return write_raw_block_to_wal(0, move(buffer));
}
void Heap::initialize_zero_block()
ErrorOr<void> Heap::initialize_zero_block()
{
m_version = VERSION;
m_schemas_root = 0;
m_tables_root = 0;
m_table_columns_root = 0;
m_next_block = 1;
m_free_list = 0;
for (auto& user : m_user_values)
user = 0u;
update_zero_block();
return update_zero_block();
}
}

View file

@ -1,5 +1,6 @@
/*
* Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
* Copyright (c) 2023, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -15,6 +16,43 @@
namespace SQL {
/**
* A Block represents a single discrete chunk of 1024 bytes inside the Heap, and
* acts as the container format for the actual data we are storing. This structure
* is used for everything except block 0, the zero / super block.
*
* If data needs to be stored that is larger than 1016 bytes, Blocks are chained
* together by setting the next block index and the data is reconstructed by
* repeatedly reading blocks until the next block index is 0.
*/
class Block {
public:
typedef u32 Index;
static constexpr u32 SIZE = 1024;
static constexpr u32 HEADER_SIZE = sizeof(u32) + sizeof(Index);
static constexpr u32 DATA_SIZE = SIZE - HEADER_SIZE;
Block(Index index, u32 size_in_bytes, Index next_block, ByteBuffer data)
: m_index(index)
, m_size_in_bytes(size_in_bytes)
, m_next_block(next_block)
, m_data(move(data))
{
}
Index index() const { return m_index; }
u32 size_in_bytes() const { return m_size_in_bytes; }
Index next_block() const { return m_next_block; }
ByteBuffer const& data() const { return m_data; }
private:
Index m_index;
u32 m_size_in_bytes;
Index m_next_block;
ByteBuffer m_data;
};
/**
* A Heap is a logical container for database (SQL) data. Conceptually a
* Heap can be a database file, or a memory block, or another storage medium.
@ -23,30 +61,25 @@ namespace SQL {
*
* A Heap can be thought of the backing storage of a single database. It's
* assumed that a single SQL database is backed by a single Heap.
*
* Currently only B-Trees and tuple stores are implemented.
*/
class Heap : public Core::Object {
C_OBJECT(Heap);
public:
static constexpr u32 VERSION = 3;
static constexpr u32 BLOCK_SIZE = 1024;
static constexpr u32 VERSION = 4;
virtual ~Heap() override;
ErrorOr<void> open();
u32 size() const { return m_end_of_file; }
ErrorOr<ByteBuffer> read_block(u32);
[[nodiscard]] u32 new_record_pointer();
[[nodiscard]] bool valid() const { return static_cast<bool>(m_file); }
bool has_block(Block::Index) const;
[[nodiscard]] Block::Index request_new_block_index() { return m_next_block++; }
u32 schemas_root() const { return m_schemas_root; }
void set_schemas_root(u32 root)
{
m_schemas_root = root;
update_zero_block();
update_zero_block().release_value_but_fixme_should_propagate_errors();
}
u32 tables_root() const { return m_tables_root; }
@ -54,7 +87,7 @@ public:
void set_tables_root(u32 root)
{
m_tables_root = root;
update_zero_block();
update_zero_block().release_value_but_fixme_should_propagate_errors();
}
u32 table_columns_root() const { return m_table_columns_root; }
@ -62,7 +95,7 @@ public:
void set_table_columns_root(u32 root)
{
m_table_columns_root = root;
update_zero_block();
update_zero_block().release_value_but_fixme_should_propagate_errors();
}
u32 version() const { return m_version; }
@ -74,31 +107,30 @@ public:
void set_user_value(size_t index, u32 value)
{
m_user_values[index] = value;
update_zero_block();
update_zero_block().release_value_but_fixme_should_propagate_errors();
}
void add_to_wal(u32 block, ByteBuffer& buffer)
{
dbgln_if(SQL_DEBUG, "Adding to WAL: block #{}, size {}", block, buffer.size());
dbgln_if(SQL_DEBUG, "{:hex-dump}", buffer.bytes().trim(8));
m_write_ahead_log.set(block, buffer);
}
ErrorOr<ByteBuffer> read_storage(Block::Index);
ErrorOr<void> write_storage(Block::Index, ReadonlyBytes);
ErrorOr<void> flush();
private:
explicit Heap(DeprecatedString);
ErrorOr<void> write_block(u32, ByteBuffer&);
ErrorOr<void> seek_block(u32);
ErrorOr<ByteBuffer> read_raw_block(Block::Index);
ErrorOr<void> write_raw_block(Block::Index, ReadonlyBytes);
ErrorOr<void> write_raw_block_to_wal(Block::Index, ByteBuffer&&);
ErrorOr<Block> read_block(Block::Index);
ErrorOr<void> write_block(Block const&);
ErrorOr<void> read_zero_block();
void initialize_zero_block();
void update_zero_block();
ErrorOr<void> initialize_zero_block();
ErrorOr<void> update_zero_block();
OwnPtr<Core::BufferedFile> m_file;
u32 m_free_list { 0 };
Block::Index m_highest_block_written { 0 };
u32 m_next_block { 1 };
u32 m_end_of_file { 1 };
u32 m_schemas_root { 0 };
u32 m_tables_root { 0 };
u32 m_table_columns_root { 0 };

View file

@ -48,8 +48,7 @@ protected:
[[nodiscard]] Serializer& serializer() { return m_serializer; }
void set_pointer(u32 pointer) { m_pointer = pointer; }
u32 new_record_pointer() { return m_serializer.new_record_pointer(); }
// ByteBuffer read_block(u32);
u32 request_new_block_index() { return m_serializer.request_new_block_index(); }
private:
Serializer m_serializer;

View file

@ -12,7 +12,6 @@
#include <AK/Format.h>
#include <LibSQL/Forward.h>
#include <LibSQL/Heap.h>
#include <string.h>
namespace SQL {
@ -25,12 +24,9 @@ public:
{
}
void get_block(u32 pointer)
void read_storage(Block::Index block_index)
{
auto buffer_or_error = m_heap->read_block(pointer);
if (buffer_or_error.is_error())
VERIFY_NOT_REACHED();
m_buffer = buffer_or_error.value();
m_buffer = m_heap->read_storage(block_index).release_value_but_fixme_should_propagate_errors();
m_current_offset = 0;
}
@ -48,14 +44,14 @@ public:
template<typename T, typename... Args>
T deserialize_block(u32 pointer, Args&&... args)
{
get_block(pointer);
read_storage(pointer);
return deserialize<T>(forward<Args>(args)...);
}
template<typename T>
void deserialize_block_to(u32 pointer, T& t)
{
get_block(pointer);
read_storage(pointer);
return deserialize_to<T>(t);
}
@ -111,19 +107,19 @@ public:
VERIFY(!m_heap.is_null());
reset();
serialize<T>(t);
m_heap->add_to_wal(t.pointer(), m_buffer);
m_heap->write_storage(t.pointer(), m_buffer).release_value_but_fixme_should_propagate_errors();
return true;
}
[[nodiscard]] size_t offset() const { return m_current_offset; }
u32 new_record_pointer()
u32 request_new_block_index()
{
return m_heap->new_record_pointer();
return m_heap->request_new_block_index();
}
bool has_block(u32 pointer) const
{
return pointer < m_heap->size();
return m_heap->has_block(pointer);
}
Heap& heap()

View file

@ -51,7 +51,7 @@ void DownPointer::deserialize(Serializer& serializer)
{
if (m_node || !m_pointer)
return;
serializer.get_block(m_pointer);
serializer.read_storage(m_pointer);
m_node = serializer.make_and_deserialize<TreeNode>(m_owner->tree(), m_owner, m_pointer);
}
@ -87,7 +87,7 @@ TreeNode::TreeNode(BTree& tree, TreeNode* up, DownPointer& left, u32 pointer)
m_down.append(DownPointer(this, left));
m_is_leaf = left.pointer() == 0;
if (!pointer)
set_pointer(m_tree.new_record_pointer());
set_pointer(m_tree.request_new_block_index());
}
TreeNode::TreeNode(BTree& tree, TreeNode* up, TreeNode* left, u32 pointer)
@ -271,7 +271,7 @@ void TreeNode::just_insert(Key const& key, TreeNode* right)
m_entries.insert(ix, key);
VERIFY(is_leaf() == (right == nullptr));
m_down.insert(ix + 1, DownPointer(this, right));
if (length() > Heap::BLOCK_SIZE) {
if (length() > Block::DATA_SIZE) {
split();
} else {
dump_if(SQL_DEBUG, "To WAL");
@ -283,7 +283,7 @@ void TreeNode::just_insert(Key const& key, TreeNode* right)
m_entries.append(key);
m_down.empend(this, right);
if (length() > Heap::BLOCK_SIZE) {
if (length() > Block::DATA_SIZE) {
split();
} else {
dump_if(SQL_DEBUG, "To WAL");