From 4e50079f3654a95f7296f8f632aa6aa92784678f Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Thu, 15 Oct 2020 23:34:07 +0200 Subject: [PATCH] AK: Redesign HashTable to use closed hashing Instead of each hash bucket being a SinglyLinkedList, switch to using closed hashing (open addressing). Buckets are chained together via double hashing (hashing the hash until we find an unused bucket.) This greatly reduces malloc traffic, since each added element no longer allocates a new linked list node. Appears performance neutral on test-js. Can definitely be tuned and could use proper management of load factor, etc. --- AK/HashFunctions.h | 10 + AK/HashTable.h | 505 +++++++++++++++++++++------------------------ 2 files changed, 248 insertions(+), 267 deletions(-) diff --git a/AK/HashFunctions.h b/AK/HashFunctions.h index 757c43653e..8395f7a109 100644 --- a/AK/HashFunctions.h +++ b/AK/HashFunctions.h @@ -39,6 +39,16 @@ inline unsigned int_hash(u32 key) return key; } +inline unsigned double_hash(u32 key) +{ + key = ~key + (key >> 23); + key ^= (key << 12); + key ^= (key >> 7); + key ^= (key << 2); + key ^= (key >> 20); + return key; +} + inline unsigned pair_int_hash(u32 key1, u32 key2) { return int_hash((int_hash(key1) * 209) ^ (int_hash(key2 * 413))); diff --git a/AK/HashTable.h b/AK/HashTable.h index f74dfa5487..8d3bf7cd2a 100644 --- a/AK/HashTable.h +++ b/AK/HashTable.h @@ -26,11 +26,11 @@ #pragma once -#include -#include +#include +#include #include -#include -#include +#include +#include namespace AK { @@ -39,137 +39,102 @@ enum class HashSetResult { ReplacedExistingEntry }; -template -class HashTable; - -template +template class HashTableIterator { -public: - bool operator!=(const HashTableIterator& other) const - { - if (m_is_end && other.m_is_end) - return false; - return &m_table != &other.m_table - || m_is_end != other.m_is_end - || m_bucket_index != other.m_bucket_index - || m_bucket_iterator != other.m_bucket_iterator; - } - bool operator==(const HashTableIterator& other) const { return !(*this != other); } - ElementType& operator*() { return *m_bucket_iterator; } - ElementType* operator->() { return m_bucket_iterator.operator->(); } - HashTableIterator& operator++() - { - skip_to_next(); - return *this; - } - - void skip_to_next() - { - while (!m_is_end) { - if (m_bucket_iterator.is_end()) { - ++m_bucket_index; - if (m_bucket_index >= m_table.capacity()) { - m_is_end = true; - return; - } - m_bucket_iterator = m_table.bucket(m_bucket_index).begin(); - } else { - ++m_bucket_iterator; - } - if (!m_bucket_iterator.is_end()) - return; - } - } - -private: friend HashTableType; - explicit HashTableIterator(HashTableType& table, bool is_end, BucketIteratorType bucket_iterator = {}, size_t bucket_index = 0) - : m_table(table) - , m_bucket_index(bucket_index) - , m_is_end(is_end) - , m_bucket_iterator(bucket_iterator) +public: + bool operator==(const HashTableIterator& other) const { return m_bucket == other.m_bucket; } + bool operator!=(const HashTableIterator& other) const { return m_bucket != other.m_bucket; } + T& operator*() { return *m_bucket->slot(); } + T* operator->() { return m_bucket->slot(); } + void operator++() { skip_to_next(); } + +private: + void skip_to_next() { - ASSERT(!table.m_clearing); - ASSERT(!table.m_rehashing); - if (!is_end && !m_table.is_empty() && m_bucket_iterator.is_end()) { - m_bucket_iterator = m_table.bucket(0).begin(); - if (m_bucket_iterator.is_end()) - skip_to_next(); - } + if (!m_bucket) + return; + do { + ++m_bucket; + if (m_bucket->used) + return; + } while (!m_bucket->end); + if (m_bucket->end) + m_bucket = nullptr; } - HashTableType& m_table; - size_t m_bucket_index { 0 }; - bool m_is_end { false }; - BucketIteratorType m_bucket_iterator; + explicit HashTableIterator(BucketType* bucket) + : m_bucket(bucket) + { + } + + BucketType* m_bucket { nullptr }; }; template class HashTable { -private: - using Bucket = SinglyLinkedList; + struct Bucket { + bool used; + bool deleted; + bool end; + alignas(T) u8 storage[sizeof(T)]; + + T* slot() { return reinterpret_cast(storage); } + const T* slot() const { return reinterpret_cast(storage); } + }; public: HashTable() { } - HashTable(size_t capacity) - : m_buckets(new Bucket[capacity]) - , m_capacity(capacity) - { - } + HashTable(size_t capacity) { rehash(capacity); } + ~HashTable() { clear(); } + HashTable(const HashTable& other) { - ensure_capacity(other.size()); + rehash(other.capacity()); for (auto& it : other) set(it); } + HashTable& operator=(const HashTable& other) { if (this != &other) { clear(); - ensure_capacity(other.size()); + rehash(other.capacity()); for (auto& it : other) set(it); } return *this; } + HashTable(HashTable&& other) : m_buckets(other.m_buckets) , m_size(other.m_size) , m_capacity(other.m_capacity) + , m_deleted_count(other.m_deleted_count) { other.m_size = 0; other.m_capacity = 0; + other.m_deleted_count = 0; other.m_buckets = nullptr; } + HashTable& operator=(HashTable&& other) { if (this != &other) { clear(); - m_buckets = other.m_buckets; - m_size = other.m_size; - m_capacity = other.m_capacity; - other.m_size = 0; - other.m_capacity = 0; - other.m_buckets = nullptr; + m_buckets = exchange(other.m_buckets, nullptr); + m_size = exchange(other.m_size, 0); + m_capacity = exchange(other.m_capacity, 0); + m_deleted_count = exchange(other.m_deleted_count, 0); } return *this; } - ~HashTable() { clear(); } bool is_empty() const { return !m_size; } size_t size() const { return m_size; } size_t capacity() const { return m_capacity; } - void ensure_capacity(size_t capacity) - { - ASSERT(capacity >= size()); - rehash(capacity); - } - - HashSetResult set(const T&); - HashSetResult set(T&&); - template void set_from(U (&from_array)[N]) { @@ -178,43 +143,93 @@ public: } } - bool contains(const T&) const; - void clear(); + void ensure_capacity(size_t capacity) + { + ASSERT(capacity >= size()); + rehash(capacity * 2); + } - using Iterator = HashTableIterator; - friend Iterator; - Iterator begin() { return Iterator(*this, is_empty()); } - Iterator end() { return Iterator(*this, true); } + bool contains(const T& value) const + { + return find(value) != end(); + } - using ConstIterator = HashTableIterator; - friend ConstIterator; - ConstIterator begin() const { return ConstIterator(*this, is_empty()); } - ConstIterator end() const { return ConstIterator(*this, true); } + using Iterator = HashTableIterator; + + Iterator begin() + { + for (size_t i = 0; i < m_capacity; ++i) { + if (m_buckets[i].used) + return Iterator(&m_buckets[i]); + } + return end(); + } + + Iterator end() + { + return Iterator(nullptr); + } + + using ConstIterator = HashTableIterator; + + ConstIterator begin() const + { + for (size_t i = 0; i < m_capacity; ++i) { + if (m_buckets[i].used) + return ConstIterator(&m_buckets[i]); + } + return end(); + } + + ConstIterator end() const + { + return ConstIterator(nullptr); + } + + void clear() + { + if (!m_buckets) + return; + + for (size_t i = 0; i < m_capacity; ++i) { + if (m_buckets[i].used) + m_buckets[i].slot()->~T(); + } + + kfree(m_buckets); + m_buckets = nullptr; + m_capacity = 0; + m_size = 0; + m_deleted_count = 0; + } + + HashSetResult set(T&& value) + { + auto& bucket = lookup_for_writing(value); + if (bucket.used) { + (*bucket.slot()) = move(value); + return HashSetResult::ReplacedExistingEntry; + } + + new (bucket.slot()) T(move(value)); + bucket.used = true; + if (bucket.deleted) { + bucket.deleted = false; + --m_deleted_count; + } + ++m_size; + return HashSetResult::InsertedNewEntry; + } + + HashSetResult set(const T& value) + { + return set(T(value)); + } template Iterator find(unsigned hash, Finder finder) { - if (is_empty()) - return end(); - size_t bucket_index; - auto& bucket = lookup_with_hash(hash, &bucket_index); - auto bucket_iterator = bucket.find(finder); - if (bucket_iterator != bucket.end()) - return Iterator(*this, false, bucket_iterator, bucket_index); - return end(); - } - - template - ConstIterator find(unsigned hash, Finder finder) const - { - if (is_empty()) - return end(); - size_t bucket_index; - auto& bucket = lookup_with_hash(hash, &bucket_index); - auto bucket_iterator = bucket.find(finder); - if (bucket_iterator != bucket.end()) - return ConstIterator(*this, false, bucket_iterator, bucket_index); - return end(); + return Iterator(lookup_with_hash(hash, move(finder))); } Iterator find(const T& value) @@ -222,6 +237,12 @@ public: return find(TraitsForT::hash(value), [&](auto& other) { return TraitsForT::equals(value, other); }); } + template + ConstIterator find(unsigned hash, Finder finder) const + { + return ConstIterator(lookup_with_hash(hash, move(finder))); + } + ConstIterator find(const T& value) const { return find(TraitsForT::hash(value), [&](auto& other) { return TraitsForT::equals(value, other); }); @@ -237,170 +258,120 @@ public: return false; } - void remove(Iterator); + void remove(Iterator iterator) + { + ASSERT(iterator.m_bucket); + auto& bucket = *iterator.m_bucket; + ASSERT(bucket.used); + ASSERT(!bucket.end); + ASSERT(!bucket.deleted); + bucket.slot()->~T(); + bucket.used = false; + bucket.deleted = true; + --m_size; + ++m_deleted_count; + } private: - Bucket& lookup(const T&, size_t* bucket_index = nullptr); - const Bucket& lookup(const T&, size_t* bucket_index = nullptr) const; - - Bucket& lookup_with_hash(unsigned hash, size_t* bucket_index) + void insert_during_rehash(T&& value) { - if (bucket_index) - *bucket_index = hash % m_capacity; - return m_buckets[hash % m_capacity]; + auto& bucket = lookup_for_writing(value); + new (bucket.slot()) T(move(value)); + bucket.used = true; } - const Bucket& lookup_with_hash(unsigned hash, size_t* bucket_index) const + void rehash(size_t new_capacity) { - if (bucket_index) - *bucket_index = hash % m_capacity; - return m_buckets[hash % m_capacity]; + new_capacity = max(new_capacity, static_cast(4)); + + auto* old_buckets = m_buckets; + auto old_capacity = m_capacity; + + m_buckets = (Bucket*)kmalloc(sizeof(Bucket) * (new_capacity + 1)); + __builtin_memset(m_buckets, 0, sizeof(Bucket) * (new_capacity + 1)); + m_capacity = new_capacity; + m_deleted_count = 0; + + m_buckets[m_capacity].end = true; + + if (!old_buckets) + return; + + for (size_t i = 0; i < old_capacity; ++i) { + auto& old_bucket = old_buckets[i]; + if (old_bucket.used) { + insert_during_rehash(move(*old_bucket.slot())); + old_bucket.slot()->~T(); + } + } + + kfree(old_buckets); } - void rehash(size_t capacity); - void insert(const T&); - void insert(T&&); + template + Bucket* lookup_with_hash(unsigned hash, Finder finder, Bucket** usable_bucket_for_writing = nullptr) const + { + if (is_empty()) + return nullptr; + size_t bucket_index = hash % m_capacity; + for (;;) { + auto& bucket = m_buckets[bucket_index]; - Bucket& bucket(size_t index) { return m_buckets[index]; } - const Bucket& bucket(size_t index) const { return m_buckets[index]; } + if (usable_bucket_for_writing && !*usable_bucket_for_writing && !bucket.used) { + *usable_bucket_for_writing = &bucket; + } + + if (bucket.used && finder(*bucket.slot())) + return &bucket; + + if (!bucket.used && !bucket.deleted) + return nullptr; + + hash = double_hash(hash); + bucket_index = hash % m_capacity; + } + } + + const Bucket* lookup_for_reading(const T& value) const + { + return lookup_with_hash(TraitsForT::hash(value), [&value](auto& entry) { return TraitsForT::equals(entry, value); }); + } + + Bucket& lookup_for_writing(const T& value) + { + auto hash = TraitsForT::hash(value); + Bucket* usable_bucket_for_writing = nullptr; + if (auto* bucket_for_reading = lookup_with_hash( + hash, + [&value](auto& entry) { return TraitsForT::equals(entry, value); }, + &usable_bucket_for_writing)) { + return *const_cast(bucket_for_reading); + } + + if ((used_bucket_count() + 1) >= m_capacity) + rehash((size() + 1) * 2); + else if (usable_bucket_for_writing) + return *usable_bucket_for_writing; + + size_t bucket_index = hash % m_capacity; + + for (;;) { + auto& bucket = m_buckets[bucket_index]; + if (!bucket.used) + return bucket; + hash = double_hash(hash); + bucket_index = hash % m_capacity; + } + } + + size_t used_bucket_count() const { return m_size + m_deleted_count; } Bucket* m_buckets { nullptr }; - size_t m_size { 0 }; size_t m_capacity { 0 }; - bool m_clearing { false }; - bool m_rehashing { false }; + size_t m_deleted_count { 0 }; }; -template -HashSetResult HashTable::set(T&& value) -{ - if (!m_capacity) - rehash(1); - auto& bucket = lookup(value); - for (auto& e : bucket) { - if (TraitsForT::equals(e, value)) { - e = move(value); - return HashSetResult::ReplacedExistingEntry; - } - } - if (size() >= capacity()) { - rehash(size() + 1); - insert(move(value)); - } else { - bucket.append(move(value)); - } - m_size++; - return HashSetResult::InsertedNewEntry; -} - -template -HashSetResult HashTable::set(const T& value) -{ - if (!m_capacity) - rehash(1); - auto& bucket = lookup(value); - for (auto& e : bucket) { - if (TraitsForT::equals(e, value)) { - e = value; - return HashSetResult::ReplacedExistingEntry; - } - } - if (size() >= capacity()) { - rehash(size() + 1); - insert(value); - } else { - bucket.append(value); - } - m_size++; - return HashSetResult::InsertedNewEntry; -} - -template -void HashTable::rehash(size_t new_capacity) -{ - TemporaryChange change(m_rehashing, true); - new_capacity *= 2; - auto* new_buckets = new Bucket[new_capacity]; - auto* old_buckets = m_buckets; - size_t old_capacity = m_capacity; - m_buckets = new_buckets; - m_capacity = new_capacity; - - for (size_t i = 0; i < old_capacity; ++i) { - for (auto& value : old_buckets[i]) { - insert(move(value)); - } - } - - delete[] old_buckets; -} - -template -void HashTable::clear() -{ - TemporaryChange change(m_clearing, true); - if (m_buckets) { - delete[] m_buckets; - m_buckets = nullptr; - } - m_capacity = 0; - m_size = 0; -} - -template -void HashTable::insert(T&& value) -{ - auto& bucket = lookup(value); - bucket.append(move(value)); -} - -template -void HashTable::insert(const T& value) -{ - auto& bucket = lookup(value); - bucket.append(value); -} - -template -bool HashTable::contains(const T& value) const -{ - if (is_empty()) - return false; - auto& bucket = lookup(value); - for (auto& e : bucket) { - if (TraitsForT::equals(e, value)) - return true; - } - return false; -} - -template -void HashTable::remove(Iterator it) -{ - ASSERT(!is_empty()); - m_buckets[it.m_bucket_index].remove(it.m_bucket_iterator); - --m_size; -} - -template -auto HashTable::lookup(const T& value, size_t* bucket_index) -> Bucket& -{ - unsigned hash = TraitsForT::hash(value); - if (bucket_index) - *bucket_index = hash % m_capacity; - return m_buckets[hash % m_capacity]; -} - -template -auto HashTable::lookup(const T& value, size_t* bucket_index) const -> const Bucket& -{ - unsigned hash = TraitsForT::hash(value); - if (bucket_index) - *bucket_index = hash % m_capacity; - return m_buckets[hash % m_capacity]; -} - } using AK::HashTable;