/* * Copyright (c) 2022, Lucas Chollet * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include namespace AK { CircularBuffer::CircularBuffer(ByteBuffer buffer) : m_buffer(move(buffer)) { } ErrorOr CircularBuffer::create_empty(size_t size) { auto temporary_buffer = TRY(ByteBuffer::create_uninitialized(size)); CircularBuffer circular_buffer { move(temporary_buffer) }; return circular_buffer; } ErrorOr CircularBuffer::create_initialized(ByteBuffer buffer) { CircularBuffer circular_buffer { move(buffer) }; circular_buffer.m_used_space = circular_buffer.m_buffer.size(); return circular_buffer; } size_t CircularBuffer::empty_space() const { return capacity() - m_used_space; } size_t CircularBuffer::used_space() const { return m_used_space; } size_t CircularBuffer::capacity() const { return m_buffer.size(); } size_t CircularBuffer::seekback_limit() const { return m_seekback_limit; } size_t SearchableCircularBuffer::search_limit() const { return m_seekback_limit - m_used_space; } bool CircularBuffer::is_wrapping_around() const { return capacity() <= m_reading_head + m_used_space; } Optional CircularBuffer::offset_of(StringView needle, Optional from, Optional until) const { auto const read_from = from.value_or(0); auto const read_until = until.value_or(m_used_space); VERIFY(read_from <= read_until); Array spans {}; spans[0] = next_read_span(); auto const original_span_0_size = spans[0].size(); if (read_from > 0) spans[0] = spans[0].slice(min(spans[0].size(), read_from)); if (spans[0].size() + read_from > read_until) spans[0] = spans[0].trim(read_until - read_from); else if (is_wrapping_around()) spans[1] = m_buffer.span().slice(max(original_span_0_size, read_from) - original_span_0_size, min(read_until, m_used_space) - original_span_0_size); auto maybe_found = AK::memmem(spans.begin(), spans.end(), needle.bytes()); if (maybe_found.has_value()) *maybe_found += read_from; return maybe_found; } void CircularBuffer::clear() { m_reading_head = 0; m_used_space = 0; m_seekback_limit = 0; } Bytes CircularBuffer::next_write_span() { if (is_wrapping_around()) return m_buffer.span().slice(m_reading_head + m_used_space - capacity(), capacity() - m_used_space); return m_buffer.span().slice(m_reading_head + m_used_space, capacity() - (m_reading_head + m_used_space)); } ReadonlyBytes CircularBuffer::next_read_span(size_t offset) const { auto reading_head = m_reading_head; auto used_space = m_used_space; if (offset > 0) { if (offset >= used_space) return Bytes {}; reading_head = (reading_head + offset) % capacity(); used_space -= offset; } return m_buffer.span().slice(reading_head, min(capacity() - reading_head, used_space)); } ReadonlyBytes CircularBuffer::next_seekback_span(size_t distance) const { VERIFY(m_seekback_limit <= capacity()); VERIFY(distance <= m_seekback_limit); // Note: We are adding the capacity once here to ensure that we can wrap around the negative space by using modulo. auto read_offset = (capacity() + m_reading_head + m_used_space - distance) % capacity(); return m_buffer.span().slice(read_offset, min(capacity() - read_offset, distance)); } ReadonlyBytes SearchableCircularBuffer::next_search_span(size_t distance) const { VERIFY(search_limit() <= capacity()); VERIFY(distance <= search_limit()); // Note: We are adding the capacity once here to ensure that we can wrap around the negative space by using modulo. auto read_offset = (capacity() + m_reading_head - distance) % capacity(); return m_buffer.span().slice(read_offset, min(capacity() - read_offset, distance)); } size_t CircularBuffer::write(ReadonlyBytes bytes) { auto remaining = bytes.size(); while (remaining > 0) { auto const next_span = next_write_span(); if (next_span.size() == 0) break; auto const written_bytes = bytes.slice(bytes.size() - remaining).copy_trimmed_to(next_span); m_used_space += written_bytes; m_seekback_limit += written_bytes; if (m_seekback_limit > capacity()) m_seekback_limit = capacity(); remaining -= written_bytes; } return bytes.size() - remaining; } Bytes CircularBuffer::read(Bytes bytes) { auto remaining = bytes.size(); while (remaining > 0) { auto const next_span = next_read_span(); if (next_span.size() == 0) break; auto written_bytes = next_span.copy_trimmed_to(bytes.slice(bytes.size() - remaining)); m_used_space -= written_bytes; m_reading_head += written_bytes; if (m_reading_head >= capacity()) m_reading_head -= capacity(); remaining -= written_bytes; } return bytes.trim(bytes.size() - remaining); } ErrorOr CircularBuffer::read_with_seekback(Bytes bytes, size_t distance) const { if (distance > m_seekback_limit) return Error::from_string_literal("Tried a seekback read beyond the seekback limit"); auto remaining = bytes.size(); while (remaining > 0) { auto const next_span = next_seekback_span(distance); if (next_span.size() == 0) break; auto written_bytes = next_span.copy_trimmed_to(bytes.slice(bytes.size() - remaining)); distance -= written_bytes; remaining -= written_bytes; } return bytes.trim(bytes.size() - remaining); } ErrorOr CircularBuffer::discard(size_t discarding_size) { if (m_used_space < discarding_size) return Error::from_string_literal("Can not discard more data than what the buffer contains"); m_used_space -= discarding_size; m_reading_head = (m_reading_head + discarding_size) % capacity(); return {}; } ErrorOr CircularBuffer::fill_from_stream(Stream& stream) { auto next_span = next_write_span(); if (next_span.size() == 0) return 0; auto bytes = TRY(stream.read_some(next_span)); m_used_space += bytes.size(); m_seekback_limit += bytes.size(); if (m_seekback_limit > capacity()) m_seekback_limit = capacity(); return bytes.size(); } ErrorOr CircularBuffer::flush_to_stream(Stream& stream) { auto next_span = next_read_span(); if (next_span.size() == 0) return 0; auto written_bytes = TRY(stream.write_some(next_span)); m_used_space -= written_bytes; m_reading_head += written_bytes; if (m_reading_head >= capacity()) m_reading_head -= capacity(); return written_bytes; } ErrorOr CircularBuffer::copy_from_seekback(size_t distance, size_t length) { if (distance > m_seekback_limit) return Error::from_string_literal("Tried a seekback copy beyond the seekback limit"); auto remaining_length = length; while (remaining_length > 0) { if (empty_space() == 0) break; auto next_span = next_seekback_span(distance); if (next_span.size() == 0) break; auto length_written = write(next_span.trim(remaining_length)); remaining_length -= length_written; // If we copied right from the end of the seekback area (i.e. our length is larger than the distance) // and the last copy was one complete "chunk", we can now double the distance to copy twice as much data in one go. if (remaining_length > distance && length_written == distance) distance *= 2; } return length - remaining_length; } SearchableCircularBuffer::SearchableCircularBuffer(ByteBuffer buffer) : CircularBuffer(move(buffer)) { } ErrorOr SearchableCircularBuffer::create_empty(size_t size) { auto temporary_buffer = TRY(ByteBuffer::create_uninitialized(size)); SearchableCircularBuffer circular_buffer { move(temporary_buffer) }; return circular_buffer; } ErrorOr SearchableCircularBuffer::create_initialized(ByteBuffer buffer) { SearchableCircularBuffer circular_buffer { move(buffer) }; circular_buffer.m_used_space = circular_buffer.m_buffer.size(); for (size_t i = 0; i + HASH_CHUNK_SIZE <= circular_buffer.m_buffer.size(); i++) TRY(circular_buffer.insert_location_hash(circular_buffer.m_buffer.span().slice(i, HASH_CHUNK_SIZE), i)); return circular_buffer; } ErrorOr SearchableCircularBuffer::read(Bytes bytes) { auto read_bytes_span = CircularBuffer::read(bytes); TRY(hash_last_bytes(read_bytes_span.size())); return read_bytes_span; } ErrorOr SearchableCircularBuffer::discard(size_t discarded_bytes) { TRY(CircularBuffer::discard(discarded_bytes)); TRY(hash_last_bytes(discarded_bytes)); return {}; } ErrorOr SearchableCircularBuffer::flush_to_stream(Stream& stream) { auto flushed_byte_count = TRY(CircularBuffer::flush_to_stream(stream)); TRY(hash_last_bytes(flushed_byte_count)); return flushed_byte_count; } Optional SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length) { VERIFY(minimum_length > 0); // Clip the maximum length to the amount of data that we actually store. if (maximum_length > m_used_space) maximum_length = m_used_space; if (maximum_length < minimum_length) return {}; Optional best_match; Array needle_storage; auto needle = needle_storage.span().trim(min(HASH_CHUNK_SIZE, maximum_length)); { auto needle_read_bytes = MUST(read_with_seekback(needle, used_space())); VERIFY(needle_read_bytes.size() == needle.size()); } // Try an efficient hash-based search first. if (needle.size() >= HASH_CHUNK_SIZE) { auto needle_hash = StringView { needle }.hash(); auto maybe_starting_offset = m_hash_location_map.get(needle_hash); if (maybe_starting_offset.has_value()) { Optional previous_buffer_offset; auto current_buffer_offset = maybe_starting_offset.value(); while (true) { auto current_search_offset = (capacity() + m_reading_head - current_buffer_offset) % capacity(); // Validate the hash. In case it is invalid, we can discard the rest of the chain, as the data (and everything older) got updated. Array hash_chunk_at_offset; auto hash_chunk_at_offset_span = MUST(read_with_seekback(hash_chunk_at_offset, current_search_offset + used_space())); VERIFY(hash_chunk_at_offset_span.size() == HASH_CHUNK_SIZE); auto found_chunk_hash = StringView { hash_chunk_at_offset }.hash(); if (needle_hash != found_chunk_hash) { if (!previous_buffer_offset.has_value()) m_hash_location_map.remove(needle_hash); else m_location_chain_map.remove(*previous_buffer_offset); break; } // Validate the match through the set-distance-based implementation. auto maybe_new_match = find_copy_in_seekback(Array { current_search_offset }, maximum_length, HASH_CHUNK_SIZE); // If we found a match, record it. // If we haven't found a match, we simply got a hash collision, so skip. if (maybe_new_match.has_value()) { auto new_match = maybe_new_match.release_value(); if (!best_match.has_value() || best_match->length < new_match.length) { best_match = new_match; // If we already found a result with the best possible length, then stop searching. if (best_match->length >= maximum_length) break; } } // Get the next location with the same hash from the location chain. auto maybe_next_buffer_offset = m_location_chain_map.get(current_buffer_offset); // End of the chain, nothing more to check. if (!maybe_next_buffer_offset.has_value()) break; previous_buffer_offset = current_buffer_offset; current_buffer_offset = maybe_next_buffer_offset.release_value(); } // If we found a match, return it now. if (best_match.has_value()) return best_match; } } // Try a plain memory search for smaller values. // Note: This overlaps with the hash search for chunks of size HASH_CHUNK_SIZE for the purpose of validation. if (minimum_length <= HASH_CHUNK_SIZE) { size_t haystack_offset_from_start = 0; Vector haystack; haystack.append(next_search_span(search_limit())); if (haystack[0].size() < search_limit()) haystack.append(next_search_span(search_limit() - haystack[0].size())); // TODO: `memmem` searches the memory in "natural" order, which means that it finds matches with a greater distance first. // Hash-based searching finds the shortest distances first, which is most likely better for encoding and memory efficiency. // Look into creating a `memmem_reverse`, which starts searching from the end. auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); while (memmem_match.has_value()) { auto match_offset = memmem_match.release_value(); auto corrected_match_distance = search_limit() - haystack_offset_from_start - match_offset; // Validate the match through the set-distance-based implementation and extend it to the largest size possible. auto maybe_new_match = find_copy_in_seekback(Array { corrected_match_distance }, min(maximum_length, HASH_CHUNK_SIZE), minimum_length); // If we weren't able to validate the match at all, either our memmem search returned garbage or our validation function is incorrect. Investigate. VERIFY(maybe_new_match.has_value()); auto new_match = maybe_new_match.release_value(); if (!best_match.has_value() || best_match->length < new_match.length) { best_match = new_match; // If we already found a result with the best possible length, then stop searching. if (best_match->length >= maximum_length) break; } auto size_to_discard = match_offset + 1; // Trim away the already processed bytes from the haystack. haystack_offset_from_start += size_to_discard; while (size_to_discard > 0) { if (haystack[0].size() < size_to_discard) { size_to_discard -= haystack[0].size(); haystack.remove(0); } else { haystack[0] = haystack[0].slice(size_to_discard); break; } } if (haystack.size() == 0) break; // Try and find the next match. memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); } // If we found a match of size HASH_CHUNK_SIZE, we should have already found that using the hash search. Investigate. VERIFY(!best_match.has_value() || best_match->length < HASH_CHUNK_SIZE); } return best_match; } Optional SearchableCircularBuffer::find_copy_in_seekback(ReadonlySpan distances, size_t maximum_length, size_t minimum_length) const { VERIFY(minimum_length > 0); // Clip the maximum length to the amount of data that we actually store. if (maximum_length > m_used_space) maximum_length = m_used_space; if (maximum_length < minimum_length) return Optional {}; Optional best_match; for (auto distance : distances) { // Discard distances outside the valid range. if (distance > search_limit() || distance <= 0) continue; // TODO: This does not yet support looping repetitions. if (distance < minimum_length) continue; auto current_match_length = 0ul; while (current_match_length < maximum_length) { auto haystack = next_search_span(distance - current_match_length).trim(maximum_length - current_match_length); auto needle = next_read_span(current_match_length).trim(maximum_length - current_match_length); auto submatch_length = haystack.matching_prefix_length(needle); if (submatch_length == 0) break; current_match_length += submatch_length; } // Discard matches that don't reach the minimum length. if (current_match_length < minimum_length) continue; if (!best_match.has_value() || best_match->length < current_match_length) best_match = Match { distance, current_match_length }; } return best_match; } ErrorOr SearchableCircularBuffer::insert_location_hash(ReadonlyBytes value, size_t raw_offset) { VERIFY(value.size() == HASH_CHUNK_SIZE); auto value_hash = StringView { value }.hash(); // Discard any old entries for this offset first. This should eliminate accidental loops by breaking the chain. // The actual cleanup is done on access, since we can only remove invalid references when actually walking the chain. m_location_chain_map.remove(raw_offset); // Check if we have any existing entries for this hash. // If so, we need to add it to the location chain map instead, as we will soon replace the entry in the hash location map. auto existing_entry = m_hash_location_map.get(value_hash); if (existing_entry.has_value()) TRY(m_location_chain_map.try_set(raw_offset, existing_entry.value())); TRY(m_hash_location_map.try_set(value_hash, raw_offset)); return {}; } ErrorOr SearchableCircularBuffer::hash_last_bytes(size_t count) { // Stop early if we don't have enough data overall to hash a full chunk. if (search_limit() < HASH_CHUNK_SIZE) return {}; auto remaining_recalculations = count; while (remaining_recalculations > 0) { // Note: We offset everything by HASH_CHUNK_SIZE because we have up to HASH_CHUNK_SIZE - 1 bytes that we couldn't hash before (as we had missing data). // The number of recalculations stays the same, since we now have up to HASH_CHUNK_SIZE - 1 bytes that we can't hash now. auto recalculation_span = next_search_span(min(remaining_recalculations + HASH_CHUNK_SIZE - 1, search_limit())); // If the span is smaller than a hash chunk, we need to manually craft some consecutive data to do the hashing. if (recalculation_span.size() < HASH_CHUNK_SIZE) { auto auxiliary_span = next_seekback_span(remaining_recalculations); // Ensure that our math is correct and that both spans are "adjacent". VERIFY(recalculation_span.data() + recalculation_span.size() == m_buffer.data() + m_buffer.size()); VERIFY(auxiliary_span.data() == m_buffer.data()); while (recalculation_span.size() > 0 && recalculation_span.size() + auxiliary_span.size() >= HASH_CHUNK_SIZE) { Array temporary_hash_chunk; auto copied_from_recalculation_span = recalculation_span.copy_to(temporary_hash_chunk); VERIFY(copied_from_recalculation_span == recalculation_span.size()); auto copied_from_auxiliary_span = auxiliary_span.copy_to(temporary_hash_chunk.span().slice(copied_from_recalculation_span)); VERIFY(copied_from_recalculation_span + copied_from_auxiliary_span == HASH_CHUNK_SIZE); TRY(insert_location_hash(temporary_hash_chunk, recalculation_span.data() - m_buffer.data())); recalculation_span = recalculation_span.slice(1); remaining_recalculations--; } continue; } for (size_t i = 0; i + HASH_CHUNK_SIZE <= recalculation_span.size(); i++) { auto value = recalculation_span.slice(i, HASH_CHUNK_SIZE); auto raw_offset = value.data() - m_buffer.data(); TRY(insert_location_hash(value, raw_offset)); remaining_recalculations--; } } return {}; } }