Kernel: Extend the lifetime of Regions during page fault handling

Previously we had a race condition in the page fault handling: We were relying on the affected Region staying alive while handling the page fault, but this was not actually guaranteed, as an munmap from another thread could result in the region being removed concurrently. This commit closes that hole by extending the lifetime of the region affected by the page fault until the handling of the page fault is complete. This is achieved by maintaing a psuedo-reference count on the region which counts the number of in-progress page faults being handled on this region, and extending the lifetime of the region while this counter is non zero. Since both the increment of the counter by the page fault handler and the spin loop waiting for it to reach 0 during Region destruction are serialized using the appropriate AddressSpace spinlock, eventual progress is guaranteed: As soon as the region is removed from the tree no more page faults on the region can start. And similarly correctness is ensured: The counter is incremented under the same lock, so any page faults that are being handled will have already incremented the counter before the region is deallocated.
2024-07-23 19:15:55 +00:00 · 2023-04-06 01:11:12 +03:00 · 2023-04-06 01:11:12 +03:00 · 1c2dbed38a
parent 003989e1b0
commit 1c2dbed38a
4 changed files with 51 additions and 32 deletions
--- a/Kernel/Memory/MemoryManager.cpp
+++ b/Kernel/Memory/MemoryManager.cpp
@ -658,16 +658,6 @@ UNMAP_AFTER_INIT void MemoryManager::initialize(u32 cpu)
    }
 }

-Region* MemoryManager::kernel_region_from_vaddr(VirtualAddress address)
-{
-    if (is_user_address(address))
-        return nullptr;
-
-    return MM.m_global_data.with([&](auto& global_data) {
-        return global_data.region_tree.find_region_containing(address);
-    });
-}
-
 Region* MemoryManager::find_user_region_from_vaddr(AddressSpace& space, VirtualAddress vaddr)
 {
    return space.find_region_containing({ vaddr, 1 });
@ -715,20 +705,6 @@ void MemoryManager::validate_syscall_preconditions(Process& process, RegisterSta
    }
 }

-Region* MemoryManager::find_region_from_vaddr(VirtualAddress vaddr)
-{
-    if (auto* region = kernel_region_from_vaddr(vaddr))
-        return region;
-    auto page_directory = PageDirectory::find_current();
-    if (!page_directory)
-        return nullptr;
-    auto* process = page_directory->process();
-    VERIFY(process);
-    return process->address_space().with([&](auto& space) {
-        return find_user_region_from_vaddr(*space, vaddr);
-    });
-}
-
 PageFaultResponse MemoryManager::handle_page_fault(PageFault const& fault)
 {
    auto faulted_in_range = [&fault](auto const* start, auto const* end) {
@ -753,11 +729,44 @@ PageFaultResponse MemoryManager::handle_page_fault(PageFault const& fault)
        return PageFaultResponse::ShouldCrash;
    }
    dbgln_if(PAGE_FAULT_DEBUG, "MM: CPU[{}] handle_page_fault({:#04x}) at {}", Processor::current_id(), fault.code(), fault.vaddr());
-    auto* region = find_region_from_vaddr(fault.vaddr());
-    if (!region) {
-        return PageFaultResponse::ShouldCrash;
+
+    // The faulting region may be unmapped concurrently to handling this page fault, and since
+    // regions are singly-owned it would usually result in the region being immediately
+    // de-allocated. To ensure the region is not de-allocated while we're still handling the
+    // fault we increase a page fault counter on the region, and the region will refrain from
+    // de-allocating itself until the counter reaches zero. (Since unmapping the region also
+    // includes removing it from the region tree while holding the address space spinlock, and
+    // because we increment the counter while still holding the spinlock it is guaranteed that
+    // we always increment the counter before it gets a chance to be deleted)
+    Region* region = nullptr;
+    if (is_user_address(fault.vaddr())) {
+        auto page_directory = PageDirectory::find_current();
+        if (!page_directory)
+            return PageFaultResponse::ShouldCrash;
+        auto* process = page_directory->process();
+        VERIFY(process);
+        region = process->address_space().with([&](auto& space) -> Region* {
+            auto* region = find_user_region_from_vaddr(*space, fault.vaddr());
+            if (!region)
+                return nullptr;
+            region->start_handling_page_fault({});
+            return region;
+        });
+    } else {
+        region = MM.m_global_data.with([&](auto& global_data) -> Region* {
+            auto* region = global_data.region_tree.find_region_containing(fault.vaddr());
+            if (!region)
+                return nullptr;
+            region->start_handling_page_fault({});
+            return region;
+        });
    }
-    return region->handle_fault(fault);
+    if (!region)
+        return PageFaultResponse::ShouldCrash;
+
+    auto response = region->handle_fault(fault);
+    region->finish_handling_page_fault({});
+    return response;
 }

 ErrorOr<NonnullOwnPtr<Region>> MemoryManager::allocate_contiguous_kernel_region(size_t size, StringView name, Region::Access access, Region::Cacheable cacheable)
--- a/Kernel/Memory/MemoryManager.h
+++ b/Kernel/Memory/MemoryManager.h
@ -249,10 +249,6 @@ private:
    static void flush_tlb_local(VirtualAddress, size_t page_count = 1);
    static void flush_tlb(PageDirectory const*, VirtualAddress, size_t page_count = 1);

-    static Region* kernel_region_from_vaddr(VirtualAddress);
-
-    static Region* find_region_from_vaddr(VirtualAddress);
-
    RefPtr<PhysicalPage> find_free_physical_page(bool);

    ALWAYS_INLINE u8* quickmap_page(PhysicalPage& page)
--- a/Kernel/Memory/Region.cpp
+++ b/Kernel/Memory/Region.cpp
@ -75,6 +75,16 @@ Region::~Region()

    if (is_kernel())
        MM.unregister_kernel_region(*this);
+
+    // Extend the lifetime of the region if there are any page faults in progress for this region's pages.
+    // Both the removal of regions from the region trees and the fetching of the regions from the tree
+    // during the start of page fault handling are serialized under the address space spinlock. This means
+    // that once the region is removed no more page faults on this region can start, so this counter will
+    // eventually reach 0. And similarly since we can only reach the region destructor once the region was
+    // removed from the appropriate region tree, it is guaranteed that any page faults that are still being
+    // handled have already increased this counter, and will be allowed to finish before deallocation.
+    while (m_in_progress_page_faults)
+        Processor::wait_check();
 }

 ErrorOr<NonnullOwnPtr<Region>> Region::create_unbacked()
--- a/Kernel/Memory/Region.h
+++ b/Kernel/Memory/Region.h
@ -207,6 +207,9 @@ public:
    [[nodiscard]] bool mmapped_from_readable() const { return m_mmapped_from_readable; }
    [[nodiscard]] bool mmapped_from_writable() const { return m_mmapped_from_writable; }

+    void start_handling_page_fault(Badge<MemoryManager>) { m_in_progress_page_faults++; };
+    void finish_handling_page_fault(Badge<MemoryManager>) { m_in_progress_page_faults--; };
+
 private:
    Region();
    Region(NonnullLockRefPtr<VMObject>, size_t offset_in_vmobject, OwnPtr<KString>, Region::Access access, Cacheable, bool shared);
@ -234,6 +237,7 @@ private:
    size_t m_offset_in_vmobject { 0 };
    LockRefPtr<VMObject> m_vmobject;
    OwnPtr<KString> m_name;
+    Atomic<u32> m_in_progress_page_faults;
    u8 m_access { Region::None };
    bool m_shared : 1 { false };
    bool m_cacheable : 1 { false };