linux/mm/ptdump.c
Ryan Roberts 426931e7e5 mm: ptdump should use ptep_get_lockless()
Patch series "Encapsulate PTE contents from non-arch code", v3.

A series to improve the encapsulation of pte entries by disallowing
non-arch code from directly dereferencing pte_t pointers.

This means that by default, the accesses change from a C dereference to a
READ_ONCE().  This is technically the correct thing to do since where
pgtables are modified by HW (for access/dirty) they are volatile and
therefore we should always ensure READ_ONCE() semantics.

But more importantly, by always using the helper, it can be overridden by
the architecture to fully encapsulate the contents of the pte.  Arch code
is deliberately not converted, as the arch code knows best.  It is
intended that arch code (arm64) will override the default with its own
implementation that can (e.g.) hide certain bits from the core code, or
determine young/dirty status by mixing in state from another source.


This patch (of 3):

The page table dumper uses walk_page_range_novma() to walk the page
tables, which does not lock the PTL before calling the pte_entry()
callback.  Therefore, the page table dumper's callback must use
ptep_get_lockless() rather than ptep_get() to ensure that the pte it reads
is not torn or otherwise corrupt when racing with writers.

Link: https://lkml.kernel.org/r/20230612151545.3317766-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20230612151545.3317766-2-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Dimitri Sivanich <dimitri.sivanich@hpe.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-06-19 16:19:24 -07:00

166 lines
4.2 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
* This is an optimization for KASAN=y case. Since all kasan page tables
* eventually point to the kasan_early_shadow_page we could call note_page()
* right away without walking through lower level page tables. This saves
* us dozens of seconds (minutes for 5-level config) while checking for
* W+X mapping or reading kernel_page_tables debugfs file.
*/
static inline int note_kasan_page_table(struct mm_walk *walk,
unsigned long addr)
{
struct ptdump_state *st = walk->private;
st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
walk->action = ACTION_CONTINUE;
return 0;
}
#endif
static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
pgd_t val = READ_ONCE(*pgd);
#if CONFIG_PGTABLE_LEVELS > 4 && \
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
return note_kasan_page_table(walk, addr);
#endif
if (st->effective_prot)
st->effective_prot(st, 0, pgd_val(val));
if (pgd_leaf(val)) {
st->note_page(st, addr, 0, pgd_val(val));
walk->action = ACTION_CONTINUE;
}
return 0;
}
static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
p4d_t val = READ_ONCE(*p4d);
#if CONFIG_PGTABLE_LEVELS > 3 && \
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
return note_kasan_page_table(walk, addr);
#endif
if (st->effective_prot)
st->effective_prot(st, 1, p4d_val(val));
if (p4d_leaf(val)) {
st->note_page(st, addr, 1, p4d_val(val));
walk->action = ACTION_CONTINUE;
}
return 0;
}
static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
pud_t val = READ_ONCE(*pud);
#if CONFIG_PGTABLE_LEVELS > 2 && \
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
return note_kasan_page_table(walk, addr);
#endif
if (st->effective_prot)
st->effective_prot(st, 2, pud_val(val));
if (pud_leaf(val)) {
st->note_page(st, addr, 2, pud_val(val));
walk->action = ACTION_CONTINUE;
}
return 0;
}
static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
pmd_t val = READ_ONCE(*pmd);
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
return note_kasan_page_table(walk, addr);
#endif
if (st->effective_prot)
st->effective_prot(st, 3, pmd_val(val));
if (pmd_leaf(val)) {
st->note_page(st, addr, 3, pmd_val(val));
walk->action = ACTION_CONTINUE;
}
return 0;
}
static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
pte_t val = ptep_get_lockless(pte);
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
st->note_page(st, addr, 4, pte_val(val));
return 0;
}
static int ptdump_hole(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
st->note_page(st, addr, depth, 0);
return 0;
}
static const struct mm_walk_ops ptdump_ops = {
.pgd_entry = ptdump_pgd_entry,
.p4d_entry = ptdump_p4d_entry,
.pud_entry = ptdump_pud_entry,
.pmd_entry = ptdump_pmd_entry,
.pte_entry = ptdump_pte_entry,
.pte_hole = ptdump_hole,
};
void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
mmap_write_lock(mm);
while (range->start != range->end) {
walk_page_range_novma(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
mmap_write_unlock(mm);
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
}