mirror of
https://github.com/torvalds/linux
synced 2024-07-08 20:29:39 +00:00
22 hotfixes. 11 are cc:stable and the remainder address post-6.7 issues
or aren't considered appropriate for backporting. -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZbdSnwAKCRDdBJ7gKXxA jv49AQCY8eLOgE0L+25HZm99HleBwapbKJozcmsXgMPlgeFZHgEA8saExeL+Nzae 6ktxmGXoVw2t3FJ67Zr66VE3EyHVKAY= =HWuo -----END PGP SIGNATURE----- Merge tag 'mm-hotfixes-stable-2024-01-28-23-21' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "22 hotfixes. 11 are cc:stable and the remainder address post-6.7 issues or aren't considered appropriate for backporting" * tag 'mm-hotfixes-stable-2024-01-28-23-21' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (22 commits) mm: thp_get_unmapped_area must honour topdown preference mm: huge_memory: don't force huge page alignment on 32 bit userfaultfd: fix mmap_changing checking in mfill_atomic_hugetlb selftests/mm: ksm_tests should only MADV_HUGEPAGE valid memory scs: add CONFIG_MMU dependency for vfree_atomic() mm/memory: fix folio_set_dirty() vs. folio_mark_dirty() in zap_pte_range() mm/huge_memory: fix folio_set_dirty() vs. folio_mark_dirty() selftests/mm: Update va_high_addr_switch.sh to check CPU for la57 flag selftests: mm: fix map_hugetlb failure on 64K page size systems MAINTAINERS: supplement of zswap maintainers update stackdepot: make fast paths lock-less again stackdepot: add stats counters exported via debugfs mm, kmsan: fix infinite recursion due to RCU critical section mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again selftests/mm: switch to bash from sh MAINTAINERS: add man-pages git trees mm: memcontrol: don't throttle dying tasks on memory.high mm: mmap: map MAP_STACK to VM_NOHUGEPAGE uprobes: use pagesize-aligned virtual address when replacing pages selftests/mm: mremap_test: fix build warning ...
This commit is contained in:
commit
6f3d7d5ced
13
CREDITS
13
CREDITS
|
@ -2161,6 +2161,19 @@ N: Mike Kravetz
|
||||||
E: mike.kravetz@oracle.com
|
E: mike.kravetz@oracle.com
|
||||||
D: Maintenance and development of the hugetlb subsystem
|
D: Maintenance and development of the hugetlb subsystem
|
||||||
|
|
||||||
|
N: Seth Jennings
|
||||||
|
E: sjenning@redhat.com
|
||||||
|
D: Creation and maintenance of zswap
|
||||||
|
|
||||||
|
N: Dan Streetman
|
||||||
|
E: ddstreet@ieee.org
|
||||||
|
D: Maintenance and development of zswap
|
||||||
|
D: Creation and maintenance of the zpool API
|
||||||
|
|
||||||
|
N: Vitaly Wool
|
||||||
|
E: vitaly.wool@konsulko.com
|
||||||
|
D: Maintenance and development of zswap
|
||||||
|
|
||||||
N: Andreas S. Krebs
|
N: Andreas S. Krebs
|
||||||
E: akrebs@altavista.net
|
E: akrebs@altavista.net
|
||||||
D: CYPRESS CY82C693 chipset IDE, Digital's PC-Alpha 164SX boards
|
D: CYPRESS CY82C693 chipset IDE, Digital's PC-Alpha 164SX boards
|
||||||
|
|
11
MAINTAINERS
11
MAINTAINERS
|
@ -12903,6 +12903,8 @@ M: Alejandro Colomar <alx@kernel.org>
|
||||||
L: linux-man@vger.kernel.org
|
L: linux-man@vger.kernel.org
|
||||||
S: Maintained
|
S: Maintained
|
||||||
W: http://www.kernel.org/doc/man-pages
|
W: http://www.kernel.org/doc/man-pages
|
||||||
|
T: git git://git.kernel.org/pub/scm/docs/man-pages/man-pages.git
|
||||||
|
T: git git://www.alejandro-colomar.es/src/alx/linux/man-pages/man-pages.git
|
||||||
|
|
||||||
MANAGEMENT COMPONENT TRANSPORT PROTOCOL (MCTP)
|
MANAGEMENT COMPONENT TRANSPORT PROTOCOL (MCTP)
|
||||||
M: Jeremy Kerr <jk@codeconstruct.com.au>
|
M: Jeremy Kerr <jk@codeconstruct.com.au>
|
||||||
|
@ -24341,13 +24343,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git
|
||||||
F: Documentation/filesystems/zonefs.rst
|
F: Documentation/filesystems/zonefs.rst
|
||||||
F: fs/zonefs/
|
F: fs/zonefs/
|
||||||
|
|
||||||
ZPOOL COMPRESSED PAGE STORAGE API
|
|
||||||
M: Dan Streetman <ddstreet@ieee.org>
|
|
||||||
L: linux-mm@kvack.org
|
|
||||||
S: Maintained
|
|
||||||
F: include/linux/zpool.h
|
|
||||||
F: mm/zpool.c
|
|
||||||
|
|
||||||
ZR36067 VIDEO FOR LINUX DRIVER
|
ZR36067 VIDEO FOR LINUX DRIVER
|
||||||
M: Corentin Labbe <clabbe@baylibre.com>
|
M: Corentin Labbe <clabbe@baylibre.com>
|
||||||
L: mjpeg-users@lists.sourceforge.net
|
L: mjpeg-users@lists.sourceforge.net
|
||||||
|
@ -24399,7 +24394,9 @@ M: Nhat Pham <nphamcs@gmail.com>
|
||||||
L: linux-mm@kvack.org
|
L: linux-mm@kvack.org
|
||||||
S: Maintained
|
S: Maintained
|
||||||
F: Documentation/admin-guide/mm/zswap.rst
|
F: Documentation/admin-guide/mm/zswap.rst
|
||||||
|
F: include/linux/zpool.h
|
||||||
F: include/linux/zswap.h
|
F: include/linux/zswap.h
|
||||||
|
F: mm/zpool.c
|
||||||
F: mm/zswap.c
|
F: mm/zswap.c
|
||||||
|
|
||||||
THE REST
|
THE REST
|
||||||
|
|
|
@ -673,6 +673,7 @@ config SHADOW_CALL_STACK
|
||||||
bool "Shadow Call Stack"
|
bool "Shadow Call Stack"
|
||||||
depends on ARCH_SUPPORTS_SHADOW_CALL_STACK
|
depends on ARCH_SUPPORTS_SHADOW_CALL_STACK
|
||||||
depends on DYNAMIC_FTRACE_WITH_ARGS || DYNAMIC_FTRACE_WITH_REGS || !FUNCTION_GRAPH_TRACER
|
depends on DYNAMIC_FTRACE_WITH_ARGS || DYNAMIC_FTRACE_WITH_REGS || !FUNCTION_GRAPH_TRACER
|
||||||
|
depends on MMU
|
||||||
help
|
help
|
||||||
This option enables the compiler's Shadow Call Stack, which
|
This option enables the compiler's Shadow Call Stack, which
|
||||||
uses a shadow stack to protect function return addresses from
|
uses a shadow stack to protect function return addresses from
|
||||||
|
|
|
@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr)
|
||||||
{
|
{
|
||||||
unsigned long x = (unsigned long)addr;
|
unsigned long x = (unsigned long)addr;
|
||||||
unsigned long y = x - __START_KERNEL_map;
|
unsigned long y = x - __START_KERNEL_map;
|
||||||
|
bool ret;
|
||||||
|
|
||||||
/* use the carry flag to determine if x was < __START_KERNEL_map */
|
/* use the carry flag to determine if x was < __START_KERNEL_map */
|
||||||
if (unlikely(x > y)) {
|
if (unlikely(x > y)) {
|
||||||
|
@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return pfn_valid(x >> PAGE_SHIFT);
|
/*
|
||||||
|
* pfn_valid() relies on RCU, and may call into the scheduler on exiting
|
||||||
|
* the critical section. However, this would result in recursion with
|
||||||
|
* KMSAN. Therefore, disable preemption here, and re-enable preemption
|
||||||
|
* below while suppressing reschedules to avoid recursion.
|
||||||
|
*
|
||||||
|
* Note, this sacrifices occasionally breaking scheduling guarantees.
|
||||||
|
* Although, a kernel compiled with KMSAN has already given up on any
|
||||||
|
* performance guarantees due to being heavily instrumented.
|
||||||
|
*/
|
||||||
|
preempt_disable();
|
||||||
|
ret = pfn_valid(x >> PAGE_SHIFT);
|
||||||
|
preempt_enable_no_resched();
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* !MODULE */
|
#endif /* !MODULE */
|
||||||
|
|
|
@ -340,7 +340,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||||
} else {
|
} else {
|
||||||
folio_unlock(folio);
|
folio_unlock(folio);
|
||||||
|
|
||||||
if (!folio_test_has_hwpoisoned(folio))
|
if (!folio_test_hwpoison(folio))
|
||||||
want = nr;
|
want = nr;
|
||||||
else {
|
else {
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -156,6 +156,7 @@ calc_vm_flag_bits(unsigned long flags)
|
||||||
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
|
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
|
||||||
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
|
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
|
||||||
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
|
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
|
||||||
|
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
|
||||||
arch_calc_vm_flag_bits(flags);
|
arch_calc_vm_flag_bits(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2013,9 +2013,9 @@ static inline int pfn_valid(unsigned long pfn)
|
||||||
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
|
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
|
||||||
return 0;
|
return 0;
|
||||||
ms = __pfn_to_section(pfn);
|
ms = __pfn_to_section(pfn);
|
||||||
rcu_read_lock();
|
rcu_read_lock_sched();
|
||||||
if (!valid_section(ms)) {
|
if (!valid_section(ms)) {
|
||||||
rcu_read_unlock();
|
rcu_read_unlock_sched();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
@ -2023,7 +2023,7 @@ static inline int pfn_valid(unsigned long pfn)
|
||||||
* the entire section-sized span.
|
* the entire section-sized span.
|
||||||
*/
|
*/
|
||||||
ret = early_section(ms) || pfn_section_valid(ms, pfn);
|
ret = early_section(ms) || pfn_section_valid(ms, pfn);
|
||||||
rcu_read_unlock();
|
rcu_read_unlock_sched();
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -537,7 +537,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = __replace_page(vma, vaddr, old_page, new_page);
|
ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
|
||||||
if (new_page)
|
if (new_page)
|
||||||
put_page(new_page);
|
put_page(new_page);
|
||||||
put_old:
|
put_old:
|
||||||
|
|
373
lib/stackdepot.c
373
lib/stackdepot.c
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
#define pr_fmt(fmt) "stackdepot: " fmt
|
#define pr_fmt(fmt) "stackdepot: " fmt
|
||||||
|
|
||||||
|
#include <linux/debugfs.h>
|
||||||
#include <linux/gfp.h>
|
#include <linux/gfp.h>
|
||||||
#include <linux/jhash.h>
|
#include <linux/jhash.h>
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
|
@ -21,8 +22,9 @@
|
||||||
#include <linux/list.h>
|
#include <linux/list.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
#include <linux/mutex.h>
|
#include <linux/mutex.h>
|
||||||
#include <linux/percpu.h>
|
|
||||||
#include <linux/printk.h>
|
#include <linux/printk.h>
|
||||||
|
#include <linux/rculist.h>
|
||||||
|
#include <linux/rcupdate.h>
|
||||||
#include <linux/refcount.h>
|
#include <linux/refcount.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
|
@ -67,12 +69,28 @@ union handle_parts {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct stack_record {
|
struct stack_record {
|
||||||
struct list_head list; /* Links in hash table or freelist */
|
struct list_head hash_list; /* Links in the hash table */
|
||||||
u32 hash; /* Hash in hash table */
|
u32 hash; /* Hash in hash table */
|
||||||
u32 size; /* Number of stored frames */
|
u32 size; /* Number of stored frames */
|
||||||
union handle_parts handle;
|
union handle_parts handle; /* Constant after initialization */
|
||||||
refcount_t count;
|
refcount_t count;
|
||||||
unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
|
union {
|
||||||
|
unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
|
||||||
|
struct {
|
||||||
|
/*
|
||||||
|
* An important invariant of the implementation is to
|
||||||
|
* only place a stack record onto the freelist iff its
|
||||||
|
* refcount is zero. Because stack records with a zero
|
||||||
|
* refcount are never considered as valid, it is safe to
|
||||||
|
* union @entries and freelist management state below.
|
||||||
|
* Conversely, as soon as an entry is off the freelist
|
||||||
|
* and its refcount becomes non-zero, the below must not
|
||||||
|
* be accessed until being placed back on the freelist.
|
||||||
|
*/
|
||||||
|
struct list_head free_list; /* Links in the freelist */
|
||||||
|
unsigned long rcu_state; /* RCU cookie */
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
#define DEPOT_STACK_RECORD_SIZE \
|
#define DEPOT_STACK_RECORD_SIZE \
|
||||||
|
@ -112,8 +130,25 @@ static LIST_HEAD(free_stacks);
|
||||||
* yet allocated or if the limit on the number of pools is reached.
|
* yet allocated or if the limit on the number of pools is reached.
|
||||||
*/
|
*/
|
||||||
static bool new_pool_required = true;
|
static bool new_pool_required = true;
|
||||||
/* Lock that protects the variables above. */
|
/* The lock must be held when performing pool or freelist modifications. */
|
||||||
static DEFINE_RWLOCK(pool_rwlock);
|
static DEFINE_RAW_SPINLOCK(pool_lock);
|
||||||
|
|
||||||
|
/* Statistics counters for debugfs. */
|
||||||
|
enum depot_counter_id {
|
||||||
|
DEPOT_COUNTER_ALLOCS,
|
||||||
|
DEPOT_COUNTER_FREES,
|
||||||
|
DEPOT_COUNTER_INUSE,
|
||||||
|
DEPOT_COUNTER_FREELIST_SIZE,
|
||||||
|
DEPOT_COUNTER_COUNT,
|
||||||
|
};
|
||||||
|
static long counters[DEPOT_COUNTER_COUNT];
|
||||||
|
static const char *const counter_names[] = {
|
||||||
|
[DEPOT_COUNTER_ALLOCS] = "allocations",
|
||||||
|
[DEPOT_COUNTER_FREES] = "frees",
|
||||||
|
[DEPOT_COUNTER_INUSE] = "in_use",
|
||||||
|
[DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size",
|
||||||
|
};
|
||||||
|
static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
|
||||||
|
|
||||||
static int __init disable_stack_depot(char *str)
|
static int __init disable_stack_depot(char *str)
|
||||||
{
|
{
|
||||||
|
@ -258,14 +293,15 @@ int stack_depot_init(void)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(stack_depot_init);
|
EXPORT_SYMBOL_GPL(stack_depot_init);
|
||||||
|
|
||||||
/* Initializes a stack depol pool. */
|
/*
|
||||||
|
* Initializes new stack depot @pool, release all its entries to the freelist,
|
||||||
|
* and update the list of pools.
|
||||||
|
*/
|
||||||
static void depot_init_pool(void *pool)
|
static void depot_init_pool(void *pool)
|
||||||
{
|
{
|
||||||
int offset;
|
int offset;
|
||||||
|
|
||||||
lockdep_assert_held_write(&pool_rwlock);
|
lockdep_assert_held(&pool_lock);
|
||||||
|
|
||||||
WARN_ON(!list_empty(&free_stacks));
|
|
||||||
|
|
||||||
/* Initialize handles and link stack records into the freelist. */
|
/* Initialize handles and link stack records into the freelist. */
|
||||||
for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
|
for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
|
||||||
|
@ -276,18 +312,36 @@ static void depot_init_pool(void *pool)
|
||||||
stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
|
stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
|
||||||
stack->handle.extra = 0;
|
stack->handle.extra = 0;
|
||||||
|
|
||||||
list_add(&stack->list, &free_stacks);
|
/*
|
||||||
|
* Stack traces of size 0 are never saved, and we can simply use
|
||||||
|
* the size field as an indicator if this is a new unused stack
|
||||||
|
* record in the freelist.
|
||||||
|
*/
|
||||||
|
stack->size = 0;
|
||||||
|
|
||||||
|
INIT_LIST_HEAD(&stack->hash_list);
|
||||||
|
/*
|
||||||
|
* Add to the freelist front to prioritize never-used entries:
|
||||||
|
* required in case there are entries in the freelist, but their
|
||||||
|
* RCU cookie still belongs to the current RCU grace period
|
||||||
|
* (there can still be concurrent readers).
|
||||||
|
*/
|
||||||
|
list_add(&stack->free_list, &free_stacks);
|
||||||
|
counters[DEPOT_COUNTER_FREELIST_SIZE]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Save reference to the pool to be used by depot_fetch_stack(). */
|
/* Save reference to the pool to be used by depot_fetch_stack(). */
|
||||||
stack_pools[pools_num] = pool;
|
stack_pools[pools_num] = pool;
|
||||||
pools_num++;
|
|
||||||
|
/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
|
||||||
|
WRITE_ONCE(pools_num, pools_num + 1);
|
||||||
|
ASSERT_EXCLUSIVE_WRITER(pools_num);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Keeps the preallocated memory to be used for a new stack depot pool. */
|
/* Keeps the preallocated memory to be used for a new stack depot pool. */
|
||||||
static void depot_keep_new_pool(void **prealloc)
|
static void depot_keep_new_pool(void **prealloc)
|
||||||
{
|
{
|
||||||
lockdep_assert_held_write(&pool_rwlock);
|
lockdep_assert_held(&pool_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If a new pool is already saved or the maximum number of
|
* If a new pool is already saved or the maximum number of
|
||||||
|
@ -310,17 +364,16 @@ static void depot_keep_new_pool(void **prealloc)
|
||||||
* number of pools is reached. In either case, take note that
|
* number of pools is reached. In either case, take note that
|
||||||
* keeping another pool is not required.
|
* keeping another pool is not required.
|
||||||
*/
|
*/
|
||||||
new_pool_required = false;
|
WRITE_ONCE(new_pool_required, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Updates references to the current and the next stack depot pools. */
|
/*
|
||||||
static bool depot_update_pools(void **prealloc)
|
* Try to initialize a new stack depot pool from either a previous or the
|
||||||
|
* current pre-allocation, and release all its entries to the freelist.
|
||||||
|
*/
|
||||||
|
static bool depot_try_init_pool(void **prealloc)
|
||||||
{
|
{
|
||||||
lockdep_assert_held_write(&pool_rwlock);
|
lockdep_assert_held(&pool_lock);
|
||||||
|
|
||||||
/* Check if we still have objects in the freelist. */
|
|
||||||
if (!list_empty(&free_stacks))
|
|
||||||
goto out_keep_prealloc;
|
|
||||||
|
|
||||||
/* Check if we have a new pool saved and use it. */
|
/* Check if we have a new pool saved and use it. */
|
||||||
if (new_pool) {
|
if (new_pool) {
|
||||||
|
@ -329,10 +382,9 @@ static bool depot_update_pools(void **prealloc)
|
||||||
|
|
||||||
/* Take note that we might need a new new_pool. */
|
/* Take note that we might need a new new_pool. */
|
||||||
if (pools_num < DEPOT_MAX_POOLS)
|
if (pools_num < DEPOT_MAX_POOLS)
|
||||||
new_pool_required = true;
|
WRITE_ONCE(new_pool_required, true);
|
||||||
|
|
||||||
/* Try keeping the preallocated memory for new_pool. */
|
return true;
|
||||||
goto out_keep_prealloc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Bail out if we reached the pool limit. */
|
/* Bail out if we reached the pool limit. */
|
||||||
|
@ -349,12 +401,32 @@ static bool depot_update_pools(void **prealloc)
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
out_keep_prealloc:
|
/* Try to find next free usable entry. */
|
||||||
/* Keep the preallocated memory for a new pool if required. */
|
static struct stack_record *depot_pop_free(void)
|
||||||
if (*prealloc)
|
{
|
||||||
depot_keep_new_pool(prealloc);
|
struct stack_record *stack;
|
||||||
return true;
|
|
||||||
|
lockdep_assert_held(&pool_lock);
|
||||||
|
|
||||||
|
if (list_empty(&free_stacks))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We maintain the invariant that the elements in front are least
|
||||||
|
* recently used, and are therefore more likely to be associated with an
|
||||||
|
* RCU grace period in the past. Consequently it is sufficient to only
|
||||||
|
* check the first entry.
|
||||||
|
*/
|
||||||
|
stack = list_first_entry(&free_stacks, struct stack_record, free_list);
|
||||||
|
if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
list_del(&stack->free_list);
|
||||||
|
counters[DEPOT_COUNTER_FREELIST_SIZE]--;
|
||||||
|
|
||||||
|
return stack;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Allocates a new stack in a stack depot pool. */
|
/* Allocates a new stack in a stack depot pool. */
|
||||||
|
@ -363,19 +435,22 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
|
||||||
{
|
{
|
||||||
struct stack_record *stack;
|
struct stack_record *stack;
|
||||||
|
|
||||||
lockdep_assert_held_write(&pool_rwlock);
|
lockdep_assert_held(&pool_lock);
|
||||||
|
|
||||||
/* Update current and new pools if required and possible. */
|
/* This should already be checked by public API entry points. */
|
||||||
if (!depot_update_pools(prealloc))
|
if (WARN_ON_ONCE(!size))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
/* Check if we have a stack record to save the stack trace. */
|
/* Check if we have a stack record to save the stack trace. */
|
||||||
if (list_empty(&free_stacks))
|
stack = depot_pop_free();
|
||||||
return NULL;
|
if (!stack) {
|
||||||
|
/* No usable entries on the freelist - try to refill the freelist. */
|
||||||
/* Get and unlink the first entry from the freelist. */
|
if (!depot_try_init_pool(prealloc))
|
||||||
stack = list_first_entry(&free_stacks, struct stack_record, list);
|
return NULL;
|
||||||
list_del(&stack->list);
|
stack = depot_pop_free();
|
||||||
|
if (WARN_ON(!stack))
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
|
/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
|
||||||
if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
|
if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
|
||||||
|
@ -394,38 +469,80 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
|
||||||
*/
|
*/
|
||||||
kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
|
kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
|
||||||
|
|
||||||
|
counters[DEPOT_COUNTER_ALLOCS]++;
|
||||||
|
counters[DEPOT_COUNTER_INUSE]++;
|
||||||
return stack;
|
return stack;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
|
static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
|
||||||
{
|
{
|
||||||
|
const int pools_num_cached = READ_ONCE(pools_num);
|
||||||
union handle_parts parts = { .handle = handle };
|
union handle_parts parts = { .handle = handle };
|
||||||
void *pool;
|
void *pool;
|
||||||
size_t offset = parts.offset << DEPOT_STACK_ALIGN;
|
size_t offset = parts.offset << DEPOT_STACK_ALIGN;
|
||||||
struct stack_record *stack;
|
struct stack_record *stack;
|
||||||
|
|
||||||
lockdep_assert_held(&pool_rwlock);
|
lockdep_assert_not_held(&pool_lock);
|
||||||
|
|
||||||
if (parts.pool_index > pools_num) {
|
if (parts.pool_index > pools_num_cached) {
|
||||||
WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
|
WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
|
||||||
parts.pool_index, pools_num, handle);
|
parts.pool_index, pools_num_cached, handle);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
pool = stack_pools[parts.pool_index];
|
pool = stack_pools[parts.pool_index];
|
||||||
if (!pool)
|
if (WARN_ON(!pool))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
stack = pool + offset;
|
stack = pool + offset;
|
||||||
|
if (WARN_ON(!refcount_read(&stack->count)))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
return stack;
|
return stack;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Links stack into the freelist. */
|
/* Links stack into the freelist. */
|
||||||
static void depot_free_stack(struct stack_record *stack)
|
static void depot_free_stack(struct stack_record *stack)
|
||||||
{
|
{
|
||||||
lockdep_assert_held_write(&pool_rwlock);
|
unsigned long flags;
|
||||||
|
|
||||||
list_add(&stack->list, &free_stacks);
|
lockdep_assert_not_held(&pool_lock);
|
||||||
|
|
||||||
|
raw_spin_lock_irqsave(&pool_lock, flags);
|
||||||
|
printk_deferred_enter();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove the entry from the hash list. Concurrent list traversal may
|
||||||
|
* still observe the entry, but since the refcount is zero, this entry
|
||||||
|
* will no longer be considered as valid.
|
||||||
|
*/
|
||||||
|
list_del_rcu(&stack->hash_list);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Due to being used from constrained contexts such as the allocators,
|
||||||
|
* NMI, or even RCU itself, stack depot cannot rely on primitives that
|
||||||
|
* would sleep (such as synchronize_rcu()) or recursively call into
|
||||||
|
* stack depot again (such as call_rcu()).
|
||||||
|
*
|
||||||
|
* Instead, get an RCU cookie, so that we can ensure this entry isn't
|
||||||
|
* moved onto another list until the next grace period, and concurrent
|
||||||
|
* RCU list traversal remains safe.
|
||||||
|
*/
|
||||||
|
stack->rcu_state = get_state_synchronize_rcu();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the entry to the freelist tail, so that older entries are
|
||||||
|
* considered first - their RCU cookie is more likely to no longer be
|
||||||
|
* associated with the current grace period.
|
||||||
|
*/
|
||||||
|
list_add_tail(&stack->free_list, &free_stacks);
|
||||||
|
|
||||||
|
counters[DEPOT_COUNTER_FREELIST_SIZE]++;
|
||||||
|
counters[DEPOT_COUNTER_FREES]++;
|
||||||
|
counters[DEPOT_COUNTER_INUSE]--;
|
||||||
|
|
||||||
|
printk_deferred_exit();
|
||||||
|
raw_spin_unlock_irqrestore(&pool_lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Calculates the hash for a stack. */
|
/* Calculates the hash for a stack. */
|
||||||
|
@ -453,22 +570,52 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
|
||||||
|
|
||||||
/* Finds a stack in a bucket of the hash table. */
|
/* Finds a stack in a bucket of the hash table. */
|
||||||
static inline struct stack_record *find_stack(struct list_head *bucket,
|
static inline struct stack_record *find_stack(struct list_head *bucket,
|
||||||
unsigned long *entries, int size,
|
unsigned long *entries, int size,
|
||||||
u32 hash)
|
u32 hash, depot_flags_t flags)
|
||||||
{
|
{
|
||||||
struct list_head *pos;
|
struct stack_record *stack, *ret = NULL;
|
||||||
struct stack_record *found;
|
|
||||||
|
|
||||||
lockdep_assert_held(&pool_rwlock);
|
/*
|
||||||
|
* Stack depot may be used from instrumentation that instruments RCU or
|
||||||
|
* tracing itself; use variant that does not call into RCU and cannot be
|
||||||
|
* traced.
|
||||||
|
*
|
||||||
|
* Note: Such use cases must take care when using refcounting to evict
|
||||||
|
* unused entries, because the stack record free-then-reuse code paths
|
||||||
|
* do call into RCU.
|
||||||
|
*/
|
||||||
|
rcu_read_lock_sched_notrace();
|
||||||
|
|
||||||
list_for_each(pos, bucket) {
|
list_for_each_entry_rcu(stack, bucket, hash_list) {
|
||||||
found = list_entry(pos, struct stack_record, list);
|
if (stack->hash != hash || stack->size != size)
|
||||||
if (found->hash == hash &&
|
continue;
|
||||||
found->size == size &&
|
|
||||||
!stackdepot_memcmp(entries, found->entries, size))
|
/*
|
||||||
return found;
|
* This may race with depot_free_stack() accessing the freelist
|
||||||
|
* management state unioned with @entries. The refcount is zero
|
||||||
|
* in that case and the below refcount_inc_not_zero() will fail.
|
||||||
|
*/
|
||||||
|
if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try to increment refcount. If this succeeds, the stack record
|
||||||
|
* is valid and has not yet been freed.
|
||||||
|
*
|
||||||
|
* If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
|
||||||
|
* to then call stack_depot_put() later, and we can assume that
|
||||||
|
* a stack record is never placed back on the freelist.
|
||||||
|
*/
|
||||||
|
if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ret = stack;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return NULL;
|
|
||||||
|
rcu_read_unlock_sched_notrace();
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
|
depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
|
||||||
|
@ -482,7 +629,6 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
|
||||||
struct page *page = NULL;
|
struct page *page = NULL;
|
||||||
void *prealloc = NULL;
|
void *prealloc = NULL;
|
||||||
bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
|
bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
|
||||||
bool need_alloc = false;
|
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
u32 hash;
|
u32 hash;
|
||||||
|
|
||||||
|
@ -505,31 +651,16 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
|
||||||
hash = hash_stack(entries, nr_entries);
|
hash = hash_stack(entries, nr_entries);
|
||||||
bucket = &stack_table[hash & stack_hash_mask];
|
bucket = &stack_table[hash & stack_hash_mask];
|
||||||
|
|
||||||
read_lock_irqsave(&pool_rwlock, flags);
|
/* Fast path: look the stack trace up without locking. */
|
||||||
printk_deferred_enter();
|
found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
|
||||||
|
if (found)
|
||||||
/* Fast path: look the stack trace up without full locking. */
|
|
||||||
found = find_stack(bucket, entries, nr_entries, hash);
|
|
||||||
if (found) {
|
|
||||||
if (depot_flags & STACK_DEPOT_FLAG_GET)
|
|
||||||
refcount_inc(&found->count);
|
|
||||||
printk_deferred_exit();
|
|
||||||
read_unlock_irqrestore(&pool_rwlock, flags);
|
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
|
||||||
|
|
||||||
/* Take note if another stack pool needs to be allocated. */
|
|
||||||
if (new_pool_required)
|
|
||||||
need_alloc = true;
|
|
||||||
|
|
||||||
printk_deferred_exit();
|
|
||||||
read_unlock_irqrestore(&pool_rwlock, flags);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Allocate memory for a new pool if required now:
|
* Allocate memory for a new pool if required now:
|
||||||
* we won't be able to do that under the lock.
|
* we won't be able to do that under the lock.
|
||||||
*/
|
*/
|
||||||
if (unlikely(can_alloc && need_alloc)) {
|
if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
|
||||||
/*
|
/*
|
||||||
* Zero out zone modifiers, as we don't have specific zone
|
* Zero out zone modifiers, as we don't have specific zone
|
||||||
* requirements. Keep the flags related to allocation in atomic
|
* requirements. Keep the flags related to allocation in atomic
|
||||||
|
@ -543,31 +674,36 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
|
||||||
prealloc = page_address(page);
|
prealloc = page_address(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
write_lock_irqsave(&pool_rwlock, flags);
|
raw_spin_lock_irqsave(&pool_lock, flags);
|
||||||
printk_deferred_enter();
|
printk_deferred_enter();
|
||||||
|
|
||||||
found = find_stack(bucket, entries, nr_entries, hash);
|
/* Try to find again, to avoid concurrently inserting duplicates. */
|
||||||
|
found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
|
||||||
if (!found) {
|
if (!found) {
|
||||||
struct stack_record *new =
|
struct stack_record *new =
|
||||||
depot_alloc_stack(entries, nr_entries, hash, &prealloc);
|
depot_alloc_stack(entries, nr_entries, hash, &prealloc);
|
||||||
|
|
||||||
if (new) {
|
if (new) {
|
||||||
list_add(&new->list, bucket);
|
/*
|
||||||
|
* This releases the stack record into the bucket and
|
||||||
|
* makes it visible to readers in find_stack().
|
||||||
|
*/
|
||||||
|
list_add_rcu(&new->hash_list, bucket);
|
||||||
found = new;
|
found = new;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
if (depot_flags & STACK_DEPOT_FLAG_GET)
|
|
||||||
refcount_inc(&found->count);
|
if (prealloc) {
|
||||||
/*
|
/*
|
||||||
* Stack depot already contains this stack trace, but let's
|
* Either stack depot already contains this stack trace, or
|
||||||
* keep the preallocated memory for future.
|
* depot_alloc_stack() did not consume the preallocated memory.
|
||||||
|
* Try to keep the preallocated memory for future.
|
||||||
*/
|
*/
|
||||||
if (prealloc)
|
depot_keep_new_pool(&prealloc);
|
||||||
depot_keep_new_pool(&prealloc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printk_deferred_exit();
|
printk_deferred_exit();
|
||||||
write_unlock_irqrestore(&pool_rwlock, flags);
|
raw_spin_unlock_irqrestore(&pool_lock, flags);
|
||||||
exit:
|
exit:
|
||||||
if (prealloc) {
|
if (prealloc) {
|
||||||
/* Stack depot didn't use this memory, free it. */
|
/* Stack depot didn't use this memory, free it. */
|
||||||
|
@ -592,7 +728,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
|
||||||
unsigned long **entries)
|
unsigned long **entries)
|
||||||
{
|
{
|
||||||
struct stack_record *stack;
|
struct stack_record *stack;
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
*entries = NULL;
|
*entries = NULL;
|
||||||
/*
|
/*
|
||||||
|
@ -604,13 +739,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
|
||||||
if (!handle || stack_depot_disabled)
|
if (!handle || stack_depot_disabled)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
read_lock_irqsave(&pool_rwlock, flags);
|
|
||||||
printk_deferred_enter();
|
|
||||||
|
|
||||||
stack = depot_fetch_stack(handle);
|
stack = depot_fetch_stack(handle);
|
||||||
|
/*
|
||||||
printk_deferred_exit();
|
* Should never be NULL, otherwise this is a use-after-put (or just a
|
||||||
read_unlock_irqrestore(&pool_rwlock, flags);
|
* corrupt handle).
|
||||||
|
*/
|
||||||
|
if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
|
||||||
|
return 0;
|
||||||
|
|
||||||
*entries = stack->entries;
|
*entries = stack->entries;
|
||||||
return stack->size;
|
return stack->size;
|
||||||
|
@ -620,29 +755,20 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch);
|
||||||
void stack_depot_put(depot_stack_handle_t handle)
|
void stack_depot_put(depot_stack_handle_t handle)
|
||||||
{
|
{
|
||||||
struct stack_record *stack;
|
struct stack_record *stack;
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
if (!handle || stack_depot_disabled)
|
if (!handle || stack_depot_disabled)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
write_lock_irqsave(&pool_rwlock, flags);
|
|
||||||
printk_deferred_enter();
|
|
||||||
|
|
||||||
stack = depot_fetch_stack(handle);
|
stack = depot_fetch_stack(handle);
|
||||||
if (WARN_ON(!stack))
|
/*
|
||||||
goto out;
|
* Should always be able to find the stack record, otherwise this is an
|
||||||
|
* unbalanced put attempt (or corrupt handle).
|
||||||
|
*/
|
||||||
|
if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
|
||||||
|
return;
|
||||||
|
|
||||||
if (refcount_dec_and_test(&stack->count)) {
|
if (refcount_dec_and_test(&stack->count))
|
||||||
/* Unlink stack from the hash table. */
|
|
||||||
list_del(&stack->list);
|
|
||||||
|
|
||||||
/* Free stack. */
|
|
||||||
depot_free_stack(stack);
|
depot_free_stack(stack);
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
printk_deferred_exit();
|
|
||||||
write_unlock_irqrestore(&pool_rwlock, flags);
|
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(stack_depot_put);
|
EXPORT_SYMBOL_GPL(stack_depot_put);
|
||||||
|
|
||||||
|
@ -690,3 +816,30 @@ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
|
||||||
return parts.extra;
|
return parts.extra;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(stack_depot_get_extra_bits);
|
EXPORT_SYMBOL(stack_depot_get_extra_bits);
|
||||||
|
|
||||||
|
static int stats_show(struct seq_file *seq, void *v)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* data race ok: These are just statistics counters, and approximate
|
||||||
|
* statistics are ok for debugging.
|
||||||
|
*/
|
||||||
|
seq_printf(seq, "pools: %d\n", data_race(pools_num));
|
||||||
|
for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
|
||||||
|
seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
DEFINE_SHOW_ATTRIBUTE(stats);
|
||||||
|
|
||||||
|
static int depot_debugfs_init(void)
|
||||||
|
{
|
||||||
|
struct dentry *dir;
|
||||||
|
|
||||||
|
if (stack_depot_disabled)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
dir = debugfs_create_dir("stackdepot", NULL);
|
||||||
|
debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
late_initcall(depot_debugfs_init);
|
||||||
|
|
|
@ -37,6 +37,7 @@
|
||||||
#include <linux/page_owner.h>
|
#include <linux/page_owner.h>
|
||||||
#include <linux/sched/sysctl.h>
|
#include <linux/sched/sysctl.h>
|
||||||
#include <linux/memory-tiers.h>
|
#include <linux/memory-tiers.h>
|
||||||
|
#include <linux/compat.h>
|
||||||
|
|
||||||
#include <asm/tlb.h>
|
#include <asm/tlb.h>
|
||||||
#include <asm/pgalloc.h>
|
#include <asm/pgalloc.h>
|
||||||
|
@ -809,7 +810,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
|
||||||
{
|
{
|
||||||
loff_t off_end = off + len;
|
loff_t off_end = off + len;
|
||||||
loff_t off_align = round_up(off, size);
|
loff_t off_align = round_up(off, size);
|
||||||
unsigned long len_pad, ret;
|
unsigned long len_pad, ret, off_sub;
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
|
||||||
|
return 0;
|
||||||
|
|
||||||
if (off_end <= off_align || (off_end - off_align) < size)
|
if (off_end <= off_align || (off_end - off_align) < size)
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -835,7 +839,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
|
||||||
if (ret == addr)
|
if (ret == addr)
|
||||||
return addr;
|
return addr;
|
||||||
|
|
||||||
ret += (off - ret) & (size - 1);
|
off_sub = (off - ret) & (size - 1);
|
||||||
|
|
||||||
|
if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
|
||||||
|
!off_sub)
|
||||||
|
return ret + size;
|
||||||
|
|
||||||
|
ret += off_sub;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2437,7 +2447,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
||||||
page = pmd_page(old_pmd);
|
page = pmd_page(old_pmd);
|
||||||
folio = page_folio(page);
|
folio = page_folio(page);
|
||||||
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
|
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
|
||||||
folio_set_dirty(folio);
|
folio_mark_dirty(folio);
|
||||||
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
|
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
|
||||||
folio_set_referenced(folio);
|
folio_set_referenced(folio);
|
||||||
folio_remove_rmap_pmd(folio, page, vma);
|
folio_remove_rmap_pmd(folio, page, vma);
|
||||||
|
@ -3563,7 +3573,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pmd_dirty(pmdval))
|
if (pmd_dirty(pmdval))
|
||||||
folio_set_dirty(folio);
|
folio_mark_dirty(folio);
|
||||||
if (pmd_write(pmdval))
|
if (pmd_write(pmdval))
|
||||||
entry = make_writable_migration_entry(page_to_pfn(page));
|
entry = make_writable_migration_entry(page_to_pfn(page));
|
||||||
else if (anon_exclusive)
|
else if (anon_exclusive)
|
||||||
|
|
|
@ -2623,8 +2623,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scheduled by try_charge() to be executed from the userland return path
|
* Reclaims memory over the high limit. Called directly from
|
||||||
* and reclaims memory over the high limit.
|
* try_charge() (context permitting), as well as from the userland
|
||||||
|
* return path where reclaim is always able to block.
|
||||||
*/
|
*/
|
||||||
void mem_cgroup_handle_over_high(gfp_t gfp_mask)
|
void mem_cgroup_handle_over_high(gfp_t gfp_mask)
|
||||||
{
|
{
|
||||||
|
@ -2643,6 +2644,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
|
||||||
current->memcg_nr_pages_over_high = 0;
|
current->memcg_nr_pages_over_high = 0;
|
||||||
|
|
||||||
retry_reclaim:
|
retry_reclaim:
|
||||||
|
/*
|
||||||
|
* Bail if the task is already exiting. Unlike memory.max,
|
||||||
|
* memory.high enforcement isn't as strict, and there is no
|
||||||
|
* OOM killer involved, which means the excess could already
|
||||||
|
* be much bigger (and still growing) than it could for
|
||||||
|
* memory.max; the dying task could get stuck in fruitless
|
||||||
|
* reclaim for a long time, which isn't desirable.
|
||||||
|
*/
|
||||||
|
if (task_is_dying())
|
||||||
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The allocating task should reclaim at least the batch size, but for
|
* The allocating task should reclaim at least the batch size, but for
|
||||||
* subsequent retries we only want to do what's necessary to prevent oom
|
* subsequent retries we only want to do what's necessary to prevent oom
|
||||||
|
@ -2693,6 +2705,9 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* Reclaim didn't manage to push usage below the limit, slow
|
||||||
|
* this allocating task down.
|
||||||
|
*
|
||||||
* If we exit early, we're guaranteed to die (since
|
* If we exit early, we're guaranteed to die (since
|
||||||
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
|
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
|
||||||
* need to account for any ill-begotten jiffies to pay them off later.
|
* need to account for any ill-begotten jiffies to pay them off later.
|
||||||
|
@ -2887,11 +2902,17 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||||
}
|
}
|
||||||
} while ((memcg = parent_mem_cgroup(memcg)));
|
} while ((memcg = parent_mem_cgroup(memcg)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reclaim is set up above to be called from the userland
|
||||||
|
* return path. But also attempt synchronous reclaim to avoid
|
||||||
|
* excessive overrun while the task is still inside the
|
||||||
|
* kernel. If this is successful, the return path will see it
|
||||||
|
* when it rechecks the overage and simply bail out.
|
||||||
|
*/
|
||||||
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
|
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
|
||||||
!(current->flags & PF_MEMALLOC) &&
|
!(current->flags & PF_MEMALLOC) &&
|
||||||
gfpflags_allow_blocking(gfp_mask)) {
|
gfpflags_allow_blocking(gfp_mask))
|
||||||
mem_cgroup_handle_over_high(gfp_mask);
|
mem_cgroup_handle_over_high(gfp_mask);
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -982,7 +982,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
|
||||||
int count = page_count(p) - 1;
|
int count = page_count(p) - 1;
|
||||||
|
|
||||||
if (extra_pins)
|
if (extra_pins)
|
||||||
count -= 1;
|
count -= folio_nr_pages(page_folio(p));
|
||||||
|
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
pr_err("%#lx: %s still referenced by %d users\n",
|
pr_err("%#lx: %s still referenced by %d users\n",
|
||||||
|
|
|
@ -1464,7 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||||
delay_rmap = 0;
|
delay_rmap = 0;
|
||||||
if (!folio_test_anon(folio)) {
|
if (!folio_test_anon(folio)) {
|
||||||
if (pte_dirty(ptent)) {
|
if (pte_dirty(ptent)) {
|
||||||
folio_set_dirty(folio);
|
folio_mark_dirty(folio);
|
||||||
if (tlb_delay_rmap(tlb)) {
|
if (tlb_delay_rmap(tlb)) {
|
||||||
delay_rmap = 1;
|
delay_rmap = 1;
|
||||||
force_flush = 1;
|
force_flush = 1;
|
||||||
|
|
|
@ -1825,15 +1825,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
|
||||||
/*
|
/*
|
||||||
* mmap_region() will call shmem_zero_setup() to create a file,
|
* mmap_region() will call shmem_zero_setup() to create a file,
|
||||||
* so use shmem's get_unmapped_area in case it can be huge.
|
* so use shmem's get_unmapped_area in case it can be huge.
|
||||||
* do_mmap() will clear pgoff, so match alignment.
|
|
||||||
*/
|
*/
|
||||||
pgoff = 0;
|
|
||||||
get_area = shmem_get_unmapped_area;
|
get_area = shmem_get_unmapped_area;
|
||||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||||
/* Ensures that larger anonymous mappings are THP aligned. */
|
/* Ensures that larger anonymous mappings are THP aligned. */
|
||||||
get_area = thp_get_unmapped_area;
|
get_area = thp_get_unmapped_area;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Always treat pgoff as zero for anonymous memory. */
|
||||||
|
if (!file)
|
||||||
|
pgoff = 0;
|
||||||
|
|
||||||
addr = get_area(file, addr, len, pgoff, flags);
|
addr = get_area(file, addr, len, pgoff, flags);
|
||||||
if (IS_ERR_VALUE(addr))
|
if (IS_ERR_VALUE(addr))
|
||||||
return addr;
|
return addr;
|
||||||
|
|
|
@ -1638,7 +1638,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
|
||||||
*/
|
*/
|
||||||
dtc->wb_thresh = __wb_calc_thresh(dtc);
|
dtc->wb_thresh = __wb_calc_thresh(dtc);
|
||||||
dtc->wb_bg_thresh = dtc->thresh ?
|
dtc->wb_bg_thresh = dtc->thresh ?
|
||||||
div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
|
div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In order to avoid the stacked BDI deadlock we need
|
* In order to avoid the stacked BDI deadlock we need
|
||||||
|
|
|
@ -469,7 +469,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
|
||||||
|
|
||||||
if (!folio)
|
if (!folio)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
mark = round_up(mark, 1UL << order);
|
mark = round_down(mark, 1UL << order);
|
||||||
if (index == mark)
|
if (index == mark)
|
||||||
folio_set_readahead(folio);
|
folio_set_readahead(folio);
|
||||||
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
|
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
|
||||||
|
@ -575,7 +575,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
|
||||||
* It's the expected callback index, assume sequential access.
|
* It's the expected callback index, assume sequential access.
|
||||||
* Ramp up sizes, and push forward the readahead window.
|
* Ramp up sizes, and push forward the readahead window.
|
||||||
*/
|
*/
|
||||||
expected = round_up(ra->start + ra->size - ra->async_size,
|
expected = round_down(ra->start + ra->size - ra->async_size,
|
||||||
1UL << order);
|
1UL << order);
|
||||||
if (index == expected || index == (ra->start + ra->size)) {
|
if (index == expected || index == (ra->start + ra->size)) {
|
||||||
ra->start += ra->size;
|
ra->start += ra->size;
|
||||||
|
|
|
@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
|
||||||
unsigned long dst_start,
|
unsigned long dst_start,
|
||||||
unsigned long src_start,
|
unsigned long src_start,
|
||||||
unsigned long len,
|
unsigned long len,
|
||||||
|
atomic_t *mmap_changing,
|
||||||
uffd_flags_t flags)
|
uffd_flags_t flags)
|
||||||
{
|
{
|
||||||
struct mm_struct *dst_mm = dst_vma->vm_mm;
|
struct mm_struct *dst_mm = dst_vma->vm_mm;
|
||||||
|
@ -472,6 +473,15 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
mmap_read_lock(dst_mm);
|
mmap_read_lock(dst_mm);
|
||||||
|
/*
|
||||||
|
* If memory mappings are changing because of non-cooperative
|
||||||
|
* operation (e.g. mremap) running in parallel, bail out and
|
||||||
|
* request the user to retry later
|
||||||
|
*/
|
||||||
|
if (mmap_changing && atomic_read(mmap_changing)) {
|
||||||
|
err = -EAGAIN;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
dst_vma = NULL;
|
dst_vma = NULL;
|
||||||
goto retry;
|
goto retry;
|
||||||
|
@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
|
||||||
unsigned long dst_start,
|
unsigned long dst_start,
|
||||||
unsigned long src_start,
|
unsigned long src_start,
|
||||||
unsigned long len,
|
unsigned long len,
|
||||||
|
atomic_t *mmap_changing,
|
||||||
uffd_flags_t flags);
|
uffd_flags_t flags);
|
||||||
#endif /* CONFIG_HUGETLB_PAGE */
|
#endif /* CONFIG_HUGETLB_PAGE */
|
||||||
|
|
||||||
|
@ -622,8 +633,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
|
||||||
* If this is a HUGETLB vma, pass off to appropriate routine
|
* If this is a HUGETLB vma, pass off to appropriate routine
|
||||||
*/
|
*/
|
||||||
if (is_vm_hugetlb_page(dst_vma))
|
if (is_vm_hugetlb_page(dst_vma))
|
||||||
return mfill_atomic_hugetlb(dst_vma, dst_start,
|
return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
|
||||||
src_start, len, flags);
|
len, mmap_changing, flags);
|
||||||
|
|
||||||
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
|
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/sh
|
#!/bin/bash
|
||||||
# SPDX-License-Identifier: GPL-2.0
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
# Kselftest framework requirement - SKIP code is 4.
|
# Kselftest framework requirement - SKIP code is 4.
|
||||||
|
|
|
@ -566,7 +566,7 @@ static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot,
|
||||||
if (map_ptr_orig == MAP_FAILED)
|
if (map_ptr_orig == MAP_FAILED)
|
||||||
err(2, "initial mmap");
|
err(2, "initial mmap");
|
||||||
|
|
||||||
if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
|
if (madvise(map_ptr, len, MADV_HUGEPAGE))
|
||||||
err(2, "MADV_HUGEPAGE");
|
err(2, "MADV_HUGEPAGE");
|
||||||
|
|
||||||
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
|
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
#include "vm_util.h"
|
||||||
|
|
||||||
#define LENGTH (256UL*1024*1024)
|
#define LENGTH (256UL*1024*1024)
|
||||||
#define PROTECTION (PROT_READ | PROT_WRITE)
|
#define PROTECTION (PROT_READ | PROT_WRITE)
|
||||||
|
@ -58,10 +59,16 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
void *addr;
|
void *addr;
|
||||||
int ret;
|
int ret;
|
||||||
|
size_t hugepage_size;
|
||||||
size_t length = LENGTH;
|
size_t length = LENGTH;
|
||||||
int flags = FLAGS;
|
int flags = FLAGS;
|
||||||
int shift = 0;
|
int shift = 0;
|
||||||
|
|
||||||
|
hugepage_size = default_huge_page_size();
|
||||||
|
/* munmap with fail if the length is not page aligned */
|
||||||
|
if (hugepage_size > length)
|
||||||
|
length = hugepage_size;
|
||||||
|
|
||||||
if (argc > 1)
|
if (argc > 1)
|
||||||
length = atol(argv[1]) << 20;
|
length = atol(argv[1]) << 20;
|
||||||
if (argc > 2) {
|
if (argc > 2) {
|
||||||
|
|
|
@ -360,7 +360,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
|
||||||
char pattern_seed)
|
char pattern_seed)
|
||||||
{
|
{
|
||||||
void *addr, *src_addr, *dest_addr, *dest_preamble_addr;
|
void *addr, *src_addr, *dest_addr, *dest_preamble_addr;
|
||||||
unsigned long long i;
|
int d;
|
||||||
|
unsigned long long t;
|
||||||
struct timespec t_start = {0, 0}, t_end = {0, 0};
|
struct timespec t_start = {0, 0}, t_end = {0, 0};
|
||||||
long long start_ns, end_ns, align_mask, ret, offset;
|
long long start_ns, end_ns, align_mask, ret, offset;
|
||||||
unsigned long long threshold;
|
unsigned long long threshold;
|
||||||
|
@ -378,8 +379,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
|
||||||
|
|
||||||
/* Set byte pattern for source block. */
|
/* Set byte pattern for source block. */
|
||||||
srand(pattern_seed);
|
srand(pattern_seed);
|
||||||
for (i = 0; i < threshold; i++)
|
for (t = 0; t < threshold; t++)
|
||||||
memset((char *) src_addr + i, (char) rand(), 1);
|
memset((char *) src_addr + t, (char) rand(), 1);
|
||||||
|
|
||||||
/* Mask to zero out lower bits of address for alignment */
|
/* Mask to zero out lower bits of address for alignment */
|
||||||
align_mask = ~(c.dest_alignment - 1);
|
align_mask = ~(c.dest_alignment - 1);
|
||||||
|
@ -420,8 +421,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
|
||||||
|
|
||||||
/* Set byte pattern for the dest preamble block. */
|
/* Set byte pattern for the dest preamble block. */
|
||||||
srand(pattern_seed);
|
srand(pattern_seed);
|
||||||
for (i = 0; i < c.dest_preamble_size; i++)
|
for (d = 0; d < c.dest_preamble_size; d++)
|
||||||
memset((char *) dest_preamble_addr + i, (char) rand(), 1);
|
memset((char *) dest_preamble_addr + d, (char) rand(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
clock_gettime(CLOCK_MONOTONIC, &t_start);
|
clock_gettime(CLOCK_MONOTONIC, &t_start);
|
||||||
|
@ -437,14 +438,14 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
|
||||||
|
|
||||||
/* Verify byte pattern after remapping */
|
/* Verify byte pattern after remapping */
|
||||||
srand(pattern_seed);
|
srand(pattern_seed);
|
||||||
for (i = 0; i < threshold; i++) {
|
for (t = 0; t < threshold; t++) {
|
||||||
char c = (char) rand();
|
char c = (char) rand();
|
||||||
|
|
||||||
if (((char *) dest_addr)[i] != c) {
|
if (((char *) dest_addr)[t] != c) {
|
||||||
ksft_print_msg("Data after remap doesn't match at offset %llu\n",
|
ksft_print_msg("Data after remap doesn't match at offset %llu\n",
|
||||||
i);
|
t);
|
||||||
ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
|
ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
|
||||||
((char *) dest_addr)[i] & 0xff);
|
((char *) dest_addr)[t] & 0xff);
|
||||||
ret = -1;
|
ret = -1;
|
||||||
goto clean_up_dest;
|
goto clean_up_dest;
|
||||||
}
|
}
|
||||||
|
@ -453,14 +454,14 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
|
||||||
/* Verify the dest preamble byte pattern after remapping */
|
/* Verify the dest preamble byte pattern after remapping */
|
||||||
if (c.dest_preamble_size) {
|
if (c.dest_preamble_size) {
|
||||||
srand(pattern_seed);
|
srand(pattern_seed);
|
||||||
for (i = 0; i < c.dest_preamble_size; i++) {
|
for (d = 0; d < c.dest_preamble_size; d++) {
|
||||||
char c = (char) rand();
|
char c = (char) rand();
|
||||||
|
|
||||||
if (((char *) dest_preamble_addr)[i] != c) {
|
if (((char *) dest_preamble_addr)[d] != c) {
|
||||||
ksft_print_msg("Preamble data after remap doesn't match at offset %d\n",
|
ksft_print_msg("Preamble data after remap doesn't match at offset %d\n",
|
||||||
i);
|
d);
|
||||||
ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
|
ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
|
||||||
((char *) dest_preamble_addr)[i] & 0xff);
|
((char *) dest_preamble_addr)[d] & 0xff);
|
||||||
ret = -1;
|
ret = -1;
|
||||||
goto clean_up_dest;
|
goto clean_up_dest;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,9 +29,15 @@ check_supported_x86_64()
|
||||||
# See man 1 gzip under '-f'.
|
# See man 1 gzip under '-f'.
|
||||||
local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
|
local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
|
||||||
|
|
||||||
|
local cpu_supports_pl5=$(awk '/^flags/ {if (/la57/) {print 0;}
|
||||||
|
else {print 1}; exit}' /proc/cpuinfo 2>/dev/null)
|
||||||
|
|
||||||
if [[ "${pg_table_levels}" -lt 5 ]]; then
|
if [[ "${pg_table_levels}" -lt 5 ]]; then
|
||||||
echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
|
echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
|
||||||
exit $ksft_skip
|
exit $ksft_skip
|
||||||
|
elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then
|
||||||
|
echo "$0: CPU does not have the necessary la57 flag to support page table level 5"
|
||||||
|
exit $ksft_skip
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/sh
|
#!/bin/bash
|
||||||
# SPDX-License-Identifier: GPL-2.0
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
Loading…
Reference in New Issue
Block a user