Preempt RT cleanups:

Introduce preempt_[dis|enable_nested() and use it to clean up
  various places which have open coded PREEMPT_RT conditionals.
 
  On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither disabling
  preemption nor interrupts. Though there are a few places which depend on
  the implicit preemption/interrupt disable of those locks, e.g. seqcount
  write sections, per CPU statistics updates etc.
 
  PREEMPT_RT added open coded CONFIG_PREEMPT_RT conditionals to
  disable/enable preemption in the related code parts all over the
  place. That's hard to read and does not really explain why this is
  necessary.
 
  Linus suggested to use helper functions (preempt_disable_nested() and
  preempt_enable_nested()) and use those in the affected places. On !RT
  enabled kernels these functions are NOPs, but contain a lockdep assert to
  validate that preemption is actually disabled to catch call sites which
  do not have preemption disabled.
 
  Clean up the affected code paths in mm, dentry and lib.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmM9c8MTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYobrrEADHkvkCUHxRlarfinQY2rxEpC4nbnAg
 ibg+LWpDpqqZwkjADExu6+lsbb0mCdvlFyvSPwY2YcQAkj/bkTAXvdf3KjejTl++
 B1J5/Cr5lyyKjajjl1efxdORgATBvwuEjR2moJiU868ZR3K4vgflN9n51A0U+NAn
 3kOj/TYotFlyDNJeoK/8edqZwKaueXs3fsYGC1aq2X8mQLI4QDeaHUR6R8CU4w+X
 bVSIdKNluIYxyc3Eav5sDwzyF6gOSL+9DtZcVyXxJ6+PrkDdkptO23derVHk19WE
 ymdAwVX6S37L6HNhJgqeScs+s3xD8KDmvu5ktEAtqC0unBP8JwOFZKCZaaYj91j3
 iMjMC4UFcXI5sERWhDXTSja2g0pYV6q3myfYfojxe6xXHlrVs42gCzDpOI4LZncM
 lvPfmhb7JR7zEmBEvVyEOX8B16ecWnUqgihU17a3ogGdKW1PRNWcWj3RmNXDmpGD
 YZsZSfsawMSJsDIrNRCydXrsiFBNIoVStN7K7c+blnNV8ER5rt24dqCJyUhrl4fB
 K8hNvDp+T8N0f6nlIUWk42vjhskEo2ijCnpvHSXQc1UL7WmLfaJf3/T9zlufPwqJ
 7yVuWd9vZIb3iVAKz+LqOzLlHcgeJmYlbSBsj+Ay1UHPsNgYulDEKcuNniVoG39u
 zFgHu3OmIRueHA==
 =3M58
 -----END PGP SIGNATURE-----

Merge tag 'sched-rt-2022-10-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull preempt RT updates from Thomas Gleixner:
 "Introduce preempt_[dis|enable_nested() and use it to clean up various
  places which have open coded PREEMPT_RT conditionals.

  On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither
  disabling preemption nor interrupts. Though there are a few places
  which depend on the implicit preemption/interrupt disable of those
  locks, e.g. seqcount write sections, per CPU statistics updates etc.

  PREEMPT_RT added open coded CONFIG_PREEMPT_RT conditionals to
  disable/enable preemption in the related code parts all over the
  place. That's hard to read and does not really explain why this is
  necessary.

  Linus suggested to use helper functions (preempt_disable_nested() and
  preempt_enable_nested()) and use those in the affected places. On !RT
  enabled kernels these functions are NOPs, but contain a lockdep assert
  to validate that preemption is actually disabled to catch call sites
  which do not have preemption disabled.

  Clean up the affected code paths in mm, dentry and lib"

* tag 'sched-rt-2022-10-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  u64_stats: Streamline the implementation
  flex_proportions: Disable preemption entering the write section.
  mm/compaction: Get rid of RT ifdeffery
  mm/memcontrol: Replace the PREEMPT_RT conditionals
  mm/debug: Provide VM_WARN_ON_IRQS_ENABLED()
  mm/vmstat: Use preempt_[dis|en]able_nested()
  dentry: Use preempt_[dis|en]able_nested()
  preempt: Provide preempt_[dis|en]able_nested()
This commit is contained in:
Linus Torvalds 2022-10-10 10:03:24 -07:00
commit 7f6dcffb44
10 changed files with 145 additions and 135 deletions

View file

@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
/*
* The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT
* kernels spin_lock() implicitly disables preemption, but not on
* PREEMPT_RT. So for RT it has to be done explicitly to protect
* the sequence count write side critical section against a reader
* or another writer preempting, which would result in a live lock.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
for (;;) {
unsigned n = dir->i_dir_seq;
if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
wait_queue_head_t *d_wait)
{
smp_store_release(&dir->i_dir_seq, n + 2);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
wake_up_all(d_wait);
}

View file

@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm);
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif
#ifdef CONFIG_DEBUG_VM_IRQSOFF
#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled())
#else
#define VM_WARN_ON_IRQS_ENABLED() do { } while (0)
#endif
#ifdef CONFIG_DEBUG_VIRTUAL
#define VIRTUAL_BUG_ON(cond) BUG_ON(cond)
#else

View file

@ -421,4 +421,46 @@ static inline void migrate_enable(void) { }
#endif /* CONFIG_SMP */
/**
* preempt_disable_nested - Disable preemption inside a normally preempt disabled section
*
* Use for code which requires preemption protection inside a critical
* section which has preemption disabled implicitly on non-PREEMPT_RT
* enabled kernels, by e.g.:
* - holding a spinlock/rwlock
* - soft interrupt context
* - regular interrupt handlers
*
* On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft
* interrupt context and regular interrupt handlers are preemptible and
* only prevent migration. preempt_disable_nested() ensures that preemption
* is disabled for cases which require CPU local serialization even on
* PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP.
*
* The use cases are code sequences which are not serialized by a
* particular lock instance, e.g.:
* - seqcount write side critical sections where the seqcount is not
* associated to a particular lock and therefore the automatic
* protection mechanism does not work. This prevents a live lock
* against a preempting high priority reader.
* - RMW per CPU variable updates like vmstat.
*/
/* Macro to avoid header recursion hell vs. lockdep */
#define preempt_disable_nested() \
do { \
if (IS_ENABLED(CONFIG_PREEMPT_RT)) \
preempt_disable(); \
else \
lockdep_assert_preemption_disabled(); \
} while (0)
/**
* preempt_enable_nested - Undo the effect of preempt_disable_nested()
*/
static __always_inline void preempt_enable_nested(void)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
}
#endif /* __LINUX_PREEMPT_H */

View file

@ -8,7 +8,7 @@
*
* Key points :
*
* - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP.
* - Use a seqcount on 32-bit
* - The whole thing is a no-op on 64-bit architectures.
*
* Usage constraints:
@ -20,7 +20,8 @@
* writer and also spin forever.
*
* 3) Write side must use the _irqsave() variant if other writers, or a reader,
* can be invoked from an IRQ context.
* can be invoked from an IRQ context. On 64bit systems this variant does not
* disable interrupts.
*
* 4) If reader fetches several counters, there is no guarantee the whole values
* are consistent w.r.t. each other (remember point #2: seqcounts are not
@ -29,11 +30,6 @@
* 5) Readers are allowed to sleep or be preempted/interrupted: they perform
* pure reads.
*
* 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats
* might be updated from a hardirq or softirq context (remember point #1:
* seqcounts are not used for UP kernels). 32-bit UP stat readers could read
* corrupted 64-bit values otherwise.
*
* Usage :
*
* Stats producer (writer) should use following template granted it already got
@ -66,7 +62,7 @@
#include <linux/seqlock.h>
struct u64_stats_sync {
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
#if BITS_PER_LONG == 32
seqcount_t seq;
#endif
};
@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p)
local64_inc(&p->v);
}
#else
static inline void u64_stats_init(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { }
static inline unsigned long __u64_stats_irqsave(void) { return 0; }
static inline void __u64_stats_irqrestore(unsigned long flags) { }
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
return 0;
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
return false;
}
#else /* 64 bit */
typedef struct {
u64 v;
@ -123,123 +134,95 @@ static inline void u64_stats_inc(u64_stats_t *p)
{
p->v++;
}
#endif
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq)
#else
static inline void u64_stats_init(struct u64_stats_sync *syncp)
{
seqcount_init(&syncp->seq);
}
#endif
static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
write_seqcount_begin(&syncp->seq);
#endif
}
static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
static inline void __u64_stats_update_end(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
write_seqcount_end(&syncp->seq);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
#endif
preempt_enable_nested();
}
static inline unsigned long
u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
static inline unsigned long __u64_stats_irqsave(void)
{
unsigned long flags = 0;
unsigned long flags;
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
else
local_irq_save(flags);
write_seqcount_begin(&syncp->seq);
#endif
local_irq_save(flags);
return flags;
}
static inline void
u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
unsigned long flags)
static inline void __u64_stats_irqrestore(unsigned long flags)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
write_seqcount_end(&syncp->seq);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
else
local_irq_restore(flags);
#endif
local_irq_restore(flags);
}
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
return read_seqcount_begin(&syncp->seq);
#else
return 0;
#endif
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
return read_seqcount_retry(&syncp->seq, start);
}
#endif /* !64 bit */
static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{
__u64_stats_update_begin(syncp);
}
static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
{
__u64_stats_update_end(syncp);
}
static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
{
unsigned long flags = __u64_stats_irqsave();
__u64_stats_update_begin(syncp);
return flags;
}
static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
unsigned long flags)
{
__u64_stats_update_end(syncp);
__u64_stats_irqrestore(flags);
}
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
preempt_disable();
#endif
return __u64_stats_fetch_begin(syncp);
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
return read_seqcount_retry(&syncp->seq, start);
#else
return false;
#endif
}
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
preempt_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
}
/*
* In case irq handlers can update u64 counters, readers can use following helpers
* - SMP 32bit arches use seqcount protection, irq safe.
* - UP 32bit must disable irqs.
* - 64bit have no problem atomically reading u64 values, irq safe.
*/
/* Obsolete interfaces */
static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
preempt_disable();
#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
local_irq_disable();
#endif
return __u64_stats_fetch_begin(syncp);
return u64_stats_fetch_begin(syncp);
}
static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
preempt_enable();
#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
local_irq_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
return u64_stats_fetch_retry(syncp, start);
}
#endif /* _LINUX_U64_STATS_SYNC_H */

View file

@ -805,6 +805,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE
An architecture should select this when it can successfully
build and run DEBUG_VM_PGTABLE.
config DEBUG_VM_IRQSOFF
def_bool DEBUG_VM && !PREEMPT_RT
config DEBUG_VM
bool "Debug VM"
depends on DEBUG_KERNEL

View file

@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
*/
if (events <= 1)
return false;
preempt_disable_nested();
write_seqcount_begin(&p->sequence);
if (periods < 64)
events -= events >> periods;
@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
percpu_counter_add(&p->events, -events);
p->period += periods;
write_seqcount_end(&p->sequence);
preempt_enable_nested();
return true;
}

View file

@ -579,6 +579,12 @@ config COMPACTION
it and then we would be really interested to hear about that at
linux-mm@kvack.org.
config COMPACT_UNEVICTABLE_DEFAULT
int
depends on COMPACTION
default 0 if PREEMPT_RT
default 1
#
# support for free page reporting
config PAGE_REPORTING

View file

@ -1727,11 +1727,7 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
#ifdef CONFIG_PREEMPT_RT
int sysctl_compact_unevictable_allowed __read_mostly = 0;
#else
int sysctl_compact_unevictable_allowed __read_mostly = 1;
#endif
int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)

View file

@ -597,25 +597,18 @@ static u64 flush_next_time;
*/
static void memcg_stats_lock(void)
{
#ifdef CONFIG_PREEMPT_RT
preempt_disable();
#else
VM_BUG_ON(!irqs_disabled());
#endif
preempt_disable_nested();
VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
#ifdef CONFIG_PREEMPT_RT
preempt_disable();
#endif
preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
#ifdef CONFIG_PREEMPT_RT
preempt_enable();
#endif
preempt_enable_nested();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
* interrupt context while other caller need to have disabled interrupt.
*/
__memcg_stats_lock();
if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
if (IS_ENABLED(CONFIG_DEBUG_VM)) {
switch (idx) {
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
WARN_ON_ONCE(!in_task());
break;
default:
WARN_ON_ONCE(!irqs_disabled());
VM_WARN_ON_IRQS_ENABLED();
}
}

View file

@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
* CPU migrations and preemption potentially corrupts a counter so
* disable preemption.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
__this_cpu_write(*p, x);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
__this_cpu_write(*p, x);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_node_page_state);
@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, -overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, -overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)