Several core optimizations:

* threadgroup_rwsem write locking is skipped when configuring controllers in
   empty subtrees. Combined with CLONE_INTO_CGROUP, this allows the common
   static usage pattern to not grab threadgroup_rwsem at all (glibc still
   doesn't seem ready for CLONE_INTO_CGROUP unfortunately).
 
 * threadgroup_rwsem used to be put into non-percpu mode by default due to
   latency concerns in specific use cases. There's no reason for everyone
   else to pay for it. Make the behavior optional.
 
 * psi no longer allocates memory when disabled.
 
 along with some code cleanups.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYIACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCYugHIQ4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGd+oAP9lfD3fTRdNo4qWV2VsZsYzoOxzNIuJSwN/dnYx
 IEbQOwD/cd2YMfeo6zcb427U/VfTFqjJjFK04OeljYtJU8fFywo=
 =sucy
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "Several core optimizations:

   - threadgroup_rwsem write locking is skipped when configuring
     controllers in empty subtrees.

     Combined with CLONE_INTO_CGROUP, this allows the common static
     usage pattern to not grab threadgroup_rwsem at all (glibc still
     doesn't seem ready for CLONE_INTO_CGROUP unfortunately).

   - threadgroup_rwsem used to be put into non-percpu mode by default
     due to latency concerns in specific use cases. There's no reason
     for everyone else to pay for it. Make the behavior optional.

   - psi no longer allocates memory when disabled.

  ... along with some code cleanups"

* tag 'cgroup-for-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: Skip subtree root in cgroup_update_dfl_csses()
  cgroup: remove "no" prefixed mount options
  cgroup: Make !percpu threadgroup_rwsem operations optional
  cgroup: Add "no" prefixed mount options
  cgroup: Elide write-locking threadgroup_rwsem when updating csses on an empty subtree
  cgroup.c: remove redundant check for mixable cgroup in cgroup_migrate_vet_dst
  cgroup.c: add helper __cset_cgroup_from_root to cleanup duplicated codes
  psi: dont alloc memory for psi by default
This commit is contained in:
Linus Torvalds 2022-08-03 09:45:08 -07:00
commit b6bb70f9ab
8 changed files with 151 additions and 63 deletions

View file

@ -184,6 +184,14 @@ cgroup v2 currently supports the following mount options.
ignored on non-init namespace mounts. Please refer to the
Delegation section for details.
favordynmods
Reduce the latencies of dynamic cgroup modifications such as
task migrations and controller on/offs at the cost of making
hot path operations such as forks and exits more expensive.
The static usage pattern of creating a cgroup, enabling
controllers, and then seeding it with CLONE_INTO_CGROUP is
not affected by this option.
memory_localevents
Only populate memory.events with data for the current cgroup,
and not any subtrees. This is legacy behaviour, the default

View file

@ -88,20 +88,33 @@ enum {
*/
CGRP_ROOT_NS_DELEGATE = (1 << 3),
/*
* Reduce latencies on dynamic cgroup modifications such as task
* migrations and controller on/offs by disabling percpu operation on
* cgroup_threadgroup_rwsem. This makes hot path operations such as
* forks and exits into the slow path and more expensive.
*
* The static usage pattern of creating a cgroup, enabling controllers,
* and then seeding it with CLONE_INTO_CGROUP doesn't require write
* locking cgroup_threadgroup_rwsem and thus doesn't benefit from
* favordynmod.
*/
CGRP_ROOT_FAVOR_DYNMODS = (1 << 4),
/*
* Enable cpuset controller in v1 cgroup to use v2 behavior.
*/
CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
CGRP_ROOT_CPUSET_V2_MODE = (1 << 16),
/*
* Enable legacy local memory.events.
*/
CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 5),
CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17),
/*
* Enable recursive subtree protection
*/
CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 6),
CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
};
/* cftype->flags */
@ -480,7 +493,7 @@ struct cgroup {
struct work_struct release_agent_work;
/* used to track pressure stalls */
struct psi_group psi;
struct psi_group *psi;
/* used to store eBPF programs */
struct cgroup_bpf bpf;

View file

@ -674,7 +674,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return &cgrp->psi;
return cgrp->psi;
}
bool cgroup_psi_enabled(void);

View file

@ -945,6 +945,16 @@ if CGROUPS
config PAGE_COUNTER
bool
config CGROUP_FAVOR_DYNMODS
bool "Favor dynamic modification latency reduction by default"
help
This option enables the "favordynmods" mount option by default
which reduces the latencies of dynamic cgroup modifications such
as task migrations and controller on/offs at the cost of making
hot path operations such as forks and exits more expensive.
Say N if unsure.
config MEMCG
bool "Memory controller"
select PAGE_COUNTER

View file

@ -233,6 +233,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);

View file

@ -875,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@ -898,6 +900,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
Opt_favordynmods,
Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@ -909,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
fsparam_flag ("favordynmods", Opt_favordynmods),
fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@ -960,6 +966,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
case Opt_favordynmods:
ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
break;
case Opt_nofavordynmods:
ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
@ -1211,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
if (ret)
if (!ret)
cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
else
cgroup_free_root(root);
return ret;
}

View file

@ -279,8 +279,6 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
* - Remount is disallowed.
*
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
@ -1309,6 +1307,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
return root_cgrp->root;
}
void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
/* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
if (favor && !favoring) {
rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
} else if (!favor && favoring) {
rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
}
}
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@ -1369,6 +1381,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_root_count--;
}
cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
@ -1378,6 +1391,31 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_free_root(root);
}
static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
struct cgroup *res_cgroup = NULL;
if (cset == &init_css_set) {
res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res_cgroup = c;
break;
}
}
}
return res_cgroup;
}
/*
* look up cgroup associated with current task's cgroup namespace on the
* specified hierarchy
@ -1393,22 +1431,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_lock();
cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
res = __cset_cgroup_from_root(cset, root);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
rcu_read_unlock();
BUG_ON(!res);
@ -1424,22 +1448,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
if (cset == &init_css_set) {
res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
res = __cset_cgroup_from_root(cset, root);
BUG_ON(!res);
return res;
@ -1866,6 +1875,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
nr__cgroup2_params
@ -1873,6 +1883,7 @@ enum cgroup2_param {
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
@ -1892,6 +1903,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
case Opt_favordynmods:
ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
@ -1910,6 +1924,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
cgroup_favor_dynmods(&cgrp_dfl_root,
root_flags & CGRP_ROOT_FAVOR_DYNMODS);
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
@ -1926,6 +1943,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
@ -1976,7 +1995,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
root->flags = ctx->flags;
/* DYNMODS must be modified through cgroup_favor_dynmods() */
root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@ -2198,6 +2218,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
#endif
return 0;
}
@ -2572,10 +2596,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
/* mixables don't care */
if (cgroup_is_mixable(dst_cgrp))
return 0;
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@ -2949,22 +2969,40 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
/*
* As cgroup_update_dfl_csses() is only called by
* cgroup_apply_control(). The csses associated with the
* given cgrp will not be affected by changes made to
* its subtree_control file. We can skip them.
*/
if (dsct == cgrp)
continue;
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
/*
* We need to write-lock threadgroup_rwsem while migrating tasks.
* However, if there are no source csets for @cgrp, changing its
* controllers isn't gonna produce any task migrations and the
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
if (has_tasks)
percpu_down_write(&cgroup_threadgroup_rwsem);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
@ -2984,7 +3022,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
if (has_tasks)
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@ -3618,21 +3657,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_CPU);
}
@ -3658,7 +3697,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
@ -5851,12 +5890,6 @@ int __init cgroup_init(void)
cgroup_rstat_boot();
/*
* The latency of the synchronize_rcu() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
*/
rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@ -6768,6 +6801,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
"favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n");
}

View file

@ -957,10 +957,16 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return 0;
cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
if (!cgroup->psi.pcpu)
cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL);
if (!cgroup->psi)
return -ENOMEM;
group_init(&cgroup->psi);
cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
if (!cgroup->psi->pcpu) {
kfree(cgroup->psi);
return -ENOMEM;
}
group_init(cgroup->psi);
return 0;
}
@ -969,10 +975,11 @@ void psi_cgroup_free(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return;
cancel_delayed_work_sync(&cgroup->psi.avgs_work);
free_percpu(cgroup->psi.pcpu);
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
free_percpu(cgroup->psi->pcpu);
/* All triggers must be removed by now */
WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
kfree(cgroup->psi);
}
/**