diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index af8a4d2d1fd1..3e03c0d13253 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/* + * Lockless lists for tracking IO stats update + * + * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). + * There are multiple blkg's (one for each block device) attached to each + * blkcg. The rstat code keeps track of which cpu has IO stats updated, + * but it doesn't know which blkg has the updated stats. If there are many + * block devices in a system, the cost of iterating all the blkg's to flush + * out the IO stats can be high. To reduce such overhead, a set of percpu + * lockless lists (lhead) per blkcg are used to track the set of recently + * updated iostat_cpu's since the last flush. An iostat_cpu will be put + * onto the lockless list on the update side [blk_cgroup_bio_start()] if + * not there yet and then removed when being flushed [blkcg_rstat_flush()]. + * References to blkg are gotten and then put back in the process to + * protect against blkg removal. + * + * Return: 0 if successful or -ENOMEM if allocation fails. + */ +static int init_blkcg_llists(struct blkcg *blkcg) +{ + int cpu; + + blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); + if (!blkcg->lhead) + return -ENOMEM; + + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); + return 0; +} + /** * blkcg_css - find the current css * @@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->blkcg = blkcg; u64_stats_init(&blkg->iostat.sync); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; + } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + struct llist_node *lnode; + struct blkg_iostat_set *bisc, *next_bisc; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(css->cgroup)) @@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + lnode = llist_del_all(lhead); + if (!lnode) + goto out; + + /* + * Iterate only the iostat_cpu's queued in the lockless list. + */ + llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { + struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur; unsigned int seq; + WRITE_ONCE(bisc->lqueued, false); + /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); @@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent && parent->parent) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); + percpu_ref_put(&blkg->refcnt); } +out: rcu_read_unlock(); } @@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); + free_percpu(blkcg->lhead); kfree(blkcg); } @@ -1151,6 +1198,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) goto unlock; } + if (init_blkcg_llists(blkcg)) + goto free_blkcg; + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -1191,7 +1241,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - + free_percpu(blkcg->lhead); +free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: @@ -1939,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio) void blk_cgroup_bio_start(struct bio *bio) { + struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; @@ -1957,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; + /* + * If the iostat_cpu isn't in a lockless list, put it into the + * list to indicate that a stat update is pending. + */ + if (!READ_ONCE(bis->lqueued)) { + struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); + + llist_add(&bis->lnode, lhead); + WRITE_ONCE(bis->lqueued, true); + percpu_ref_get(&bis->blkg->refcnt); + } + u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index aa2b286bc825..1e94e404eaa8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -18,6 +18,7 @@ #include #include #include +#include struct blkcg_gq; struct blkg_policy_data; @@ -43,6 +44,9 @@ struct blkg_iostat { struct blkg_iostat_set { struct u64_stats_sync sync; + struct blkcg_gq *blkg; + struct llist_node lnode; + int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; @@ -97,6 +101,12 @@ struct blkcg { struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; + + /* + * List of updated percpu blkg_iostat_set's since the last flush. + */ + struct llist_head __percpu *lhead; + #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif