From b5a9adcbd5dc95d34d1f5fc84eff9af6fc60d284 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:00 -0400 Subject: blk-cgroup: Return -ENOMEM directly in blkcg_css_alloc() error path For blkcg_css_alloc(), the only error that will be returned is -ENOMEM. Simplify error handling code by returning this error directly instead of setting an intermediate "ret" variable. Signed-off-by: Waiman Long Reviewed-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-2-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6a5c849ee061..af8a4d2d1fd1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1139,7 +1139,6 @@ static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; - struct cgroup_subsys_state *ret; int i; mutex_lock(&blkcg_pol_mutex); @@ -1148,10 +1147,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); + if (!blkcg) goto unlock; - } } for (i = 0; i < BLKCG_MAX_POLS ; i++) { @@ -1168,10 +1165,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - ret = ERR_PTR(-ENOMEM); + if (!cpd) goto free_pd_blkcg; - } + blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; @@ -1200,7 +1196,7 @@ free_pd_blkcg: kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); - return ret; + return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 3b8cc6298724021da845f2f9fd7dd4b6829a6817 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:01 -0400 Subject: blk-cgroup: Optimize blkcg_rstat_flush() For a system with many CPUs and block devices, the time to do blkcg_rstat_flush() from cgroup_rstat_flush() can be rather long. It can be especially problematic as interrupt is disabled during the flush. It was reported that it might take seconds to complete in some extreme cases leading to hard lockup messages. As it is likely that not all the percpu blkg_iostat_set's has been updated since the last flush, those stale blkg_iostat_set's don't need to be flushed in this case. This patch optimizes blkcg_rstat_flush() by keeping a lockless list of recently updated blkg_iostat_set's in a newly added percpu blkcg->lhead pointer. The blkg_iostat_set is added to a lockless list on the update side in blk_cgroup_bio_start(). It is removed from the lockless list when flushed in blkcg_rstat_flush(). Due to racing, it is possible that blk_iostat_set's in the lockless list may have no new IO stats to be flushed, but that is OK. To protect against destruction of blkg, a percpu reference is gotten when putting into the lockless list and put back when removed. When booting up an instrumented test kernel with this patch on a 2-socket 96-thread system with cgroup v2, out of the 2051 calls to cgroup_rstat_flush() after bootup, 1788 of the calls were exited immediately because of empty lockless list. After an all-cpu kernel build, the ratio became 6295424/6340513. That was more than 99%. Signed-off-by: Waiman Long Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-3-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++----- block/blk-cgroup.h | 10 +++++++ 2 files changed, 80 insertions(+), 6 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index af8a4d2d1fd1..3e03c0d13253 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/* + * Lockless lists for tracking IO stats update + * + * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). + * There are multiple blkg's (one for each block device) attached to each + * blkcg. The rstat code keeps track of which cpu has IO stats updated, + * but it doesn't know which blkg has the updated stats. If there are many + * block devices in a system, the cost of iterating all the blkg's to flush + * out the IO stats can be high. To reduce such overhead, a set of percpu + * lockless lists (lhead) per blkcg are used to track the set of recently + * updated iostat_cpu's since the last flush. An iostat_cpu will be put + * onto the lockless list on the update side [blk_cgroup_bio_start()] if + * not there yet and then removed when being flushed [blkcg_rstat_flush()]. + * References to blkg are gotten and then put back in the process to + * protect against blkg removal. + * + * Return: 0 if successful or -ENOMEM if allocation fails. + */ +static int init_blkcg_llists(struct blkcg *blkcg) +{ + int cpu; + + blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); + if (!blkcg->lhead) + return -ENOMEM; + + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); + return 0; +} + /** * blkcg_css - find the current css * @@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->blkcg = blkcg; u64_stats_init(&blkg->iostat.sync); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; + } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + struct llist_node *lnode; + struct blkg_iostat_set *bisc, *next_bisc; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(css->cgroup)) @@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + lnode = llist_del_all(lhead); + if (!lnode) + goto out; + + /* + * Iterate only the iostat_cpu's queued in the lockless list. + */ + llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { + struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur; unsigned int seq; + WRITE_ONCE(bisc->lqueued, false); + /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); @@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent && parent->parent) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); + percpu_ref_put(&blkg->refcnt); } +out: rcu_read_unlock(); } @@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); + free_percpu(blkcg->lhead); kfree(blkcg); } @@ -1151,6 +1198,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) goto unlock; } + if (init_blkcg_llists(blkcg)) + goto free_blkcg; + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -1191,7 +1241,8 @@ free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - + free_percpu(blkcg->lhead); +free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: @@ -1939,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio) void blk_cgroup_bio_start(struct bio *bio) { + struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; @@ -1957,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; + /* + * If the iostat_cpu isn't in a lockless list, put it into the + * list to indicate that a stat update is pending. + */ + if (!READ_ONCE(bis->lqueued)) { + struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); + + llist_add(&bis->lnode, lhead); + WRITE_ONCE(bis->lqueued, true); + percpu_ref_get(&bis->blkg->refcnt); + } + u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index aa2b286bc825..1e94e404eaa8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -18,6 +18,7 @@ #include #include #include +#include struct blkcg_gq; struct blkg_policy_data; @@ -43,6 +44,9 @@ struct blkg_iostat { struct blkg_iostat_set { struct u64_stats_sync sync; + struct blkcg_gq *blkg; + struct llist_node lnode; + int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; @@ -97,6 +101,12 @@ struct blkcg { struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; + + /* + * List of updated percpu blkg_iostat_set's since the last flush. + */ + struct llist_head __percpu *lhead; + #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif -- cgit v1.2.3 From dae590a6c96c799434e0ff8156ef29b88c257e60 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:02 -0400 Subject: blk-cgroup: Flush stats at blkgs destruction path As noted by Michal, the blkg_iostat_set's in the lockless list hold reference to blkg's to protect against their removal. Those blkg's hold reference to blkcg. When a cgroup is being destroyed, cgroup_rstat_flush() is only called at css_release_work_fn() which is called when the blkcg reference count reaches 0. This circular dependency will prevent blkcg from being freed until some other events cause cgroup_rstat_flush() to be called to flush out the pending blkcg stats. To prevent this delayed blkcg removal, add a new cgroup_rstat_css_flush() function to flush stats for a given css and cpu and call it at the blkgs destruction path, blkcg_destroy_blkgs(), whenever there are still some pending stats to be flushed. This will ensure that blkcg reference count can reach 0 ASAP. Signed-off-by: Waiman Long Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-4-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 ++++++++++++++- include/linux/cgroup.h | 1 + kernel/cgroup/rstat.c | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3e03c0d13253..57941d2a8ba3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1084,10 +1084,12 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { + int cpu; + might_sleep(); + css_get(&blkcg->css); spin_lock_irq(&blkcg->lock); - while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); @@ -1110,6 +1112,17 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg) } spin_unlock_irq(&blkcg->lock); + + /* + * Flush all the non-empty percpu lockless lists. + */ + for_each_possible_cpu(cpu) { + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + + if (!llist_empty(lhead)) + cgroup_rstat_css_cpu_flush(&blkcg->css, cpu); + } + css_put(&blkcg->css); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44b59e2..6c4e66b3fa84 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,6 +766,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 793ecff29038..910e633869b0 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -281,6 +281,26 @@ void cgroup_rstat_flush_release(void) spin_unlock_irq(&cgroup_rstat_lock); } +/** + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu + * @css: target css to be flush + * @cpu: the cpu that holds the stats to be flush + * + * A lightweight rstat flush operation for a given css and cpu. + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock + * isn't used. + */ +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + + raw_spin_lock_irq(cpu_lock); + rcu_read_lock(); + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + raw_spin_unlock_irq(cpu_lock); +} + int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; -- cgit v1.2.3 From c62256dda37133a48d56cecc15e4a4d527d4cc46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Nov 2022 08:25:46 -0700 Subject: Revert "blk-cgroup: Flush stats at blkgs destruction path" This reverts commit dae590a6c96c799434e0ff8156ef29b88c257e60. We've had a few reports on this causing a crash at boot time, because of a reference issue. While this problem seemginly did exist before the patch and needs solving separately, this patch makes it a lot easier to trigger. Link: https://lore.kernel.org/linux-block/CA+QYu4oxiRKC6hJ7F27whXy-PRBx=Tvb+-7TQTONN8qTtV3aDA@mail.gmail.com/ Link: https://lore.kernel.org/linux-block/69af7ccb-6901-c84c-0e95-5682ccfb750c@acm.org/ Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 +-------------- include/linux/cgroup.h | 1 - kernel/cgroup/rstat.c | 20 -------------------- 3 files changed, 1 insertion(+), 35 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 57941d2a8ba3..3e03c0d13253 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1084,12 +1084,10 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { - int cpu; - might_sleep(); - css_get(&blkcg->css); spin_lock_irq(&blkcg->lock); + while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); @@ -1112,17 +1110,6 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg) } spin_unlock_irq(&blkcg->lock); - - /* - * Flush all the non-empty percpu lockless lists. - */ - for_each_possible_cpu(cpu) { - struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); - - if (!llist_empty(lhead)) - cgroup_rstat_css_cpu_flush(&blkcg->css, cpu); - } - css_put(&blkcg->css); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6c4e66b3fa84..528bd44b59e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,7 +766,6 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); -void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 910e633869b0..793ecff29038 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -281,26 +281,6 @@ void cgroup_rstat_flush_release(void) spin_unlock_irq(&cgroup_rstat_lock); } -/** - * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu - * @css: target css to be flush - * @cpu: the cpu that holds the stats to be flush - * - * A lightweight rstat flush operation for a given css and cpu. - * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock - * isn't used. - */ -void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - - raw_spin_lock_irq(cpu_lock); - rcu_read_lock(); - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - raw_spin_unlock_irq(cpu_lock); -} - int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; -- cgit v1.2.3 From 1d6df9d352bb2a3c2ddb32851dfcafb417c47762 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Dec 2022 09:17:13 +0800 Subject: blk-cgroup: Fix some kernel-doc comments Make the description of @gendisk to @disk in blkcg_schedule_throttle() to clear the below warnings: block/blk-cgroup.c:1850: warning: Function parameter or member 'disk' not described in 'blkcg_schedule_throttle' block/blk-cgroup.c:1850: warning: Excess function parameter 'gendisk' description in 'blkcg_schedule_throttle' Fixes: de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle") Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3338 Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/20221202011713.14834-1-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3e03c0d13253..0e225ca7bd9a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1831,7 +1831,7 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @gendisk: disk to throttle + * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated -- cgit v1.2.3 From 37754595e94779db869e6ef803f038fa956d08ff Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 6 Dec 2022 17:33:07 +0800 Subject: blk-cgroup: Fix typo in comment Replace assocating with associating. Replace intiailized with initialized. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Reviewed-by: Mukesh Ojha Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221206093307.378249-1-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-cgroup.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0e225ca7bd9a..ce09e56c35a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -610,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs); * @pd: policy private data of interest * @v: value to print * - * Print @v to @sf for the device assocaited with @pd. + * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() * * Finish up after per-blkg config update. This function must be paired * with blkg_conf_prep(). -- cgit v1.2.3