From fee468fdf41cdf36ba6b5a780e2474d0a3e066ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:06 -0700 Subject: writeback: reliably update bandwidth estimation Currently we trigger writeback bandwidth estimation from balance_dirty_pages() and from wb_writeback(). However neither of these need to trigger when the system is relatively idle and writeback is triggered e.g. from fsync(2). Make sure writeback estimates happen reliably by triggering them from do_writepages(). Link: https://lkml.kernel.org/r/20210713104716.22868-2-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..2480322c06a7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,7 +379,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); -- cgit v1.2.3 From 45a2966fd64147518dc5bca25f447bd0fb5359ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:09 -0700 Subject: writeback: fix bandwidth estimate for spiky workload Michael Stapelberg has reported that for workload with short big spikes of writes (GCC linker seem to trigger this frequently) the write throughput is heavily underestimated and tends to steadily sink until it reaches zero. This has rather bad impact on writeback throttling (causing stalls). The problem is that writeback throughput estimate gets updated at most once per 200 ms. One update happens early after we submit pages for writeback (at that point writeout of only small fraction of pages is completed and thus observed throughput is tiny). Next update happens only during the next write spike (updates happen only from inode writeback and dirty throttling code) and if that is more than 1s after previous spike, we decide system was idle and just ignore whatever was written until this moment. Fix the problem by making sure writeback throughput estimate is also updated shortly after writeback completes to get reasonable estimate of throughput for spiky workloads. [jack@suse.cz: avoid division by 0 in wb_update_dirty_ratelimit()] Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+linux@google.com Link: https://lkml.kernel.org/r/20210713104716.22868-3-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Michael Stapelberg Tested-by: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + mm/backing-dev.c | 10 ++++++++++ mm/page-writeback.c | 39 +++++++++++++++++++++++++-------------- 4 files changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 06fb8e13f6bc..33207004cfde 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -143,6 +143,7 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ unsigned long dirty_sleep; /* last wait */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2480322c06a7..cbaef099645e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); +void wb_update_bandwidth(struct bdi_writeback *wb); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b4c707ddedb1..6122c78ce914 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void wb_update_bandwidth_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, bw_dwork); + + wb_update_bandwidth(wb); +} + /* * Initial write bandwidth: 100 MB/s */ @@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); @@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e4a381b8944b..156f5888c09d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1336,18 +1336,19 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - wb->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; - lockdep_assert_held(&wb->list_lock); + spin_lock(&wb->list_lock); /* - * rate-limit, only update once every 200ms. + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. */ - if (elapsed < BANDWIDTH_INTERVAL) - return; - + elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); @@ -1369,15 +1370,14 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); } -static void wb_update_bandwidth(struct bdi_writeback *wb) +void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - spin_lock(&wb->list_lock); __wb_update_bandwidth(&gdtc, NULL, false); - spin_unlock(&wb->list_lock); } /* Interval after which we consider wb idle and don't estimate bandwidth */ @@ -1722,11 +1722,8 @@ free_running: wb->dirty_exceeded = 1; if (time_is_before_jiffies(wb->bw_time_stamp + - BANDWIDTH_INTERVAL)) { - spin_lock(&wb->list_lock); + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); - spin_unlock(&wb->list_lock); - } /* throttle according to the chosen dtc */ dirty_ratelimit = wb->dirty_ratelimit; @@ -2374,7 +2371,13 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } - wb_update_bandwidth(wb); + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } @@ -2754,6 +2757,14 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb) static void wb_inode_writeback_end(struct bdi_writeback *wb) { atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); } int test_clear_page_writeback(struct page *page) -- cgit v1.2.3 From 7490a2d248145d8694e1e9828801b496250fd697 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:53:27 -0700 Subject: writeback: memcg: simplify cgroup_writeback_by_id Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty pages for a memcg. However mem_cgroup_wb_stats() does a lot more than just get the number of dirty pages. Just directly get the number of dirty pages instead of calling mem_cgroup_wb_stats(). Also cgroup_writeback_by_id() is only called for best-effort dirty flushing, so remove the unused 'nr' parameter and no need to explicitly flush memcg stats. Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 20 +++++++++----------- include/linux/memcontrol.h | 15 +++++++++++++++ include/linux/writeback.h | 2 +- mm/memcontrol.c | 13 +------------ 4 files changed, 26 insertions(+), 24 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 867984e778c3..35894a2dba75 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1039,20 +1039,20 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; + unsigned long dirty; int ret; /* lookup bdi and memcg */ @@ -1081,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, } /* - * If @nr is zero, the caller is attempting to write out most of + * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. + * + * BTW the memcg stats are flushed periodically and this is best-effort + * estimation, so some potential error is ok. */ - if (!nr) { - unsigned long filepages, headroom, dirty, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, - &writeback); - nr = dirty * 10 / 8; - } + dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); + dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { - work->nr_pages = nr; + work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24797929d8a1..3403ec77528a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -955,6 +955,16 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { @@ -1391,6 +1401,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cbaef099645e..aeda2c0c9986 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); bool cleanup_offline_cgwb(struct bdi_writeback *wb); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 702a81dfe72d..1047f0271ff8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -645,17 +645,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } -/* idx can be of type enum memcg_stat_item or node_stat_item. */ -static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -4668,7 +4657,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) atomic_read(&frn->done.cnt) == 1) { frn->at = 0; trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, WB_REASON_FOREIGN_FLUSH, &frn->done); } -- cgit v1.2.3