From 176042404ee6a96ba7e9054e1bda6220360a26ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:56 +0100 Subject: mm: add PSI accounting around ->read_folio and ->readahead calls PSI tries to account for the cost of bringing back in pages discarded by the MM LRU management. Currently the prime place for that is hooked into the bio submission path, which is a rather bad place: - it does not actually account I/O for non-block file systems, of which we have many - it adds overhead and a layering violation to the block layer Add the accounting into the two places in the core MM code that read pages into an address space by calling into ->read_folio and ->readahead so that the entire file system operations are covered, to broaden the coverage and allow removing the accounting in the block layer going forward. As psi_memstall_enter can deal with nested calls this will not lead to double accounting even while the bio annotations are still present. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-2-hch@lst.de Signed-off-by: Jens Axboe --- mm/filemap.c | 7 +++++++ mm/readahead.c | 22 ++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 15800334147b..c943d1b90cc2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2382,6 +2382,8 @@ retry: static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { + bool workingset = folio_test_workingset(folio); + unsigned long pflags; int error; /* @@ -2390,8 +2392,13 @@ static int filemap_read_folio(struct file *file, filler_t filler, * fails. */ folio_clear_error(folio); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = filler(file, folio); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (error) return error; diff --git a/mm/readahead.c b/mm/readahead.c index fdcd28cbd92d..b10f0cf81d80 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -122,6 +122,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { @@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac) } blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; BUG_ON(readahead_count(rac)); } @@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); + ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } @@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); - if (err) + if (err) { folio_put(folio); - else - ractl->_nr_pages += 1UL << order; - return err; + return err; + } + + ractl->_nr_pages += 1UL << order; + ractl->_workingset |= folio_test_workingset(folio); + return 0; } void page_cache_ra_order(struct readahead_control *ractl, @@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl, put_page(page); return; } + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; if (ra) { ra->size++; -- cgit v1.2.3 From de185b56e8a62822d4e1cdb3e068b38ca709aa47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:05:00 +0200 Subject: blk-cgroup: pass a gendisk to blkcg_schedule_throttle Pass the gendisk to blkcg_schedule_throttle as part of moving the blk-cgroup infrastructure to be gendisk based. Remove the unused !BLK_CGROUP stub while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-17-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 +++++--- block/blk-iocost.c | 4 ++-- block/blk-iolatency.c | 2 +- include/linux/blk-cgroup.h | 5 ++--- mm/swapfile.c | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c2d5ca2eb92e..fc82057db962 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1792,13 +1792,13 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q: the request queue IO was submitted on + * @gendisk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for - * instance will only have a request_queue at that point. This set's the + * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * @@ -1807,8 +1807,10 @@ out: * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { + struct request_queue *q = disk->queue; + if (unlikely(current->flags & PF_KTHREAD)) return; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c0f69bc99db9..495396425bad 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2636,7 +2636,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2737,7 +2737,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c6f61fe88b87..571fa95aafe9 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q, use_memdelay); + blkcg_schedule_throttle(rqos->q->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9f40dbc65f82..dd5841a42c33 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -18,14 +18,14 @@ struct bio; struct cgroup_subsys_state; -struct request_queue; +struct gendisk; #define FC_APPID_LEN 129 #ifdef CONFIG_BLK_CGROUP extern struct cgroup_subsys_state * const blkcg_root_css; -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay); void blkcg_maybe_throttle_current(void); bool blk_cgroup_congested(void); void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); @@ -39,7 +39,6 @@ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } -static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { return NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1fdccd2f1422..82e62007881d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3655,7 +3655,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); + blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } -- cgit v1.2.3