From 714fbf2647b1a33d914edd695d4da92029c7e7c0 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 22 Mar 2022 14:38:39 -0700 Subject: ntfs: add sanity check on allocation size ntfs_read_inode_mount invokes ntfs_malloc_nofs with zero allocation size. It triggers one BUG in the __ntfs_malloc function. Fix this by adding sanity check on ni->attr_list_size. Link: https://lkml.kernel.org/r/20220120094914.47736-1-dzm91@hust.edu.cn Reported-by: syzbot+3c765c5248797356edaa@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4474adb393ca..517b71c73aa9 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " -- cgit v1.2.3 From 38c9d2d3f38454d12e317dfbcad45e08e826a250 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 22 Mar 2022 14:38:42 -0700 Subject: ocfs2: cleanup some return variables Simply return directly instead of assign the return value to another variable. Link: https://lkml.kernel.org/r/20220114021641.13927-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reported-by: Zeal Robot Cc: Minghao Chi Cc: CGEL ZTE Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 9 +++------ fs/ocfs2/stack_user.c | 18 ++++++------------ 2 files changed, 9 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fc5f780fa235..24321c44cd42 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int ret; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); - ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, - clusters_to_add, mark_unwritten, - data_ac, meta_ac, reason_ret); - - return ret; + return ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); } static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 85a47621e0c0..a75e2b7d67f5 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - int ret; - if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + sizeof(struct dlm_lksb); - ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, - flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, lksb, - fsdlm_blocking_ast_wrapper); - return ret; + return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - int ret; - - ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, lksb); - return ret; + return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); } static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) -- cgit v1.2.3 From 137cebf9432eae024d0334953ed92a2a78619b52 Mon Sep 17 00:00:00 2001 From: hongnanli Date: Tue, 22 Mar 2022 14:38:45 -0700 Subject: fs/ocfs2: fix comments mentioning i_mutex inode->i_mutex has been replaced with inode->i_rwsem long ago. Fix comments still mentioning i_mutex. Link: https://lkml.kernel.org/r/20220214031314.100094-1-hongnan.li@linux.alibaba.com Signed-off-by: hongnanli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/ocfs2/cluster/nodemanager.c | 2 +- fs/ocfs2/dir.c | 4 ++-- fs/ocfs2/file.c | 4 ++-- fs/ocfs2/inode.c | 2 +- fs/ocfs2/localalloc.c | 6 +++--- fs/ocfs2/namei.c | 2 +- fs/ocfs2/ocfs2.h | 4 ++-- fs/ocfs2/quota_global.c | 2 +- fs/ocfs2/xattr.c | 2 +- 11 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bf9357123bc5..49f41074baad 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5981,7 +5981,7 @@ bail: return status; } -/* Expects you to already be holding tl_inode->i_mutex */ +/* Expects you to already be holding tl_inode->i_rwsem */ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 498da317580a..c23e1c243cc6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2311,7 +2311,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, down_write(&oi->ip_alloc_sem); - /* Delete orphan before acquire i_mutex. */ + /* Delete orphan before acquire i_rwsem. */ if (dwc->dw_orphaned) { BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 625c92521416..27fee68f860a 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -689,7 +689,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - /* this runs under the parent dir's i_mutex; there can be only + /* this runs under the parent dir's i_rwsem; there can be only * one caller in here at a time */ if (o2nm_single_cluster) return ERR_PTR(-ENOSPC); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f2cc1ff29e6d..81c3d65d68fe 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1957,7 +1957,7 @@ bail_nolock: } /* - * NOTE: this should always be called with parent dir i_mutex taken. + * NOTE: this should always be called with parent dir i_rwsem taken. */ int ocfs2_find_files_on_disk(const char *name, int namelen, @@ -2003,7 +2003,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name * - * Callers should have i_mutex + a cluster lock on dir + * Callers should have i_rwsem + a cluster lock on dir */ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 24321c44cd42..01b7407a8893 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -270,7 +270,7 @@ int ocfs2_update_inode_atime(struct inode *inode, /* * Don't use ocfs2_mark_inode_dirty() here as we don't always - * have i_mutex to guard against concurrent changes to other + * have i_rwsem to guard against concurrent changes to other * inode fields. */ inode->i_atime = current_time(inode); @@ -1065,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode, /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on - * i_mutex to block other extend/truncate calls while we're + * i_rwsem to block other extend/truncate calls while we're * here. We even have to hold it for sparse files because there * might be some tail zeroing. */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 6c2411c2afcf..5739dc301569 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -713,7 +713,7 @@ bail: /* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir - * i_mutex held, we'll deadlock here. Instead we detect this + * i_rwsem held, we'll deadlock here. Instead we detect this * and exit early - recovery will wipe this inode for us. */ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 5f6bacbeef6b..c4426d12a2ad 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -606,7 +606,7 @@ out: /* * make sure we've got at least bits_wanted contiguous bits in the - * local alloc. You lose them when you drop i_mutex. + * local alloc. You lose them when you drop i_rwsem. * * We will add ourselves to the transaction passed in, but may start * our own in order to shift windows. @@ -636,7 +636,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * We must double check state and allocator bits because - * another process may have changed them while holding i_mutex. + * another process may have changed them while holding i_rwsem. */ spin_lock(&osb->osb_lock); if (!ocfs2_la_state_enabled(osb) || @@ -1029,7 +1029,7 @@ enum ocfs2_la_event { /* * Given an event, calculate the size of our next local alloc window. * - * This should always be called under i_mutex of the local alloc inode + * This should always be called under i_rwsem of the local alloc inode * so that local alloc disabling doesn't race with processes trying to * use the allocator. * diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea..c75fd54b9185 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -476,7 +476,7 @@ leave: ocfs2_free_alloc_context(meta_ac); /* - * We should call iput after the i_mutex of the bitmap been + * We should call iput after the i_rwsem of the bitmap been * unlocked in ocfs2_free_alloc_context, or the * ocfs2_delete_inode will mutex_lock again. */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index bb62cc2e0211..337527571461 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -355,7 +355,7 @@ struct ocfs2_super struct delayed_work la_enable_wq; /* - * Must hold local alloc i_mutex and osb->osb_lock to change + * Must hold local alloc i_rwsem and osb->osb_lock to change * local_alloc_bits. Reads can be done under either lock. */ unsigned int local_alloc_bits; @@ -430,7 +430,7 @@ struct ocfs2_super atomic_t osb_tl_disable; /* * How many clusters in our truncate log. - * It must be protected by osb_tl_inode->i_mutex. + * It must be protected by osb_tl_inode->i_rwsem. */ unsigned int truncated_clusters; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index f033de733adb..273f65e0aaba 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -36,7 +36,7 @@ * should be obeyed by all the functions: * - any write of quota structure (either to local or global file) is protected * by dqio_sem or dquot->dq_lock. - * - any modification of global quota file holds inode cluster lock, i_mutex, + * - any modification of global quota file holds inode cluster lock, i_rwsem, * and ip_alloc_sem of the global quota file (achieved by * ocfs2_lock_global_qf). It also has to hold qinfo_lock. * - an allocation of new blocks for local quota file is protected by diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c..95d0611c5fc7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7205,7 +7205,7 @@ out: * Used for reflink a non-preserve-security file. * * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. + * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, -- cgit v1.2.3 From 670d21c6e17f67535fcf16e14c772209220da9ae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:38:58 -0700 Subject: fuse: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. Fuse is one of a small number of filesystems that uses it, setting both the sync (read) and async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the sync flag is to cause read-ahead to be skipped. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flags, change: - .readahead to stop when it has submitted all non-async pages for read. - .writepages to do nothing if WB_SYNC_NONE and the flag would be set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag would be set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) will further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983737.9187.2627117501000365074.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/control.c | 17 ----------------- fs/fuse/dev.c | 8 -------- fs/fuse/file.c | 17 +++++++++++++++++ 3 files changed, 17 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 000d2e5627e9..7cede9a3bc96 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; - struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - - /* - * Get any fuse_mount belonging to this fuse_conn; s_bdi is - * shared between all of them - */ - - if (!list_empty(&fc->mounts)) { - fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); - if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } else { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } - } spin_unlock(&fc->bg_lock); up_read(&fc->killsb); fuse_conn_put(fc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 592730fd6e42..0e537e580dc1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fm->sb) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } fc->num_background--; fc->active_background--; flush_bg_queue(fc); @@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fm->sb) { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0fc150c1c50b..61b1e04eb497 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -966,6 +966,14 @@ static void fuse_readahead(struct readahead_control *rac) struct fuse_io_args *ia; struct fuse_args_pages *ap; + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + break; + nr_pages = readahead_count(rac) - nr_pages; if (nr_pages > max_pages) nr_pages = max_pages; @@ -1959,6 +1967,7 @@ err: static int fuse_writepage(struct page *page, struct writeback_control *wbc) { + struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; if (fuse_page_is_writeback(page->mapping->host, page->index)) { @@ -1974,6 +1983,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) return 0; } + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return AOP_WRITEPAGE_ACTIVATE; + err = fuse_writepage_locked(page); unlock_page(page); @@ -2227,6 +2240,10 @@ static int fuse_writepages(struct address_space *mapping, if (fuse_is_bad(inode)) goto out; + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; + data.inode = inode; data.wpa = NULL; data.ff = NULL; -- cgit v1.2.3 From 6df25e58532be7a4cd6fb15bcd85805947402d91 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:01 -0700 Subject: nfs: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. NFS is one of a small number of filesystems that uses it, setting just the async (write) congestion flag at what it determines are appropriate times. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flag, set an internal flag and change: - .writepages to do nothing if WB_SYNC_NONE and the flag is set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag is set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) wil further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983738.9187.3972219847989393182.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/write.c | 14 +++++++++++--- include/linux/nfs_fs_sb.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 987a187bd39a..7c986164018e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -417,7 +417,7 @@ static void nfs_set_page_writeback(struct page *page) if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + nfss->write_congested = 1; } static void nfs_end_page_writeback(struct nfs_page *req) @@ -433,7 +433,7 @@ static void nfs_end_page_writeback(struct nfs_page *req) end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + nfss->write_congested = 0; } /* @@ -672,6 +672,10 @@ static int nfs_writepage_locked(struct page *page, struct inode *inode = page_file_mapping(page)->host; int err; + if (wbc->sync_mode == WB_SYNC_NONE && + NFS_SERVER(inode)->write_congested) + return AOP_WRITEPAGE_ACTIVATE; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); @@ -719,6 +723,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int priority = 0; int err; + if (wbc->sync_mode == WB_SYNC_NONE && + NFS_SERVER(inode)->write_congested) + return 0; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || @@ -1893,7 +1901,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); + nfss->write_congested = 0; nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ca0959e51e81..6aa2a200676a 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -138,6 +138,7 @@ struct nfs_server { struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ atomic_long_t writeback; /* number of writeback pages */ + unsigned int write_congested;/* flag set when writeback gets too high */ unsigned int flags; /* various flags */ /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */ -- cgit v1.2.3 From 503d4fa6ee28e8d4d201c92ac3922d1b3526f844 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:04 -0700 Subject: ceph: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. CEPHfs is one of a small number of filesystems that uses it, setting just the async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flag, set an internal flag and change: - .writepages to do nothing if WB_SYNC_NONE and the flag is set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag is set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) wil further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983739.9187.14895675781408171186.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 22 +++++++++++++--------- fs/ceph/super.c | 1 + fs/ceph/super.h | 1 + 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c98e5238a1b6..dc7af34640dd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -563,7 +563,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + fsc->write_congested = true; req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, @@ -623,7 +623,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + fsc->write_congested = false; return err; } @@ -635,6 +635,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); + if (wbc->sync_mode == WB_SYNC_NONE && + ceph_inode_to_client(inode)->write_congested) + return AOP_WRITEPAGE_ACTIVATE; + wait_on_page_fscache(page); err = writepage_nounlock(page, wbc); @@ -707,8 +711,7 @@ static void writepages_finish(struct ceph_osd_request *req) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); + fsc->write_congested = false; ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); @@ -760,6 +763,10 @@ static int ceph_writepages_start(struct address_space *mapping, bool done = false; bool caching = ceph_is_cache_enabled(inode); + if (wbc->sync_mode == WB_SYNC_NONE && + fsc->write_congested) + return 0; + dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); @@ -954,11 +961,8 @@ get_more_pages: if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) { - set_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - } - + fsc->mount_options->congestion_kb)) + fsc->write_congested = true; pages[locked_pages++] = page; pvec.pages[i] = NULL; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf79f369aec6..4a3b77d049c7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -802,6 +802,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); + fsc->write_congested = false; err = -ENOMEM; /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 67f145e1ae7a..0bd97aea2319 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -121,6 +121,7 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; atomic_long_t writeback_count; + bool write_congested; struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; -- cgit v1.2.3 From fe55d563d4174f13839a9b7ef7309da5031b5d93 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:07 -0700 Subject: remove inode_congested() inode_congested() reports if the backing-device for the inode is congested. No bdi reports congestion any more, so this always returns 'false'. So remove inode_congested() and related functions, and remove the call sites, assuming that inode_congested() always returns 'false'. Link: https://lkml.kernel.org/r/164549983741.9187.2174285592262191311.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 37 ------------------------------------- include/linux/backing-dev.h | 22 ---------------------- mm/fadvise.c | 5 ++--- mm/readahead.c | 6 ------ mm/vmscan.c | 17 +---------------- 5 files changed, 3 insertions(+), 84 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f8d7fe6db989..42a3dfad40b8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -893,43 +893,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); -/** - * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion (may be NULL) - * @cong_bits: mask of WB_[a]sync_congested bits to test - * - * Tests whether @inode is congested. @cong_bits is the mask of congestion - * bits to test and the return value is the mask of set bits. - * - * If cgroup writeback is enabled for @inode, the congestion state is - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg - * associated with @inode is congested; otherwise, the root wb's congestion - * state is used. - * - * @inode is allowed to be NULL as this function is often called on - * mapping->host which is NULL for the swapper space. - */ -int inode_congested(struct inode *inode, int cong_bits) -{ - /* - * Once set, ->i_wb never becomes NULL while the inode is alive. - * Start transaction iff ->i_wb is visible. - */ - if (inode && inode_to_wb_is_valid(inode)) { - struct bdi_writeback *wb; - struct wb_lock_cookie lock_cookie = {}; - bool congested; - - wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); - congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, &lock_cookie); - return congested; - } - - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} -EXPORT_SYMBOL_GPL(inode_congested); - /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 483979c1b9f4..860b675c2929 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -162,7 +162,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); -int inode_congested(struct inode *inode, int cong_bits); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -390,29 +389,8 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline int inode_congested(struct inode *inode, int cong_bits) -{ - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} - #endif /* CONFIG_CGROUP_WRITEBACK */ -static inline int inode_read_congested(struct inode *inode) -{ - return inode_congested(inode, 1 << WB_sync_congested); -} - -static inline int inode_write_congested(struct inode *inode) -{ - return inode_congested(inode, 1 << WB_async_congested); -} - -static inline int inode_rw_congested(struct inode *inode) -{ - return inode_congested(inode, (1 << WB_sync_congested) | - (1 << WB_async_congested)); -} - static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) { return wb_congested(&bdi->wb, cong_bits); diff --git a/mm/fadvise.c b/mm/fadvise.c index d6baa4f451c5..338f16022012 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!inode_write_congested(mapping->host)) - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/readahead.c b/mm/readahead.c index 8a97bd408cf6..f61943fd1741 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -709,12 +709,6 @@ void page_cache_async_ra(struct readahead_control *ractl, folio_clear_readahead(folio); - /* - * Defer asynchronous read-ahead on IO congestion. - */ - if (inode_read_congested(ractl->mapping->host)) - return; - if (blk_cgroup_congested()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 59b14e0d696c..e38de6456cdc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -989,17 +989,6 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } -static int may_write_to_inode(struct inode *inode) -{ - if (current->flags & PF_SWAPWRITE) - return 1; - if (!inode_write_congested(inode)) - return 1; - if (inode_to_bdi(inode) == current->backing_dev_info) - return 1; - return 0; -} - /* * We detected a synchronous write error writing a page out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -1201,8 +1190,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host)) - return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; @@ -1578,9 +1565,7 @@ retry: * end of the LRU a second time. */ mapping = page_mapping(page); - if (((dirty || writeback) && mapping && - inode_write_congested(mapping->host)) || - (writeback && PageReclaim(page))) + if (writeback && PageReclaim(page)) stat->nr_congested++; /* -- cgit v1.2.3 From b9b1335e640308acc1b8f26c739b804c80a6c147 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:10 -0700 Subject: remove bdi_congested() and wb_congested() and related functions These functions are no longer useful as no BDIs report congestions any more. Removing the test on bdi_write_contested() in current_may_throttle() could cause a small change in behaviour, but only when PF_LOCAL_THROTTLE is set. So replace the calls by 'false' and simplify the code - and remove the functions. [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/164549983742.9187.2570198746005819592.stgit@noble.brown Signed-off-by: NeilBrown Acked-by: Ryusuke Konishi [nilfs] Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/drbd/drbd_int.h | 3 --- drivers/block/drbd/drbd_req.c | 3 +-- fs/ext2/ialloc.c | 5 ----- fs/nilfs2/segbuf.c | 16 ---------------- fs/xfs/xfs_buf.c | 3 --- include/linux/backing-dev.h | 26 -------------------------- mm/vmscan.c | 4 +--- 7 files changed, 2 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f27d5b0f9a0b..f804b1bfb3e6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -638,9 +638,6 @@ enum { STATE_SENT, /* Do not change state/UUIDs while this is set */ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) * pending, from drbd worker context. - * If set, bdi_write_congested() returns true, - * so shrink_page_list() would not recurse into, - * and potentially deadlock on, this drbd worker. */ DISCONNECT_SENT, diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3235532ae077..2e5fb7e442e3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -909,8 +909,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - return bdi_read_congested( - device->ldev->backing_bdev->bd_disk->bdi); + return 0; case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index df14e750e9fe..998dd2ac8008 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode) unsigned long offset; unsigned long block; struct ext2_group_desc * gdp; - struct backing_dev_info *bdi; - - bdi = inode_to_bdi(inode); - if (bdi_rw_congested(bdi)) - return; block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 43287b0d3e9b..f4b57bc0c586 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -341,18 +341,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, int mode_flags) { struct bio *bio = wi->bio; - int err; - - if (segbuf->sb_nbio > 0 && - bdi_write_congested(segbuf->sb_super->s_bdi)) { - wait_for_completion(&segbuf->sb_bio_event); - segbuf->sb_nbio--; - if (unlikely(atomic_read(&segbuf->sb_err))) { - bio_put(bio); - err = -EIO; - goto failed; - } - } bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; @@ -365,10 +353,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end; return 0; - - failed: - wi->bio = NULL; - return err; } /** diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b45e0d50a405..b7ebcfe6b8d3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -843,9 +843,6 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) - return; - xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 860b675c2929..2d764566280c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -135,11 +135,6 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb) struct backing_dev_info *inode_to_bdi(struct inode *inode); -static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) -{ - return wb->congested & cong_bits; -} - long congestion_wait(int sync, long timeout); static inline bool mapping_can_writeback(struct address_space *mapping) @@ -391,27 +386,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) #endif /* CONFIG_CGROUP_WRITEBACK */ -static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) -{ - return wb_congested(&bdi->wb, cong_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} - -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); -} - const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e38de6456cdc..5e1469887afa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2364,9 +2364,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, */ static int current_may_throttle(void) { - return !(current->flags & PF_LOCAL_THROTTLE) || - current->backing_dev_info == NULL || - bdi_write_congested(current->backing_dev_info); + return !(current->flags & PF_LOCAL_THROTTLE); } /* -- cgit v1.2.3 From a64239d0ef345208d8c15d7841a028a43a34c068 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:13 -0700 Subject: f2fs: replace congestion_wait() calls with io_schedule_timeout() As congestion is no longer tracked, congestion_wait() is effectively equivalent to io_schedule_timeout(). So introduce f2fs_io_schedule_timeout() which sets TASK_UNINTERRUPTIBLE and call that instead. Link: https://lkml.kernel.org/r/164549983744.9187.6425865370954230902.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/compress.c | 4 +--- fs/f2fs/data.c | 3 +-- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/segment.c | 8 +++----- fs/f2fs/super.c | 6 ++---- 5 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d0c3aeba5945..2f95559025ad 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1505,9 +1505,7 @@ continue_unlock: if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c417864c66a..d428ddfd42ee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3047,8 +3047,7 @@ result: } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, + f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68b44015514f..467f5dbdc7d1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4426,6 +4426,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1dabc8244083..6ff20da44ad7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -313,8 +313,7 @@ next: skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); if (gc_failure) { if (++looped >= count) return; @@ -803,8 +802,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { @@ -3133,7 +3131,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index baefd398ec1a..ebd32daf052c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2135,8 +2135,7 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) @@ -2504,8 +2503,7 @@ retry: &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); -- cgit v1.2.3 From a128b054ce029554a4a52fc3abb8c1df8bafcaef Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Tue, 22 Mar 2022 14:39:22 -0700 Subject: mount: warn only once about timestamp range expiration Commit f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry") introduced a mount warning regarding filesystem timestamp limits, that is printed upon each writable mount or remount. This can result in a lot of unnecessary messages in the kernel log in setups where filesystems are being frequently remounted (or mounted multiple times). Avoid this by setting a superblock flag which indicates that the warning has been emitted at least once for any particular mount, as suggested in [1]. Link: https://lore.kernel.org/CAHk-=wim6VGnxQmjfK_tDg6fbHYKL4EFkmnTjVr9QnRqjDBAeA@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20220119202934.26495-1-ailiop@suse.com Signed-off-by: Anthony Iliopoulos Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Darrick J. Wong Cc: Alexander Viro Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 ++ include/linux/fs.h | 1 + 2 files changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index de6fae84f1a1..0044feef59d0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2597,6 +2597,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2611,6 +2612,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b5c486bd4a2..ca9445f6cf3d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1440,6 +1440,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ /* Possible states of 'frozen' field */ enum { -- cgit v1.2.3 From c0226eb8bde854e016a594a16f5c0d98aca426fa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2022 14:39:34 -0700 Subject: mm: fs: fix lru_cache_disabled race in bh_lru Check lru_cache_disabled under bh_lru_lock. Otherwise, it could introduce race below and it fails to migrate pages containing buffer_head. CPU 0 CPU 1 bh_lru_install lru_cache_disable lru_cache_disabled = false atomic_inc(&lru_disable_count); invalidate_bh_lrus_cpu of CPU 0 bh_lru_lock __invalidate_bh_lrus bh_lru_unlock bh_lru_lock install the bh bh_lru_unlock WHen this race happens a CMA allocation fails, which is critical for the workload which depends on CMA. Link: https://lkml.kernel.org/r/20220308180709.2017638-1-minchan@kernel.org Fixes: 8cc621d2f45d ("mm: fs: invalidate BH LRU during page migration") Signed-off-by: Minchan Kim Cc: Chris Goldsworthy Cc: Marcelo Tosatti Cc: John Dias Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd371..c76a8ef60a75 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1235,16 +1235,18 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + bh_lru_lock(); + /* * the refcount of buffer_head in bh_lru prevents dropping the * attached page(i.e., try_to_free_buffers) so it could cause * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) + if (lru_cache_disabled()) { + bh_lru_unlock(); return; - - bh_lru_lock(); + } b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { -- cgit v1.2.3 From 8b9f3ac5b01db85c6cf74c2c3a71280cc3045c9c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:00 -0700 Subject: fs: introduce alloc_inode_sb() to allocate filesystems specific inode The allocated inode cache is supposed to be added to its memcg list_lru which should be allocated as well in advance. That can be done by kmem_cache_alloc_lru() which allocates object and list_lru. The file systems is main user of it. So introduce alloc_inode_sb() to allocate file system specific inodes and set up the inode reclaim context properly. The file system is supposed to use alloc_inode_sb() to allocate inodes. In later patches, we will convert all users to the new API. Link: https://lkml.kernel.org/r/20220228122126.37293-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/porting.rst | 6 ++++++ fs/inode.c | 2 +- include/linux/fs.h | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index bf19fd6b86e7..7c1583dbeb59 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -45,6 +45,12 @@ typically between calling iget_locked() and unlocking the inode. At some point that will become mandatory. +**mandatory** + +The foo_inode_info should always be allocated through alloc_inode_sb() rather +than kmem_cache_alloc() or kmalloc() related to set up the inode reclaim context +correctly. + --- **mandatory** diff --git a/fs/inode.c b/fs/inode.c index 63324df6fa27..9d9b422504d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb) if (ops->alloc_inode) inode = ops->alloc_inode(sb); else - inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index ca9445f6cf3d..58a73e59e4c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -3114,6 +3115,16 @@ extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); +/* + * This must be used for allocating filesystems specific inodes to set + * up the inode reclaim context correctly. + */ +static inline void * +alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) +{ + return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); +} + extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { -- cgit v1.2.3 From fd60b28842df833477c42da6a6d63d0d114a5fcc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:03 -0700 Subject: fs: allocate inode by using alloc_inode_sb() The inode allocation is supposed to use alloc_inode_sb(), so convert kmem_cache_alloc() of all filesystems to alloc_inode_sb(). Link: https://lkml.kernel.org/r/20220228122126.37293-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Theodore Ts'o [ext4] Acked-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/bdev.c | 2 +- drivers/dax/super.c | 2 +- fs/9p/vfs_inode.c | 2 +- fs/adfs/super.c | 2 +- fs/affs/super.c | 2 +- fs/afs/super.c | 2 +- fs/befs/linuxvfs.c | 2 +- fs/bfs/inode.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/inode.c | 2 +- fs/cifs/cifsfs.c | 2 +- fs/coda/inode.c | 2 +- fs/ecryptfs/super.c | 2 +- fs/efs/super.c | 2 +- fs/erofs/super.c | 2 +- fs/exfat/super.c | 2 +- fs/ext2/super.c | 2 +- fs/ext4/super.c | 2 +- fs/fat/inode.c | 2 +- fs/freevxfs/vxfs_super.c | 2 +- fs/fuse/inode.c | 2 +- fs/gfs2/super.c | 2 +- fs/hfs/super.c | 2 +- fs/hfsplus/super.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/super.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/isofs/inode.c | 2 +- fs/jffs2/super.c | 2 +- fs/jfs/super.c | 2 +- fs/minix/inode.c | 2 +- fs/nfs/inode.c | 2 +- fs/nilfs2/super.c | 2 +- fs/ntfs/inode.c | 2 +- fs/ntfs3/super.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/ocfs2/super.c | 2 +- fs/openpromfs/inode.c | 2 +- fs/orangefs/super.c | 2 +- fs/overlayfs/super.c | 2 +- fs/proc/inode.c | 2 +- fs/qnx4/inode.c | 2 +- fs/qnx6/inode.c | 2 +- fs/reiserfs/super.c | 2 +- fs/romfs/super.c | 2 +- fs/squashfs/super.c | 2 +- fs/sysv/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/udf/super.c | 2 +- fs/ufs/super.c | 2 +- fs/vboxsf/super.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/zonefs/super.c | 2 +- ipc/mqueue.c | 2 +- mm/shmem.c | 2 +- net/socket.c | 2 +- net/sunrpc/rpc_pipe.c | 2 +- 57 files changed, 57 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/block/bdev.c b/block/bdev.c index 102837a37051..ac05343d410b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -385,7 +385,7 @@ static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); if (!ei) return NULL; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index e3029389d809..aedc15eabeac 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -282,7 +282,7 @@ static struct inode *dax_alloc_inode(struct super_block *sb) struct dax_device *dax_dev; struct inode *inode; - dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + dax_dev = alloc_inode_sb(sb, dax_cache, GFP_KERNEL); if (!dax_dev) return NULL; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2a10242c79c7..84c3cf7dffa5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -228,7 +228,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) { struct v9fs_inode *v9inode; - v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL); + v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; #ifdef CONFIG_9P_FSCACHE diff --git a/fs/adfs/super.c b/fs/adfs/super.c index bdbd26e571ed..e8bfc38239cd 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/super.c b/fs/affs/super.c index c609005a9eaa..4c5f30a83336 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb) { struct affs_inode_info *i; - i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL); if (!i) return NULL; diff --git a/fs/afs/super.c b/fs/afs/super.c index 5ec9fd97eccc..7592c0f469f1 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -679,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb) { struct afs_vnode *vnode; - vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); + vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL); if (!vnode) return NULL; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c1ba13d19024..b4b3567ac655 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb) { struct befs_inode_info *bi; - bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); + bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fd691e4815c5..1926bec2c850 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; - bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); + bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5bbea5ec31fc..dcc2102cb546 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8787,7 +8787,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ef4a980a7bf3..9cfa6c730519 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -447,7 +447,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) struct ceph_inode_info *ci; int i; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS); if (!ci) return NULL; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 082c21478686..74ddece5b533 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -354,7 +354,7 @@ static struct inode * cifs_alloc_inode(struct super_block *sb) { struct cifsInodeInfo *cifs_inode; - cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL); + cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ diff --git a/fs/coda/inode.c b/fs/coda/inode.c index d9f1bd7153df..2185328b65c7 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -43,7 +43,7 @@ static struct kmem_cache * coda_inode_cachep; static struct inode *coda_alloc_inode(struct super_block *sb) { struct coda_inode_info *ei; - ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, coda_inode_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->c_fid, 0, sizeof(struct CodaFid)); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 39116af0390f..0b1c878317ab 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -38,7 +38,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) struct ecryptfs_inode_info *inode_info; struct inode *inode = NULL; - inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL); + inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { diff --git a/fs/efs/super.c b/fs/efs/super.c index 62b155b9366b..b287f47c165b 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -69,7 +69,7 @@ static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 915eefe0d7e2..a9ffffc62c25 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -84,7 +84,7 @@ static void erofs_inode_init_once(void *ptr) static struct inode *erofs_alloc_inode(struct super_block *sb) { struct erofs_inode *vi = - kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8c9fb7dcec16..9f892903419a 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -183,7 +183,7 @@ static struct inode *exfat_alloc_inode(struct super_block *sb) { struct exfat_inode_info *ei; - ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, exfat_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 94f1fbd7d3ac..e48aaf52ce93 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -180,7 +180,7 @@ static struct kmem_cache * ext2_inode_cachep; static struct inode *ext2_alloc_inode(struct super_block *sb) { struct ext2_inode_info *ei; - ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->i_block_alloc_info = NULL; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c5021ca0a28a..774eef9f7913 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1316,7 +1316,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; - ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a6f1c6d426d1..38796681cc86 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -745,7 +745,7 @@ static struct kmem_cache *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; - ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 578a5062706e..22eed5a73ac2 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -124,7 +124,7 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb) { struct vxfs_inode_info *vi; - vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL); + vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; inode_init_once(&vi->vfs_inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9ee36aa73251..8c0665c5dff8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -72,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) { struct fuse_inode *fi; - fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); + fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL); if (!fi) return NULL; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 64c67090f503..cf9cf66522b3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1425,7 +1425,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) { struct gfs2_inode *ip; - ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL); if (!ip) return NULL; ip->i_flags = 0; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 12d9bae39363..6764afa98a6f 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -162,7 +162,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) { struct hfs_inode_info *i; - i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b9e3db3f855f..8479add998b5 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -624,7 +624,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) { struct hfsplus_inode_info *i; - i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ef481c3d9019..458300390c51 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -222,7 +222,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT); + hi = alloc_inode_sb(sb, hostfs_inode_cache, GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a7dbfc892022..1cb89595b875 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -232,7 +232,7 @@ static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; - ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7c6c7498be0..171212bdaae6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1110,7 +1110,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; - p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); + p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); return NULL; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 0c6eacfcbeef..d7491692aea3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -70,7 +70,7 @@ static struct kmem_cache *isofs_inode_cachep; static struct inode *isofs_alloc_inode(struct super_block *sb) { struct iso_inode_info *ei; - ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 81ca58c10b72..7ea37f49f1e1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -39,7 +39,7 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) { struct jffs2_inode_info *f; - f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); + f = alloc_inode_sb(sb, jffs2_inode_cachep, GFP_KERNEL); if (!f) return NULL; return &f->vfs_inode; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 24cbc9946e01..f1a13a74cddf 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -102,7 +102,7 @@ static struct inode *jfs_alloc_inode(struct super_block *sb) { struct jfs_inode_info *jfs_inode; - jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); + jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS); if (!jfs_inode) return NULL; #ifdef CONFIG_QUOTA diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a71f1cf894b9..8a0af80741b5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -63,7 +63,7 @@ static struct kmem_cache * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d96baa4450e3..3351c2de3e08 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2238,7 +2238,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 63e5fa74016c..3e05c98631ec 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -151,7 +151,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) { struct nilfs_inode_info *ii; - ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS); if (!ii) return NULL; ii->i_bh = NULL; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 517b71c73aa9..efe0602b4e51 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -310,7 +310,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) ntfs_inode *ni; ntfs_debug("Entering."); - ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS); + ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); if (likely(ni != NULL)) { ni->state = 0; return VFS_I(ni); diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 29813200c7af..278dcf502410 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -399,7 +399,7 @@ static struct kmem_cache *ntfs_inode_cachep; static struct inode *ntfs_alloc_inode(struct super_block *sb) { - struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS); + struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS); if (!ni) return NULL; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index fa0a14f199eb..e360543ad7e7 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -280,7 +280,7 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) { struct dlmfs_inode_private *ip; - ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); + ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS); if (!ip) return NULL; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8bde30fa5387..477cdf94122e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -548,7 +548,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) { struct ocfs2_inode_info *oi; - oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); + oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS); if (!oi) return NULL; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index f825176ff4ed..f0b7f4d51a17 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -335,7 +335,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) { struct op_inode_info *oi; - oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL); + oi = alloc_inode_sb(sb, op_inode_cachep, GFP_KERNEL); if (!oi) return NULL; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index d90d8addbfc2..5254256a224d 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -107,7 +107,7 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) { struct orangefs_inode_s *orangefs_inode; - orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL); + orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL); if (!orangefs_inode) return NULL; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7bb0a47cb615..001cdbb8f015 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -174,7 +174,7 @@ static struct kmem_cache *ovl_inode_cachep; static struct inode *ovl_alloc_inode(struct super_block *sb) { - struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL); + struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL); if (!oi) return NULL; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f84355c5a36d..73aeb4e6d32e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -66,7 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; - ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3fb7fc819b4f..a635bb6615e9 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -338,7 +338,7 @@ static struct kmem_cache *qnx4_inode_cachep; static struct inode *qnx4_alloc_inode(struct super_block *sb) { struct qnx4_inode_info *ei; - ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, qnx4_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 61191f7bdf62..9d8e7e9788a1 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -597,7 +597,7 @@ static struct kmem_cache *qnx6_inode_cachep; static struct inode *qnx6_alloc_inode(struct super_block *sb) { struct qnx6_inode_info *ei; - ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 82e09901462e..756c4916c7ae 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -639,7 +639,7 @@ static struct kmem_cache *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; atomic_set(&ei->openers, 0); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 259f684d9236..9e6bbb4219de 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -375,7 +375,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; - inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index b1b556dbce12..4f74abbc1a54 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -584,7 +584,7 @@ static void __exit exit_squashfs_fs(void) static struct inode *squashfs_alloc_inode(struct super_block *sb) { struct squashfs_inode_info *ei = - kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); + alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL); return ei ? &ei->vfs_inode : NULL; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index be47263b8605..9e8d4a6fb2f3 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -306,7 +306,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) { struct sysv_inode_info *si; - si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL); + si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL); if (!si) return NULL; return &si->vfs_inode; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index aa7a1381c457..bad67455215f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -268,7 +268,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) { struct ubifs_inode *ui; - ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS); + ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS); if (!ui) return NULL; diff --git a/fs/udf/super.c b/fs/udf/super.c index f26b5e0b84b6..48871615e489 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -136,7 +136,7 @@ static struct kmem_cache *udf_inode_cachep; static struct inode *udf_alloc_inode(struct super_block *sb) { struct udf_inode_info *ei; - ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL); if (!ei) return NULL; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 00a01471ea05..23377c1baed9 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1443,7 +1443,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; - ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); + ei = alloc_inode_sb(sb, ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 37dd3fe5b1e9..d2f6df69f611 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -241,7 +241,7 @@ static struct inode *vboxsf_alloc_inode(struct super_block *sb) { struct vboxsf_inode *sf_i; - sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS); + sf_i = alloc_inode_sb(sb, vboxsf_inode_cachep, GFP_NOFS); if (!sf_i) return NULL; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9644f938990c..9e434cb41e48 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -77,7 +77,7 @@ xfs_inode_alloc( * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL * and return NULL here on ENOMEM. */ - ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); + ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); if (inode_init_always(mp->m_super, VFS_I(ip))) { kmem_cache_free(xfs_inode_cache, ip); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b76dfb310ab6..11682cd7045d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1137,7 +1137,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) { struct zonefs_inode_info *zi; - zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL); + zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); if (!zi) return NULL; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5becca9be867..7c08eb3c258d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -486,7 +486,7 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb) { struct mqueue_inode_info *ei; - ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/mm/shmem.c b/mm/shmem.c index 49447c5410d8..f21eb0ef8ae0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3708,7 +3708,7 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; - info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; diff --git a/net/socket.c b/net/socket.c index 982eecad464c..6887840682bb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -301,7 +301,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; - ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; init_waitqueue_head(&ei->socket.wq.wait); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 35588f0afa86..0b6034fab9ab 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -197,7 +197,7 @@ static struct inode * rpc_alloc_inode(struct super_block *sb) { struct rpc_inode *rpci; - rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); + rpci = alloc_inode_sb(sb, rpc_inode_cachep, GFP_KERNEL); if (!rpci) return NULL; return &rpci->vfs_inode; -- cgit v1.2.3 From 65d3af647b4016f134145591914102ee762350b1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:06 -0700 Subject: f2fs: allocate inode by using alloc_inode_sb() The inode allocation is supposed to use alloc_inode_sb(), so convert kmem_cache_alloc() to alloc_inode_sb(). Link: https://lkml.kernel.org/r/20220228122126.37293-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ebd32daf052c..d75dfa23e6e7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1345,8 +1345,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep, - GFP_F2FS_ZERO, false, F2FS_SB(sb)); + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC); + return NULL; + } + + fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; -- cgit v1.2.3 From f53bf711d4d8e07de2caa3f13f6082c6e24145a4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:09 -0700 Subject: mm: dcache: use kmem_cache_alloc_lru() to allocate dentry Like inode cache, the dentry will also be added to its memcg list_lru. So replace kmem_cache_alloc() with kmem_cache_alloc_lru() to allocate dentry. Link: https://lkml.kernel.org/r/20220228122126.37293-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Johannes Weiner Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index c84269c6e8bf..93f4f5ee07bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1766,7 +1766,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) char *dname; int err; - dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, + GFP_KERNEL); if (!dentry) return NULL; -- cgit v1.2.3 From 824ddc601adc2cc48efb7f58b57997986c1c1276 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 22 Mar 2022 14:45:32 -0700 Subject: userfaultfd: provide unmasked address on page-fault Userfaultfd is supposed to provide the full address (i.e., unmasked) of the faulting access back to userspace. However, that is not the case for quite some time. Even running "userfaultfd_demo" from the userfaultfd man page provides the wrong output (and contradicts the man page). Notice that "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) and not the first read address (0x7fc5e30b300f). Address returned by mmap() = 0x7fc5e30b3000 fault_handler_thread(): poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 (uffdio_copy.copy returned 4096) Read address 0x7fc5e30b300f in main(): A Read address 0x7fc5e30b340f in main(): A Read address 0x7fc5e30b380f in main(): A Read address 0x7fc5e30b3c0f in main(): A The exact address is useful for various reasons and specifically for prefetching decisions. If it is known that the memory is populated by certain objects whose size is not page-aligned, then based on the faulting address, the uffd-monitor can decide whether to prefetch and prefault the adjacent page. This bug has been for quite some time in the kernel: since commit 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") vmf->virtual_address"), which dates back to 2016. A concern has been raised that existing userspace application might rely on the old/wrong behavior in which the address is masked. Therefore, it was suggested to provide the masked address unless the user explicitly asks for the exact address. Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct userfaultfd to provide the exact address. Add a new "real_address" field to vmf to hold the unmasked address. Provide the address to userspace accordingly. Initialize real_address in various code-paths to be consistent with address, even when it is not used, to be on the safe side. [namit@vmware.com: initialize real_address on all code paths, per Jan] Link: https://lkml.kernel.org/r/20220226022655.350562-1-namit@vmware.com [akpm@linux-foundation.org: fix typo in comment, per Jan] Link: https://lkml.kernel.org/r/20220218041003.3508-1-namit@vmware.com Signed-off-by: Nadav Amit Acked-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Jan Kara Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 5 ++++- include/linux/mm.h | 3 ++- include/uapi/linux/userfaultfd.h | 8 +++++++- mm/hugetlb.c | 6 ++++-- mm/memory.c | 1 + mm/swapfile.c | 1 + 6 files changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8e03b3d3f5fa..aa0c47cb0d16 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, struct uffd_msg msg; msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; + + if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) + address &= PAGE_MASK; msg.arg.pagefault.address = address; /* * These flags indicate why the userfault occurred: @@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d58321386cc..0e4fd101616e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -478,7 +478,8 @@ struct vm_fault { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + unsigned long address; /* Faulting virtual address - masked */ + unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 05b31d60acf6..ef739054cb1c 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -32,7 +32,8 @@ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ UFFD_FEATURE_MINOR_HUGETLBFS | \ - UFFD_FEATURE_MINOR_SHMEM) + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -189,6 +190,10 @@ struct uffdio_api { * * UFFD_FEATURE_MINOR_SHMEM indicates the same support as * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -201,6 +206,7 @@ struct uffdio_api { #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) #define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) __u64 features; __u64 ioctls; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f425147e4ce3..75b41879e9e9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5341,6 +5341,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, pgoff_t idx, unsigned int flags, unsigned long haddr, + unsigned long addr, unsigned long reason) { vm_fault_t ret; @@ -5348,6 +5349,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = haddr, + .real_address = addr, .flags = flags, /* @@ -5416,7 +5418,7 @@ retry: /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MISSING); goto out; } @@ -5480,7 +5482,7 @@ retry: unlock_page(page); put_page(page); ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MINOR); goto out; } diff --git a/mm/memory.c b/mm/memory.c index 1a55b4c5b5db..e0f3410fa70c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4633,6 +4633,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, + .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), diff --git a/mm/swapfile.c b/mm/swapfile.c index bf0df7aa7158..33c7abb16610 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct vm_fault vmf = { .vma = vma, .address = addr, + .real_address = addr, .pmd = pmd, }; -- cgit v1.2.3 From b698f0a1773f7df73f2bb4bfe0e597ea1bb3881f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Mar 2022 14:45:38 -0700 Subject: mm/fs: delete PF_SWAPWRITE PF_SWAPWRITE has been redundant since v3.2 commit ee72886d8ed5 ("mm: vmscan: do not writeback filesystem pages in direct reclaim"). Coincidentally, NeilBrown's current patch "remove inode_congested()" deletes may_write_to_inode(), which appeared to be the one function which took notice of PF_SWAPWRITE. But if you study the old logic, and the conditions under which may_write_to_inode() was called, you discover that flag and function have been pointless for a decade. Link: https://lkml.kernel.org/r/75e80e7-742d-e3bd-531-614db8961e4@google.com Signed-off-by: Hugh Dickins Cc: NeilBrown Cc: Jan Kara Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 3 --- fs/xfs/libxfs/xfs_btree.c | 2 +- include/linux/sched.h | 1 - mm/migrate.c | 7 ------- mm/vmscan.c | 8 ++------ 5 files changed, 3 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 42a3dfad40b8..3d8dacfff150 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2197,7 +2197,6 @@ void wb_workfn(struct work_struct *work) long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); - current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { @@ -2226,8 +2225,6 @@ void wb_workfn(struct work_struct *work) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); - - current->flags &= ~PF_SWAPWRITE; } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f18a875f51c6..c1500b238520 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2818,7 +2818,7 @@ xfs_btree_split_worker( * in any way. */ if (args->kswapd) - new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + new_pflags |= PF_MEMALLOC | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); xfs_trans_set_context(args->cur->bc_tp); diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248..c3c841a02a00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1689,7 +1689,6 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ diff --git a/mm/migrate.c b/mm/migrate.c index 4359d49b508a..fc75c409482e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1350,7 +1350,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, bool is_thp = false; struct page *page; struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); LIST_HEAD(thp_split_pages); @@ -1359,9 +1358,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, trace_mm_migrate_pages_start(mode, reason); - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; @@ -1516,9 +1512,6 @@ out: trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason); - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - if (ret_succeeded) *ret_succeeded = nr_succeeded; diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e1469887afa..d4168966311f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4457,7 +4457,7 @@ static int kswapd(void *p) * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); @@ -4508,7 +4508,7 @@ kswapd_try_sleep: goto kswapd_try_sleep; } - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); return 0; } @@ -4749,11 +4749,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP - * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_UNMAP. */ noreclaim_flag = memalloc_noreclaim_save(); - p->flags |= PF_SWAPWRITE; set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { @@ -4767,7 +4764,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } set_task_reclaim_state(p, NULL); - current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); psi_memstall_leave(&pflags); -- cgit v1.2.3