From 36518b6b4da7e8d4387bc19ad21e772f1060e9d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Jun 2022 16:04:03 -0400 Subject: teach iomap_dio_rw() to suppress dsync New flag, equivalent to removal of IOCB_DSYNC from iocb flags. This mimics what btrfs is doing (and that's what btrfs will switch to). However, I'm not at all sure that we want to suppress REQ_FUA for those - all btrfs hack really cares about is suppression of generic_write_sync(). For now let's keep the existing behaviour, but I really want to hear more detailed arguments pro or contra. [folded brain fix from willy] Suggested-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- fs/iomap/direct-io.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 370c3241618a..c10c69e2de24 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -548,17 +548,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb->ki_flags & IOCB_DSYNC && + !(dio_flags & IOMAP_DIO_NOSYNC)) { dio->flags |= IOMAP_DIO_NEED_SYNC; - /* - * For datasync only writes, we optimistically try using FUA for - * this IO. Any non-FUA write that occurs will clear this flag, - * hence we know before completion whether a cache flush is - * necessary. - */ - if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) - dio->flags |= IOMAP_DIO_WRITE_FUA; + /* + * For datasync only writes, we optimistically try + * using FUA for this IO. Any non-FUA write that + * occurs will clear this flag, hence we know before + * completion whether a cache flush is necessary. + */ + if (!(iocb->ki_flags & IOCB_SYNC)) + dio->flags |= IOMAP_DIO_WRITE_FUA; + } } if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { -- cgit v1.2.3 From eacdf4eaca632438c8453294727b94d7c5745c62 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Jun 2022 16:10:01 -0400 Subject: btrfs: use IOMAP_DIO_NOSYNC ... instead of messing with iocb flags Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- fs/btrfs/file.c | 17 ----------------- fs/btrfs/inode.c | 3 ++- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fd827b99c1b..98f81e304eb1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1848,7 +1848,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { - const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1901,15 +1900,6 @@ relock: goto buffered; } - /* - * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() - * calls generic_write_sync() (through iomap_dio_complete()), because - * that results in calling fsync (btrfs_sync_file()) which will try to - * lock the inode in exclusive/write mode. - */ - if (is_sync_write) - iocb->ki_flags &= ~IOCB_DSYNC; - /* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because @@ -1964,13 +1954,6 @@ again: btrfs_inode_unlock(inode, ilock_flags); - /* - * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do - * the fsync (call generic_write_sync()). - */ - if (is_sync_write) - iocb->ki_flags |= IOCB_DSYNC; - /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3..fbf0aee7d66a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8152,7 +8152,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, + &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v1.2.3 From e87f2c26c8085dac59978dee1beeb05cef31a9dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 09:28:12 -0400 Subject: struct file: use anonymous union member for rcuhead and llist Once upon a time we couldn't afford anon unions; these days minimal gcc version had been raised enough to take care of that. Reviewed-by: Jan Kara Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- fs/file_table.c | 16 ++++++++-------- include/linux/fs.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index 5424e3a8df5f..b989e33aacda 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp; static void file_free_rcu(struct rcu_head *head) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + struct file *f = container_of(head, struct file, f_rcuhead); put_cred(f->f_cred); kmem_cache_free(filp_cachep, f); @@ -56,7 +56,7 @@ static inline void file_free(struct file *f) security_file_free(f); if (!(f->f_mode & FMODE_NOACCOUNT)) percpu_counter_dec(&nr_files); - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + call_rcu(&f->f_rcuhead, file_free_rcu); } /* @@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free_rcu(&f->f_u.fu_rcuhead); + file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } @@ -341,13 +341,13 @@ static void delayed_fput(struct work_struct *unused) struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; - llist_for_each_entry_safe(f, t, node, f_u.fu_llist) + llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_u.fu_rcuhead)); + __fput(container_of(work, struct file, f_rcuhead)); } /* @@ -374,8 +374,8 @@ void fput(struct file *file) struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_u.fu_rcuhead, ____fput); - if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) + init_task_work(&file->f_rcuhead, ____fput); + if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), @@ -384,7 +384,7 @@ void fput(struct file *file) */ } - if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) + if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..6a2a4906041f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -924,9 +924,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) struct file { union { - struct llist_node fu_llist; - struct rcu_head fu_rcuhead; - } f_u; + struct llist_node f_llist; + struct rcu_head f_rcuhead; + }; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; -- cgit v1.2.3 From 91b94c5d6ae55d1161633047ffeea644b110b35f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 09:39:27 -0400 Subject: iocb: delay evaluation of IS_SYNC(...) until we want to check IOCB_DSYNC New helper to be used instead of direct checks for IOCB_DSYNC: iocb_is_dsync(iocb). Checks converted, which allows to avoid the IS_SYNC(iocb->ki_filp->f_mapping->host) part (4 cache lines) from iocb_flags() - it's checked in iocb_is_dsync() instead Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- block/fops.c | 2 +- fs/btrfs/file.c | 2 +- fs/direct-io.c | 2 +- fs/fuse/file.c | 2 +- fs/iomap/direct-io.c | 3 +-- fs/zonefs/super.c | 2 +- include/linux/fs.h | 10 ++++++++-- 7 files changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/block/fops.c b/block/fops.c index d6b3276a6c68..6e86931ab847 100644 --- a/block/fops.c +++ b/block/fops.c @@ -37,7 +37,7 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) op |= REQ_FUA; return op; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 98f81e304eb1..54358a5c9d56 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2021,7 +2021,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb->ki_flags & IOCB_DSYNC; + const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we diff --git a/fs/direct-io.c b/fs/direct-io.c index 840752006f60..39647eb56904 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1210,7 +1210,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 05caa2b9272e..00fa861aeead 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1042,7 +1042,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) { unsigned int flags = iocb->ki_filp->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) flags |= O_DSYNC; if (iocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c10c69e2de24..31c7f1035b20 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -548,8 +548,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb->ki_flags & IOCB_DSYNC && - !(dio_flags & IOMAP_DIO_NOSYNC)) { + if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) { dio->flags |= IOMAP_DIO_NEED_SYNC; /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bcb21aea990a..04a98b4cd7ee 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -746,7 +746,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; bio->bi_ioprio = iocb->ki_ioprio; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) bio->bi_opf |= REQ_FUA; ret = bio_iov_iter_get_pages(bio, from); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a2a4906041f..380a1292f4f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2720,6 +2720,12 @@ extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); +static inline bool iocb_is_dsync(const struct kiocb *iocb) +{ + return (iocb->ki_flags & IOCB_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host); +} + /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount @@ -2727,7 +2733,7 @@ extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { - if (iocb->ki_flags & IOCB_DSYNC) { + if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); @@ -3262,7 +3268,7 @@ static inline int iocb_flags(struct file *file) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; - if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) + if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; -- cgit v1.2.3 From 164f4064ca81eefcea29f7f5dcf394f92be1d0c0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 May 2022 11:38:11 -0400 Subject: keep iocb_flags() result cached in struct file * calculate at the time we set FMODE_OPENED (do_dentry_open() for normal opens, alloc_file() for pipe()/socket()/etc.) * update when handling F_SETFL * keep in a new field - file->f_iocb_flags; since that thing is needed only before the refcount reaches zero, we can put it into the same anon union where ->f_rcuhead and ->f_llist live - those are used only after refcount reaches zero. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- drivers/nvme/target/io-cmd-file.c | 2 +- fs/aio.c | 2 +- fs/fcntl.c | 1 + fs/file_table.c | 1 + fs/io_uring.c | 2 +- fs/open.c | 1 + include/linux/fs.h | 5 ++--- 7 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index f3d58abf11e0..64b47e2a4633 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -112,7 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_pos = pos; iocb->ki_filp = req->ns->file; - iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); + iocb->ki_flags = ki_flags | iocb->ki_filp->f_iocb_flags; return call_iter(iocb, &iter); } diff --git a/fs/aio.c b/fs/aio.c index 3c249b938632..2bdd444d408b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = iocb_flags(req->ki_filp); + req->ki_flags = req->ki_filp->f_iocb_flags; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 34a3faa4886d..146c9ab0cd4b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: diff --git a/fs/file_table.c b/fs/file_table.c index b989e33aacda..905792b0521c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -241,6 +241,7 @@ static struct file *alloc_file(const struct path *path, int flags, if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; + file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3aab4182fd89..53424b1f019f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4330,7 +4330,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (!io_req_ffs_set(req)) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - kiocb->ki_flags = iocb_flags(file); + kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, req->rw.flags); if (unlikely(ret)) return ret; diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..d80441a0bf17 100644 --- a/fs/open.c +++ b/fs/open.c @@ -862,6 +862,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/include/linux/fs.h b/include/linux/fs.h index 380a1292f4f9..c82b9d442f56 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -926,6 +926,7 @@ struct file { union { struct llist_node f_llist; struct rcu_head f_rcuhead; + unsigned int f_iocb_flags; }; struct path f_path; struct inode *f_inode; /* cached value */ @@ -2199,13 +2200,11 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns, !gid_valid(i_gid_into_mnt(mnt_userns, inode)); } -static inline int iocb_flags(struct file *file); - static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, - .ki_flags = iocb_flags(filp), + .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } -- cgit v1.2.3