From 2eb7509a05443048fb4df60b782de3f03c6c298b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 13 May 2021 16:13:54 +0200 Subject: gfs2: Add wrapper for iomap_file_buffered_write Add a wrapper around iomap_file_buffered_write. We'll add code for when the operation needs to be retried here later. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c559827cb6f9..da742b470f23 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -876,6 +876,20 @@ out_uninit: return written ? written : ret; } +static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + + current->backing_dev_info = inode_to_bdi(inode); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + current->backing_dev_info = NULL; + if (ret > 0) + iocb->ki_pos += ret; + return ret; +} + /** * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context @@ -927,9 +941,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out_unlock; iocb->ki_flags |= IOCB_DSYNC; - current->backing_dev_info = inode_to_bdi(inode); - buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); - current->backing_dev_info = NULL; + buffered = gfs2_file_buffered_write(iocb, from); if (unlikely(buffered <= 0)) { if (!ret) ret = buffered; @@ -943,7 +955,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * the direct I/O range as we don't know if the buffered pages * made it to disk. */ - iocb->ki_pos += buffered; ret2 = generic_write_sync(iocb, buffered); invalidate_mapping_pages(mapping, (iocb->ki_pos - buffered) >> PAGE_SHIFT, @@ -951,13 +962,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!ret || ret2 > 0) ret += ret2; } else { - current->backing_dev_info = inode_to_bdi(inode); - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); - current->backing_dev_info = NULL; - if (likely(ret > 0)) { - iocb->ki_pos += ret; + ret = gfs2_file_buffered_write(iocb, from); + if (likely(ret > 0)) ret = generic_write_sync(iocb, ret); - } } out_unlock: -- cgit v1.2.3 From 6144464937fe1e6135b13a30502a339d549bf093 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 10 Aug 2021 00:43:55 +0200 Subject: gfs2: Clean up function may_grant Pass the first current glock holder into function may_grant and deobfuscate the logic there. While at it, switch from BUG_ON to GLOCK_BUG_ON in may_grant. To make that build cleanly, de-constify the may_grant arguments. We're now using function find_first_holder in do_promote, so move the function's definition above do_promote. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 119 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 50 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e0eaa9cf9fb6..bffd9c20f2de 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -301,46 +301,59 @@ void gfs2_glock_put(struct gfs2_glock *gl) } /** - * may_grant - check if its ok to grant a new lock + * may_grant - check if it's ok to grant a new lock * @gl: The glock + * @current_gh: One of the current holders of @gl * @gh: The lock request which we wish to grant * - * Returns: true if its ok to grant the lock + * With our current compatibility rules, if a glock has one or more active + * holders (HIF_HOLDER flag set), any of those holders can be passed in as + * @current_gh; they are all the same as far as compatibility with the new @gh + * goes. + * + * Returns true if it's ok to grant the lock. */ -static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) -{ - const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); +static inline bool may_grant(struct gfs2_glock *gl, + struct gfs2_holder *current_gh, + struct gfs2_holder *gh) +{ + if (current_gh) { + GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); + + switch(current_gh->gh_state) { + case LM_ST_EXCLUSIVE: + /* + * Here we make a special exception to grant holders + * who agree to share the EX lock with other holders + * who also have the bit set. If the original holder + * has the LM_FLAG_NODE_SCOPE bit set, we grant more + * holders with the bit set. + */ + return gh->gh_state == LM_ST_EXCLUSIVE && + (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && + (gh->gh_flags & LM_FLAG_NODE_SCOPE); - if (gh != gh_head) { - /** - * Here we make a special exception to grant holders who agree - * to share the EX lock with other holders who also have the - * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit - * is set, we grant more holders with the bit set. - */ - if (gh_head->gh_state == LM_ST_EXCLUSIVE && - (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) && - gh->gh_state == LM_ST_EXCLUSIVE && - (gh->gh_flags & LM_FLAG_NODE_SCOPE)) - return 1; - if ((gh->gh_state == LM_ST_EXCLUSIVE || - gh_head->gh_state == LM_ST_EXCLUSIVE)) - return 0; + case LM_ST_SHARED: + case LM_ST_DEFERRED: + return gh->gh_state == current_gh->gh_state; + + default: + return false; + } } + if (gl->gl_state == gh->gh_state) - return 1; + return true; if (gh->gh_flags & GL_EXACT) - return 0; + return false; if (gl->gl_state == LM_ST_EXCLUSIVE) { - if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) - return 1; - if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) - return 1; + return gh->gh_state == LM_ST_SHARED || + gh->gh_state == LM_ST_DEFERRED; } - if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) - return 1; - return 0; + if (gh->gh_flags & LM_FLAG_ANY) + return gl->gl_state != LM_ST_UNLOCKED; + return false; } static void gfs2_holder_wake(struct gfs2_holder *gh) @@ -380,6 +393,24 @@ static void do_error(struct gfs2_glock *gl, const int ret) } } +/** + * find_first_holder - find the first "holder" gh + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (!list_empty(&gl->gl_holders)) { + gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, + gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock @@ -393,14 +424,15 @@ __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh, *tmp; + struct gfs2_holder *gh, *tmp, *first_gh; int ret; restart: + first_gh = find_first_holder(gl); list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; - if (may_grant(gl, gh)) { + if (may_grant(gl, first_gh, gh)) { if (gh->gh_list.prev == &gl->gl_holders && glops->go_lock) { spin_unlock(&gl->gl_lockref.lock); @@ -722,23 +754,6 @@ out: spin_lock(&gl->gl_lockref.lock); } -/** - * find_first_holder - find the first "holder" gh - * @gl: the glock - */ - -static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) -{ - struct gfs2_holder *gh; - - if (!list_empty(&gl->gl_holders)) { - gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - return gh; - } - return NULL; -} - /** * run_queue - do all outstanding tasks related to a glock * @gl: The glock in question @@ -1354,8 +1369,12 @@ __acquires(&gl->gl_lockref.lock) GLOCK_BUG_ON(gl, true); if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) - try_futile = !may_grant(gl, gh); + if (test_bit(GLF_LOCK, &gl->gl_flags)) { + struct gfs2_holder *first_gh; + + first_gh = find_first_holder(gl); + try_futile = !may_grant(gl, first_gh, gh); + } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; } -- cgit v1.2.3 From dc732906c2450939c319fec6e258aa89ecb5a632 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 19 Aug 2021 20:51:23 +0200 Subject: gfs2: Introduce flag for glock holder auto-demotion This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure that will allow glocks to be demoted automatically on locking conflicts. When a locking request comes in that isn't compatible with the locking state of an active holder and that holder has the HIF_MAY_DEMOTE flag set, the holder will be demoted before the incoming locking request is granted. Note that this mechanism demotes active holders (with the HIF_HOLDER flag set), while before we were only demoting glocks without any active holders. This allows processes to keep hold of locks that may form a cyclic locking dependency; the core glock logic will then break those dependencies in case a conflicting locking request occurs. We'll use this to avoid giving up the inode glock proactively before faulting in pages. Processes that allow a glock holder to be taken away indicate this by calling gfs2_holder_allow_demote(), which sets the HIF_MAY_DEMOTE flag. Later, they call gfs2_holder_disallow_demote() to clear the flag again, and then they check if their holder is still queued: if it is, they are still holding the glock; if it isn't, they can re-acquire the glock (or abort). Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 215 +++++++++++++++++++++++++++++++++++++++++++++---------- fs/gfs2/glock.h | 20 ++++++ fs/gfs2/incore.h | 1 + 3 files changed, 200 insertions(+), 36 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index bffd9c20f2de..f686083d0250 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -58,6 +58,7 @@ struct gfs2_glock_iter { typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); +static void __gfs2_glock_dq(struct gfs2_holder *gh); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; @@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl) if (gl->gl_state == LM_ST_UNLOCKED) return 0; + /* + * Note that demote_ok is used for the lru process of disposing of + * glocks. For this purpose, we don't care if the glock's holders + * have the HIF_MAY_DEMOTE flag set or not. If someone is using + * them, don't demote. + */ if (!list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) @@ -379,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret) struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + if (!test_bit(HIF_WAIT, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; @@ -393,6 +400,40 @@ static void do_error(struct gfs2_glock *gl, const int ret) } } +/** + * demote_incompat_holders - demote incompatible demoteable holders + * @gl: the glock we want to promote + * @new_gh: the new holder to be promoted + */ +static void demote_incompat_holders(struct gfs2_glock *gl, + struct gfs2_holder *new_gh) +{ + struct gfs2_holder *gh; + + /* + * Demote incompatible holders before we make ourselves eligible. + * (This holder may or may not allow auto-demoting, but we don't want + * to demote the new holder before it's even granted.) + */ + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + /* + * Since holders are at the front of the list, we stop when we + * find the first non-holder. + */ + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return; + if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && + !may_grant(gl, new_gh, gh)) { + /* + * We should not recurse into do_promote because + * __gfs2_glock_dq only calls handle_callback, + * gfs2_glock_add_to_lru and __gfs2_glock_queue_work. + */ + __gfs2_glock_dq(gh); + } + } +} + /** * find_first_holder - find the first "holder" gh * @gl: the glock @@ -411,6 +452,26 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) return NULL; } +/** + * find_first_strong_holder - find the first non-demoteable holder + * @gl: the glock + * + * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set. + */ +static inline struct gfs2_holder * +find_first_strong_holder(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return NULL; + if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) + return gh; + } + return NULL; +} + /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock @@ -425,14 +486,20 @@ __acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh, *tmp, *first_gh; + bool incompat_holders_demoted = false; int ret; restart: - first_gh = find_first_holder(gl); + first_gh = find_first_strong_holder(gl); list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + if (!test_bit(HIF_WAIT, &gh->gh_iflags)) continue; if (may_grant(gl, first_gh, gh)) { + if (!incompat_holders_demoted) { + demote_incompat_holders(gl, first_gh); + incompat_holders_demoted = true; + first_gh = gh; + } if (gh->gh_list.prev == &gl->gl_holders && glops->go_lock) { spin_unlock(&gl->gl_lockref.lock); @@ -458,6 +525,11 @@ restart: gfs2_holder_wake(gh); continue; } + /* + * If we get here, it means we may not grant this holder for + * some reason. If this holder is the head of the list, it + * means we have a blocked holder at the head, so return 1. + */ if (gh->gh_list.prev == &gl->gl_holders) return 1; do_error(gl, 0); @@ -1372,7 +1444,7 @@ __acquires(&gl->gl_lockref.lock) if (test_bit(GLF_LOCK, &gl->gl_flags)) { struct gfs2_holder *first_gh; - first_gh = find_first_holder(gl); + first_gh = find_first_strong_holder(gl); try_futile = !may_grant(gl, first_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) @@ -1381,7 +1453,8 @@ __acquires(&gl->gl_lockref.lock) list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && - (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) + (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && + !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) goto trap_recursive; if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { @@ -1477,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh) return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } -/** - * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) - * @gh: the glock holder - * - */ +static inline bool needs_demote(struct gfs2_glock *gl) +{ + return (test_bit(GLF_DEMOTE, &gl->gl_flags) || + test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); +} -void gfs2_glock_dq(struct gfs2_holder *gh) +static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned delay = 0; int fast_path = 0; - spin_lock(&gl->gl_lockref.lock); /* - * If we're in the process of file system withdraw, we cannot just - * dequeue any glocks until our journal is recovered, lest we - * introduce file system corruption. We need two exceptions to this - * rule: We need to allow unlocking of nondisk glocks and the glock - * for our own journal that needs recovery. + * This while loop is similar to function demote_incompat_holders: + * If the glock is due to be demoted (which may be from another node + * or even if this holder is GL_NOCACHE), the weak holders are + * demoted as well, allowing the glock to be demoted. */ - if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && - glock_blocked_by_withdraw(gl) && - gh->gh_gl != sdp->sd_jinode_gl) { - sdp->sd_glock_dqs_held++; - spin_unlock(&gl->gl_lockref.lock); - might_sleep(); - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, - TASK_UNINTERRUPTIBLE); - spin_lock(&gl->gl_lockref.lock); - } - if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, false); + while (gh) { + /* + * If we're in the process of file system withdraw, we cannot + * just dequeue any glocks until our journal is recovered, lest + * we introduce file system corruption. We need two exceptions + * to this rule: We need to allow unlocking of nondisk glocks + * and the glock for our own journal that needs recovery. + */ + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + + /* + * This holder should not be cached, so mark it for demote. + * Note: this should be done before the check for needs_demote + * below. + */ + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + + list_del_init(&gh->gh_list); + clear_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_glock_queue(gh, 0); + + /* + * If there hasn't been a demote request we are done. + * (Let the remaining holders, if any, keep holding it.) + */ + if (!needs_demote(gl)) { + if (list_empty(&gl->gl_holders)) + fast_path = 1; + break; + } + /* + * If we have another strong holder (we cannot auto-demote) + * we are done. It keeps holding it until it is done. + */ + if (find_first_strong_holder(gl)) + break; - list_del_init(&gh->gh_list); - clear_bit(HIF_HOLDER, &gh->gh_iflags); - if (list_empty(&gl->gl_holders) && - !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - fast_path = 1; + /* + * If we have a weak holder at the head of the list, it + * (and all others like it) must be auto-demoted. If there + * are no more weak holders, we exit the while loop. + */ + gh = find_first_holder(gl); + } if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) gfs2_glock_add_to_lru(gl); - trace_gfs2_glock_queue(gh, 0); if (unlikely(!fast_path)) { gl->gl_lockref.count++; if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -1530,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) delay = gl->gl_hold_time; __gfs2_glock_queue_work(gl, delay); } +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_lockref.lock); + __gfs2_glock_dq(gh); spin_unlock(&gl->gl_lockref.lock); } @@ -1692,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { + struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, }; unsigned long delay = 0; unsigned long holdtime; unsigned long now = jiffies; @@ -1706,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } + /* + * Note 1: We cannot call demote_incompat_holders from handle_callback + * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq -> + * handle_callback -> demote_incompat_holders -> gfs2_glock_dq + * Plus, we only want to demote the holders if the request comes from + * a remote cluster node because local holder conflicts are resolved + * elsewhere. + * + * Note 2: if a remote node wants this glock in EX mode, lock_dlm will + * request that we set our state to UNLOCKED. Here we mock up a holder + * to make it look like someone wants the lock EX locally. Any SH + * and DF requests should be able to share the lock without demoting. + * + * Note 3: We only want to demote the demoteable holders when there + * are no more strong holders. The demoteable holders might as well + * keep the glock until the last strong holder is done with it. + */ + if (!find_first_strong_holder(gl)) { + if (state == LM_ST_UNLOCKED) + mock_gh.gh_state = LM_ST_EXCLUSIVE; + demote_incompat_holders(gl, &mock_gh); + } handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -2095,6 +2236,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; + if (test_bit(HIF_MAY_DEMOTE, &iflags)) + *p++ = 'D'; *p = 0; return buf; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 31a8f2f649b5..9012487da4c6 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) break; + if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) + continue; if (gh->gh_owner_pid == pid) goto out; } @@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(&gl->gl_lockref.lock); } +static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_lockref.lock); + set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); + spin_unlock(&gl->gl_lockref.lock); +} + +static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_lockref.lock); + clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); + spin_unlock(&gl->gl_lockref.lock); +} + extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0fe49770166e..58b7bac501e4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -252,6 +252,7 @@ struct gfs2_lkstats { enum { /* States */ + HIF_MAY_DEMOTE = 1, HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_WAIT = 10, }; -- cgit v1.2.3 From b924bdab7445946e2ed364a0e6e249d36f1f1158 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 11 Aug 2021 21:50:14 +0200 Subject: gfs2: Move the inode glock locking to gfs2_file_buffered_write So far, for buffered writes, we were taking the inode glock in gfs2_iomap_begin and dropping it in gfs2_iomap_end with the intention of not holding the inode glock while iomap_write_actor faults in user pages. It turns out that iomap_write_actor is called inside iomap_begin ... iomap_end, so the user pages were still faulted in while holding the inode glock and the locking code in iomap_begin / iomap_end was completely pointless. Move the locking into gfs2_file_buffered_write instead. We'll take care of the potential deadlocks due to faulting in user pages while holding a glock in a subsequent patch. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 60 +--------------------------------------------------------- fs/gfs2/file.c | 27 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 59 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5414c2c33580..7235d539e969 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -961,46 +961,6 @@ hole_found: goto out; } -static int gfs2_write_lock(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (error) - goto out_uninit; - if (&ip->i_inode == sdp->sd_rindex) { - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, &m_ip->i_gh); - if (error) - goto out_unlock; - } - return 0; - -out_unlock: - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - -static void gfs2_write_unlock(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - - if (&ip->i_inode == sdp->sd_rindex) { - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - - gfs2_glock_dq_uninit(&m_ip->i_gh); - } - gfs2_glock_dq_uninit(&ip->i_gh); -} - static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, unsigned len) { @@ -1118,11 +1078,6 @@ out_qunlock: return ret; } -static inline bool gfs2_iomap_need_write_lock(unsigned flags) -{ - return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT); -} - static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) @@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); - if (gfs2_iomap_need_write_lock(flags)) { - ret = gfs2_write_lock(inode); - if (ret) - goto out; - } - ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); if (ret) goto out_unlock; @@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); out_unlock: - if (ret && gfs2_iomap_need_write_lock(flags)) - gfs2_write_unlock(inode); release_metapath(&mp); -out: trace_gfs2_iomap_end(ip, iomap, ret); return ret; } @@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, } if (unlikely(!written)) - goto out_unlock; + return 0; if (iomap->flags & IOMAP_F_SIZE_CHANGED) mark_inode_dirty(inode); set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); - -out_unlock: - if (gfs2_iomap_need_write_lock(flags)) - gfs2_write_unlock(inode); return 0; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index da742b470f23..13282f57da37 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -880,13 +880,40 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *fro { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); ssize_t ret; + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + ret = gfs2_glock_nq(&ip->i_gh); + if (ret) + goto out_uninit; + + if (inode == sdp->sd_rindex) { + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (ret) + goto out_unlock; + } + current->backing_dev_info = inode_to_bdi(inode); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); current->backing_dev_info = NULL; if (ret > 0) iocb->ki_pos += ret; + + if (inode == sdp->sd_rindex) { + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + gfs2_glock_dq_uninit(&m_ip->i_gh); + } + +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); return ret; } -- cgit v1.2.3 From 1b223f7065bc7d89c4677c27381817cc95b117a8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 25 Aug 2021 21:54:18 +0200 Subject: gfs2: Eliminate ip->i_gh Now that gfs2_file_buffered_write is the only remaining user of ip->i_gh, we can move the glock holder to the stack (or rather, use the one we already have on the stack); there is no need for keeping the holder in the inode anymore. This is slightly complicated by the fact that we're using ip->i_gh for the statfs inode in gfs2_file_buffered_write as well. Writing to the statfs inode isn't very common, so allocate the statfs holder dynamically when needed. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 34 +++++++++++++++++++++------------- fs/gfs2/incore.h | 3 +-- 2 files changed, 22 insertions(+), 15 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 13282f57da37..8f37e4bab995 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -876,16 +876,25 @@ out_uninit: return written ? written : ret; } -static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *from) +static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from, + struct gfs2_holder *gh) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder *statfs_gh = NULL; ssize_t ret; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - ret = gfs2_glock_nq(&ip->i_gh); + if (inode == sdp->sd_rindex) { + statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); + if (!statfs_gh) + return -ENOMEM; + } + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); + ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; @@ -893,7 +902,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *fro struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, &m_ip->i_gh); + GL_NOCACHE, statfs_gh); if (ret) goto out_unlock; } @@ -904,16 +913,15 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *fro if (ret > 0) iocb->ki_pos += ret; - if (inode == sdp->sd_rindex) { - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - - gfs2_glock_dq_uninit(&m_ip->i_gh); - } + if (inode == sdp->sd_rindex) + gfs2_glock_dq_uninit(statfs_gh); out_unlock: - gfs2_glock_dq(&ip->i_gh); + gfs2_glock_dq(gh); out_uninit: - gfs2_holder_uninit(&ip->i_gh); + gfs2_holder_uninit(gh); + if (statfs_gh) + kfree(statfs_gh); return ret; } @@ -968,7 +976,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out_unlock; iocb->ki_flags |= IOCB_DSYNC; - buffered = gfs2_file_buffered_write(iocb, from); + buffered = gfs2_file_buffered_write(iocb, from, &gh); if (unlikely(buffered <= 0)) { if (!ret) ret = buffered; @@ -989,7 +997,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!ret || ret2 > 0) ret += ret2; } else { - ret = gfs2_file_buffered_write(iocb, from); + ret = gfs2_file_buffered_write(iocb, from, &gh); if (likely(ret > 0)) ret = generic_write_sync(iocb, ret); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 58b7bac501e4..ca42d310fd4d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -387,9 +387,8 @@ struct gfs2_inode { u64 i_generation; u64 i_eattr; unsigned long i_flags; /* GIF_... */ - struct gfs2_glock *i_gl; /* Move into i_gh? */ + struct gfs2_glock *i_gl; struct gfs2_holder i_iopen_gh; - struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_qadata *i_qadata; /* quota allocation data */ struct gfs2_holder i_rgd_gh; struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ -- cgit v1.2.3 From 00bfe02f479688a67a29019d1228f1470e26f014 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 18 Oct 2021 19:21:44 +0200 Subject: gfs2: Fix mmap + page fault deadlocks for buffered I/O In the .read_iter and .write_iter file operations, we're accessing user-space memory while holding the inode glock. There is a possibility that the memory is mapped to the same file, in which case we'd recurse on the same glock. We could detect and work around this simple case of recursive locking, but more complex scenarios exist that involve multiple glocks, processes, and cluster nodes, and working around all of those cases isn't practical or even possible. Avoid these kinds of problems by disabling page faults while holding the inode glock. If a page fault would occur, we either end up with a partial read or write or with -EFAULT if nothing could be read or written. In either case, we know that we're not done with the operation, so we indicate that we're willing to give up the inode glock and then we fault in the missing pages. If that made us lose the inode glock, we return a partial read or write. Otherwise, we resume the operation. This locking problem was originally reported by Jan Kara. Linus came up with the idea of disabling page faults. Many thanks to Al Viro and Matthew Wilcox for their feedback. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 5 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 8f37e4bab995..d9126e3e6dd6 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -776,6 +776,36 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } +static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, + size_t *prev_count, + size_t *window_size) +{ + char __user *p = i->iov[0].iov_base + i->iov_offset; + size_t count = iov_iter_count(i); + int pages = 1; + + if (likely(!count)) + return false; + if (ret <= 0 && ret != -EFAULT) + return false; + if (!iter_is_iovec(i)) + return false; + + if (*prev_count != count || !*window_size) { + int pages, nr_dirtied; + + pages = min_t(int, BIO_MAX_VECS, + DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE)); + nr_dirtied = max(current->nr_dirtied_pause - + current->nr_dirtied, 1); + pages = min(pages, nr_dirtied); + } + + *prev_count = count; + *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); + return true; +} + static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct gfs2_holder *gh) { @@ -840,9 +870,17 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct gfs2_inode *ip; struct gfs2_holder gh; + size_t prev_count = 0, window_size = 0; size_t written = 0; ssize_t ret; + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + */ + if (iocb->ki_flags & IOCB_DIRECT) { ret = gfs2_file_direct_read(iocb, to, &gh); if (likely(ret != -ENOTBLK)) @@ -864,13 +902,34 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } ip = GFS2_I(iocb->ki_filp->f_mapping->host); gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); +retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; +retry_under_glock: + pagefault_disable(); ret = generic_file_read_iter(iocb, to); + pagefault_enable(); if (ret > 0) written += ret; - gfs2_glock_dq(&gh); + + if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(&gh); + leftover = fault_in_iov_iter_writeable(to, window_size); + gfs2_holder_disallow_demote(&gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(&gh)) { + if (written) + goto out_uninit; + goto retry; + } + goto retry_under_glock; + } + } + if (gfs2_holder_queued(&gh)) + gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); return written ? written : ret; @@ -885,8 +944,17 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder *statfs_gh = NULL; + size_t prev_count = 0, window_size = 0; + size_t read = 0; ssize_t ret; + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + */ + if (inode == sdp->sd_rindex) { statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); if (!statfs_gh) @@ -894,10 +962,11 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, } gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); +retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; - +retry_under_glock: if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -908,21 +977,41 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, } current->backing_dev_info = inode_to_bdi(inode); + pagefault_disable(); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + pagefault_enable(); current->backing_dev_info = NULL; - if (ret > 0) + if (ret > 0) { iocb->ki_pos += ret; + read += ret; + } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); + if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(gh); + leftover = fault_in_iov_iter_readable(from, window_size); + gfs2_holder_disallow_demote(gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(gh)) { + if (read) + goto out_uninit; + goto retry; + } + goto retry_under_glock; + } + } out_unlock: - gfs2_glock_dq(gh); + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); - return ret; + return read ? read : ret; } /** -- cgit v1.2.3 From 4fdccaa0d184c202f98d73b24e3ec8eeee88ab8d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 24 Jul 2021 12:26:41 +0200 Subject: iomap: Add done_before argument to iomap_dio_rw Add a done_before argument to iomap_dio_rw that indicates how much of the request has already been transferred. When the request succeeds, we report that done_before additional bytes were tranferred. This is useful for finishing a request asynchronously when part of the request has already been completed synchronously. We'll use that to allow iomap_dio_rw to be used with page faults disabled: when a page fault occurs while submitting a request, we synchronously complete the part of the request that has already been submitted. The caller can then take care of the page fault and call iomap_dio_rw again for the rest of the request, passing in the number of bytes already tranferred. Signed-off-by: Andreas Gruenbacher Reviewed-by: Darrick J. Wong --- fs/btrfs/file.c | 5 +++-- fs/erofs/data.c | 2 +- fs/ext4/file.c | 5 +++-- fs/gfs2/file.c | 4 ++-- fs/iomap/direct-io.c | 19 ++++++++++++++++--- fs/xfs/xfs_file.c | 6 +++--- fs/zonefs/super.c | 4 ++-- include/linux/iomap.h | 4 ++-- 8 files changed, 32 insertions(+), 17 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f37211d3bb69..9d41b28c67ba 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1957,7 +1957,7 @@ relock: } dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0); + 0, 0); btrfs_inode_unlock(inode, ilock_flags); @@ -3658,7 +3658,8 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + 0, 0); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 9db829715652..16a41d0db55a 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!err) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0); + NULL, 0, 0); if (err < 0) return err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac0e11bbb445..b25c1f8f7c4f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ilock_shared) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); + (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, + 0); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d9126e3e6dd6..f772ee0fcae3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -822,7 +822,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, if (ret) goto out_uninit; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0); gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); @@ -856,7 +856,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, if (offset + len > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0); if (ret == -ENOTBLK) ret = 0; out: diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index a434fb7887b2..468dcbba45bc 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -31,6 +31,7 @@ struct iomap_dio { atomic_t ref; unsigned flags; int error; + size_t done_before; bool wait_for_completion; union { @@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); + if (ret > 0) + ret += dio->done_before; + kfree(dio); return ret; @@ -450,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, * may be pure data writes. In that case, we still need to do a full data sync * completion. * + * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, + * __iomap_dio_rw can return a partial result if it encounters a non-resident + * page in @iter after preparing a transfer. In that case, the non-resident + * pages can be faulted in and the request resumed with @done_before set to the + * number of bytes previously transferred. The request will then complete with + * the correct total number of bytes transferred; this is essential for + * completing partial requests asynchronously. + * * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags) + unsigned int dio_flags, size_t done_before) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -486,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->dops = dops; dio->error = 0; dio->flags = 0; + dio->done_before = done_before; dio->submit.iter = iter; dio->submit.waiter = current; @@ -652,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags) + unsigned int dio_flags, size_t done_before) { struct iomap_dio *dio; - dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7aa943edfc02..240eb932c014 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -259,7 +259,7 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -569,7 +569,7 @@ xfs_file_dio_write_aligned( } trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0); + &xfs_dio_write_ops, 0, 0); out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -647,7 +647,7 @@ retry_exclusive: trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, flags); + &xfs_dio_write_ops, flags, 0); /* * Retry unaligned I/O with exclusive blocking semantics if the DIO diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ddc346a9df9b..6122c38ab44d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = zonefs_file_dio_append(iocb, from); else ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, - &zonefs_write_dio_ops, 0); + &zonefs_write_dio_ops, 0, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } file_accessed(iocb->ki_filp); ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, - &zonefs_read_dio_ops, 0); + &zonefs_read_dio_ops, 0, 0); } else { ret = generic_file_read_iter(iocb, to); if (ret == -EIO) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2a213b0d1e1f..829f2325ecba 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -339,10 +339,10 @@ struct iomap_dio_ops { ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags); + unsigned int dio_flags, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags); + unsigned int dio_flags, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); -- cgit v1.2.3 From b01b2d72da25c000aeb124bc78daf3fb998be2b6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 12 Jul 2021 13:40:38 +0200 Subject: gfs2: Fix mmap + page fault deadlocks for direct I/O Also disable page faults during direct I/O requests and implement a similar kind of retry logic as in the buffered I/O case. The retry logic in the direct I/O case differs from the buffered I/O case in the following way: direct I/O doesn't provide the kinds of consistency guarantees between concurrent reads and writes that buffered I/O provides, so once we lose the inode glock while faulting in user pages, we always resume the operation. We never need to return a partial read or write. This locking problem was originally reported by Jan Kara. Linus came up with the idea of disabling page faults. Many thanks to Al Viro and Matthew Wilcox for their feedback. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 12 deletions(-) (limited to 'fs/gfs2') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f772ee0fcae3..40e6501c02e5 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -811,22 +811,64 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, { struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - size_t count = iov_iter_count(to); + size_t prev_count = 0, window_size = 0; + size_t written = 0; ssize_t ret; - if (!count) + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + * + * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger + * physical as well as manual page faults, and we need to disable both + * kinds. + * + * For direct I/O, gfs2 takes the inode glock in deferred mode. This + * locking mode is compatible with other deferred holders, so multiple + * processes and nodes can do direct I/O to a file at the same time. + * There's no guarantee that reads or writes will be atomic. Any + * coordination among readers and writers needs to happen externally. + */ + + if (!iov_iter_count(to)) return 0; /* skip atime */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; +retry_under_glock: + pagefault_disable(); + to->nofault = true; + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, + IOMAP_DIO_PARTIAL, written); + to->nofault = false; + pagefault_enable(); + if (ret > 0) + written = ret; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0); - gfs2_glock_dq(gh); + if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(gh); + leftover = fault_in_iov_iter_writeable(to, window_size); + gfs2_holder_disallow_demote(gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(gh)) + goto retry; + goto retry_under_glock; + } + } + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); - return ret; + if (ret < 0) + return ret; + return written; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -835,10 +877,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - size_t len = iov_iter_count(from); - loff_t offset = iocb->ki_pos; + size_t prev_count = 0, window_size = 0; + size_t read = 0; ssize_t ret; + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + * + * For writes, iomap_dio_rw only triggers manual page faults, so we + * don't need to disable physical ones. + */ + /* * Deferred lock, even if its a write, since we do no allocation on * this path. All we need to change is the atime, and this lock mode @@ -848,22 +900,45 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, * VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; - +retry_under_glock: /* Silently fall back to buffered I/O when writing beyond EOF */ - if (offset + len > i_size_read(&ip->i_inode)) + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0); + from->nofault = true; + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, + IOMAP_DIO_PARTIAL, read); + from->nofault = false; + if (ret == -ENOTBLK) ret = 0; + if (ret > 0) + read = ret; + + if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(gh); + leftover = fault_in_iov_iter_readable(from, window_size); + gfs2_holder_disallow_demote(gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(gh)) + goto retry; + goto retry_under_glock; + } + } out: - gfs2_glock_dq(gh); + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); - return ret; + if (ret < 0) + return ret; + return read; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -- cgit v1.2.3