summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/fsclient.c6
-rw-r--r--fs/afs/yfsclient.c2
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/transaction.c49
-rw-r--r--fs/btrfs/tree-log.c33
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/io_uring.c26
-rw-r--r--fs/lockd/host.c3
-rw-r--r--fs/locks.c5
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c5
-rw-r--r--fs/nfs/nfs4proc.c5
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c15
-rw-r--r--fs/xfs/scrub/btree.c11
-rw-r--r--fs/xfs/scrub/dabtree.c5
-rw-r--r--fs/xfs/xfs_discard.c8
-rw-r--r--fs/xfs/xfs_file.c27
20 files changed, 163 insertions, 52 deletions
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index ca08c83168f5..0b37867b5c20 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1515,8 +1515,8 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
xdr_encode_AFS_StoreStatus(&bp, attr);
- *bp++ = 0; /* position of start of write */
- *bp++ = 0;
+ *bp++ = htonl(attr->ia_size >> 32); /* position of start of write */
+ *bp++ = htonl((u32) attr->ia_size);
*bp++ = 0; /* size of write */
*bp++ = 0;
*bp++ = htonl(attr->ia_size >> 32); /* new file length */
@@ -1564,7 +1564,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
xdr_encode_AFS_StoreStatus(&bp, attr);
- *bp++ = 0; /* position of start of write */
+ *bp++ = htonl(attr->ia_size); /* position of start of write */
*bp++ = 0; /* size of write */
*bp++ = htonl(attr->ia_size); /* new file length */
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 5aa57929e8c2..6e97a42d24d1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1514,7 +1514,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vnode->fid);
bp = xdr_encode_YFS_StoreStatus(bp, attr);
- bp = xdr_encode_u64(bp, 0); /* position of start of write */
+ bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */
bp = xdr_encode_u64(bp, 0); /* size of write */
bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
yfs_check_req(call, bp);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1d49694e6ae3..c5880329ae37 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6174,7 +6174,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
*
* This is overestimating in most cases.
*/
- qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
+ qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
spin_lock(&block_rsv->lock);
block_rsv->size = reserve_size;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index eb680b715dd6..e659d9d61107 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1922,8 +1922,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
int i;
/* Level sanity check */
- if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
- root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
root_level < cur_level) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1869ba8e5981..67a6f7d47402 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2430,8 +2430,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ kunmap(p_page);
}
__free_page(p_page);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acdad6d658f5..e4e665f422fc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
}
}
-static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
/*
* We use writeback_inodes_sb here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
*/
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ } else {
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+
+ /*
+ * Flush dellaloc for any root that is going to be snapshotted.
+ * This is done to avoid a corrupted version of files, in the
+ * snapshots, that had both buffered and direct IO writes (even
+ * if they were done sequentially) due to an unordered update of
+ * the inode's size on disk.
+ */
+ list_for_each_entry(pending, head, list) {
+ int ret;
+
+ ret = btrfs_start_delalloc_snapshot(pending->root);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
-static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
{
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ } else {
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+
+ /*
+ * Wait for any dellaloc that we started previously for the roots
+ * that are going to be snapshotted. This is to avoid a corrupted
+ * version of files in the snapshots that had both buffered and
+ * direct IO writes (even if they were done sequentially).
+ */
+ list_for_each_entry(pending, head, list)
+ btrfs_wait_ordered_extents(pending->root,
+ U64_MAX, 0, U64_MAX);
+ }
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2023,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(fs_info);
+ ret = btrfs_start_delalloc_flush(trans);
if (ret)
goto cleanup_transaction;
@@ -2039,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- btrfs_wait_delalloc_flush(fs_info);
+ btrfs_wait_delalloc_flush(trans);
btrfs_scrub_pause(fs_info);
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f06454a55e00..561884f60d35 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3578,9 +3578,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again. See the note for
+ * log_new_dir_dentries, if we're logging a directory recursively we
+ * won't be holding its i_mutex, which means we can modify the directory
+ * while we're logging it. If we remove an entry between our first
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (WARN_ON(ret != 0))
+ if (ret != 0)
goto done;
/*
@@ -4544,6 +4551,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
*size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode
+ * size stored in the btree, return the inode's i_size, so
+ * that we get a correct inode size after replaying the log
+ * when before a power failure we had a shrinking truncate
+ * followed by addition of a new name (rename / new hard link).
+ * Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a
+ * write that expands the inode's size and logging a new name
+ * immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
}
btrfs_release_path(path);
@@ -4705,15 +4725,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- ASSERT(len == i_size ||
- (len == fs_info->sectorsize &&
- btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE) ||
- (len < i_size && i_size < fs_info->sectorsize));
+ BTRFS_FILE_EXTENT_INLINE)
return 0;
- }
len = btrfs_file_extent_num_bytes(leaf, extent);
/* Last extent goes beyond i_size, no need to log a hole. */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9024eee889b9..db934ceae9c1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6407,7 +6407,7 @@ static void btrfs_end_bio(struct bio *bio)
if (bio_op(bio) == REQ_OP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- else
+ else if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e3346628efe2..2d61ddda9bf5 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ceph_inode_info *ci = ceph_inode(inode);
+ kfree(ci->i_symlink);
kmem_cache_free(ceph_inode_cachep, ci);
}
@@ -566,7 +567,6 @@ void ceph_destroy_inode(struct inode *inode)
}
}
- kfree(ci->i_symlink);
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
frag = rb_entry(n, struct ceph_inode_frag, node);
rb_erase(n, &ci->i_fragtree);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6aaa30580a2b..bbdbd56cf2ac 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1022,6 +1022,8 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) {
+ ssize_t ret2;
+
/*
* Open-code file_start_write here to grab freeze protection,
* which will be released by another thread in
@@ -1036,7 +1038,19 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
SB_FREEZE_WRITE);
}
kiocb->ki_flags |= IOCB_WRITE;
- io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+
+ ret2 = call_write_iter(file, kiocb, &iter);
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ io_rw_done(kiocb, ret2);
+ } else {
+ /*
+ * If ->needs_lock is true, we're already in async
+ * context.
+ */
+ if (!s->needs_lock)
+ io_async_list_note(WRITE, req, iov_count);
+ ret = -EAGAIN;
+ }
}
out_free:
kfree(iovec);
@@ -1968,7 +1982,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return 0;
if (sig) {
- ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ &ksigmask, &sigsaved, sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, &ksigmask,
+ &sigsaved, sigsz);
+
if (ret)
return ret;
}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 93fb7cf0b92b..f0b5c987d6ae 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host)
WARN_ON_ONCE(host->h_server);
- if (refcount_dec_and_test(&host->h_count)) {
+ if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) {
WARN_ON_ONCE(!list_empty(&host->h_lockowners));
WARN_ON_ONCE(!list_empty(&host->h_granted));
WARN_ON_ONCE(!list_empty(&host->h_reclaim));
- mutex_lock(&nlm_host_mutex);
nlm_destroy_host_locked(host);
mutex_unlock(&nlm_host_mutex);
}
diff --git a/fs/locks.c b/fs/locks.c
index eaa1cfaf73b0..71d0c6c2aac5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1160,6 +1160,11 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
*/
error = -EDEADLK;
spin_lock(&blocked_lock_lock);
+ /*
+ * Ensure that we don't find any locks blocked on this
+ * request during deadlock detection.
+ */
+ __locks_wake_up_blocks(request);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
__locks_insert_block(fl, request,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index fb1cf1a4bda2..90d71fda65ce 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -453,7 +453,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
case XPRT_TRANSPORT_RDMA:
if (retrans == NFS_UNSPEC_RETRANS)
to->to_retries = NFS_DEF_TCP_RETRANS;
- if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f9264e1922a2..6673d4ff5a2a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1289,6 +1289,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ int new_idx = hdr->pgio_mirror_idx;
int err;
trace_nfs4_pnfs_read(hdr, task->tk_status);
@@ -1307,7 +1308,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
case -NFS4ERR_RESET_TO_PNFS:
if (ff_layout_choose_best_ds_for_read(hdr->lseg,
hdr->pgio_mirror_idx + 1,
- &hdr->pgio_mirror_idx))
+ &new_idx))
goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
@@ -1320,7 +1321,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
return 0;
out_layouterror:
+ ff_layout_read_record_layoutstats_done(task, hdr);
ff_layout_send_layouterror(hdr->lseg);
+ hdr->pgio_mirror_idx = new_idx;
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4dbb0ee23432..741ff8c9c6ed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2933,7 +2933,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
- nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ if (!opendata->cancelled)
+ nfs4_sequence_free_slot(&opendata->o_res.seq_res);
return ret;
}
@@ -6301,7 +6302,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->arg.seqid = seqid;
p->res.seqid = seqid;
p->lsp = lsp;
- refcount_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
p->l_ctx = nfs_get_lock_context(ctx);
@@ -6526,7 +6526,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->res.lock_seqid = p->arg.lock_seqid;
p->lsp = lsp;
p->server = server;
- refcount_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 48502cb9990f..4637ae1ae91c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1191,7 +1191,10 @@ xfs_iread_extents(
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
+ if (unlikely(level == 0)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
@@ -4249,9 +4252,13 @@ xfs_bmapi_write(
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
{
+ struct xfs_bmalloca bma = {
+ .tp = tp,
+ .ip = ip,
+ .total = total,
+ };
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4319,10 +4326,6 @@ xfs_bmapi_write(
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
- bma.tp = tp;
- bma.ip = ip;
- bma.total = total;
- bma.datatype = 0;
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
n = 0;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 6f94d1f7322d..117910db51b8 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -415,8 +415,17 @@ xchk_btree_check_owner(
struct xfs_btree_cur *cur = bs->cur;
struct check_owner *co;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+ /*
+ * In theory, xfs_btree_get_block should only give us a null buffer
+ * pointer for the root of a root-in-inode btree type, but we need
+ * to check defensively here in case the cursor state is also screwed
+ * up.
+ */
+ if (bp == NULL) {
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
+ }
/*
* We want to cross-reference each btree block with the bnobt
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index f1260b4bfdee..90527b094878 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -574,6 +574,11 @@ xchk_da_btree(
/* Drill another level deeper. */
blkno = be32_to_cpu(key->before);
level++;
+ if (level >= XFS_DA_NODE_MAXDEPTH) {
+ /* Too deep! */
+ xchk_da_set_corrupt(&ds, level - 1);
+ break;
+ }
ds.tree_level--;
error = xchk_da_btree_block(&ds, level, blkno);
if (error)
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 93f07edafd81..9ee2a7d02e70 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -161,6 +161,14 @@ xfs_ioc_trim(
return -EPERM;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+
+ /*
+ * We haven't recovered the log, so we cannot use our bnobt-guided
+ * storage zapping commands.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return -EROFS;
+
if (copy_from_user(&range, urange, sizeof(range)))
return -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f2e2845eb76..a7ceae90110e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -529,18 +529,17 @@ xfs_file_dio_aio_write(
count = iov_iter_count(from);
/*
- * If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to take the exclusive lock
- * for other reasons in xfs_file_aio_write_checks.
+ * If we are doing unaligned IO, we can't allow any other overlapping IO
+ * in-flight at the same time or we risk data corruption. Wait for all
+ * other IO to drain before we submit. If the IO is aligned, demote the
+ * iolock if we had to take the exclusive lock in
+ * xfs_file_aio_write_checks() for other reasons.
*/
if (unaligned_io) {
- /* If we are going to wait for other DIO to finish, bail */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (atomic_read(&inode->i_dio_count))
- return -EAGAIN;
- } else {
- inode_dio_wait(inode);
- }
+ /* unaligned dio always waits, bail */
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_dio_wait(inode);
} else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
@@ -548,6 +547,14 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+
+ /*
+ * If unaligned, this is the only IO in-flight. If it has not yet
+ * completed, wait on it before we release the iolock to prevent
+ * subsequent overlapping IO.
+ */
+ if (ret == -EIOCBQUEUED && unaligned_io)
+ inode_dio_wait(inode);
out:
xfs_iunlock(ip, iolock);