summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c221
1 files changed, 144 insertions, 77 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 16fb7436043c..e372d5b9f6dc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -712,6 +712,7 @@ struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
const struct iovec *free_iovec;
struct iov_iter iter;
+ struct iov_iter_state iter_state;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -735,7 +736,6 @@ enum {
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
- REQ_F_DONT_REISSUE_BIT,
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
@@ -782,8 +782,6 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* don't attempt request reissue, see io_rw_reissue() */
- REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
/* supports async reads */
REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
/* supports async writes */
@@ -2444,13 +2442,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
- if (READ_ONCE(req->result) == -EAGAIN &&
- !(req->flags & REQ_F_DONT_REISSUE)) {
- req->iopoll_completed = 0;
- io_req_task_queue_reissue(req);
- continue;
- }
-
__io_cqring_fill_event(ctx, req->user_data, req->result,
io_put_rw_kbuf(req));
(*nr_events)++;
@@ -2613,8 +2604,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
if (!rw)
return !io_req_prep_async(req);
- /* may have left rw->iter inconsistent on -EIOCBQUEUED */
- iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
+ iov_iter_restore(&rw->iter, &rw->iter_state);
return true;
}
@@ -2714,10 +2704,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (unlikely(res != req->result)) {
- if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
- io_resubmit_prep(req))) {
- req_set_fail(req);
- req->flags |= REQ_F_DONT_REISSUE;
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ req->flags |= REQ_F_REISSUE;
+ return;
}
}
@@ -2843,7 +2832,8 @@ static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
return __io_file_supports_nowait(req->file, rw);
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ int rw)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2865,8 +2855,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(ret))
return ret;
- /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
- if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+ /*
+ * If the file is marked O_NONBLOCK, still allow retry for it if it
+ * supports async. Otherwise it's impossible to use O_NONBLOCK files
+ * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+ */
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
req->flags |= REQ_F_NOWAIT;
ioprio = READ_ONCE(sqe->ioprio);
@@ -2931,7 +2926,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
- bool check_reissue = kiocb->ki_complete == io_complete_rw;
/* add previously done IO, if any */
if (io && io->bytes_done > 0) {
@@ -2943,19 +2937,27 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
- if (ret >= 0 && check_reissue)
+ if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
__io_complete_rw(req, ret, 0, issue_flags);
else
io_rw_done(kiocb, ret);
- if (check_reissue && (req->flags & REQ_F_REISSUE)) {
+ if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
io_req_task_queue_reissue(req);
} else {
+ unsigned int cflags = io_put_rw_kbuf(req);
+ struct io_ring_ctx *ctx = req->ctx;
+
req_set_fail(req);
- __io_req_complete(req, issue_flags, ret,
- io_put_rw_kbuf(req));
+ if (issue_flags & IO_URING_F_NONBLOCK) {
+ mutex_lock(&ctx->uring_lock);
+ __io_req_complete(req, issue_flags, ret, cflags);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ __io_req_complete(req, issue_flags, ret, cflags);
+ }
}
}
}
@@ -3263,12 +3265,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ if (!iov_iter_is_bvec(iter)) {
+ iov_iter_advance(iter, nr);
+ } else {
+ req->rw.len -= nr;
+ req->rw.addr += nr;
+ }
ret += nr;
if (nr != iovec.iov_len)
break;
- req->rw.len -= nr;
- req->rw.addr += nr;
- iov_iter_advance(iter, nr);
}
return ret;
@@ -3315,12 +3320,17 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
if (!req->async_data) {
+ struct io_async_rw *iorw;
+
if (io_alloc_async_data(req)) {
kfree(iovec);
return -ENOMEM;
}
io_req_map_rw(req, iovec, fast_iov, iter);
+ iorw = req->async_data;
+ /* we've copied and mapped the iter, ensure state is saved */
+ iov_iter_save_state(&iorw->iter, &iorw->iter_state);
}
return 0;
}
@@ -3339,6 +3349,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
+ iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
@@ -3346,7 +3357,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- return io_prep_rw(req, sqe);
+ return io_prep_rw(req, sqe, READ);
}
/*
@@ -3442,19 +3453,28 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- ssize_t io_size, ret, ret2;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct iov_iter_state __state, *state;
+ ssize_t ret, ret2;
if (rw) {
iter = &rw->iter;
+ state = &rw->iter_state;
+ /*
+ * We come here from an earlier attempt, restore our state to
+ * match in case it doesn't. It's cheap enough that we don't
+ * need to make this conditional.
+ */
+ iov_iter_restore(iter, state);
iovec = NULL;
} else {
ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ state = &__state;
+ iov_iter_save_state(iter, state);
}
- io_size = iov_iter_count(iter);
- req->result = io_size;
+ req->result = iov_iter_count(iter);
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
@@ -3468,7 +3488,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret ?: -EAGAIN;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3484,30 +3504,49 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
goto done;
- /* some cases will consume bytes even on error returns */
- iov_iter_reexpand(iter, iter->count + iter->truncated);
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == io_size || !force_nonblock ||
+ } else if (ret <= 0 || ret == req->result || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
+ /*
+ * Don't depend on the iter state matching what was consumed, or being
+ * untouched in case of error. Restore it and we'll advance it
+ * manually if we need to.
+ */
+ iov_iter_restore(iter, state);
+
ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
- /* now use our persistent iterator, if we aren't already */
- iter = &rw->iter;
+ /*
+ * Now use our persistent iterator and state, if we aren't already.
+ * We've restored and mapped the iter to match.
+ */
+ if (iter != &rw->iter) {
+ iter = &rw->iter;
+ state = &rw->iter_state;
+ }
do {
- io_size -= ret;
+ /*
+ * We end up here because of a partial read, either from
+ * above or inside this loop. Advance the iter by the bytes
+ * that were consumed.
+ */
+ iov_iter_advance(iter, ret);
+ if (!iov_iter_count(iter))
+ break;
rw->bytes_done += ret;
+ iov_iter_save_state(iter, state);
+
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
@@ -3525,7 +3564,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- } while (ret > 0 && ret < io_size);
+ iov_iter_restore(iter, state);
+ } while (ret > 0);
done:
kiocb_done(kiocb, ret, issue_flags);
out_free:
@@ -3539,7 +3579,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- return io_prep_rw(req, sqe);
+ return io_prep_rw(req, sqe, WRITE);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
@@ -3548,19 +3588,24 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- ssize_t ret, ret2, io_size;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct iov_iter_state __state, *state;
+ ssize_t ret, ret2;
if (rw) {
iter = &rw->iter;
+ state = &rw->iter_state;
+ iov_iter_restore(iter, state);
iovec = NULL;
} else {
ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ state = &__state;
+ iov_iter_save_state(iter, state);
}
- io_size = iov_iter_count(iter);
- req->result = io_size;
+ req->result = iov_iter_count(iter);
+ ret2 = 0;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
@@ -3577,7 +3622,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret))
goto out_free;
@@ -3624,9 +3669,9 @@ done:
kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
- /* some cases will consume bytes even on error returns */
- iov_iter_reexpand(iter, iter->count + iter->truncated);
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
+ iov_iter_restore(iter, state);
+ if (ret2 > 0)
+ iov_iter_advance(iter, ret2);
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
return ret ?: -EAGAIN;
}
@@ -7515,6 +7560,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7528,14 +7581,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -8284,11 +8329,27 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+ struct io_rsrc_node *node, void *rsrc)
+{
+ struct io_rsrc_put *prsrc;
+
+ prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+ if (!prsrc)
+ return -ENOMEM;
+
+ prsrc->tag = *io_get_tag_slot(data, idx);
+ prsrc->rsrc = rsrc;
+ list_add(&prsrc->list, &node->rsrc_list);
+ return 0;
+}
+
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
@@ -8304,9 +8365,22 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
- ret = -EBADF;
- if (file_slot->file_ptr)
- goto err;
+
+ if (file_slot->file_ptr) {
+ struct file *old_file;
+
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
+ goto err;
+
+ old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+ ctx->rsrc_node, old_file);
+ if (ret)
+ goto err;
+ file_slot->file_ptr = 0;
+ needs_switch = true;
+ }
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
io_fixed_file_set(file_slot, file);
@@ -8318,27 +8392,14 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
ret = 0;
err:
+ if (needs_switch)
+ io_rsrc_node_switch(ctx, ctx->file_data);
io_ring_submit_unlock(ctx, !force_nonblock);
if (ret)
fput(file);
return ret;
}
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
- struct io_rsrc_node *node, void *rsrc)
-{
- struct io_rsrc_put *prsrc;
-
- prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
- if (!prsrc)
- return -ENOMEM;
-
- prsrc->tag = *io_get_tag_slot(data, idx);
- prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &node->rsrc_list);
- return 0;
-}
-
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
@@ -10560,10 +10621,12 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
* ordering. Fine to drop uring_lock here, we hold
* a ref to the ctx.
*/
+ refcount_inc(&sqd->refs);
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
- tctx = sqd->thread->io_uring;
+ if (sqd->thread)
+ tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
@@ -10577,16 +10640,20 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
if (ret)
goto err;
- if (sqd)
+ if (sqd) {
mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
return 0;
err:
- if (sqd)
+ if (sqd) {
mutex_unlock(&sqd->lock);
+ io_put_sq_data(sqd);
+ }
return ret;
}