summaryrefslogtreecommitdiff
path: root/drivers/block/ublk_drv.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/ublk_drv.c')
-rw-r--r--drivers/block/ublk_drv.c208
1 files changed, 119 insertions, 89 deletions
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index f80e48ab7ba0..5b10612f7005 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -75,6 +75,7 @@ struct ublk_rq_data {
struct ublk_uring_cmd_pdu {
struct ublk_queue *ubq;
+ u16 tag;
};
/*
@@ -141,14 +142,13 @@ struct ublk_queue {
unsigned int max_io_sz;
bool force_abort;
bool timeout;
+ bool canceling;
unsigned short nr_io_ready; /* how many ios setup */
spinlock_t cancel_lock;
struct ublk_device *dev;
struct ublk_io ios[];
};
-#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
-
struct ublk_device {
struct gendisk *ub_disk;
@@ -179,11 +179,6 @@ struct ublk_device {
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
- /*
- * Our ubq->daemon may be killed without any notification, so
- * monitor each queue's daemon periodically
- */
- struct delayed_work monitor_work;
struct work_struct quiesce_work;
struct work_struct stop_work;
};
@@ -194,10 +189,11 @@ struct ublk_params_header {
__u32 types;
};
+static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
+
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
-
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_USER_COPY;
@@ -1123,8 +1119,6 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_requeue_request(rq, false);
else
blk_mq_end_request(rq, BLK_STS_IOERR);
-
- mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
}
static inline void __ublk_rq_task_work(struct request *req,
@@ -1245,12 +1239,12 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
io = &ubq->ios[rq->tag];
/*
* If the check pass, we know that this is a re-issued request aborted
- * previously in monitor_work because the ubq_daemon(cmd's task) is
+ * previously in cancel fn because the ubq_daemon(cmd's task) is
* PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
* because this ioucmd's io_uring context may be freed now if no inflight
* ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
*
- * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
+ * Note: cancel fn sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
* the tag). Then the request is re-started(allocating the tag) and we are here.
* Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
* guarantees that here is a re-issued request aborted previously.
@@ -1258,17 +1252,15 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
ublk_abort_io_cmds(ubq);
} else {
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->ubq = ubq;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+ io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
}
}
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+ unsigned int nr_inflight = 0;
+ int i;
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
if (!ubq->timeout) {
@@ -1279,6 +1271,29 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
return BLK_EH_DONE;
}
+ if (!ubq_daemon_is_dying(ubq))
+ return BLK_EH_RESET_TIMER;
+
+ for (i = 0; i < ubq->q_depth; i++) {
+ struct ublk_io *io = &ubq->ios[i];
+
+ if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
+ nr_inflight++;
+ }
+
+ /* cancelable uring_cmd can't help us if all commands are in-flight */
+ if (nr_inflight == ubq->q_depth) {
+ struct ublk_device *ub = ubq->dev;
+
+ if (ublk_abort_requests(ub, ubq)) {
+ if (ublk_can_use_recovery(ub))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
+ }
+ return BLK_EH_DONE;
+ }
+
return BLK_EH_RESET_TIMER;
}
@@ -1308,7 +1323,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(bd->rq);
- if (unlikely(ubq_daemon_is_dying(ubq))) {
+ if (unlikely(ubq->canceling)) {
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
@@ -1416,9 +1431,9 @@ static void ublk_commit_completion(struct ublk_device *ub,
}
/*
- * When ->ubq_daemon is exiting, either new request is ended immediately,
- * or any queued io command is drained, so it is safe to abort queue
- * lockless
+ * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
+ * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
+ * context, so everything is serialized.
*/
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{
@@ -1441,10 +1456,17 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
}
}
-static bool ublk_abort_requests(struct ublk_device *ub)
+static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
{
struct gendisk *disk;
- int i;
+
+ spin_lock(&ubq->cancel_lock);
+ if (ubq->canceling) {
+ spin_unlock(&ubq->cancel_lock);
+ return false;
+ }
+ ubq->canceling = true;
+ spin_unlock(&ubq->cancel_lock);
spin_lock(&ub->lock);
disk = ub->ub_disk;
@@ -1458,53 +1480,69 @@ static bool ublk_abort_requests(struct ublk_device *ub)
/* Now we are serialized with ublk_queue_rq() */
blk_mq_quiesce_queue(disk->queue);
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
-
- if (ubq_daemon_is_dying(ubq)) {
- /* abort queue is for making forward progress */
- ublk_abort_queue(ub, ubq);
- }
- }
+ /* abort queue is for making forward progress */
+ ublk_abort_queue(ub, ubq);
blk_mq_unquiesce_queue(disk->queue);
put_device(disk_to_dev(disk));
return true;
}
-static void ublk_daemon_monitor_work(struct work_struct *work)
+static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
+ unsigned int issue_flags)
{
- struct ublk_device *ub =
- container_of(work, struct ublk_device, monitor_work.work);
- int i;
+ bool done;
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
+ if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
+ return;
- if (ubq_daemon_is_dying(ubq))
- goto found;
- }
- return;
+ spin_lock(&ubq->cancel_lock);
+ done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
+ if (!done)
+ io->flags |= UBLK_IO_FLAG_CANCELED;
+ spin_unlock(&ubq->cancel_lock);
+
+ if (!done)
+ io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
+}
+
+/*
+ * The ublk char device won't be closed when calling cancel fn, so both
+ * ublk device and queue are guaranteed to be live
+ */
+static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->ubq;
+ struct task_struct *task;
+ struct ublk_device *ub;
+ bool need_schedule;
+ struct ublk_io *io;
-found:
- if (!ublk_abort_requests(ub))
+ if (WARN_ON_ONCE(!ubq))
return;
- if (ublk_can_use_recovery(ub))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
+ if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
+ return;
- /*
- * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
- * after ublk_remove() or __ublk_quiesce_dev() is started.
- *
- * No need ub->mutex, monitor work are canceled after state is marked
- * as not LIVE, so new state is observed reliably.
- */
- if (ub->dev_info.state == UBLK_S_DEV_LIVE)
- schedule_delayed_work(&ub->monitor_work,
- UBLK_DAEMON_MONITOR_PERIOD);
+ task = io_uring_cmd_get_task(cmd);
+ if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
+ return;
+
+ ub = ubq->dev;
+ need_schedule = ublk_abort_requests(ub, ubq);
+
+ io = &ubq->ios[pdu->tag];
+ WARN_ON_ONCE(io->cmd != cmd);
+ ublk_cancel_cmd(ubq, &ubq->ios[pdu->tag], issue_flags);
+
+ if (need_schedule) {
+ if (ublk_can_use_recovery(ub))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
+ }
}
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
@@ -1516,24 +1554,8 @@ static void ublk_cancel_queue(struct ublk_queue *ubq)
{
int i;
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
-
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
- bool done;
-
- spin_lock(&ubq->cancel_lock);
- done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
- if (!done)
- io->flags |= UBLK_IO_FLAG_CANCELED;
- spin_unlock(&ubq->cancel_lock);
-
- if (!done)
- io_uring_cmd_done(io->cmd,
- UBLK_IO_RES_ABORT, 0,
- IO_URING_F_UNLOCKED);
- }
- }
+ for (i = 0; i < ubq->q_depth; i++)
+ ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -1580,15 +1602,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
blk_mq_quiesce_queue(ub->ub_disk->queue);
ublk_wait_tagset_rqs_idle(ub);
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
- /* we are going to release task_struct of ubq_daemon and resets
- * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
- * Besides, monitor_work is not necessary in QUIESCED state since we have
- * already scheduled quiesce_work and quiesced all ubqs.
- *
- * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
- * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
- */
- cancel_delayed_work_sync(&ub->monitor_work);
}
static void ublk_quiesce_work_fn(struct work_struct *work)
@@ -1651,7 +1664,6 @@ static void ublk_stop_dev(struct ublk_device *ub)
unlock:
mutex_unlock(&ub->mutex);
ublk_cancel_dev(ub);
- cancel_delayed_work_sync(&ub->monitor_work);
}
/* device can only be started after all IOs are ready */
@@ -1702,6 +1714,21 @@ static inline void ublk_fill_io_cmd(struct ublk_io *io,
io->addr = buf_addr;
}
+static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct ublk_queue *ubq, unsigned int tag)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ /*
+ * Safe to refer to @ubq since ublk_queue won't be died until its
+ * commands are completed
+ */
+ pdu->ubq = ubq;
+ pdu->tag = tag;
+ io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1817,6 +1844,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
default:
goto out;
}
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
return -EIOCBQUEUED;
out:
@@ -1884,6 +1912,11 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
+ ublk_uring_cmd_cancel_fn(cmd, issue_flags);
+ return 0;
+ }
+
/* well-implemented server won't run into unlocked */
if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
@@ -2215,8 +2248,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
- schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
-
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
test_bit(UB_STATE_USED, &ub->state)) {
@@ -2393,7 +2424,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
spin_lock_init(&ub->lock);
INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
- INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2647,6 +2677,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
ubq->timeout = false;
+ ubq->canceling = false;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -2732,7 +2763,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
__func__, header->dev_id);
blk_mq_kick_requeue_list(ub->ub_disk->queue);
ub->dev_info.state = UBLK_S_DEV_LIVE;
- schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);