diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 45 | ||||
-rw-r--r-- | drivers/vhost/Kconfig.vringh | 6 | ||||
-rw-r--r-- | drivers/vhost/Makefile | 6 | ||||
-rw-r--r-- | drivers/vhost/iotlb.c | 177 | ||||
-rw-r--r-- | drivers/vhost/net.c | 5 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 2 | ||||
-rw-r--r-- | drivers/vhost/vdpa.c | 883 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 233 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 45 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 421 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 2 |
11 files changed, 1603 insertions, 222 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 3d03ccbd1adc..362b832f5338 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,4 +1,29 @@ # SPDX-License-Identifier: GPL-2.0-only +config VHOST_IOTLB + tristate + help + Generic IOTLB implementation for vhost and vringh. + +config VHOST_RING + tristate + select VHOST_IOTLB + help + This option is selected by any driver which needs to access + the host side of a virtio ring. + +config VHOST + tristate + select VHOST_IOTLB + help + This option is selected by any driver which needs to access + the core of vhost. + +menuconfig VHOST_MENU + bool "VHOST drivers" + default y + +if VHOST_MENU + config VHOST_NET tristate "Host kernel accelerator for virtio net" depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP) @@ -23,8 +48,8 @@ config VHOST_SCSI config VHOST_VSOCK tristate "vhost virtio-vsock driver" depends on VSOCKETS && EVENTFD - select VIRTIO_VSOCKETS_COMMON select VHOST + select VIRTIO_VSOCKETS_COMMON default n ---help--- This kernel module can be loaded in the host kernel to provide AF_VSOCK @@ -34,11 +59,17 @@ config VHOST_VSOCK To compile this driver as a module, choose M here: the module will be called vhost_vsock. -config VHOST - tristate - ---help--- - This option is selected by any driver which needs to access - the core of vhost. +config VHOST_VDPA + tristate "Vhost driver for vDPA-based backend" + depends on EVENTFD + select VHOST + select VDPA + help + This kernel module can be loaded in host kernel to accelerate + guest virtio devices with the vDPA-based backends. + + To compile this driver as a module, choose M here: the module + will be called vhost_vdpa. config VHOST_CROSS_ENDIAN_LEGACY bool "Cross-endian support for vhost" @@ -54,3 +85,5 @@ config VHOST_CROSS_ENDIAN_LEGACY adds some overhead, it is disabled by default. If unsure, say "N". + +endif diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh deleted file mode 100644 index c1fe36a9b8d4..000000000000 --- a/drivers/vhost/Kconfig.vringh +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config VHOST_RING - tristate - ---help--- - This option is selected by any driver which needs to access - the host side of a virtio ring. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 6c6df24f770c..f3e1897cce85 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -10,4 +10,10 @@ vhost_vsock-y := vsock.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o +vhost_vdpa-y := vdpa.o + obj-$(CONFIG_VHOST) += vhost.o + +obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o +vhost_iotlb-y := iotlb.o diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c new file mode 100644 index 000000000000..1f0ca6e44410 --- /dev/null +++ b/drivers/vhost/iotlb.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2020 Red Hat, Inc. + * Author: Jason Wang <jasowang@redhat.com> + * + * IOTLB implementation for vhost. + */ +#include <linux/slab.h> +#include <linux/vhost_iotlb.h> +#include <linux/module.h> + +#define MOD_VERSION "0.1" +#define MOD_DESC "VHOST IOTLB" +#define MOD_AUTHOR "Jason Wang <jasowang@redhat.com>" +#define MOD_LICENSE "GPL v2" + +#define START(map) ((map)->start) +#define LAST(map) ((map)->last) + +INTERVAL_TREE_DEFINE(struct vhost_iotlb_map, + rb, __u64, __subtree_last, + START, LAST, static inline, vhost_iotlb_itree); + +/** + * vhost_iotlb_map_free - remove a map node and free it + * @iotlb: the IOTLB + * @map: the map that want to be remove and freed + */ +void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, + struct vhost_iotlb_map *map) +{ + vhost_iotlb_itree_remove(map, &iotlb->root); + list_del(&map->link); + kfree(map); + iotlb->nmaps--; +} +EXPORT_SYMBOL_GPL(vhost_iotlb_map_free); + +/** + * vhost_iotlb_add_range - add a new range to vhost IOTLB + * @iotlb: the IOTLB + * @start: start of the IOVA range + * @last: last of IOVA range + * @addr: the address that is mapped to @start + * @perm: access permission of this range + * + * Returns an error last is smaller than start or memory allocation + * fails + */ +int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, + u64 start, u64 last, + u64 addr, unsigned int perm) +{ + struct vhost_iotlb_map *map; + + if (last < start) + return -EFAULT; + + if (iotlb->limit && + iotlb->nmaps == iotlb->limit && + iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) { + map = list_first_entry(&iotlb->list, typeof(*map), link); + vhost_iotlb_map_free(iotlb, map); + } + + map = kmalloc(sizeof(*map), GFP_ATOMIC); + if (!map) + return -ENOMEM; + + map->start = start; + map->size = last - start + 1; + map->last = last; + map->addr = addr; + map->perm = perm; + + iotlb->nmaps++; + vhost_iotlb_itree_insert(map, &iotlb->root); + + INIT_LIST_HEAD(&map->link); + list_add_tail(&map->link, &iotlb->list); + + return 0; +} +EXPORT_SYMBOL_GPL(vhost_iotlb_add_range); + +/** + * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB + * @iotlb: the IOTLB + * @start: start of the IOVA range + * @last: last of IOVA range + */ +void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last) +{ + struct vhost_iotlb_map *map; + + while ((map = vhost_iotlb_itree_iter_first(&iotlb->root, + start, last))) + vhost_iotlb_map_free(iotlb, map); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_del_range); + +/** + * vhost_iotlb_alloc - add a new vhost IOTLB + * @limit: maximum number of IOTLB entries + * @flags: VHOST_IOTLB_FLAG_XXX + * + * Returns an error is memory allocation fails + */ +struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags) +{ + struct vhost_iotlb *iotlb = kzalloc(sizeof(*iotlb), GFP_KERNEL); + + if (!iotlb) + return NULL; + + iotlb->root = RB_ROOT_CACHED; + iotlb->limit = limit; + iotlb->nmaps = 0; + iotlb->flags = flags; + INIT_LIST_HEAD(&iotlb->list); + + return iotlb; +} +EXPORT_SYMBOL_GPL(vhost_iotlb_alloc); + +/** + * vhost_iotlb_reset - reset vhost IOTLB (free all IOTLB entries) + * @iotlb: the IOTLB to be reset + */ +void vhost_iotlb_reset(struct vhost_iotlb *iotlb) +{ + vhost_iotlb_del_range(iotlb, 0ULL, 0ULL - 1); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_reset); + +/** + * vhost_iotlb_free - reset and free vhost IOTLB + * @iotlb: the IOTLB to be freed + */ +void vhost_iotlb_free(struct vhost_iotlb *iotlb) +{ + if (iotlb) { + vhost_iotlb_reset(iotlb); + kfree(iotlb); + } +} +EXPORT_SYMBOL_GPL(vhost_iotlb_free); + +/** + * vhost_iotlb_itree_first - return the first overlapped range + * @iotlb: the IOTLB + * @start: start of IOVA range + * @end: end of IOVA range + */ +struct vhost_iotlb_map * +vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last) +{ + return vhost_iotlb_itree_iter_first(&iotlb->root, start, last); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_itree_first); + +/** + * vhost_iotlb_itree_first - return the next overlapped range + * @iotlb: the IOTLB + * @start: start of IOVA range + * @end: end of IOVA range + */ +struct vhost_iotlb_map * +vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last) +{ + return vhost_iotlb_itree_iter_next(map, start, last); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_itree_next); + +MODULE_VERSION(MOD_VERSION); +MODULE_DESCRIPTION(MOD_DESC); +MODULE_AUTHOR(MOD_AUTHOR); +MODULE_LICENSE(MOD_LICENSE); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 18e205eeb9af..87469d67ede8 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1324,7 +1324,8 @@ static int vhost_net_open(struct inode *inode, struct file *f) } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, - VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT); + VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, + NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev); @@ -1586,7 +1587,7 @@ static long vhost_net_reset_owner(struct vhost_net *n) struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; - struct vhost_umem *umem; + struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 0b949a14bce3..7653667a8cdc 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1628,7 +1628,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; } vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV, - VHOST_SCSI_WEIGHT, 0); + VHOST_SCSI_WEIGHT, 0, NULL); vhost_scsi_init_inflight(vs, NULL); diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c new file mode 100644 index 000000000000..421f02a8530a --- /dev/null +++ b/drivers/vhost/vdpa.c @@ -0,0 +1,883 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018-2020 Intel Corporation. + * Copyright (C) 2020 Red Hat, Inc. + * + * Author: Tiwei Bie <tiwei.bie@intel.com> + * Jason Wang <jasowang@redhat.com> + * + * Thanks Michael S. Tsirkin for the valuable comments and + * suggestions. And thanks to Cunming Liang and Zhihong Wang for all + * their supports. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/iommu.h> +#include <linux/uuid.h> +#include <linux/vdpa.h> +#include <linux/nospec.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> + +#include "vhost.h" + +enum { + VHOST_VDPA_FEATURES = + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1ULL << VIRTIO_F_ANY_LAYOUT) | + (1ULL << VIRTIO_F_VERSION_1) | + (1ULL << VIRTIO_F_IOMMU_PLATFORM) | + (1ULL << VIRTIO_F_RING_PACKED) | + (1ULL << VIRTIO_F_ORDER_PLATFORM) | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX), + + VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES | + (1ULL << VIRTIO_NET_F_CSUM) | + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | + (1ULL << VIRTIO_NET_F_MTU) | + (1ULL << VIRTIO_NET_F_MAC) | + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | + (1ULL << VIRTIO_NET_F_GUEST_ECN) | + (1ULL << VIRTIO_NET_F_GUEST_UFO) | + (1ULL << VIRTIO_NET_F_HOST_TSO4) | + (1ULL << VIRTIO_NET_F_HOST_TSO6) | + (1ULL << VIRTIO_NET_F_HOST_ECN) | + (1ULL << VIRTIO_NET_F_HOST_UFO) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_NET_F_STATUS) | + (1ULL << VIRTIO_NET_F_SPEED_DUPLEX), +}; + +/* Currently, only network backend w/o multiqueue is supported. */ +#define VHOST_VDPA_VQ_MAX 2 + +#define VHOST_VDPA_DEV_MAX (1U << MINORBITS) + +struct vhost_vdpa { + struct vhost_dev vdev; + struct iommu_domain *domain; + struct vhost_virtqueue *vqs; + struct completion completion; + struct vdpa_device *vdpa; + struct device dev; + struct cdev cdev; + atomic_t opened; + int nvqs; + int virtio_id; + int minor; +}; + +static DEFINE_IDA(vhost_vdpa_ida); + +static dev_t vhost_vdpa_major; + +static const u64 vhost_vdpa_features[] = { + [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES, +}; + +static void handle_vq_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev); + const struct vdpa_config_ops *ops = v->vdpa->config; + + ops->kick_vq(v->vdpa, vq - v->vqs); +} + +static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) +{ + struct vhost_virtqueue *vq = private; + struct eventfd_ctx *call_ctx = vq->call_ctx; + + if (call_ctx) + eventfd_signal(call_ctx, 1); + + return IRQ_HANDLED; +} + +static void vhost_vdpa_reset(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + ops->set_status(vdpa, 0); +} + +static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u32 device_id; + + device_id = ops->get_device_id(vdpa); + + if (copy_to_user(argp, &device_id, sizeof(device_id))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u8 status; + + status = ops->get_status(vdpa); + + if (copy_to_user(statusp, &status, sizeof(status))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u8 status; + + if (copy_from_user(&status, statusp, sizeof(status))) + return -EFAULT; + + /* + * Userspace shouldn't remove status bits unless reset the + * status to 0. + */ + if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) + return -EINVAL; + + ops->set_status(vdpa, status); + + return 0; +} + +static int vhost_vdpa_config_validate(struct vhost_vdpa *v, + struct vhost_vdpa_config *c) +{ + long size = 0; + + switch (v->virtio_id) { + case VIRTIO_ID_NET: + size = sizeof(struct virtio_net_config); + break; + } + + if (c->len == 0) + return -EINVAL; + + if (c->len > size - c->off) + return -E2BIG; + + return 0; +} + +static long vhost_vdpa_get_config(struct vhost_vdpa *v, + struct vhost_vdpa_config __user *c) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa_config config; + unsigned long size = offsetof(struct vhost_vdpa_config, buf); + u8 *buf; + + if (copy_from_user(&config, c, size)) + return -EFAULT; + if (vhost_vdpa_config_validate(v, &config)) + return -EINVAL; + buf = kvzalloc(config.len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ops->get_config(vdpa, config.off, buf, config.len); + + if (copy_to_user(c->buf, buf, config.len)) { + kvfree(buf); + return -EFAULT; + } + + kvfree(buf); + return 0; +} + +static long vhost_vdpa_set_config(struct vhost_vdpa *v, + struct vhost_vdpa_config __user *c) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa_config config; + unsigned long size = offsetof(struct vhost_vdpa_config, buf); + u8 *buf; + + if (copy_from_user(&config, c, size)) + return -EFAULT; + if (vhost_vdpa_config_validate(v, &config)) + return -EINVAL; + buf = kvzalloc(config.len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, c->buf, config.len)) { + kvfree(buf); + return -EFAULT; + } + + ops->set_config(vdpa, config.off, buf, config.len); + + kvfree(buf); + return 0; +} + +static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u64 features; + + features = ops->get_features(vdpa); + features &= vhost_vdpa_features[v->virtio_id]; + + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u64 features; + + /* + * It's not allowed to change the features after they have + * been negotiated. + */ + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK) + return -EBUSY; + + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + + if (features & ~vhost_vdpa_features[v->virtio_id]) + return -EINVAL; + + if (ops->set_features(vdpa, features)) + return -EINVAL; + + return 0; +} + +static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u16 num; + + num = ops->get_vq_num_max(vdpa); + + if (copy_to_user(argp, &num, sizeof(num))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, + void __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vdpa_callback cb; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + u8 status; + u32 idx; + long r; + + r = get_user(idx, (u32 __user *)argp); + if (r < 0) + return r; + + if (idx >= v->nvqs) + return -ENOBUFS; + + idx = array_index_nospec(idx, v->nvqs); + vq = &v->vqs[idx]; + + status = ops->get_status(vdpa); + + if (cmd == VHOST_VDPA_SET_VRING_ENABLE) { + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + ops->set_vq_ready(vdpa, idx, s.num); + return 0; + } + + if (cmd == VHOST_GET_VRING_BASE) + vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx); + + r = vhost_vring_ioctl(&v->vdev, cmd, argp); + if (r) + return r; + + switch (cmd) { + case VHOST_SET_VRING_ADDR: + if (ops->set_vq_address(vdpa, idx, + (u64)(uintptr_t)vq->desc, + (u64)(uintptr_t)vq->avail, + (u64)(uintptr_t)vq->used)) + r = -EINVAL; + break; + + case VHOST_SET_VRING_BASE: + if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx)) + r = -EINVAL; + break; + + case VHOST_SET_VRING_CALL: + if (vq->call_ctx) { + cb.callback = vhost_vdpa_virtqueue_cb; + cb.private = vq; + } else { + cb.callback = NULL; + cb.private = NULL; + } + ops->set_vq_cb(vdpa, idx, &cb); + break; + + case VHOST_SET_VRING_NUM: + ops->set_vq_num(vdpa, idx, vq->num); + break; + } + + return r; +} + +static long vhost_vdpa_unlocked_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vhost_vdpa *v = filep->private_data; + struct vhost_dev *d = &v->vdev; + void __user *argp = (void __user *)arg; + long r; + + mutex_lock(&d->mutex); + + switch (cmd) { + case VHOST_VDPA_GET_DEVICE_ID: + r = vhost_vdpa_get_device_id(v, argp); + break; + case VHOST_VDPA_GET_STATUS: + r = vhost_vdpa_get_status(v, argp); + break; + case VHOST_VDPA_SET_STATUS: + r = vhost_vdpa_set_status(v, argp); + break; + case VHOST_VDPA_GET_CONFIG: + r = vhost_vdpa_get_config(v, argp); + break; + case VHOST_VDPA_SET_CONFIG: + r = vhost_vdpa_set_config(v, argp); + break; + case VHOST_GET_FEATURES: + r = vhost_vdpa_get_features(v, argp); + break; + case VHOST_SET_FEATURES: + r = vhost_vdpa_set_features(v, argp); + break; + case VHOST_VDPA_GET_VRING_NUM: + r = vhost_vdpa_get_vring_num(v, argp); + break; + case VHOST_SET_LOG_BASE: + case VHOST_SET_LOG_FD: + r = -ENOIOCTLCMD; + break; + default: + r = vhost_dev_ioctl(&v->vdev, cmd, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vdpa_vring_ioctl(v, cmd, argp); + break; + } + + mutex_unlock(&d->mutex); + return r; +} + +static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb_map *map; + struct page *page; + unsigned long pfn, pinned; + + while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { + pinned = map->size >> PAGE_SHIFT; + for (pfn = map->addr >> PAGE_SHIFT; + pinned > 0; pfn++, pinned--) { + page = pfn_to_page(pfn); + if (map->perm & VHOST_ACCESS_WO) + set_page_dirty_lock(page); + unpin_user_page(page); + } + atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm); + vhost_iotlb_map_free(iotlb, map); + } +} + +static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) +{ + struct vhost_dev *dev = &v->vdev; + + vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); + kfree(dev->iotlb); + dev->iotlb = NULL; +} + +static int perm_to_iommu_flags(u32 perm) +{ + int flags = 0; + + switch (perm) { + case VHOST_ACCESS_WO: + flags |= IOMMU_WRITE; + break; + case VHOST_ACCESS_RO: + flags |= IOMMU_READ; + break; + case VHOST_ACCESS_RW: + flags |= (IOMMU_WRITE | IOMMU_READ); + break; + default: + WARN(1, "invalidate vhost IOTLB permission\n"); + break; + } + + return flags | IOMMU_CACHE; +} + +static int vhost_vdpa_map(struct vhost_vdpa *v, + u64 iova, u64 size, u64 pa, u32 perm) +{ + struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + int r = 0; + + r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1, + pa, perm); + if (r) + return r; + + if (ops->dma_map) + r = ops->dma_map(vdpa, iova, size, pa, perm); + else if (ops->set_map) + r = ops->set_map(vdpa, dev->iotlb); + else + r = iommu_map(v->domain, iova, pa, size, + perm_to_iommu_flags(perm)); + + return r; +} + +static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) +{ + struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); + + if (ops->dma_map) + ops->dma_unmap(vdpa, iova, size); + else if (ops->set_map) + ops->set_map(vdpa, dev->iotlb); + else + iommu_unmap(v->domain, iova, size); +} + +static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, + struct vhost_iotlb_msg *msg) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct page **page_list; + unsigned long list_size = PAGE_SIZE / sizeof(struct page *); + unsigned int gup_flags = FOLL_LONGTERM; + unsigned long npages, cur_base, map_pfn, last_pfn = 0; + unsigned long locked, lock_limit, pinned, i; + u64 iova = msg->iova; + int ret = 0; + + if (vhost_iotlb_itree_first(iotlb, msg->iova, + msg->iova + msg->size - 1)) + return -EEXIST; + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + if (msg->perm & VHOST_ACCESS_WO) + gup_flags |= FOLL_WRITE; + + npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; + if (!npages) + return -EINVAL; + + down_read(&dev->mm->mmap_sem); + + locked = atomic64_add_return(npages, &dev->mm->pinned_vm); + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (locked > lock_limit) { + ret = -ENOMEM; + goto out; + } + + cur_base = msg->uaddr & PAGE_MASK; + iova &= PAGE_MASK; + + while (npages) { + pinned = min_t(unsigned long, npages, list_size); + ret = pin_user_pages(cur_base, pinned, + gup_flags, page_list, NULL); + if (ret != pinned) + goto out; + + if (!last_pfn) + map_pfn = page_to_pfn(page_list[0]); + + for (i = 0; i < ret; i++) { + unsigned long this_pfn = page_to_pfn(page_list[i]); + u64 csize; + + if (last_pfn && (this_pfn != last_pfn + 1)) { + /* Pin a contiguous chunk of memory */ + csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; + if (vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm)) + goto out; + map_pfn = this_pfn; + iova += csize; + } + + last_pfn = this_pfn; + } + + cur_base += ret << PAGE_SHIFT; + npages -= ret; + } + + /* Pin the rest chunk */ + ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT, + map_pfn << PAGE_SHIFT, msg->perm); +out: + if (ret) { + vhost_vdpa_unmap(v, msg->iova, msg->size); + atomic64_sub(npages, &dev->mm->pinned_vm); + } + up_read(&dev->mm->mmap_sem); + free_page((unsigned long)page_list); + return ret; +} + +static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg) +{ + struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); + int r = 0; + + r = vhost_dev_check_owner(dev); + if (r) + return r; + + switch (msg->type) { + case VHOST_IOTLB_UPDATE: + r = vhost_vdpa_process_iotlb_update(v, msg); + break; + case VHOST_IOTLB_INVALIDATE: + vhost_vdpa_unmap(v, msg->iova, msg->size); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_vdpa *v = file->private_data; + struct vhost_dev *dev = &v->vdev; + + return vhost_chr_write_iter(dev, from); +} + +static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct device *dma_dev = vdpa_get_dma_dev(vdpa); + struct bus_type *bus; + int ret; + + /* Device want to do DMA by itself */ + if (ops->set_map || ops->dma_map) + return 0; + + bus = dma_dev->bus; + if (!bus) + return -EFAULT; + + if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) + return -ENOTSUPP; + + v->domain = iommu_domain_alloc(bus); + if (!v->domain) + return -EIO; + + ret = iommu_attach_device(v->domain, dma_dev); + if (ret) + goto err_attach; + + return 0; + +err_attach: + iommu_domain_free(v->domain); + return ret; +} + +static void vhost_vdpa_free_domain(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + struct device *dma_dev = vdpa_get_dma_dev(vdpa); + + if (v->domain) { + iommu_detach_device(v->domain, dma_dev); + iommu_domain_free(v->domain); + } + + v->domain = NULL; +} + +static int vhost_vdpa_open(struct inode *inode, struct file *filep) +{ + struct vhost_vdpa *v; + struct vhost_dev *dev; + struct vhost_virtqueue **vqs; + int nvqs, i, r, opened; + + v = container_of(inode->i_cdev, struct vhost_vdpa, cdev); + if (!v) + return -ENODEV; + + opened = atomic_cmpxchg(&v->opened, 0, 1); + if (opened) + return -EBUSY; + + nvqs = v->nvqs; + vhost_vdpa_reset(v); + + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + r = -ENOMEM; + goto err; + } + + dev = &v->vdev; + for (i = 0; i < nvqs; i++) { + vqs[i] = &v->vqs[i]; + vqs[i]->handle_kick = handle_vq_kick; + } + vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, + vhost_vdpa_process_iotlb_msg); + + dev->iotlb = vhost_iotlb_alloc(0, 0); + if (!dev->iotlb) { + r = -ENOMEM; + goto err_init_iotlb; + } + + r = vhost_vdpa_alloc_domain(v); + if (r) + goto err_init_iotlb; + + filep->private_data = v; + + return 0; + +err_init_iotlb: + vhost_dev_cleanup(&v->vdev); +err: + atomic_dec(&v->opened); + return r; +} + +static int vhost_vdpa_release(struct inode *inode, struct file *filep) +{ + struct vhost_vdpa *v = filep->private_data; + struct vhost_dev *d = &v->vdev; + + mutex_lock(&d->mutex); + filep->private_data = NULL; + vhost_vdpa_reset(v); + vhost_dev_stop(&v->vdev); + vhost_vdpa_iotlb_free(v); + vhost_vdpa_free_domain(v); + vhost_dev_cleanup(&v->vdev); + kfree(v->vdev.vqs); + mutex_unlock(&d->mutex); + + atomic_dec(&v->opened); + complete(&v->completion); + + return 0; +} + +static const struct file_operations vhost_vdpa_fops = { + .owner = THIS_MODULE, + .open = vhost_vdpa_open, + .release = vhost_vdpa_release, + .write_iter = vhost_vdpa_chr_write_iter, + .unlocked_ioctl = vhost_vdpa_unlocked_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static void vhost_vdpa_release_dev(struct device *device) +{ + struct vhost_vdpa *v = + container_of(device, struct vhost_vdpa, dev); + + ida_simple_remove(&vhost_vdpa_ida, v->minor); + kfree(v->vqs); + kfree(v); +} + +static int vhost_vdpa_probe(struct vdpa_device *vdpa) +{ + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa *v; + int minor, nvqs = VHOST_VDPA_VQ_MAX; + int r; + + /* Currently, we only accept the network devices. */ + if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) + return -ENOTSUPP; + + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!v) + return -ENOMEM; + + minor = ida_simple_get(&vhost_vdpa_ida, 0, + VHOST_VDPA_DEV_MAX, GFP_KERNEL); + if (minor < 0) { + kfree(v); + return minor; + } + + atomic_set(&v->opened, 0); + v->minor = minor; + v->vdpa = vdpa; + v->nvqs = nvqs; + v->virtio_id = ops->get_device_id(vdpa); + + device_initialize(&v->dev); + v->dev.release = vhost_vdpa_release_dev; + v->dev.parent = &vdpa->dev; + v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor); + v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue), + GFP_KERNEL); + if (!v->vqs) { + r = -ENOMEM; + goto err; + } + + r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor); + if (r) + goto err; + + cdev_init(&v->cdev, &vhost_vdpa_fops); + v->cdev.owner = THIS_MODULE; + + r = cdev_device_add(&v->cdev, &v->dev); + if (r) + goto err; + + init_completion(&v->completion); + vdpa_set_drvdata(vdpa, v); + + return 0; + +err: + put_device(&v->dev); + return r; +} + +static void vhost_vdpa_remove(struct vdpa_device *vdpa) +{ + struct vhost_vdpa *v = vdpa_get_drvdata(vdpa); + int opened; + + cdev_device_del(&v->cdev, &v->dev); + + do { + opened = atomic_cmpxchg(&v->opened, 0, 1); + if (!opened) + break; + wait_for_completion(&v->completion); + } while (1); + + put_device(&v->dev); +} + +static struct vdpa_driver vhost_vdpa_driver = { + .driver = { + .name = "vhost_vdpa", + }, + .probe = vhost_vdpa_probe, + .remove = vhost_vdpa_remove, +}; + +static int __init vhost_vdpa_init(void) +{ + int r; + + r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX, + "vhost-vdpa"); + if (r) + goto err_alloc_chrdev; + + r = vdpa_register_driver(&vhost_vdpa_driver); + if (r) + goto err_vdpa_register_driver; + + return 0; + +err_vdpa_register_driver: + unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); +err_alloc_chrdev: + return r; +} +module_init(vhost_vdpa_init); + +static void __exit vhost_vdpa_exit(void) +{ + vdpa_unregister_driver(&vhost_vdpa_driver); + unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); +} +module_exit(vhost_vdpa_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("vDPA-based vhost backend for virtio"); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f44340b41494..d450e16c5c25 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -50,10 +50,6 @@ enum { #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) -INTERVAL_TREE_DEFINE(struct vhost_umem_node, - rb, __u64, __subtree_last, - START, LAST, static inline, vhost_umem_interval_tree); - #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) { @@ -457,7 +453,9 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue *vq, void vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs, - int iov_limit, int weight, int byte_weight) + int iov_limit, int weight, int byte_weight, + int (*msg_handler)(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg)) { struct vhost_virtqueue *vq; int i; @@ -473,6 +471,7 @@ void vhost_dev_init(struct vhost_dev *dev, dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; + dev->msg_handler = msg_handler; init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); @@ -581,21 +580,25 @@ err_mm: } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); -struct vhost_umem *vhost_dev_reset_owner_prepare(void) +static struct vhost_iotlb *iotlb_alloc(void) +{ + return vhost_iotlb_alloc(max_iotlb_entries, + VHOST_IOTLB_FLAG_RETIRE); +} + +struct vhost_iotlb *vhost_dev_reset_owner_prepare(void) { - return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL); + return iotlb_alloc(); } EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); /* Caller should have device mutex */ -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem) +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) { int i; vhost_dev_cleanup(dev); - /* Restore memory to default empty mapping. */ - INIT_LIST_HEAD(&umem->umem_list); dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. @@ -618,28 +621,6 @@ void vhost_dev_stop(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_stop); -static void vhost_umem_free(struct vhost_umem *umem, - struct vhost_umem_node *node) -{ - vhost_umem_interval_tree_remove(node, &umem->umem_tree); - list_del(&node->link); - kfree(node); - umem->numem--; -} - -static void vhost_umem_clean(struct vhost_umem *umem) -{ - struct vhost_umem_node *node, *tmp; - - if (!umem) - return; - - list_for_each_entry_safe(node, tmp, &umem->umem_list, link) - vhost_umem_free(umem, node); - - kvfree(umem); -} - static void vhost_clear_msg(struct vhost_dev *dev) { struct vhost_msg_node *node, *n; @@ -677,9 +658,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) eventfd_ctx_put(dev->log_ctx); dev->log_ctx = NULL; /* No one will access memory at this point */ - vhost_umem_clean(dev->umem); + vhost_iotlb_free(dev->umem); dev->umem = NULL; - vhost_umem_clean(dev->iotlb); + vhost_iotlb_free(dev->iotlb); dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); @@ -715,27 +696,26 @@ static bool vhost_overflow(u64 uaddr, u64 size) } /* Caller should have vq mutex and device mutex. */ -static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, +static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem, int log_all) { - struct vhost_umem_node *node; + struct vhost_iotlb_map *map; if (!umem) return false; - list_for_each_entry(node, &umem->umem_list, link) { - unsigned long a = node->userspace_addr; + list_for_each_entry(map, &umem->list, link) { + unsigned long a = map->addr; - if (vhost_overflow(node->userspace_addr, node->size)) + if (vhost_overflow(map->addr, map->size)) return false; - if (!access_ok((void __user *)a, - node->size)) + if (!access_ok((void __user *)a, map->size)) return false; else if (log_all && !log_access_ok(log_base, - node->start, - node->size)) + map->start, + map->size)) return false; } return true; @@ -745,17 +725,17 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, u64 addr, unsigned int size, int type) { - const struct vhost_umem_node *node = vq->meta_iotlb[type]; + const struct vhost_iotlb_map *map = vq->meta_iotlb[type]; - if (!node) + if (!map) return NULL; - return (void *)(uintptr_t)(node->userspace_addr + addr - node->start); + return (void *)(uintptr_t)(map->addr + addr - map->start); } /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ -static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, +static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem, int log_all) { int i; @@ -1020,47 +1000,6 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq, return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); } -static int vhost_new_umem_range(struct vhost_umem *umem, - u64 start, u64 size, u64 end, - u64 userspace_addr, int perm) -{ - struct vhost_umem_node *tmp, *node; - - if (!size) - return -EFAULT; - - node = kmalloc(sizeof(*node), GFP_ATOMIC); - if (!node) - return -ENOMEM; - - if (umem->numem == max_iotlb_entries) { - tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link); - vhost_umem_free(umem, tmp); - } - - node->start = start; - node->size = size; - node->last = end; - node->userspace_addr = userspace_addr; - node->perm = perm; - INIT_LIST_HEAD(&node->link); - list_add_tail(&node->link, &umem->umem_list); - vhost_umem_interval_tree_insert(node, &umem->umem_tree); - umem->numem++; - - return 0; -} - -static void vhost_del_umem_range(struct vhost_umem *umem, - u64 start, u64 end) -{ - struct vhost_umem_node *node; - - while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - start, end))) - vhost_umem_free(umem, node); -} - static void vhost_iotlb_notify_vq(struct vhost_dev *d, struct vhost_iotlb_msg *msg) { @@ -1117,9 +1056,9 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, break; } vhost_vq_meta_reset(dev); - if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size, - msg->iova + msg->size - 1, - msg->uaddr, msg->perm)) { + if (vhost_iotlb_add_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1, + msg->uaddr, msg->perm)) { ret = -ENOMEM; break; } @@ -1131,8 +1070,8 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, break; } vhost_vq_meta_reset(dev); - vhost_del_umem_range(dev->iotlb, msg->iova, - msg->iova + msg->size - 1); + vhost_iotlb_del_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1); break; default: ret = -EINVAL; @@ -1178,7 +1117,12 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, ret = -EINVAL; goto done; } - if (vhost_process_iotlb_msg(dev, &msg)) { + + if (dev->msg_handler) + ret = dev->msg_handler(dev, &msg); + else + ret = vhost_process_iotlb_msg(dev, &msg); + if (ret) { ret = -EFAULT; goto done; } @@ -1311,44 +1255,42 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, } static void vhost_vq_meta_update(struct vhost_virtqueue *vq, - const struct vhost_umem_node *node, + const struct vhost_iotlb_map *map, int type) { int access = (type == VHOST_ADDR_USED) ? VHOST_ACCESS_WO : VHOST_ACCESS_RO; - if (likely(node->perm & access)) - vq->meta_iotlb[type] = node; + if (likely(map->perm & access)) + vq->meta_iotlb[type] = map; } static bool iotlb_access_ok(struct vhost_virtqueue *vq, int access, u64 addr, u64 len, int type) { - const struct vhost_umem_node *node; - struct vhost_umem *umem = vq->iotlb; + const struct vhost_iotlb_map *map; + struct vhost_iotlb *umem = vq->iotlb; u64 s = 0, size, orig_addr = addr, last = addr + len - 1; if (vhost_vq_meta_fetch(vq, addr, len, type)) return true; while (len > s) { - node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - addr, - last); - if (node == NULL || node->start > addr) { + map = vhost_iotlb_itree_first(umem, addr, last); + if (map == NULL || map->start > addr) { vhost_iotlb_miss(vq, addr, access); return false; - } else if (!(node->perm & access)) { + } else if (!(map->perm & access)) { /* Report the possible access violation by * request another translation from userspace. */ return false; } - size = node->size - addr + node->start; + size = map->size - addr + map->start; if (orig_addr == addr && size >= len) - vhost_vq_meta_update(vq, node, type); + vhost_vq_meta_update(vq, map, type); s += size; addr += size; @@ -1364,12 +1306,12 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq) if (!vq->iotlb) return 1; - return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, + return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc, vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) && - iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail, + iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail, vhost_get_avail_size(vq, num), VHOST_ADDR_AVAIL) && - iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used, + iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used, vhost_get_used_size(vq, num), VHOST_ADDR_USED); } EXPORT_SYMBOL_GPL(vq_meta_prefetch); @@ -1408,25 +1350,11 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq) } EXPORT_SYMBOL_GPL(vhost_vq_access_ok); -static struct vhost_umem *vhost_umem_alloc(void) -{ - struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL); - - if (!umem) - return NULL; - - umem->umem_tree = RB_ROOT_CACHED; - umem->numem = 0; - INIT_LIST_HEAD(&umem->umem_list); - - return umem; -} - static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem; struct vhost_memory_region *region; - struct vhost_umem *newumem, *oldumem; + struct vhost_iotlb *newumem, *oldumem; unsigned long size = offsetof(struct vhost_memory, regions); int i; @@ -1448,7 +1376,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return -EFAULT; } - newumem = vhost_umem_alloc(); + newumem = iotlb_alloc(); if (!newumem) { kvfree(newmem); return -ENOMEM; @@ -1457,13 +1385,12 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) for (region = newmem->regions; region < newmem->regions + mem.nregions; region++) { - if (vhost_new_umem_range(newumem, - region->guest_phys_addr, - region->memory_size, - region->guest_phys_addr + - region->memory_size - 1, - region->userspace_addr, - VHOST_ACCESS_RW)) + if (vhost_iotlb_add_range(newumem, + region->guest_phys_addr, + region->guest_phys_addr + + region->memory_size - 1, + region->userspace_addr, + VHOST_MAP_RW)) goto err; } @@ -1481,11 +1408,11 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) } kvfree(newmem); - vhost_umem_clean(oldumem); + vhost_iotlb_free(oldumem); return 0; err: - vhost_umem_clean(newumem); + vhost_iotlb_free(newumem); kvfree(newmem); return -EFAULT; } @@ -1726,10 +1653,10 @@ EXPORT_SYMBOL_GPL(vhost_vring_ioctl); int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) { - struct vhost_umem *niotlb, *oiotlb; + struct vhost_iotlb *niotlb, *oiotlb; int i; - niotlb = vhost_umem_alloc(); + niotlb = iotlb_alloc(); if (!niotlb) return -ENOMEM; @@ -1745,7 +1672,7 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) mutex_unlock(&vq->mutex); } - vhost_umem_clean(oiotlb); + vhost_iotlb_free(oiotlb); return 0; } @@ -1875,8 +1802,8 @@ static int log_write(void __user *log_base, static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) { - struct vhost_umem *umem = vq->umem; - struct vhost_umem_node *u; + struct vhost_iotlb *umem = vq->umem; + struct vhost_iotlb_map *u; u64 start, end, l, min; int r; bool hit = false; @@ -1886,16 +1813,15 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) /* More than one GPAs can be mapped into a single HVA. So * iterate all possible umems here to be safe. */ - list_for_each_entry(u, &umem->umem_list, link) { - if (u->userspace_addr > hva - 1 + len || - u->userspace_addr - 1 + u->size < hva) + list_for_each_entry(u, &umem->list, link) { + if (u->addr > hva - 1 + len || + u->addr - 1 + u->size < hva) continue; - start = max(u->userspace_addr, hva); - end = min(u->userspace_addr - 1 + u->size, - hva - 1 + len); + start = max(u->addr, hva); + end = min(u->addr - 1 + u->size, hva - 1 + len); l = end - start + 1; r = log_write(vq->log_base, - u->start + start - u->userspace_addr, + u->start + start - u->addr, l); if (r < 0) return r; @@ -2046,9 +1972,9 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access); static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct iovec iov[], int iov_size, int access) { - const struct vhost_umem_node *node; + const struct vhost_iotlb_map *map; struct vhost_dev *dev = vq->dev; - struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem; + struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem; struct iovec *_iov; u64 s = 0; int ret = 0; @@ -2060,25 +1986,24 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, break; } - node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - addr, addr + len - 1); - if (node == NULL || node->start > addr) { + map = vhost_iotlb_itree_first(umem, addr, addr + len - 1); + if (map == NULL || map->start > addr) { if (umem != dev->iotlb) { ret = -EFAULT; break; } ret = -EAGAIN; break; - } else if (!(node->perm & access)) { + } else if (!(map->perm & access)) { ret = -EPERM; break; } _iov = iov + ret; - size = node->size - addr + node->start; + size = map->size - addr + map->start; _iov->iov_len = min((u64)len - s, size); _iov->iov_base = (void __user *)(unsigned long) - (node->userspace_addr + addr - node->start); + (map->addr + addr - map->start); s += size; addr += size; ++ret; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index a123fd70847e..181382185bbc 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -12,6 +12,7 @@ #include <linux/virtio_config.h> #include <linux/virtio_ring.h> #include <linux/atomic.h> +#include <linux/vhost_iotlb.h> struct vhost_work; typedef void (*vhost_work_fn_t)(struct vhost_work *work); @@ -52,27 +53,6 @@ struct vhost_log { u64 len; }; -#define START(node) ((node)->start) -#define LAST(node) ((node)->last) - -struct vhost_umem_node { - struct rb_node rb; - struct list_head link; - __u64 start; - __u64 last; - __u64 size; - __u64 userspace_addr; - __u32 perm; - __u32 flags_padding; - __u64 __subtree_last; -}; - -struct vhost_umem { - struct rb_root_cached umem_tree; - struct list_head umem_list; - int numem; -}; - enum vhost_uaddr_type { VHOST_ADDR_DESC = 0, VHOST_ADDR_AVAIL = 1, @@ -90,7 +70,7 @@ struct vhost_virtqueue { struct vring_desc __user *desc; struct vring_avail __user *avail; struct vring_used __user *used; - const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; + const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS]; struct file *kick; struct eventfd_ctx *call_ctx; struct eventfd_ctx *error_ctx; @@ -128,8 +108,8 @@ struct vhost_virtqueue { struct iovec *indirect; struct vring_used_elem *heads; /* Protected by virtqueue mutex. */ - struct vhost_umem *umem; - struct vhost_umem *iotlb; + struct vhost_iotlb *umem; + struct vhost_iotlb *iotlb; void *private_data; u64 acked_features; u64 acked_backend_features; @@ -164,8 +144,8 @@ struct vhost_dev { struct eventfd_ctx *log_ctx; struct llist_head work_list; struct task_struct *worker; - struct vhost_umem *umem; - struct vhost_umem *iotlb; + struct vhost_iotlb *umem; + struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; struct list_head read_list; struct list_head pending_list; @@ -174,16 +154,20 @@ struct vhost_dev { int weight; int byte_weight; u64 kcov_handle; + int (*msg_handler)(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg); }; bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, - int nvqs, int iov_limit, int weight, int byte_weight); + int nvqs, int iov_limit, int weight, int byte_weight, + int (*msg_handler)(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg)); long vhost_dev_set_owner(struct vhost_dev *dev); bool vhost_dev_has_owner(struct vhost_dev *dev); long vhost_dev_check_owner(struct vhost_dev *); -struct vhost_umem *vhost_dev_reset_owner_prepare(void); -void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *); +struct vhost_iotlb *vhost_dev_reset_owner_prepare(void); +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *iotlb); void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); @@ -229,6 +213,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, struct iov_iter *from); int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled); +void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, + struct vhost_iotlb_map *map); + #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \ diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index a0a2d74967ef..ee0491f579ac 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -13,6 +13,9 @@ #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/bvec.h> +#include <linux/highmem.h> +#include <linux/vhost_iotlb.h> #include <uapi/linux/virtio_config.h> static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) @@ -71,9 +74,11 @@ static inline int __vringh_get_head(const struct vringh *vrh, } /* Copy some bytes to/from the iovec. Returns num copied. */ -static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, +static inline ssize_t vringh_iov_xfer(struct vringh *vrh, + struct vringh_kiov *iov, void *ptr, size_t len, - int (*xfer)(void *addr, void *ptr, + int (*xfer)(const struct vringh *vrh, + void *addr, void *ptr, size_t len)) { int err, done = 0; @@ -82,7 +87,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, size_t partlen; partlen = min(iov->iov[iov->i].iov_len, len); - err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); + err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen); if (err) return err; done += partlen; @@ -96,6 +101,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, /* Fix up old iov element then increment. */ iov->iov[iov->i].iov_len = iov->consumed; iov->iov[iov->i].iov_base -= iov->consumed; + iov->consumed = 0; iov->i++; @@ -227,7 +233,8 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src, u64 addr, struct vringh_range *r), struct vringh_range *range, - int (*copy)(void *dst, const void *src, size_t len)) + int (*copy)(const struct vringh *vrh, + void *dst, const void *src, size_t len)) { size_t part, len = sizeof(struct vring_desc); @@ -241,7 +248,7 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src, if (!rcheck(vrh, addr, &part, range, getrange)) return -EINVAL; - err = copy(dst, src, part); + err = copy(vrh, dst, src, part); if (err) return err; @@ -262,7 +269,8 @@ __vringh_iov(struct vringh *vrh, u16 i, struct vringh_range *)), bool (*getrange)(struct vringh *, u64, struct vringh_range *), gfp_t gfp, - int (*copy)(void *dst, const void *src, size_t len)) + int (*copy)(const struct vringh *vrh, + void *dst, const void *src, size_t len)) { int err, count = 0, up_next, desc_max; struct vring_desc desc, *descs; @@ -291,7 +299,7 @@ __vringh_iov(struct vringh *vrh, u16 i, err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, &slowrange, copy); else - err = copy(&desc, &descs[i], sizeof(desc)); + err = copy(vrh, &desc, &descs[i], sizeof(desc)); if (unlikely(err)) goto fail; @@ -404,7 +412,8 @@ static inline int __vringh_complete(struct vringh *vrh, unsigned int num_used, int (*putu16)(const struct vringh *vrh, __virtio16 *p, u16 val), - int (*putused)(struct vring_used_elem *dst, + int (*putused)(const struct vringh *vrh, + struct vring_used_elem *dst, const struct vring_used_elem *src, unsigned num)) { @@ -420,12 +429,12 @@ static inline int __vringh_complete(struct vringh *vrh, /* Compiler knows num_used == 1 sometimes, hence extra check */ if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { u16 part = vrh->vring.num - off; - err = putused(&used_ring->ring[off], used, part); + err = putused(vrh, &used_ring->ring[off], used, part); if (!err) - err = putused(&used_ring->ring[0], used + part, + err = putused(vrh, &used_ring->ring[0], used + part, num_used - part); } else - err = putused(&used_ring->ring[off], used, num_used); + err = putused(vrh, &used_ring->ring[off], used, num_used); if (err) { vringh_bad("Failed to write %u used entries %u at %p", @@ -564,13 +573,15 @@ static inline int putu16_user(const struct vringh *vrh, __virtio16 *p, u16 val) return put_user(v, (__force __virtio16 __user *)p); } -static inline int copydesc_user(void *dst, const void *src, size_t len) +static inline int copydesc_user(const struct vringh *vrh, + void *dst, const void *src, size_t len) { return copy_from_user(dst, (__force void __user *)src, len) ? -EFAULT : 0; } -static inline int putused_user(struct vring_used_elem *dst, +static inline int putused_user(const struct vringh *vrh, + struct vring_used_elem *dst, const struct vring_used_elem *src, unsigned int num) { @@ -578,13 +589,15 @@ static inline int putused_user(struct vring_used_elem *dst, sizeof(*dst) * num) ? -EFAULT : 0; } -static inline int xfer_from_user(void *src, void *dst, size_t len) +static inline int xfer_from_user(const struct vringh *vrh, void *src, + void *dst, size_t len) { return copy_from_user(dst, (__force void __user *)src, len) ? -EFAULT : 0; } -static inline int xfer_to_user(void *dst, void *src, size_t len) +static inline int xfer_to_user(const struct vringh *vrh, + void *dst, void *src, size_t len) { return copy_to_user((__force void __user *)dst, src, len) ? -EFAULT : 0; @@ -706,7 +719,7 @@ EXPORT_SYMBOL(vringh_getdesc_user); */ ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) { - return vringh_iov_xfer((struct vringh_kiov *)riov, + return vringh_iov_xfer(NULL, (struct vringh_kiov *)riov, dst, len, xfer_from_user); } EXPORT_SYMBOL(vringh_iov_pull_user); @@ -722,7 +735,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user); ssize_t vringh_iov_push_user(struct vringh_iov *wiov, const void *src, size_t len) { - return vringh_iov_xfer((struct vringh_kiov *)wiov, + return vringh_iov_xfer(NULL, (struct vringh_kiov *)wiov, (void *)src, len, xfer_to_user); } EXPORT_SYMBOL(vringh_iov_push_user); @@ -832,13 +845,15 @@ static inline int putu16_kern(const struct vringh *vrh, __virtio16 *p, u16 val) return 0; } -static inline int copydesc_kern(void *dst, const void *src, size_t len) +static inline int copydesc_kern(const struct vringh *vrh, + void *dst, const void *src, size_t len) { memcpy(dst, src, len); return 0; } -static inline int putused_kern(struct vring_used_elem *dst, +static inline int putused_kern(const struct vringh *vrh, + struct vring_used_elem *dst, const struct vring_used_elem *src, unsigned int num) { @@ -846,13 +861,15 @@ static inline int putused_kern(struct vring_used_elem *dst, return 0; } -static inline int xfer_kern(void *src, void *dst, size_t len) +static inline int xfer_kern(const struct vringh *vrh, void *src, + void *dst, size_t len) { memcpy(dst, src, len); return 0; } -static inline int kern_xfer(void *dst, void *src, size_t len) +static inline int kern_xfer(const struct vringh *vrh, void *dst, + void *src, size_t len) { memcpy(dst, src, len); return 0; @@ -949,7 +966,7 @@ EXPORT_SYMBOL(vringh_getdesc_kern); */ ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) { - return vringh_iov_xfer(riov, dst, len, xfer_kern); + return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern); } EXPORT_SYMBOL(vringh_iov_pull_kern); @@ -964,7 +981,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern); ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, const void *src, size_t len) { - return vringh_iov_xfer(wiov, (void *)src, len, kern_xfer); + return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer); } EXPORT_SYMBOL(vringh_iov_push_kern); @@ -1042,4 +1059,362 @@ int vringh_need_notify_kern(struct vringh *vrh) } EXPORT_SYMBOL(vringh_need_notify_kern); +static int iotlb_translate(const struct vringh *vrh, + u64 addr, u64 len, struct bio_vec iov[], + int iov_size, u32 perm) +{ + struct vhost_iotlb_map *map; + struct vhost_iotlb *iotlb = vrh->iotlb; + int ret = 0; + u64 s = 0; + + while (len > s) { + u64 size, pa, pfn; + + if (unlikely(ret >= iov_size)) { + ret = -ENOBUFS; + break; + } + + map = vhost_iotlb_itree_first(iotlb, addr, + addr + len - 1); + if (!map || map->start > addr) { + ret = -EINVAL; + break; + } else if (!(map->perm & perm)) { + ret = -EPERM; + break; + } + + size = map->size - addr + map->start; + pa = map->addr + addr - map->start; + pfn = pa >> PAGE_SHIFT; + iov[ret].bv_page = pfn_to_page(pfn); + iov[ret].bv_len = min(len - s, size); + iov[ret].bv_offset = pa & (PAGE_SIZE - 1); + s += size; + addr += size; + ++ret; + } + + return ret; +} + +static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, + void *src, size_t len) +{ + struct iov_iter iter; + struct bio_vec iov[16]; + int ret; + + ret = iotlb_translate(vrh, (u64)(uintptr_t)src, + len, iov, 16, VHOST_MAP_RO); + if (ret < 0) + return ret; + + iov_iter_bvec(&iter, READ, iov, ret, len); + + ret = copy_from_iter(dst, len, &iter); + + return ret; +} + +static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, + void *src, size_t len) +{ + struct iov_iter iter; + struct bio_vec iov[16]; + int ret; + + ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, + len, iov, 16, VHOST_MAP_WO); + if (ret < 0) + return ret; + + iov_iter_bvec(&iter, WRITE, iov, ret, len); + + return copy_to_iter(src, len, &iter); +} + +static inline int getu16_iotlb(const struct vringh *vrh, + u16 *val, const __virtio16 *p) +{ + struct bio_vec iov; + void *kaddr, *from; + int ret; + + /* Atomic read is needed for getu16 */ + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + &iov, 1, VHOST_MAP_RO); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(iov.bv_page); + from = kaddr + iov.bv_offset; + *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from)); + kunmap_atomic(kaddr); + + return 0; +} + +static inline int putu16_iotlb(const struct vringh *vrh, + __virtio16 *p, u16 val) +{ + struct bio_vec iov; + void *kaddr, *to; + int ret; + + /* Atomic write is needed for putu16 */ + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + &iov, 1, VHOST_MAP_WO); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(iov.bv_page); + to = kaddr + iov.bv_offset; + WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val)); + kunmap_atomic(kaddr); + + return 0; +} + +static inline int copydesc_iotlb(const struct vringh *vrh, + void *dst, const void *src, size_t len) +{ + int ret; + + ret = copy_from_iotlb(vrh, dst, (void *)src, len); + if (ret != len) + return -EFAULT; + + return 0; +} + +static inline int xfer_from_iotlb(const struct vringh *vrh, void *src, + void *dst, size_t len) +{ + int ret; + + ret = copy_from_iotlb(vrh, dst, src, len); + if (ret != len) + return -EFAULT; + + return 0; +} + +static inline int xfer_to_iotlb(const struct vringh *vrh, + void *dst, void *src, size_t len) +{ + int ret; + + ret = copy_to_iotlb(vrh, dst, src, len); + if (ret != len) + return -EFAULT; + + return 0; +} + +static inline int putused_iotlb(const struct vringh *vrh, + struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + int size = num * sizeof(*dst); + int ret; + + ret = copy_to_iotlb(vrh, dst, (void *)src, num * sizeof(*dst)); + if (ret != size) + return -EFAULT; + + return 0; +} + +/** + * vringh_init_iotlb - initialize a vringh for a ring with IOTLB. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_iotlb(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + return vringh_init_kern(vrh, features, num, weak_barriers, + desc, avail, used); +} +EXPORT_SYMBOL(vringh_init_iotlb); + +/** + * vringh_set_iotlb - initialize a vringh for a ring with IOTLB. + * @vrh: the vring + * @iotlb: iotlb associated with this vring + */ +void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb) +{ + vrh->iotlb = iotlb; +} +EXPORT_SYMBOL(vringh_set_iotlb); + +/** + * vringh_getdesc_iotlb - get next available descriptor from ring with + * IOTLB. + * @vrh: the kernelspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @head: head index we received, for passing to vringh_complete_iotlb(). + * @gfp: flags for allocating larger riov/wiov. + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_iotlb(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp) +{ + int err; + + err = __vringh_get_head(vrh, getu16_iotlb, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + *head = err; + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, + gfp, copydesc_iotlb); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_iotlb); + +/** + * vringh_iov_pull_iotlb - copy bytes from vring_iov. + * @vrh: the vring. + * @riov: the riov as passed to vringh_getdesc_iotlb() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_iotlb(struct vringh *vrh, + struct vringh_kiov *riov, + void *dst, size_t len) +{ + return vringh_iov_xfer(vrh, riov, dst, len, xfer_from_iotlb); +} +EXPORT_SYMBOL(vringh_iov_pull_iotlb); + +/** + * vringh_iov_push_iotlb - copy bytes into vring_iov. + * @vrh: the vring. + * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_iotlb(struct vringh *vrh, + struct vringh_kiov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer(vrh, wiov, (void *)src, len, xfer_to_iotlb); +} +EXPORT_SYMBOL(vringh_iov_push_iotlb); + +/** + * vringh_abandon_iotlb - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_iotlb() to undo). + * + * The next vringh_get_iotlb() will return the old descriptor(s) again. + */ +void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. + */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_iotlb); + +/** + * vringh_complete_iotlb - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_iotlb. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_iotlb() after one or more calls + * to this function. + */ +int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = cpu_to_vringh32(vrh, head); + used.len = cpu_to_vringh32(vrh, len); + + return __vringh_complete(vrh, &used, 1, putu16_iotlb, putused_iotlb); +} +EXPORT_SYMBOL(vringh_complete_iotlb); + +/** + * vringh_notify_enable_iotlb - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_iotlb(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb); +} +EXPORT_SYMBOL(vringh_notify_enable_iotlb); + +/** + * vringh_notify_disable_iotlb - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_iotlb(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_iotlb); +} +EXPORT_SYMBOL(vringh_notify_disable_iotlb); + +/** + * vringh_need_notify_iotlb - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_iotlb() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_iotlb(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_iotlb); +} +EXPORT_SYMBOL(vringh_need_notify_iotlb); + + MODULE_LICENSE("GPL"); diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index c2d7d57e98cf..97669484a3f6 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -621,7 +621,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, - VHOST_VSOCK_WEIGHT); + VHOST_VSOCK_WEIGHT, NULL); file->private_data = vsock; spin_lock_init(&vsock->send_pkt_list_lock); |