summaryrefslogtreecommitdiff
path: root/virt/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig11
-rw-r--r--virt/kvm/async_pf.c13
-rw-r--r--virt/kvm/guest_memfd.c176
-rw-r--r--virt/kvm/irqchip.c24
-rw-r--r--virt/kvm/kvm_main.c106
-rw-r--r--virt/kvm/pfncache.c3
6 files changed, 282 insertions, 51 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 29b73eedfe74..b14e14cdbfb9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
config KVM_GENERIC_DIRTYLOG_READ_PROTECT
bool
+config KVM_GENERIC_PRE_FAULT_MEMORY
+ bool
+
config KVM_COMPAT
def_bool y
depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
@@ -109,3 +112,11 @@ config KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_PRIVATE_MEM
bool
+
+config HAVE_KVM_GMEM_PREPARE
+ bool
+ depends on KVM_PRIVATE_MEM
+
+config HAVE_KVM_GMEM_INVALIDATE
+ bool
+ depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 99a63bad0306..0ee4816b079a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,6 @@ static void async_pf_execute(struct work_struct *work)
spin_lock(&vcpu->async_pf.lock);
first = list_empty(&vcpu->async_pf.done);
list_add_tail(&apf->link, &vcpu->async_pf.done);
- apf->vcpu = NULL;
spin_unlock(&vcpu->async_pf.lock);
/*
@@ -120,8 +119,6 @@ static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
{
- spin_lock(&vcpu->async_pf.lock);
-
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
@@ -129,23 +126,15 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
list_del(&work->queue);
- /*
- * We know it's present in vcpu->async_pf.done, do
- * nothing here.
- */
- if (!work->vcpu)
- continue;
-
- spin_unlock(&vcpu->async_pf.lock);
#ifdef CONFIG_KVM_ASYNC_PF_SYNC
flush_work(&work->work);
#else
if (cancel_work_sync(&work->work))
kmem_cache_free(async_pf_cache, work);
#endif
- spin_lock(&vcpu->async_pf.lock);
}
+ spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
list_first_entry(&vcpu->async_pf.done,
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 747fe251e445..1c509c351261 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -13,14 +13,50 @@ struct kvm_gmem {
struct list_head entry;
};
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+ struct list_head *gmem_list = &inode->i_mapping->i_private_list;
+ struct kvm_gmem *gmem;
+
+ list_for_each_entry(gmem, gmem_list, entry) {
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = gmem->kvm;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int rc;
+
+ if (!kvm_arch_gmem_prepare_needed(kvm))
+ continue;
+
+ slot = xa_load(&gmem->bindings, index);
+ if (!slot)
+ continue;
+
+ page = folio_file_page(folio, index);
+ pfn = page_to_pfn(page);
+ gfn = slot->base_gfn + index - slot->gmem.pgoff;
+ rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
+ if (rc) {
+ pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
+ index, gfn, pfn, rc);
+ return rc;
+ }
+ }
+
+#endif
+ return 0;
+}
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
{
struct folio *folio;
/* TODO: Support huge pages. */
folio = filemap_grab_folio(inode->i_mapping, index);
- if (IS_ERR_OR_NULL(folio))
- return NULL;
+ if (IS_ERR(folio))
+ return folio;
/*
* Use the up-to-date flag to track whether or not the memory has been
@@ -41,6 +77,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
folio_mark_uptodate(folio);
}
+ if (prepare) {
+ int r = kvm_gmem_prepare_folio(inode, index, folio);
+ if (r < 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(r);
+ }
+ }
+
/*
* Ignore accessed, referenced, and dirty flags. The memory is
* unevictable and there is no storage to write back to.
@@ -145,9 +190,9 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
break;
}
- folio = kvm_gmem_get_folio(inode, index);
- if (!folio) {
- r = -ENOMEM;
+ folio = kvm_gmem_get_folio(inode, index, true);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
break;
}
@@ -298,10 +343,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
return MF_DELAYED;
}
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+static void kvm_gmem_free_folio(struct folio *folio)
+{
+ struct page *page = folio_page(folio, 0);
+ kvm_pfn_t pfn = page_to_pfn(page);
+ int order = folio_order(folio);
+
+ kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+}
+#endif
+
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
+#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+ .free_folio = kvm_gmem_free_folio,
+#endif
};
static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -360,7 +419,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
inode->i_mode |= S_IFREG;
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_unmovable(inode->i_mapping);
+ mapping_set_inaccessible(inode->i_mapping);
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
@@ -482,38 +541,34 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
fput(file);
}
-int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare)
{
pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
- struct kvm_gmem *gmem;
+ struct kvm_gmem *gmem = file->private_data;
struct folio *folio;
struct page *page;
- struct file *file;
int r;
- file = kvm_gmem_get_file(slot);
- if (!file)
+ if (file != slot->gmem.file) {
+ WARN_ON_ONCE(slot->gmem.file);
return -EFAULT;
+ }
gmem = file->private_data;
-
- if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
- r = -EIO;
- goto out_fput;
+ if (xa_load(&gmem->bindings, index) != slot) {
+ WARN_ON_ONCE(xa_load(&gmem->bindings, index));
+ return -EIO;
}
- folio = kvm_gmem_get_folio(file_inode(file), index);
- if (!folio) {
- r = -ENOMEM;
- goto out_fput;
- }
+ folio = kvm_gmem_get_folio(file_inode(file), index, prepare);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (folio_test_hwpoison(folio)) {
folio_unlock(folio);
folio_put(folio);
- r = -EHWPOISON;
- goto out_fput;
+ return -EHWPOISON;
}
page = folio_file_page(folio, index);
@@ -525,9 +580,78 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
r = 0;
folio_unlock(folio);
-out_fput:
- fput(file);
return r;
}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+ struct file *file = kvm_gmem_get_file(slot);
+ int r;
+
+ if (!file)
+ return -EFAULT;
+
+ r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true);
+ fput(file);
+ return r;
+}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
+ kvm_gmem_populate_cb post_populate, void *opaque)
+{
+ struct file *file;
+ struct kvm_memory_slot *slot;
+ void __user *p;
+
+ int ret = 0, max_order;
+ long i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+ if (npages < 0)
+ return -EINVAL;
+
+ slot = gfn_to_memslot(kvm, start_gfn);
+ if (!kvm_slot_can_be_private(slot))
+ return -EINVAL;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return -EFAULT;
+
+ filemap_invalidate_lock(file->f_mapping);
+
+ npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
+ for (i = 0; i < npages; i += (1 << max_order)) {
+ gfn_t gfn = start_gfn + i;
+ kvm_pfn_t pfn;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false);
+ if (ret)
+ break;
+
+ if (!IS_ALIGNED(gfn, (1 << max_order)) ||
+ (npages - i) < (1 << max_order))
+ max_order = 0;
+
+ p = src ? src + i * PAGE_SIZE : NULL;
+ ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+
+ put_page(pfn_to_page(pfn));
+ if (ret)
+ break;
+ }
+
+ filemap_invalidate_unlock(file->f_mapping);
+
+ fput(file);
+ return ret && !i ? ret : i;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_populate);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1e567d1f6d3d..162d8ed889f2 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -237,3 +237,27 @@ out:
return r;
}
+
+/*
+ * Allocate empty IRQ routing by default so that additional setup isn't needed
+ * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing
+ * is guaranteed to be non-NULL.
+ */
+int kvm_init_irq_routing(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *new;
+ int chip_size;
+
+ new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_rt_entries = 1;
+
+ chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;
+ memset(new->chip, -1, chip_size);
+
+ RCU_INIT_POINTER(kvm->irq_routing, new);
+
+ return 0;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1192942aef91..d0788d0a72cc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1,9 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
+ * Kernel-based Virtual Machine (KVM) Hypervisor
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
@@ -74,6 +71,7 @@
#define ITOA_MAX_LEN 12
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");
/* Architectures should define their poll value according to the halt latency */
@@ -91,8 +89,8 @@ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
-/* Default resets per-vcpu halt_poll_ns . */
-unsigned int halt_poll_ns_shrink;
+/* Default halves per-vcpu halt_poll_ns. */
+unsigned int halt_poll_ns_shrink = 2;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
@@ -110,8 +108,7 @@ static struct kmem_cache *kvm_vcpu_cache;
static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
-struct dentry *kvm_debugfs_dir;
-EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
+static struct dentry *kvm_debugfs_dir;
static const struct file_operations stat_fops_per_vm;
@@ -1145,8 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
{
struct kvm *kvm = kvm_arch_alloc_vm();
struct kvm_memslots *slots;
- int r = -ENOMEM;
- int i, j;
+ int r, i, j;
if (!kvm)
return ERR_PTR(-ENOMEM);
@@ -1183,12 +1179,18 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
task_pid_nr(current));
+ r = -ENOMEM;
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
goto out_err_no_irq_srcu;
+ r = kvm_init_irq_routing(kvm);
+ if (r)
+ goto out_err_no_irq_routing;
+
refcount_set(&kvm->users_count, 1);
+
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
for (j = 0; j < 2; j++) {
slots = &kvm->__memslots[i][j];
@@ -1206,6 +1208,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
}
+ r = -ENOMEM;
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
@@ -1267,6 +1270,8 @@ out_err_no_arch_destroy_vm:
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
+ kvm_free_irq_routing(kvm);
+out_err_no_irq_routing:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
@@ -4202,12 +4207,21 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
{
int r;
struct kvm_vcpu *vcpu;
struct page *page;
+ /*
+ * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
+ * too-large values instead of silently truncating.
+ *
+ * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
+ * changing the storage type (at the very least, IDs should be tracked
+ * as unsigned ints).
+ */
+ BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
if (id >= KVM_MAX_VCPU_IDS)
return -EINVAL;
@@ -4375,6 +4389,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
return fd;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ int idx;
+ long r;
+ u64 full_size;
+
+ if (range->flags)
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(range->gpa) ||
+ !PAGE_ALIGNED(range->size) ||
+ range->gpa + range->size <= range->gpa)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ full_size = range->size;
+ do {
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
+ if (WARN_ON_ONCE(r == 0 || r == -EIO))
+ break;
+
+ if (r < 0)
+ break;
+
+ range->size -= r;
+ range->gpa += r;
+ cond_resched();
+ } while (range->size);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ vcpu_put(vcpu);
+
+ /* Return success if at least one page was mapped successfully. */
+ return full_size == range->size ? r : 0;
+}
+#endif
+
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4421,7 +4481,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
synchronize_rcu();
put_pid(oldpid);
}
+ vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
r = kvm_arch_vcpu_ioctl_run(vcpu);
+ vcpu->wants_to_run = false;
+
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
@@ -4575,6 +4638,20 @@ out_free1:
r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
break;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+ case KVM_PRE_FAULT_MEMORY: {
+ struct kvm_pre_fault_memory range;
+
+ r = -EFAULT;
+ if (copy_from_user(&range, argp, sizeof(range)))
+ break;
+ r = kvm_vcpu_pre_fault_memory(vcpu, &range);
+ /* Pass back leftover range. */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ r = -EFAULT;
+ break;
+ }
+#endif
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
@@ -6287,8 +6364,9 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
WRITE_ONCE(vcpu->ready, false);
__this_cpu_write(kvm_running_vcpu, vcpu);
- kvm_arch_sched_in(vcpu, cpu);
kvm_arch_vcpu_load(vcpu, cpu);
+
+ WRITE_ONCE(vcpu->scheduled_out, false);
}
static void kvm_sched_out(struct preempt_notifier *pn,
@@ -6296,7 +6374,9 @@ static void kvm_sched_out(struct preempt_notifier *pn,
{
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
- if (current->on_rq) {
+ WRITE_ONCE(vcpu->scheduled_out, true);
+
+ if (current->on_rq && vcpu->wants_to_run) {
WRITE_ONCE(vcpu->preempted, true);
WRITE_ONCE(vcpu->ready, true);
}
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index e3453e869e92..f0039efb9e1e 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -430,6 +430,9 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len)
{
+ if (!access_ok((void __user *)uhva, len))
+ return -EINVAL;
+
return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len);
}