summaryrefslogtreecommitdiff
path: root/drivers/hv
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/Kconfig1
-rw-r--r--drivers/hv/channel.c103
-rw-r--r--drivers/hv/channel_mgmt.c86
-rw-r--r--drivers/hv/connection.c7
-rw-r--r--drivers/hv/hv.c152
-rw-r--r--drivers/hv/hv_balloon.c91
-rw-r--r--drivers/hv/hv_trace.h15
-rw-r--r--drivers/hv/ring_buffer.c10
-rw-r--r--drivers/hv/vmbus_drv.c93
9 files changed, 479 insertions, 79 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 79e5356a737a..66c794d92391 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -23,6 +23,7 @@ config HYPERV_UTILS
config HYPERV_BALLOON
tristate "Microsoft Hyper-V Balloon driver"
depends on HYPERV
+ select PAGE_REPORTING
help
Select this option to enable Hyper-V Balloon driver.
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 0bd202de7960..c2635e913a92 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -209,31 +209,96 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
}
EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
+static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+ struct vmbus_channel_modifychannel msg;
+ int ret;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+ msg.child_relid = channel->offermsg.child_relid;
+ msg.target_vp = target_vp;
+
+ ret = vmbus_post_msg(&msg, sizeof(msg), true);
+ trace_vmbus_send_modifychannel(&msg, ret);
+
+ return ret;
+}
+
+static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+ struct vmbus_channel_modifychannel *msg;
+ struct vmbus_channel_msginfo *info;
+ unsigned long flags;
+ int ret;
+
+ info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
+ sizeof(struct vmbus_channel_modifychannel),
+ GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ init_completion(&info->waitevent);
+ info->waiting_channel = channel;
+
+ msg = (struct vmbus_channel_modifychannel *)info->msg;
+ msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+ msg->child_relid = channel->offermsg.child_relid;
+ msg->target_vp = target_vp;
+
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+ ret = vmbus_post_msg(msg, sizeof(*msg), true);
+ trace_vmbus_send_modifychannel(msg, ret);
+ if (ret != 0) {
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_del(&info->msglistentry);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+ goto free_info;
+ }
+
+ /*
+ * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on
+ * the mutex and be unable to signal the completion.
+ *
+ * See the caller target_cpu_store() for information about the usage of the
+ * mutex.
+ */
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ wait_for_completion(&info->waitevent);
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_del(&info->msglistentry);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+ if (info->response.modify_response.status)
+ ret = -EAGAIN;
+
+free_info:
+ kfree(info);
+ return ret;
+}
+
/*
* Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
*
- * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not
- * ACK such messages. IOW we can't know when the host will stop interrupting
- * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
+ * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. When VMbus version 5.3
+ * or later is negotiated, Hyper-V always sends an ACK in response to such a
+ * message. For VMbus version 5.2 and earlier, it never sends an ACK. With-
+ * out an ACK, we can not know when the host will stop interrupting the "old"
+ * vCPU and start interrupting the "new" vCPU for the given channel.
*
* The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
* VERSION_WIN10_V4_1.
*/
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
{
- struct vmbus_channel_modifychannel conn_msg;
- int ret;
-
- memset(&conn_msg, 0, sizeof(conn_msg));
- conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
- conn_msg.child_relid = child_relid;
- conn_msg.target_vp = target_vp;
-
- ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
-
- trace_vmbus_send_modifychannel(&conn_msg, ret);
-
- return ret;
+ if (vmbus_proto_version >= VERSION_WIN10_V5_3)
+ return send_modifychannel_with_ack(channel, target_vp);
+ return send_modifychannel_without_ack(channel, target_vp);
}
EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
@@ -385,7 +450,7 @@ nomem:
* @kbuffer: from kmalloc or vmalloc
* @size: page-size multiple
* @send_offset: the offset (in bytes) where the send ring buffer starts,
- * should be 0 for BUFFER type gpadl
+ * should be 0 for BUFFER type gpadl
* @gpadl_handle: some funky thing
*/
static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
@@ -653,7 +718,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
if (newchannel->rescind) {
err = -ENODEV;
- goto error_free_info;
+ goto error_clean_msglist;
}
err = vmbus_post_msg(open_msg,
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index f0ed730e2e4e..caf6d0c4bc1b 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -333,7 +333,6 @@ fw_error:
negop->icversion_data[1].minor = icmsg_minor;
return found_match;
}
-
EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
/*
@@ -593,10 +592,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
* CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
*
* Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
- * CPU2's SEARCH from *not* seeing CPU1's INSERT
+ * CPU2's SEARCH from *not* seeing CPU1's INSERT
*
* Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
- * CPU2's LOAD from *not* seing CPU1's STORE
+ * CPU2's LOAD from *not* seing CPU1's STORE
*/
cpus_read_lock();
@@ -756,6 +755,12 @@ static void init_vp_index(struct vmbus_channel *channel)
free_cpumask_var(available_mask);
}
+#define UNLOAD_DELAY_UNIT_MS 10 /* 10 milliseconds */
+#define UNLOAD_WAIT_MS (100*1000) /* 100 seconds */
+#define UNLOAD_WAIT_LOOPS (UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS)
+#define UNLOAD_MSG_MS (5*1000) /* Every 5 seconds */
+#define UNLOAD_MSG_LOOPS (UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS)
+
static void vmbus_wait_for_unload(void)
{
int cpu;
@@ -773,12 +778,17 @@ static void vmbus_wait_for_unload(void)
* vmbus_connection.unload_event. If not, the last thing we can do is
* read message pages for all CPUs directly.
*
- * Wait no more than 10 seconds so that the panic path can't get
- * hung forever in case the response message isn't seen.
+ * Wait up to 100 seconds since an Azure host must writeback any dirty
+ * data in its disk cache before the VMbus UNLOAD request will
+ * complete. This flushing has been empirically observed to take up
+ * to 50 seconds in cases with a lot of dirty data, so allow additional
+ * leeway and for inaccuracies in mdelay(). But eventually time out so
+ * that the panic path can't get hung forever in case the response
+ * message isn't seen.
*/
- for (i = 0; i < 1000; i++) {
+ for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) {
if (completion_done(&vmbus_connection.unload_event))
- break;
+ goto completed;
for_each_online_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
@@ -801,9 +811,18 @@ static void vmbus_wait_for_unload(void)
vmbus_signal_eom(msg, message_type);
}
- mdelay(10);
+ /*
+ * Give a notice periodically so someone watching the
+ * serial output won't think it is completely hung.
+ */
+ if (!(i % UNLOAD_MSG_LOOPS))
+ pr_notice("Waiting for VMBus UNLOAD to complete\n");
+
+ mdelay(UNLOAD_DELAY_UNIT_MS);
}
+ pr_err("Continuing even though VMBus UNLOAD did not complete\n");
+completed:
/*
* We're crashing and already got the UNLOAD_RESPONSE, cleanup all
* maybe-pending messages on all CPUs to be able to receive new
@@ -827,6 +846,11 @@ static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
/*
* This is a global event; just wakeup the waiting thread.
* Once we successfully unload, we can cleanup the monitor state.
+ *
+ * NB. A malicious or compromised Hyper-V could send a spurious
+ * message of type CHANNELMSG_UNLOAD_RESPONSE, and trigger a call
+ * of the complete() below. Make sure that unload_event has been
+ * initialized by the time this complete() is executed.
*/
complete(&vmbus_connection.unload_event);
}
@@ -842,7 +866,7 @@ void vmbus_initiate_unload(bool crash)
if (vmbus_proto_version < VERSION_WIN8_1)
return;
- init_completion(&vmbus_connection.unload_event);
+ reinit_completion(&vmbus_connection.unload_event);
memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
hdr.msgtype = CHANNELMSG_UNLOAD;
vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header),
@@ -980,7 +1004,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
* UNLOCK channel_mutex
*
* Forbids: r1 == valid_relid &&
- * channels[valid_relid] == channel
+ * channels[valid_relid] == channel
*
* Note. r1 can be INVALID_RELID only for an hv_sock channel.
* None of the hv_sock channels which were present before the
@@ -1313,6 +1337,46 @@ static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr)
}
/*
+ * vmbus_onmodifychannel_response - Modify Channel response handler.
+ *
+ * This is invoked when we received a response to our channel modify request.
+ * Find the matching request, copy the response and signal the requesting thread.
+ */
+static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr)
+{
+ struct vmbus_channel_modifychannel_response *response;
+ struct vmbus_channel_msginfo *msginfo;
+ unsigned long flags;
+
+ response = (struct vmbus_channel_modifychannel_response *)hdr;
+
+ trace_vmbus_onmodifychannel_response(response);
+
+ /*
+ * Find the modify msg, copy the response and signal/unblock the wait event.
+ */
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+
+ list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) {
+ struct vmbus_channel_message_header *responseheader =
+ (struct vmbus_channel_message_header *)msginfo->msg;
+
+ if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
+ struct vmbus_channel_modifychannel *modifymsg;
+
+ modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg;
+ if (modifymsg->child_relid == response->child_relid) {
+ memcpy(&msginfo->response.modify_response, response,
+ sizeof(*response));
+ complete(&msginfo->waitevent);
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+}
+
+/*
* vmbus_ongpadl_torndown - GPADL torndown handler.
*
* This is invoked when we received a response to our gpadl teardown request.
@@ -1429,6 +1493,8 @@ channel_message_table[CHANNELMSG_COUNT] = {
{ CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
{ CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
{ CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
+ { CHANNELMSG_MODIFYCHANNEL_RESPONSE, 1, vmbus_onmodifychannel_response,
+ sizeof(struct vmbus_channel_modifychannel_response)},
};
/*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index c83612cddb99..311cd005b3be 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -26,9 +26,11 @@
struct vmbus_connection vmbus_connection = {
.conn_state = DISCONNECTED,
+ .unload_event = COMPLETION_INITIALIZER(
+ vmbus_connection.unload_event),
.next_gpadl_handle = ATOMIC_INIT(0xE1E10),
- .ready_for_suspend_event= COMPLETION_INITIALIZER(
+ .ready_for_suspend_event = COMPLETION_INITIALIZER(
vmbus_connection.ready_for_suspend_event),
.ready_for_resume_event = COMPLETION_INITIALIZER(
vmbus_connection.ready_for_resume_event),
@@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
* Table of VMBus versions listed from newest to oldest.
*/
static __u32 vmbus_versions[] = {
+ VERSION_WIN10_V5_3,
VERSION_WIN10_V5_2,
VERSION_WIN10_V5_1,
VERSION_WIN10_V5,
@@ -60,7 +63,7 @@ static __u32 vmbus_versions[] = {
* Maximal VMBus protocol version guests can negotiate. Useful to cap the
* VMBus version for testing and debugging purpose.
*/
-static uint max_version = VERSION_WIN10_V5_2;
+static uint max_version = VERSION_WIN10_V5_3;
module_param(max_version, uint, S_IRUGO);
MODULE_PARM_DESC(max_version,
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index f202ac7f4b3d..e83507f49676 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -13,9 +13,10 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/hyperv.h>
-#include <linux/version.h>
#include <linux/random.h>
#include <linux/clockchips.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -37,6 +38,42 @@ int hv_init(void)
}
/*
+ * Functions for allocating and freeing memory with size and
+ * alignment HV_HYP_PAGE_SIZE. These functions are needed because
+ * the guest page size may not be the same as the Hyper-V page
+ * size. We depend upon kmalloc() aligning power-of-two size
+ * allocations to the allocation size boundary, so that the
+ * allocated memory appears to Hyper-V as a page of the size
+ * it expects.
+ */
+
+void *hv_alloc_hyperv_page(void)
+{
+ BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
+
+ if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+ return (void *)__get_free_page(GFP_KERNEL);
+ else
+ return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+}
+
+void *hv_alloc_hyperv_zeroed_page(void)
+{
+ if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ else
+ return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+}
+
+void hv_free_hyperv_page(unsigned long addr)
+{
+ if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+ free_page(addr);
+ else
+ kfree((void *)addr);
+}
+
+/*
* hv_post_message - Post a message using the hypervisor message IPC.
*
* This involves a hypercall.
@@ -68,7 +105,7 @@ int hv_post_message(union hv_connection_id connection_id,
*/
put_cpu_ptr(hv_cpu);
- return status & 0xFFFF;
+ return hv_result(status);
}
int hv_synic_alloc(void)
@@ -162,34 +199,48 @@ void hv_synic_enable_regs(unsigned int cpu)
union hv_synic_scontrol sctrl;
/* Setup the Synic's message page */
- hv_get_simp(simp.as_uint64);
+ simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
simp.simp_enabled = 1;
simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
>> HV_HYP_PAGE_SHIFT;
- hv_set_simp(simp.as_uint64);
+ hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
/* Setup the Synic's event page */
- hv_get_siefp(siefp.as_uint64);
+ siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
siefp.siefp_enabled = 1;
siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
>> HV_HYP_PAGE_SHIFT;
- hv_set_siefp(siefp.as_uint64);
+ hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
/* Setup the shared SINT. */
- hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+ if (vmbus_irq != -1)
+ enable_percpu_irq(vmbus_irq, 0);
+ shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
+ VMBUS_MESSAGE_SINT);
- shared_sint.vector = hv_get_vector();
+ shared_sint.vector = vmbus_interrupt;
shared_sint.masked = false;
- shared_sint.auto_eoi = hv_recommend_using_aeoi();
- hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+
+ /*
+ * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
+ * it doesn't provide a recommendation flag and AEOI must be disabled.
+ */
+#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
+ shared_sint.auto_eoi =
+ !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
+#else
+ shared_sint.auto_eoi = 0;
+#endif
+ hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
+ shared_sint.as_uint64);
/* Enable the global synic bit */
- hv_get_synic_state(sctrl.as_uint64);
+ sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
sctrl.enable = 1;
- hv_set_synic_state(sctrl.as_uint64);
+ hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
}
int hv_synic_init(unsigned int cpu)
@@ -211,30 +262,71 @@ void hv_synic_disable_regs(unsigned int cpu)
union hv_synic_siefp siefp;
union hv_synic_scontrol sctrl;
- hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+ shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
+ VMBUS_MESSAGE_SINT);
shared_sint.masked = 1;
/* Need to correctly cleanup in the case of SMP!!! */
/* Disable the interrupt */
- hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+ hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
+ shared_sint.as_uint64);
- hv_get_simp(simp.as_uint64);
+ simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
simp.simp_enabled = 0;
simp.base_simp_gpa = 0;
- hv_set_simp(simp.as_uint64);
+ hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
- hv_get_siefp(siefp.as_uint64);
+ siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
siefp.siefp_enabled = 0;
siefp.base_siefp_gpa = 0;
- hv_set_siefp(siefp.as_uint64);
+ hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
/* Disable the global synic bit */
- hv_get_synic_state(sctrl.as_uint64);
+ sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
sctrl.enable = 0;
- hv_set_synic_state(sctrl.as_uint64);
+ hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
+
+ if (vmbus_irq != -1)
+ disable_percpu_irq(vmbus_irq);
+}
+
+#define HV_MAX_TRIES 3
+/*
+ * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one
+ * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times.
+ * Return 'true', if there is still any set bit after this operation; 'false', otherwise.
+ *
+ * If a bit is set, that means there is a pending channel interrupt. The expectation is
+ * that the normal interrupt handling mechanism will find and process the channel interrupt
+ * "very soon", and in the process clear the bit.
+ */
+static bool hv_synic_event_pending(void)
+{
+ struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ union hv_synic_event_flags *event =
+ (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
+ unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
+ bool pending;
+ u32 relid;
+ int tries = 0;
+
+retry:
+ pending = false;
+ for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
+ /* Special case - VMBus channel protocol messages */
+ if (relid == 0)
+ continue;
+ pending = true;
+ break;
+ }
+ if (pending && tries++ < HV_MAX_TRIES) {
+ usleep_range(10000, 20000);
+ goto retry;
+ }
+ return pending;
}
int hv_synic_cleanup(unsigned int cpu)
@@ -242,6 +334,9 @@ int hv_synic_cleanup(unsigned int cpu)
struct vmbus_channel *channel, *sc;
bool channel_found = false;
+ if (vmbus_connection.conn_state != CONNECTED)
+ goto always_cleanup;
+
/*
* Hyper-V does not provide a way to change the connect CPU once
* it is set; we must prevent the connect CPU from going offline
@@ -249,8 +344,7 @@ int hv_synic_cleanup(unsigned int cpu)
* path where the vmbus is already disconnected, the CPU must be
* allowed to shut down.
*/
- if (cpu == VMBUS_CONNECT_CPU &&
- vmbus_connection.conn_state == CONNECTED)
+ if (cpu == VMBUS_CONNECT_CPU)
return -EBUSY;
/*
@@ -277,9 +371,21 @@ int hv_synic_cleanup(unsigned int cpu)
}
mutex_unlock(&vmbus_connection.channel_mutex);
- if (channel_found && vmbus_connection.conn_state == CONNECTED)
+ if (channel_found)
+ return -EBUSY;
+
+ /*
+ * channel_found == false means that any channels that were previously
+ * assigned to the CPU have been reassigned elsewhere with a call of
+ * vmbus_send_modifychannel(). Scan the event flags page looking for
+ * bits that are set and waiting with a timeout for vmbus_chan_sched()
+ * to process such bits. If bits are still set after this operation
+ * and VMBus is connected, fail the CPU offlining operation.
+ */
+ if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
return -EBUSY;
+always_cleanup:
hv_stimer_legacy_cleanup(cpu);
hv_synic_disable_regs(cpu);
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 8c471823a5af..58af84e30144 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -21,6 +21,7 @@
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/percpu_counter.h>
+#include <linux/page_reporting.h>
#include <linux/hyperv.h>
#include <asm/hyperv-tlfs.h>
@@ -563,6 +564,8 @@ struct hv_dynmem_device {
* The negotiated version agreed by host.
*/
__u32 version;
+
+ struct page_reporting_dev_info pr_dev_info;
};
static struct hv_dynmem_device dm_device;
@@ -726,7 +729,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
- (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
+ (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
if (ret) {
pr_err("hot_add memory failed error is %d\n", ret);
@@ -1568,6 +1571,89 @@ static void balloon_onchannelcallback(void *context)
}
+/* Hyper-V only supports reporting 2MB pages or higher */
+#define HV_MIN_PAGE_REPORTING_ORDER 9
+#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
+static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+ struct scatterlist *sgl, unsigned int nents)
+{
+ unsigned long flags;
+ struct hv_memory_hint *hint;
+ int i;
+ u64 status;
+ struct scatterlist *sg;
+
+ WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+ WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN);
+ local_irq_save(flags);
+ hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ if (!hint) {
+ local_irq_restore(flags);
+ return -ENOSPC;
+ }
+
+ hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+ hint->reserved = 0;
+ for_each_sg(sgl, sg, nents, i) {
+ union hv_gpa_page_range *range;
+
+ range = &hint->ranges[i];
+ range->address_space = 0;
+ /* page reporting only reports 2MB pages or higher */
+ range->page.largepage = 1;
+ range->page.additional_pages =
+ (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
+ range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
+ range->base_large_pfn =
+ page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
+ }
+
+ status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
+ hint, NULL);
+ local_irq_restore(flags);
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ pr_err("Cold memory discard hypercall failed with status %llx\n",
+ status);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void enable_page_reporting(void)
+{
+ int ret;
+
+ /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */
+ if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) {
+ pr_debug("Cold memory discard is only supported on 2MB pages and above\n");
+ return;
+ }
+
+ if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
+ pr_debug("Cold memory discard hint not supported by Hyper-V\n");
+ return;
+ }
+
+ BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+ dm_device.pr_dev_info.report = hv_free_page_report;
+ ret = page_reporting_register(&dm_device.pr_dev_info);
+ if (ret < 0) {
+ dm_device.pr_dev_info.report = NULL;
+ pr_err("Failed to enable cold memory discard: %d\n", ret);
+ } else {
+ pr_info("Cold memory discard hint enabled\n");
+ }
+}
+
+static void disable_page_reporting(void)
+{
+ if (dm_device.pr_dev_info.report) {
+ page_reporting_unregister(&dm_device.pr_dev_info);
+ dm_device.pr_dev_info.report = NULL;
+ }
+}
+
static int balloon_connect_vsp(struct hv_device *dev)
{
struct dm_version_request version_req;
@@ -1713,6 +1799,7 @@ static int balloon_probe(struct hv_device *dev,
if (ret != 0)
return ret;
+ enable_page_reporting();
dm_device.state = DM_INITIALIZED;
dm_device.thread =
@@ -1727,6 +1814,7 @@ static int balloon_probe(struct hv_device *dev,
probe_error:
dm_device.state = DM_INIT_ERROR;
dm_device.thread = NULL;
+ disable_page_reporting();
vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
@@ -1749,6 +1837,7 @@ static int balloon_remove(struct hv_device *dev)
cancel_work_sync(&dm->ha_wrk.wrk);
kthread_stop(dm->thread);
+ disable_page_reporting();
vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h
index 6063bb21bb13..c02a1719e92f 100644
--- a/drivers/hv/hv_trace.h
+++ b/drivers/hv/hv_trace.h
@@ -103,6 +103,21 @@ TRACE_EVENT(vmbus_ongpadl_created,
)
);
+TRACE_EVENT(vmbus_onmodifychannel_response,
+ TP_PROTO(const struct vmbus_channel_modifychannel_response *response),
+ TP_ARGS(response),
+ TP_STRUCT__entry(
+ __field(u32, child_relid)
+ __field(u32, status)
+ ),
+ TP_fast_assign(__entry->child_relid = response->child_relid;
+ __entry->status = response->status;
+ ),
+ TP_printk("child_relid 0x%x, status %d",
+ __entry->child_relid, __entry->status
+ )
+ );
+
TRACE_EVENT(vmbus_ongpadl_torndown,
TP_PROTO(const struct vmbus_channel_gpadl_torndown *gpadltorndown),
TP_ARGS(gpadltorndown),
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 35833d4d1a1d..374f8afbf8a5 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -84,15 +84,6 @@ hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer->write_index = next_write_location;
}
-/* Set the next read location for the specified ring buffer. */
-static inline void
-hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
- u32 next_read_location)
-{
- ring_info->ring_buffer->read_index = next_read_location;
- ring_info->priv_read_index = next_read_location;
-}
-
/* Get the size of the ring buffer. */
static inline u32
hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
@@ -313,7 +304,6 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
rqst_id = vmbus_next_request_id(&channel->requestor, requestid);
if (rqst_id == VMBUS_RQST_ERROR) {
spin_unlock_irqrestore(&outring_info->ring_lock, flags);
- pr_err("No request id available\n");
return -EAGAIN;
}
}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index b341b144bde8..92cb3f7d21d9 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -48,8 +48,10 @@ static int hyperv_cpuhp_online;
static void *hv_panic_page;
+static long __percpu *vmbus_evt;
+
/* Values parsed from ACPI DSDT */
-static int vmbus_irq;
+int vmbus_irq;
int vmbus_interrupt;
/*
@@ -1381,7 +1383,13 @@ static void vmbus_isr(void)
tasklet_schedule(&hv_cpu->msg_dpc);
}
- add_interrupt_randomness(hv_get_vector(), 0);
+ add_interrupt_randomness(vmbus_interrupt, 0);
+}
+
+static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
+{
+ vmbus_isr();
+ return IRQ_HANDLED;
}
/*
@@ -1393,14 +1401,11 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
{
struct kmsg_dump_iter iter;
size_t bytes_written;
- phys_addr_t panic_pa;
/* We are only interested in panics. */
if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg))
return;
- panic_pa = virt_to_phys(hv_panic_page);
-
/*
* Write dump contents to the page. No need to synchronize; panic should
* be single-threaded.
@@ -1408,8 +1413,25 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
kmsg_dump_rewind(&iter);
kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
&bytes_written);
- if (bytes_written)
- hyperv_report_panic_msg(panic_pa, bytes_written);
+ if (!bytes_written)
+ return;
+ /*
+ * P3 to contain the physical address of the panic page & P4 to
+ * contain the size of the panic data in that page. Rest of the
+ * registers are no-op when the NOTIFY_MSG flag is set.
+ */
+ hv_set_register(HV_REGISTER_CRASH_P0, 0);
+ hv_set_register(HV_REGISTER_CRASH_P1, 0);
+ hv_set_register(HV_REGISTER_CRASH_P2, 0);
+ hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
+ hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
+
+ /*
+ * Let Hyper-V know there is crash data available along with
+ * the panic message.
+ */
+ hv_set_register(HV_REGISTER_CRASH_CTL,
+ (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
}
static struct kmsg_dumper hv_kmsg_dumper = {
@@ -1484,9 +1506,28 @@ static int vmbus_bus_init(void)
if (ret)
return ret;
- ret = hv_setup_vmbus_irq(vmbus_irq, vmbus_isr);
- if (ret)
- goto err_setup;
+ /*
+ * VMbus interrupts are best modeled as per-cpu interrupts. If
+ * on an architecture with support for per-cpu IRQs (e.g. ARM64),
+ * allocate a per-cpu IRQ using standard Linux kernel functionality.
+ * If not on such an architecture (e.g., x86/x64), then rely on
+ * code in the arch-specific portion of the code tree to connect
+ * the VMbus interrupt handler.
+ */
+
+ if (vmbus_irq == -1) {
+ hv_setup_vmbus_handler(vmbus_isr);
+ } else {
+ vmbus_evt = alloc_percpu(long);
+ ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
+ "Hyper-V VMbus", vmbus_evt);
+ if (ret) {
+ pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
+ vmbus_irq, ret);
+ free_percpu(vmbus_evt);
+ goto err_setup;
+ }
+ }
ret = hv_synic_alloc();
if (ret)
@@ -1523,7 +1564,7 @@ static int vmbus_bus_init(void)
* Register for panic kmsg callback only if the right
* capability is supported by the hypervisor.
*/
- hv_get_crash_ctl(hyperv_crash_ctl);
+ hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
hv_kmsg_dump_register();
@@ -1547,7 +1588,12 @@ err_connect:
err_cpuhp:
hv_synic_free();
err_alloc:
- hv_remove_vmbus_irq();
+ if (vmbus_irq == -1) {
+ hv_remove_vmbus_handler();
+ } else {
+ free_percpu_irq(vmbus_irq, vmbus_evt);
+ free_percpu(vmbus_evt);
+ }
err_setup:
bus_unregister(&hv_bus);
unregister_sysctl_table(hv_ctl_table_hdr);
@@ -1804,13 +1850,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
if (target_cpu == origin_cpu)
goto cpu_store_unlock;
- if (vmbus_send_modifychannel(channel->offermsg.child_relid,
+ if (vmbus_send_modifychannel(channel,
hv_cpu_number_to_vp_number(target_cpu))) {
ret = -EIO;
goto cpu_store_unlock;
}
/*
+ * For version before VERSION_WIN10_V5_3, the following warning holds:
+ *
* Warning. At this point, there is *no* guarantee that the host will
* have successfully processed the vmbus_send_modifychannel() request.
* See the header comment of vmbus_send_modifychannel() for more info.
@@ -2665,6 +2713,18 @@ static int __init hv_acpi_init(void)
ret = -ETIMEDOUT;
goto cleanup;
}
+
+ /*
+ * If we're on an architecture with a hardcoded hypervisor
+ * vector (i.e. x86/x64), override the VMbus interrupt found
+ * in the ACPI tables. Ensure vmbus_irq is not set since the
+ * normal Linux IRQ mechanism is not used in this case.
+ */
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+ vmbus_interrupt = HYPERVISOR_CALLBACK_VECTOR;
+ vmbus_irq = -1;
+#endif
+
hv_debug_init();
ret = vmbus_bus_init();
@@ -2695,7 +2755,12 @@ static void __exit vmbus_exit(void)
vmbus_connection.conn_state = DISCONNECTED;
hv_stimer_global_cleanup();
vmbus_disconnect();
- hv_remove_vmbus_irq();
+ if (vmbus_irq == -1) {
+ hv_remove_vmbus_handler();
+ } else {
+ free_percpu_irq(vmbus_irq, vmbus_evt);
+ free_percpu(vmbus_evt);
+ }
for_each_online_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);