From a8c3209998afb5c4941b49e35b513cea9050cb4a Mon Sep 17 00:00:00 2001 From: Andres Beltran Date: Tue, 8 Dec 2020 05:53:11 +0100 Subject: Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer Pointers to ring-buffer packets sent by Hyper-V are used within the guest VM. Hyper-V can send packets with erroneous values or modify packet fields after they are processed by the guest. To defend against these scenarios, return a copy of the incoming VMBus packet after validating its length and offset fields in hv_pkt_iter_first(). In this way, the packet can no longer be modified by the host. Signed-off-by: Andres Beltran Co-developed-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Cc: "David S. Miller" Cc: Jakub Kicinski Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: netdev@vger.kernel.org Cc: linux-scsi@vger.kernel.org Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201208045311.10244-1-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 9 +++-- drivers/hv/hv_fcopy.c | 1 + drivers/hv/hv_kvp.c | 1 + drivers/hv/hyperv_vmbus.h | 2 +- drivers/hv/ring_buffer.c | 82 +++++++++++++++++++++++++++++++++------ drivers/net/hyperv/hyperv_net.h | 3 ++ drivers/net/hyperv/netvsc.c | 2 + drivers/net/hyperv/rndis_filter.c | 2 + drivers/scsi/storvsc_drv.c | 10 +++++ include/linux/hyperv.h | 48 +++++++++++++++++++---- net/vmw_vsock/hyperv_transport.c | 4 +- 11 files changed, 139 insertions(+), 25 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 6fb0c76bfbf8..0d63862d6551 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -597,12 +597,15 @@ static int __vmbus_open(struct vmbus_channel *newchannel, newchannel->onchannel_callback = onchannelcallback; newchannel->channel_callback_context = context; - err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages); + if (!newchannel->max_pkt_size) + newchannel->max_pkt_size = VMBUS_DEFAULT_MAX_PKT_SIZE; + + err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages, 0); if (err) goto error_clean_ring; - err = hv_ringbuffer_init(&newchannel->inbound, - &page[send_pages], recv_pages); + err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], + recv_pages, newchannel->max_pkt_size); if (err) goto error_clean_ring; diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index 5040d7e0cd9e..85784a9e6a36 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -329,6 +329,7 @@ int hv_fcopy_init(struct hv_util_service *srv) { recv_buffer = srv->recv_buffer; fcopy_transaction.recv_channel = srv->channel; + fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2; /* * When this driver loads, the user level daemon that diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 754d35a25a1c..7d2a7b918847 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -741,6 +741,7 @@ hv_kvp_init(struct hv_util_service *srv) { recv_buffer = srv->recv_buffer; kvp_transaction.recv_channel = srv->channel; + kvp_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 4; /* * When this driver loads, the user level daemon that diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 9416e09ebd58..42f3d9d123a1 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -174,7 +174,7 @@ extern int hv_synic_cleanup(unsigned int cpu); void hv_ringbuffer_pre_init(struct vmbus_channel *channel); int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 pagecnt); + struct page *pages, u32 pagecnt, u32 max_pkt_size); void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 35833d4d1a1d..29e90477363a 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -190,7 +190,7 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) /* Initialize the ring buffer. */ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 page_cnt) + struct page *pages, u32 page_cnt, u32 max_pkt_size) { int i; struct page **pages_wraparound; @@ -232,6 +232,14 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, sizeof(struct hv_ring_buffer); ring_info->priv_read_index = 0; + /* Initialize buffer that holds copies of incoming packets */ + if (max_pkt_size) { + ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL); + if (!ring_info->pkt_buffer) + return -ENOMEM; + ring_info->pkt_buffer_size = max_pkt_size; + } + spin_lock_init(&ring_info->ring_lock); return 0; @@ -244,6 +252,9 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info) vunmap(ring_info->ring_buffer); ring_info->ring_buffer = NULL; mutex_unlock(&ring_info->ring_buffer_mutex); + + kfree(ring_info->pkt_buffer); + ring_info->pkt_buffer_size = 0; } /* Write to the ring buffer. */ @@ -385,7 +396,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, memcpy(buffer, (const char *)desc + offset, packetlen); /* Advance ring index to next packet descriptor */ - __hv_pkt_iter_next(channel, desc); + __hv_pkt_iter_next(channel, desc, true); /* Notify host of update */ hv_pkt_iter_close(channel); @@ -411,6 +422,22 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) return (rbi->ring_datasize - priv_read_loc) + write_loc; } +/* + * Get first vmbus packet without copying it out of the ring buffer + */ +struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *rbi = &channel->inbound; + + hv_debug_delay_test(channel, MESSAGE_DELAY); + + if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) + return NULL; + + return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index); +} +EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw); + /* * Get first vmbus packet from ring buffer after read_index * @@ -419,17 +446,49 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel) { struct hv_ring_buffer_info *rbi = &channel->inbound; - struct vmpacket_descriptor *desc; + struct vmpacket_descriptor *desc, *desc_copy; + u32 bytes_avail, pkt_len, pkt_offset; - hv_debug_delay_test(channel, MESSAGE_DELAY); - if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) + desc = hv_pkt_iter_first_raw(channel); + if (!desc) return NULL; - desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index; - if (desc) - prefetch((char *)desc + (desc->len8 << 3)); + bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi)); + + /* + * Ensure the compiler does not use references to incoming Hyper-V values (which + * could change at any moment) when reading local variables later in the code + */ + pkt_len = READ_ONCE(desc->len8) << 3; + pkt_offset = READ_ONCE(desc->offset8) << 3; + + /* + * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and + * rbi->pkt_buffer_size + */ + if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail) + pkt_len = bytes_avail; + + /* + * If pkt_offset is invalid, arbitrarily set it to + * the size of vmpacket_descriptor + */ + if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len) + pkt_offset = sizeof(struct vmpacket_descriptor); + + /* Copy the Hyper-V packet out of the ring buffer */ + desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer; + memcpy(desc_copy, desc, pkt_len); + + /* + * Hyper-V could still change len8 and offset8 after the earlier read. + * Ensure that desc_copy has legal values for len8 and offset8 that + * are consistent with the copy we just made + */ + desc_copy->len8 = pkt_len >> 3; + desc_copy->offset8 = pkt_offset >> 3; - return desc; + return desc_copy; } EXPORT_SYMBOL_GPL(hv_pkt_iter_first); @@ -441,7 +500,8 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first); */ struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *desc) + const struct vmpacket_descriptor *desc, + bool copy) { struct hv_ring_buffer_info *rbi = &channel->inbound; u32 packetlen = desc->len8 << 3; @@ -454,7 +514,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel, rbi->priv_read_index -= dsize; /* more data? */ - return hv_pkt_iter_first(channel); + return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel); } EXPORT_SYMBOL_GPL(__hv_pkt_iter_next); diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 2a87cfa27ac0..7ea6936f86ef 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -860,9 +860,12 @@ static inline u32 netvsc_rqstor_size(unsigned long ringbytes) ringbytes / NETVSC_MIN_IN_MSG_SIZE; } +#define NETVSC_MAX_XFER_PAGE_RANGES 375 #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \ (offsetof(struct vmtransfer_page_packet_header, ranges) + \ (rng_cnt) * sizeof(struct vmtransfer_page_range)) +#define NETVSC_MAX_PKT_SIZE (NETVSC_XFER_HEADER_SIZE(NETVSC_MAX_XFER_PAGE_RANGES) + \ + sizeof(struct nvsp_message) + (sizeof(u32) * VRSS_SEND_TAB_SIZE)) struct multi_send_data { struct sk_buff *skb; /* skb containing the pkt */ diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 2350342b961f..1510a236aa34 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1530,6 +1530,8 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Open the channel */ device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); + device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE; + ret = vmbus_open(device->channel, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, net_device->chan_table); diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 598713c0d5a8..7e6dee2f02a4 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1174,6 +1174,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) nvchan->channel = new_sc; new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); + new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE; + ret = vmbus_open(new_sc, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, nvchan); diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 2e4fa77445fd..7e59284dbf5b 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -414,6 +414,14 @@ static void storvsc_on_channel_callback(void *context); #define STORVSC_IDE_MAX_TARGETS 1 #define STORVSC_IDE_MAX_CHANNELS 1 +/* + * Upper bound on the size of a storvsc packet. vmscsi_size_delta is not + * included in the calculation because it is set after STORVSC_MAX_PKT_SIZE + * is used in storvsc_connect_to_vsp + */ +#define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\ + sizeof(struct vstor_packet)) + struct storvsc_cmd_request { struct scsi_cmnd *cmd; @@ -698,6 +706,7 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) return; memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE; /* * The size of vmbus_requestor is an upper bound on the number of requests @@ -1280,6 +1289,7 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE; /* * The size of vmbus_requestor is an upper bound on the number of requests * that can be in-progress at any one time across all channels. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 5ddb479c4d4c..fbae8406d5d4 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -181,6 +181,10 @@ struct hv_ring_buffer_info { * being freed while the ring buffer is being accessed. */ struct mutex ring_buffer_mutex; + + /* Buffer that holds a copy of an incoming host packet */ + void *pkt_buffer; + u32 pkt_buffer_size; }; @@ -787,6 +791,8 @@ struct vmbus_device { bool perf_device; }; +#define VMBUS_DEFAULT_MAX_PKT_SIZE 4096 + struct vmbus_channel { struct list_head listentry; @@ -1008,6 +1014,9 @@ struct vmbus_channel { /* request/transaction ids for VMBus */ struct vmbus_requestor requestor; u32 rqstor_size; + + /* The max size of a packet on this channel */ + u32 max_pkt_size; }; u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr); @@ -1642,32 +1651,55 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc) } +struct vmpacket_descriptor * +hv_pkt_iter_first_raw(struct vmbus_channel *channel); + struct vmpacket_descriptor * hv_pkt_iter_first(struct vmbus_channel *channel); struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt); + const struct vmpacket_descriptor *pkt, + bool copy); void hv_pkt_iter_close(struct vmbus_channel *channel); -/* - * Get next packet descriptor from iterator - * If at end of list, return NULL and update host. - */ static inline struct vmpacket_descriptor * -hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt) +hv_pkt_iter_next_pkt(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt, + bool copy) { struct vmpacket_descriptor *nxt; - nxt = __hv_pkt_iter_next(channel, pkt); + nxt = __hv_pkt_iter_next(channel, pkt, copy); if (!nxt) hv_pkt_iter_close(channel); return nxt; } +/* + * Get next packet descriptor without copying it out of the ring buffer + * If at end of list, return NULL and update host. + */ +static inline struct vmpacket_descriptor * +hv_pkt_iter_next_raw(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt) +{ + return hv_pkt_iter_next_pkt(channel, pkt, false); +} + +/* + * Get next packet descriptor from iterator + * If at end of list, return NULL and update host. + */ +static inline struct vmpacket_descriptor * +hv_pkt_iter_next(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt) +{ + return hv_pkt_iter_next_pkt(channel, pkt, true); +} + #define foreach_vmbus_pkt(pkt, channel) \ for (pkt = hv_pkt_iter_first(channel); pkt; \ pkt = hv_pkt_iter_next(channel, pkt)) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 630b851f8150..cd8b7c1ca9f1 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -600,7 +600,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, return -EOPNOTSUPP; if (need_refill) { - hvs->recv_desc = hv_pkt_iter_first(hvs->chan); + hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan); ret = hvs_update_recv_data(hvs); if (ret) return ret; @@ -614,7 +614,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, hvs->recv_data_len -= to_read; if (hvs->recv_data_len == 0) { - hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc); + hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc); if (hvs->recv_desc) { ret = hvs_update_recv_data(hvs); if (ret) -- cgit v1.2.3 From 06caa778d8b2fbcb4ac3878751e39d116424ba9b Mon Sep 17 00:00:00 2001 From: Andres Beltran Date: Mon, 9 Nov 2020 11:07:04 +0100 Subject: hv_utils: Add validation for untrusted Hyper-V values For additional robustness in the face of Hyper-V errors or malicious behavior, validate all values that originate from packets that Hyper-V has sent to the guest in the host-to-guest ring buffer. Ensure that invalid values cannot cause indexing off the end of the icversion_data array in vmbus_prep_negotiate_resp(). Signed-off-by: Andres Beltran Co-developed-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201109100704.9152-1-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 24 +++-- drivers/hv/hv_fcopy.c | 36 ++++++-- drivers/hv/hv_kvp.c | 122 ++++++++++++++----------- drivers/hv/hv_snapshot.c | 89 +++++++++++-------- drivers/hv/hv_util.c | 222 ++++++++++++++++++++++++++++------------------ include/linux/hyperv.h | 9 +- 6 files changed, 314 insertions(+), 188 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 1d44bb635bb8..5bc5eef5da15 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -190,6 +190,7 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel) * vmbus_prep_negotiate_resp() - Create default response for Negotiate message * @icmsghdrp: Pointer to msg header structure * @buf: Raw buffer channel data + * @buflen: Length of the raw buffer channel data. * @fw_version: The framework versions we can support. * @fw_vercnt: The size of @fw_version. * @srv_version: The service versions we can support. @@ -202,8 +203,8 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel) * Set up and fill in default negotiate response message. * Mainly used by Hyper-V drivers. */ -bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, - u8 *buf, const int *fw_version, int fw_vercnt, +bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, + u32 buflen, const int *fw_version, int fw_vercnt, const int *srv_version, int srv_vercnt, int *nego_fw_version, int *nego_srv_version) { @@ -215,10 +216,14 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, bool found_match = false; struct icmsg_negotiate *negop; + /* Check that there's enough space for icframe_vercnt, icmsg_vercnt */ + if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) { + pr_err_ratelimited("Invalid icmsg negotiate\n"); + return false; + } + icmsghdrp->icmsgsize = 0x10; - negop = (struct icmsg_negotiate *)&buf[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR]; icframe_major = negop->icframe_vercnt; icframe_minor = 0; @@ -226,6 +231,15 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, icmsg_major = negop->icmsg_vercnt; icmsg_minor = 0; + /* Validate negop packet */ + if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT || + icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT || + ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) { + pr_err_ratelimited("Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n", + icframe_major, icmsg_major); + goto fw_error; + } + /* * Select the framework version number we will * support. diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index 85784a9e6a36..660036da7449 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -235,15 +235,27 @@ void hv_fcopy_onchannelcallback(void *context) if (fcopy_transaction.state > HVUTIL_READY) return; - vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, - &requestid); - if (recvlen <= 0) + if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) { + pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n"); return; + } + + if (!recvlen) + return; + + /* Ensure recvlen is big enough to read header data */ + if (recvlen < ICMSG_HDR) { + pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n", + recvlen); + return; + } icmsghdr = (struct icmsg_hdr *)&recv_buffer[ sizeof(struct vmbuspipe_hdr)]; + if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdr, recv_buffer, + if (vmbus_prep_negotiate_resp(icmsghdr, + recv_buffer, recvlen, fw_versions, FW_VER_COUNT, fcopy_versions, FCOPY_VER_COUNT, NULL, &fcopy_srv_version)) { @@ -252,10 +264,14 @@ void hv_fcopy_onchannelcallback(void *context) fcopy_srv_version >> 16, fcopy_srv_version & 0xFFFF); } - } else { - fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) { + /* Ensure recvlen is big enough to contain hv_fcopy_hdr */ + if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) { + pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n", + recvlen); + return; + } + fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR]; /* * Stash away this global state for completing the @@ -280,6 +296,10 @@ void hv_fcopy_onchannelcallback(void *context) schedule_delayed_work(&fcopy_timeout_work, HV_UTIL_TIMEOUT * HZ); return; + } else { + pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n", + icmsghdr->icmsgtype); + return; } icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; vmbus_sendpacket(channel, recv_buffer, recvlen, requestid, diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 7d2a7b918847..c698592b83e4 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -662,71 +662,87 @@ void hv_kvp_onchannelcallback(void *context) if (kvp_transaction.state > HVUTIL_READY) return; - vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen, - &requestid); - - if (recvlen > 0) { - icmsghdrp = (struct icmsg_hdr *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr)]; - - if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdrp, - recv_buffer, fw_versions, FW_VER_COUNT, - kvp_versions, KVP_VER_COUNT, - NULL, &kvp_srv_version)) { - pr_info("KVP IC version %d.%d\n", - kvp_srv_version >> 16, - kvp_srv_version & 0xFFFF); - } - } else { - kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen, &requestid)) { + pr_err_ratelimited("KVP request received. Could not read into recv buf\n"); + return; + } - /* - * Stash away this global state for completing the - * transaction; note transactions are serialized. - */ + if (!recvlen) + return; - kvp_transaction.recv_len = recvlen; - kvp_transaction.recv_req_id = requestid; - kvp_transaction.kvp_msg = kvp_msg; + /* Ensure recvlen is big enough to read header data */ + if (recvlen < ICMSG_HDR) { + pr_err_ratelimited("KVP request received. Packet length too small: %d\n", + recvlen); + return; + } - if (kvp_transaction.state < HVUTIL_READY) { - /* Userspace is not registered yet */ - kvp_respond_to_host(NULL, HV_E_FAIL); - return; - } - kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED; + icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)]; + + if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { + if (vmbus_prep_negotiate_resp(icmsghdrp, + recv_buffer, recvlen, + fw_versions, FW_VER_COUNT, + kvp_versions, KVP_VER_COUNT, + NULL, &kvp_srv_version)) { + pr_info("KVP IC version %d.%d\n", + kvp_srv_version >> 16, + kvp_srv_version & 0xFFFF); + } + } else if (icmsghdrp->icmsgtype == ICMSGTYPE_KVPEXCHANGE) { + /* + * recvlen is not checked against sizeof(struct kvp_msg) because kvp_msg contains + * a union of structs and the msg type received is not known. Code using this + * struct should provide validation when accessing its fields. + */ + kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ICMSG_HDR]; - /* - * Get the information from the - * user-mode component. - * component. This transaction will be - * completed when we get the value from - * the user-mode component. - * Set a timeout to deal with - * user-mode not responding. - */ - schedule_work(&kvp_sendkey_work); - schedule_delayed_work(&kvp_timeout_work, - HV_UTIL_TIMEOUT * HZ); + /* + * Stash away this global state for completing the + * transaction; note transactions are serialized. + */ - return; + kvp_transaction.recv_len = recvlen; + kvp_transaction.recv_req_id = requestid; + kvp_transaction.kvp_msg = kvp_msg; + if (kvp_transaction.state < HVUTIL_READY) { + /* Userspace is not registered yet */ + kvp_respond_to_host(NULL, HV_E_FAIL); + return; } + kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED; - icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION - | ICMSGHDRFLAG_RESPONSE; + /* + * Get the information from the + * user-mode component. + * component. This transaction will be + * completed when we get the value from + * the user-mode component. + * Set a timeout to deal with + * user-mode not responding. + */ + schedule_work(&kvp_sendkey_work); + schedule_delayed_work(&kvp_timeout_work, + HV_UTIL_TIMEOUT * HZ); - vmbus_sendpacket(channel, recv_buffer, - recvlen, requestid, - VM_PKT_DATA_INBAND, 0); + return; - host_negotiatied = NEGO_FINISHED; - hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper); + } else { + pr_err_ratelimited("KVP request received. Invalid msg type: %d\n", + icmsghdrp->icmsgtype); + return; } + icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION + | ICMSGHDRFLAG_RESPONSE; + + vmbus_sendpacket(channel, recv_buffer, + recvlen, requestid, + VM_PKT_DATA_INBAND, 0); + + host_negotiatied = NEGO_FINISHED; + hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper); } static void kvp_on_reset(void) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 783779e4cc1a..2267bd4c3472 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -298,49 +298,64 @@ void hv_vss_onchannelcallback(void *context) if (vss_transaction.state > HVUTIL_READY) return; - vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, - &requestid); - - if (recvlen > 0) { - icmsghdrp = (struct icmsg_hdr *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr)]; - - if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdrp, - recv_buffer, fw_versions, FW_VER_COUNT, - vss_versions, VSS_VER_COUNT, - NULL, &vss_srv_version)) { - - pr_info("VSS IC version %d.%d\n", - vss_srv_version >> 16, - vss_srv_version & 0xFFFF); - } - } else { - vss_msg = (struct hv_vss_msg *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; - - /* - * Stash away this global state for completing the - * transaction; note transactions are serialized. - */ - - vss_transaction.recv_len = recvlen; - vss_transaction.recv_req_id = requestid; - vss_transaction.msg = (struct hv_vss_msg *)vss_msg; - - schedule_work(&vss_handle_request_work); + if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) { + pr_err_ratelimited("VSS request received. Could not read into recv buf\n"); + return; + } + + if (!recvlen) + return; + + /* Ensure recvlen is big enough to read header data */ + if (recvlen < ICMSG_HDR) { + pr_err_ratelimited("VSS request received. Packet length too small: %d\n", + recvlen); + return; + } + + icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)]; + + if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { + if (vmbus_prep_negotiate_resp(icmsghdrp, + recv_buffer, recvlen, + fw_versions, FW_VER_COUNT, + vss_versions, VSS_VER_COUNT, + NULL, &vss_srv_version)) { + + pr_info("VSS IC version %d.%d\n", + vss_srv_version >> 16, + vss_srv_version & 0xFFFF); + } + } else if (icmsghdrp->icmsgtype == ICMSGTYPE_VSS) { + /* Ensure recvlen is big enough to contain hv_vss_msg */ + if (recvlen < ICMSG_HDR + sizeof(struct hv_vss_msg)) { + pr_err_ratelimited("Invalid VSS msg. Packet length too small: %u\n", + recvlen); return; } + vss_msg = (struct hv_vss_msg *)&recv_buffer[ICMSG_HDR]; + + /* + * Stash away this global state for completing the + * transaction; note transactions are serialized. + */ - icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION - | ICMSGHDRFLAG_RESPONSE; + vss_transaction.recv_len = recvlen; + vss_transaction.recv_req_id = requestid; + vss_transaction.msg = (struct hv_vss_msg *)vss_msg; - vmbus_sendpacket(channel, recv_buffer, - recvlen, requestid, - VM_PKT_DATA_INBAND, 0); + schedule_work(&vss_handle_request_work); + return; + } else { + pr_err_ratelimited("VSS request received. Invalid msg type: %d\n", + icmsghdrp->icmsgtype); + return; } + icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | + ICMSGHDRFLAG_RESPONSE; + vmbus_sendpacket(channel, recv_buffer, recvlen, requestid, + VM_PKT_DATA_INBAND, 0); } static void vss_on_reset(void) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 05566ecdbe4b..34f3e789cc9a 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -195,73 +195,88 @@ static void shutdown_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; - vmbus_recvpacket(channel, shut_txf_buf, - HV_HYP_PAGE_SIZE, &recvlen, &requestid); + if (vmbus_recvpacket(channel, shut_txf_buf, HV_HYP_PAGE_SIZE, &recvlen, &requestid)) { + pr_err_ratelimited("Shutdown request received. Could not read into shut txf buf\n"); + return; + } - if (recvlen > 0) { - icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[ - sizeof(struct vmbuspipe_hdr)]; + if (!recvlen) + return; - if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdrp, shut_txf_buf, - fw_versions, FW_VER_COUNT, - sd_versions, SD_VER_COUNT, - NULL, &sd_srv_version)) { - pr_info("Shutdown IC version %d.%d\n", - sd_srv_version >> 16, - sd_srv_version & 0xFFFF); - } - } else { - shutdown_msg = - (struct shutdown_msg_data *)&shut_txf_buf[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + /* Ensure recvlen is big enough to read header data */ + if (recvlen < ICMSG_HDR) { + pr_err_ratelimited("Shutdown request received. Packet length too small: %d\n", + recvlen); + return; + } - /* - * shutdown_msg->flags can be 0(shut down), 2(reboot), - * or 4(hibernate). It may bitwise-OR 1, which means - * performing the request by force. Linux always tries - * to perform the request by force. - */ - switch (shutdown_msg->flags) { - case 0: - case 1: - icmsghdrp->status = HV_S_OK; - work = &shutdown_work; - pr_info("Shutdown request received -" - " graceful shutdown initiated\n"); - break; - case 2: - case 3: - icmsghdrp->status = HV_S_OK; - work = &restart_work; - pr_info("Restart request received -" - " graceful restart initiated\n"); - break; - case 4: - case 5: - pr_info("Hibernation request received\n"); - icmsghdrp->status = hibernation_supported ? - HV_S_OK : HV_E_FAIL; - if (hibernation_supported) - work = &hibernate_context.work; - break; - default: - icmsghdrp->status = HV_E_FAIL; - pr_info("Shutdown request received -" - " Invalid request\n"); - break; - } + icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[sizeof(struct vmbuspipe_hdr)]; + + if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { + if (vmbus_prep_negotiate_resp(icmsghdrp, + shut_txf_buf, recvlen, + fw_versions, FW_VER_COUNT, + sd_versions, SD_VER_COUNT, + NULL, &sd_srv_version)) { + pr_info("Shutdown IC version %d.%d\n", + sd_srv_version >> 16, + sd_srv_version & 0xFFFF); + } + } else if (icmsghdrp->icmsgtype == ICMSGTYPE_SHUTDOWN) { + /* Ensure recvlen is big enough to contain shutdown_msg_data struct */ + if (recvlen < ICMSG_HDR + sizeof(struct shutdown_msg_data)) { + pr_err_ratelimited("Invalid shutdown msg data. Packet length too small: %u\n", + recvlen); + return; } - icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION - | ICMSGHDRFLAG_RESPONSE; - - vmbus_sendpacket(channel, shut_txf_buf, - recvlen, requestid, - VM_PKT_DATA_INBAND, 0); + shutdown_msg = (struct shutdown_msg_data *)&shut_txf_buf[ICMSG_HDR]; + + /* + * shutdown_msg->flags can be 0(shut down), 2(reboot), + * or 4(hibernate). It may bitwise-OR 1, which means + * performing the request by force. Linux always tries + * to perform the request by force. + */ + switch (shutdown_msg->flags) { + case 0: + case 1: + icmsghdrp->status = HV_S_OK; + work = &shutdown_work; + pr_info("Shutdown request received - graceful shutdown initiated\n"); + break; + case 2: + case 3: + icmsghdrp->status = HV_S_OK; + work = &restart_work; + pr_info("Restart request received - graceful restart initiated\n"); + break; + case 4: + case 5: + pr_info("Hibernation request received\n"); + icmsghdrp->status = hibernation_supported ? + HV_S_OK : HV_E_FAIL; + if (hibernation_supported) + work = &hibernate_context.work; + break; + default: + icmsghdrp->status = HV_E_FAIL; + pr_info("Shutdown request received - Invalid request\n"); + break; + } + } else { + icmsghdrp->status = HV_E_FAIL; + pr_err_ratelimited("Shutdown request received. Invalid msg type: %d\n", + icmsghdrp->icmsgtype); } + icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION + | ICMSGHDRFLAG_RESPONSE; + + vmbus_sendpacket(channel, shut_txf_buf, + recvlen, requestid, + VM_PKT_DATA_INBAND, 0); + if (work) schedule_work(work); } @@ -396,19 +411,27 @@ static void timesync_onchannelcallback(void *context) HV_HYP_PAGE_SIZE, &recvlen, &requestid); if (ret) { - pr_warn_once("TimeSync IC pkt recv failed (Err: %d)\n", - ret); + pr_err_ratelimited("TimeSync IC pkt recv failed (Err: %d)\n", + ret); break; } if (!recvlen) break; + /* Ensure recvlen is big enough to read header data */ + if (recvlen < ICMSG_HDR) { + pr_err_ratelimited("Timesync request received. Packet length too small: %d\n", + recvlen); + break; + } + icmsghdrp = (struct icmsg_hdr *)&time_txf_buf[ sizeof(struct vmbuspipe_hdr)]; if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdrp, time_txf_buf, + if (vmbus_prep_negotiate_resp(icmsghdrp, + time_txf_buf, recvlen, fw_versions, FW_VER_COUNT, ts_versions, TS_VER_COUNT, NULL, &ts_srv_version)) { @@ -416,33 +439,44 @@ static void timesync_onchannelcallback(void *context) ts_srv_version >> 16, ts_srv_version & 0xFFFF); } - } else { + } else if (icmsghdrp->icmsgtype == ICMSGTYPE_TIMESYNC) { if (ts_srv_version > TS_VERSION_3) { - refdata = (struct ictimesync_ref_data *) - &time_txf_buf[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + /* Ensure recvlen is big enough to read ictimesync_ref_data */ + if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_ref_data)) { + pr_err_ratelimited("Invalid ictimesync ref data. Length too small: %u\n", + recvlen); + break; + } + refdata = (struct ictimesync_ref_data *)&time_txf_buf[ICMSG_HDR]; adj_guesttime(refdata->parenttime, refdata->vmreferencetime, refdata->flags); } else { - timedatap = (struct ictimesync_data *) - &time_txf_buf[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + /* Ensure recvlen is big enough to read ictimesync_data */ + if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_data)) { + pr_err_ratelimited("Invalid ictimesync data. Length too small: %u\n", + recvlen); + break; + } + timedatap = (struct ictimesync_data *)&time_txf_buf[ICMSG_HDR]; + adj_guesttime(timedatap->parenttime, hv_read_reference_counter(), timedatap->flags); } + } else { + icmsghdrp->status = HV_E_FAIL; + pr_err_ratelimited("Timesync request received. Invalid msg type: %d\n", + icmsghdrp->icmsgtype); } icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; vmbus_sendpacket(channel, time_txf_buf, - recvlen, requestid, - VM_PKT_DATA_INBAND, 0); + recvlen, requestid, + VM_PKT_DATA_INBAND, 0); } } @@ -462,18 +496,28 @@ static void heartbeat_onchannelcallback(void *context) while (1) { - vmbus_recvpacket(channel, hbeat_txf_buf, - HV_HYP_PAGE_SIZE, &recvlen, &requestid); + if (vmbus_recvpacket(channel, hbeat_txf_buf, HV_HYP_PAGE_SIZE, + &recvlen, &requestid)) { + pr_err_ratelimited("Heartbeat request received. Could not read into hbeat txf buf\n"); + return; + } if (!recvlen) break; + /* Ensure recvlen is big enough to read header data */ + if (recvlen < ICMSG_HDR) { + pr_err_ratelimited("Hearbeat request received. Packet length too small: %d\n", + recvlen); + break; + } + icmsghdrp = (struct icmsg_hdr *)&hbeat_txf_buf[ sizeof(struct vmbuspipe_hdr)]; if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { if (vmbus_prep_negotiate_resp(icmsghdrp, - hbeat_txf_buf, + hbeat_txf_buf, recvlen, fw_versions, FW_VER_COUNT, hb_versions, HB_VER_COUNT, NULL, &hb_srv_version)) { @@ -482,21 +526,31 @@ static void heartbeat_onchannelcallback(void *context) hb_srv_version >> 16, hb_srv_version & 0xFFFF); } - } else { - heartbeat_msg = - (struct heartbeat_msg_data *)&hbeat_txf_buf[ - sizeof(struct vmbuspipe_hdr) + - sizeof(struct icmsg_hdr)]; + } else if (icmsghdrp->icmsgtype == ICMSGTYPE_HEARTBEAT) { + /* + * Ensure recvlen is big enough to read seq_num. Reserved area is not + * included in the check as the host may not fill it up entirely + */ + if (recvlen < ICMSG_HDR + sizeof(u64)) { + pr_err_ratelimited("Invalid heartbeat msg data. Length too small: %u\n", + recvlen); + break; + } + heartbeat_msg = (struct heartbeat_msg_data *)&hbeat_txf_buf[ICMSG_HDR]; heartbeat_msg->seq_num += 1; + } else { + icmsghdrp->status = HV_E_FAIL; + pr_err_ratelimited("Heartbeat request received. Invalid msg type: %d\n", + icmsghdrp->icmsgtype); } icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; vmbus_sendpacket(channel, hbeat_txf_buf, - recvlen, requestid, - VM_PKT_DATA_INBAND, 0); + recvlen, requestid, + VM_PKT_DATA_INBAND, 0); } } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index fbae8406d5d4..2ea967bc17ad 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1480,6 +1480,7 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size); #define ICMSGTYPE_SHUTDOWN 3 #define ICMSGTYPE_TIMESYNC 4 #define ICMSGTYPE_VSS 5 +#define ICMSGTYPE_FCOPY 7 #define ICMSGHDRFLAG_TRANSACTION 1 #define ICMSGHDRFLAG_REQUEST 2 @@ -1523,6 +1524,12 @@ struct icmsg_hdr { u8 reserved[2]; } __packed; +#define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100 +#define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)) +#define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \ + (ICMSG_HDR + offsetof(struct icmsg_negotiate, icversion_data) + \ + (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version))) + struct icmsg_negotiate { u16 icframe_vercnt; u16 icmsg_vercnt; @@ -1578,7 +1585,7 @@ struct hyperv_service_callback { }; #define MAX_SRV_VER 0x7ffffff -extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, +extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen, const int *fw_version, int fw_vercnt, const int *srv_version, int srv_vercnt, int *nego_fw_version, int *nego_srv_version); -- cgit v1.2.3 From e99c4afbee07e9323e9191a20b24d74dbf815bdf Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 9 Dec 2020 08:08:22 +0100 Subject: Drivers: hv: vmbus: Initialize memory to be sent to the host __vmbus_open() and vmbus_teardown_gpadl() do not inizialite the memory for the vmbus_channel_open_channel and the vmbus_channel_gpadl_teardown objects they allocate respectively. These objects contain padding bytes and fields that are left uninitialized and that are later sent to the host, potentially leaking guest data. Zero initialize such fields to avoid leaking sensitive information to the host. Reported-by: Juan Vazquez Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201209070827.29335-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 0d63862d6551..9aa789e5f22b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -621,7 +621,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel, goto error_clean_ring; /* Create and init the channel open message */ - open_info = kmalloc(sizeof(*open_info) + + open_info = kzalloc(sizeof(*open_info) + sizeof(struct vmbus_channel_open_channel), GFP_KERNEL); if (!open_info) { @@ -748,7 +748,7 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle) unsigned long flags; int ret; - info = kmalloc(sizeof(*info) + + info = kzalloc(sizeof(*info) + sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL); if (!info) return -ENOMEM; -- cgit v1.2.3 From 9c400d3548c39378327268fb18112b229f91b220 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 9 Dec 2020 08:08:23 +0100 Subject: Drivers: hv: vmbus: Reduce number of references to message in vmbus_on_msg_dpc() Simplify the function by removing various references to the hv_message 'msg', introduce local variables 'msgtype' and 'payload_size'. Suggested-by: Juan Vazquez Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201209070827.29335-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index d491fdcee61f..9749cbce33d4 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1057,9 +1057,11 @@ void vmbus_on_msg_dpc(unsigned long data) struct hv_message *msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; struct vmbus_channel_message_header *hdr; + enum vmbus_channel_message_type msgtype; const struct vmbus_channel_message_table_entry *entry; struct onmessage_work_context *ctx; u32 message_type = msg->header.message_type; + __u8 payload_size; /* * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as @@ -1073,40 +1075,38 @@ void vmbus_on_msg_dpc(unsigned long data) return; hdr = (struct vmbus_channel_message_header *)msg->u.payload; + msgtype = hdr->msgtype; trace_vmbus_on_msg_dpc(hdr); - if (hdr->msgtype >= CHANNELMSG_COUNT) { - WARN_ONCE(1, "unknown msgtype=%d\n", hdr->msgtype); + if (msgtype >= CHANNELMSG_COUNT) { + WARN_ONCE(1, "unknown msgtype=%d\n", msgtype); goto msg_handled; } - if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { - WARN_ONCE(1, "payload size is too large (%d)\n", - msg->header.payload_size); + payload_size = msg->header.payload_size; + if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { + WARN_ONCE(1, "payload size is too large (%d)\n", payload_size); goto msg_handled; } - entry = &channel_message_table[hdr->msgtype]; + entry = &channel_message_table[msgtype]; if (!entry->message_handler) goto msg_handled; - if (msg->header.payload_size < entry->min_payload_len) { - WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", - hdr->msgtype, msg->header.payload_size); + if (payload_size < entry->min_payload_len) { + WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", msgtype, payload_size); goto msg_handled; } if (entry->handler_type == VMHT_BLOCKING) { - ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size, - GFP_ATOMIC); + ctx = kmalloc(sizeof(*ctx) + payload_size, GFP_ATOMIC); if (ctx == NULL) return; INIT_WORK(&ctx->work, vmbus_onmessage_work); - memcpy(&ctx->msg, msg, sizeof(msg->header) + - msg->header.payload_size); + memcpy(&ctx->msg, msg, sizeof(msg->header) + payload_size); /* * The host can generate a rescind message while we @@ -1115,7 +1115,7 @@ void vmbus_on_msg_dpc(unsigned long data) * by offer_in_progress and by channel_mutex. See also the * inline comments in vmbus_onoffer_rescind(). */ - switch (hdr->msgtype) { + switch (msgtype) { case CHANNELMSG_RESCIND_CHANNELOFFER: /* * If we are handling the rescind message; -- cgit v1.2.3 From fe8c1b18a27de4d6ca5d99b3ffb3125dc69a5b76 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 9 Dec 2020 08:08:24 +0100 Subject: Drivers: hv: vmbus: Copy the hv_message in vmbus_on_msg_dpc() Since the message is in memory shared with the host, an erroneous or a malicious Hyper-V could 'corrupt' the message while vmbus_on_msg_dpc() or individual message handlers are executing. To prevent it, copy the message into private memory. Reported-by: Juan Vazquez Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201209070827.29335-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 9749cbce33d4..4b0f2065a4bd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1054,14 +1054,14 @@ void vmbus_on_msg_dpc(unsigned long data) { struct hv_per_cpu_context *hv_cpu = (void *)data; void *page_addr = hv_cpu->synic_message_page; - struct hv_message *msg = (struct hv_message *)page_addr + + struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; struct vmbus_channel_message_header *hdr; enum vmbus_channel_message_type msgtype; const struct vmbus_channel_message_table_entry *entry; struct onmessage_work_context *ctx; - u32 message_type = msg->header.message_type; __u8 payload_size; + u32 message_type; /* * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as @@ -1070,11 +1070,20 @@ void vmbus_on_msg_dpc(unsigned long data) */ BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32)); + /* + * Since the message is in memory shared with the host, an erroneous or + * malicious Hyper-V could modify the message while vmbus_on_msg_dpc() + * or individual message handlers are executing; to prevent this, copy + * the message into private memory. + */ + memcpy(&msg_copy, msg, sizeof(struct hv_message)); + + message_type = msg_copy.header.message_type; if (message_type == HVMSG_NONE) /* no msg */ return; - hdr = (struct vmbus_channel_message_header *)msg->u.payload; + hdr = (struct vmbus_channel_message_header *)msg_copy.u.payload; msgtype = hdr->msgtype; trace_vmbus_on_msg_dpc(hdr); @@ -1084,7 +1093,7 @@ void vmbus_on_msg_dpc(unsigned long data) goto msg_handled; } - payload_size = msg->header.payload_size; + payload_size = msg_copy.header.payload_size; if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { WARN_ONCE(1, "payload size is too large (%d)\n", payload_size); goto msg_handled; @@ -1106,7 +1115,7 @@ void vmbus_on_msg_dpc(unsigned long data) return; INIT_WORK(&ctx->work, vmbus_onmessage_work); - memcpy(&ctx->msg, msg, sizeof(msg->header) + payload_size); + memcpy(&ctx->msg, &msg_copy, sizeof(msg->header) + payload_size); /* * The host can generate a rescind message while we -- cgit v1.2.3 From e3fa4b747f085d2cda09bba0533b86fa76038635 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 9 Dec 2020 08:08:25 +0100 Subject: Drivers: hv: vmbus: Avoid use-after-free in vmbus_onoffer_rescind() When channel->device_obj is non-NULL, vmbus_onoffer_rescind() could invoke put_device(), that will eventually release the device and free the channel object (cf. vmbus_device_release()). However, a pointer to the object is dereferenced again later to load the primary_channel. The use-after-free can be avoided by noticing that this load/check is redundant if device_obj is non-NULL: primary_channel must be NULL if device_obj is non-NULL, cf. vmbus_add_channel_work(). Fixes: 54a66265d6754b ("Drivers: hv: vmbus: Fix rescind handling") Reported-by: Juan Vazquez Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201209070827.29335-5-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 5bc5eef5da15..4072fd1f2214 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1116,8 +1116,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) vmbus_device_unregister(channel->device_obj); put_device(dev); } - } - if (channel->primary_channel != NULL) { + } else if (channel->primary_channel != NULL) { /* * Sub-channel is being rescinded. Following is the channel * close sequence when initiated from the driveri (refer to -- cgit v1.2.3 From e4d221b42354b2e2ddb9187a806afb651eee2cda Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 9 Dec 2020 08:08:26 +0100 Subject: Drivers: hv: vmbus: Resolve race condition in vmbus_onoffer_rescind() An erroneous or malicious host could send multiple rescind messages for a same channel. In vmbus_onoffer_rescind(), the guest maps the channel ID to obtain a pointer to the channel object and it eventually releases such object and associated data. The host could time rescind messages and lead to an use-after-free. Add a new flag to the channel structure to make sure that only one instance of vmbus_onoffer_rescind() can get the reference to the channel object. Reported-by: Juan Vazquez Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201209070827.29335-6-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 12 ++++++++++++ include/linux/hyperv.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 4072fd1f2214..68950a1e4b63 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1063,6 +1063,18 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) mutex_lock(&vmbus_connection.channel_mutex); channel = relid2channel(rescind->child_relid); + if (channel != NULL) { + /* + * Guarantee that no other instance of vmbus_onoffer_rescind() + * has got a reference to the channel object. Synchronize on + * &vmbus_connection.channel_mutex. + */ + if (channel->rescind_ref) { + mutex_unlock(&vmbus_connection.channel_mutex); + return; + } + channel->rescind_ref = true; + } mutex_unlock(&vmbus_connection.channel_mutex); if (channel == NULL) { diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2ea967bc17ad..f0d48a368f13 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -809,6 +809,7 @@ struct vmbus_channel { u8 monitor_bit; bool rescind; /* got rescind msg */ + bool rescind_ref; /* got rescind msg, got channel reference */ struct completion rescind_event; u32 ringbuffer_gpadlhandle; -- cgit v1.2.3 From c068e3f484268458defea3e1a19d821017f88d26 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 6 Jan 2021 17:45:52 -0800 Subject: Drivers: hv: vmbus: Add /sys/bus/vmbus/hibernation When a Linux VM runs on Hyper-V, if the host toolstack doesn't support hibernation for the VM (this happens on old Hyper-V hosts like Windows Server 2016, or new Hyper-V hosts if the admin or user doesn't declare the hibernation intent for the VM), the VM is discouraged from trying hibernation (because the host doesn't guarantee that the VM's virtual hardware configuration will remain exactly the same across hibernation), i.e. the VM should not try to set up the swap partition/file for hibernation, etc. x86 Hyper-V uses the presence of the virtual ACPI S4 state as the indication of the host toolstack support for a VM. Currently there is no easy and reliable way for the userspace to detect the presence of the state (see https://lkml.org/lkml/2020/12/11/1097). Add /sys/bus/vmbus/hibernation for this purpose. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210107014552.14234-1-decui@microsoft.com Signed-off-by: Wei Liu --- Documentation/ABI/stable/sysfs-bus-vmbus | 7 +++++++ drivers/hv/vmbus_drv.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-bus-vmbus b/Documentation/ABI/stable/sysfs-bus-vmbus index c27b7b89477c..42599d9fa161 100644 --- a/Documentation/ABI/stable/sysfs-bus-vmbus +++ b/Documentation/ABI/stable/sysfs-bus-vmbus @@ -1,3 +1,10 @@ +What: /sys/bus/vmbus/hibernation +Date: Jan 2021 +KernelVersion: 5.12 +Contact: Dexuan Cui +Description: Whether the host supports hibernation for the VM. +Users: Daemon that sets up swap partition/file for hibernation. + What: /sys/bus/vmbus/devices//id Date: Jul 2009 KernelVersion: 2.6.31 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 4b0f2065a4bd..65535c32662d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -678,6 +678,23 @@ static const struct attribute_group vmbus_dev_group = { }; __ATTRIBUTE_GROUPS(vmbus_dev); +/* Set up the attribute for /sys/bus/vmbus/hibernation */ +static ssize_t hibernation_show(struct bus_type *bus, char *buf) +{ + return sprintf(buf, "%d\n", !!hv_is_hibernation_supported()); +} + +static BUS_ATTR_RO(hibernation); + +static struct attribute *vmbus_bus_attrs[] = { + &bus_attr_hibernation.attr, + NULL, +}; +static const struct attribute_group vmbus_bus_group = { + .attrs = vmbus_bus_attrs, +}; +__ATTRIBUTE_GROUPS(vmbus_bus); + /* * vmbus_uevent - add uevent for our device * @@ -1024,6 +1041,7 @@ static struct bus_type hv_bus = { .uevent = vmbus_uevent, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, + .bus_groups = vmbus_bus_groups, .pm = &vmbus_pm, }; -- cgit v1.2.3 From bdb49526d25b076af4bd31b2fc66986ff0df1127 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 27 Jan 2021 23:31:36 +0000 Subject: hv_utils: Fix spelling mistake "Hearbeat" -> "Heartbeat" There is a spelling mistake in an error message. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20210127233136.623465-1-colin.king@canonical.com Signed-off-by: Wei Liu --- drivers/hv/hv_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 34f3e789cc9a..e4aefeb330da 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -507,7 +507,7 @@ static void heartbeat_onchannelcallback(void *context) /* Ensure recvlen is big enough to read header data */ if (recvlen < ICMSG_HDR) { - pr_err_ratelimited("Hearbeat request received. Packet length too small: %d\n", + pr_err_ratelimited("Heartbeat request received. Packet length too small: %d\n", recvlen); break; } -- cgit v1.2.3 From a6c76bb08dc7f7ff2b1c381002eb6c7211746182 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 1 Feb 2021 15:48:11 +0100 Subject: x86/hyperv: Load/save the Isolation Configuration leaf If bit 22 of Group B Features is set, the guest has access to the Isolation Configuration CPUID leaf. On x86, the first four bits of EAX in this leaf provide the isolation type of the partition; we entail three isolation types: 'SNP' (hardware-based isolation), 'VBS' (software-based isolation), and 'NONE' (no isolation). Signed-off-by: Andrea Parri (Microsoft) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Arnd Bergmann Cc: x86@kernel.org Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/r/20210201144814.2701-2-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 15 +++++++++++++++ arch/x86/include/asm/hyperv-tlfs.h | 15 +++++++++++++++ arch/x86/kernel/cpu/mshyperv.c | 9 +++++++++ include/asm-generic/hyperv-tlfs.h | 1 + include/asm-generic/mshyperv.h | 5 +++++ 5 files changed, 45 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6375967a8244..6608d50d7aaa 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -555,3 +556,17 @@ bool hv_is_hibernation_supported(void) return acpi_sleep_state_supported(ACPI_STATE_S4); } EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); + +enum hv_isolation_type hv_get_isolation_type(void) +{ + if (!(ms_hyperv.features_b & HV_ISOLATION)) + return HV_ISOLATION_TYPE_NONE; + return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); +} +EXPORT_SYMBOL_GPL(hv_get_isolation_type); + +bool hv_is_isolation_supported(void) +{ + return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; +} +EXPORT_SYMBOL_GPL(hv_is_isolation_supported); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 6bf42aed387e..6aed936e5e96 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -22,6 +22,7 @@ #define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 #define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C #define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081 #define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */ @@ -122,6 +123,20 @@ #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) +/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */ +#define HV_PARAVISOR_PRESENT BIT(0) + +/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */ +#define HV_ISOLATION_TYPE GENMASK(3, 0) +#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) +#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) + +enum hv_isolation_type { + HV_ISOLATION_TYPE_NONE = 0, + HV_ISOLATION_TYPE_VBS = 1, + HV_ISOLATION_TYPE_SNP = 2 +}; + /* Hyper-V specific model specific registers (MSRs) */ /* MSR used to identify the guest OS. */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 43b54bef5448..fc6cb84c9fe5 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -243,6 +243,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); @@ -277,6 +278,14 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } + if (ms_hyperv.features_b & HV_ISOLATION) { + ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); + + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", + ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); + } + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { ms_hyperv.nested_features = cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index e73a11850055..20d3cd950204 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -89,6 +89,7 @@ #define HV_ACCESS_STATS BIT(8) #define HV_DEBUGGING BIT(11) #define HV_CPU_POWER_MANAGEMENT BIT(12) +#define HV_ISOLATION BIT(22) /* diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c57799684170..dff58a3db5d5 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -27,11 +27,14 @@ struct ms_hyperv_info { u32 features; + u32 features_b; u32 misc_features; u32 hints; u32 nested_features; u32 max_vp_index; u32 max_lp_index; + u32 isolation_config_a; + u32 isolation_config_b; }; extern struct ms_hyperv_info ms_hyperv; @@ -169,6 +172,8 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die); void hyperv_report_panic_msg(phys_addr_t pa, size_t size); bool hv_is_hyperv_initialized(void); bool hv_is_hibernation_supported(void); +enum hv_isolation_type hv_get_isolation_type(void); +bool hv_is_isolation_supported(void); void hyperv_cleanup(void); #else /* CONFIG_HYPERV */ static inline bool hv_is_hyperv_initialized(void) { return false; } -- cgit v1.2.3 From 21a4e356d3588806307555c149b80cec3dedb180 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 1 Feb 2021 15:48:12 +0100 Subject: Drivers: hv: vmbus: Restrict vmbus_devices on isolated guests Only the VSCs or ICs that have been hardened and that are critical for the successful adoption of Confidential VMs should be allowed if the guest is running isolated. This change reduces the footprint of the code that will be exercised by Confidential VMs and hence the exposure to bugs and vulnerabilities. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210201144814.2701-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/hyperv.h | 1 + 2 files changed, 39 insertions(+) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 68950a1e4b63..f0ed730e2e4e 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -31,101 +31,118 @@ const struct vmbus_device vmbus_devs[] = { { .dev_type = HV_IDE, HV_IDE_GUID, .perf_device = true, + .allowed_in_isolated = false, }, /* SCSI */ { .dev_type = HV_SCSI, HV_SCSI_GUID, .perf_device = true, + .allowed_in_isolated = true, }, /* Fibre Channel */ { .dev_type = HV_FC, HV_SYNTHFC_GUID, .perf_device = true, + .allowed_in_isolated = false, }, /* Synthetic NIC */ { .dev_type = HV_NIC, HV_NIC_GUID, .perf_device = true, + .allowed_in_isolated = true, }, /* Network Direct */ { .dev_type = HV_ND, HV_ND_GUID, .perf_device = true, + .allowed_in_isolated = false, }, /* PCIE */ { .dev_type = HV_PCIE, HV_PCIE_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Synthetic Frame Buffer */ { .dev_type = HV_FB, HV_SYNTHVID_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Synthetic Keyboard */ { .dev_type = HV_KBD, HV_KBD_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Synthetic MOUSE */ { .dev_type = HV_MOUSE, HV_MOUSE_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* KVP */ { .dev_type = HV_KVP, HV_KVP_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Time Synch */ { .dev_type = HV_TS, HV_TS_GUID, .perf_device = false, + .allowed_in_isolated = true, }, /* Heartbeat */ { .dev_type = HV_HB, HV_HEART_BEAT_GUID, .perf_device = false, + .allowed_in_isolated = true, }, /* Shutdown */ { .dev_type = HV_SHUTDOWN, HV_SHUTDOWN_GUID, .perf_device = false, + .allowed_in_isolated = true, }, /* File copy */ { .dev_type = HV_FCOPY, HV_FCOPY_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Backup */ { .dev_type = HV_BACKUP, HV_VSS_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Dynamic Memory */ { .dev_type = HV_DM, HV_DM_GUID, .perf_device = false, + .allowed_in_isolated = false, }, /* Unknown GUID */ { .dev_type = HV_UNKNOWN, .perf_device = false, + .allowed_in_isolated = false, }, }; @@ -903,6 +920,20 @@ find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer) return channel; } +static bool vmbus_is_valid_device(const guid_t *guid) +{ + u16 i; + + if (!hv_is_isolation_supported()) + return true; + + for (i = 0; i < ARRAY_SIZE(vmbus_devs); i++) { + if (guid_equal(guid, &vmbus_devs[i].guid)) + return vmbus_devs[i].allowed_in_isolated; + } + return false; +} + /* * vmbus_onoffer - Handler for channel offers from vmbus in parent partition. * @@ -917,6 +948,13 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) trace_vmbus_onoffer(offer); + if (!vmbus_is_valid_device(&offer->offer.if_type)) { + pr_err_ratelimited("Invalid offer %d from the host supporting isolation\n", + offer->child_relid); + atomic_dec(&vmbus_connection.offer_in_progress); + return; + } + oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index f0d48a368f13..e3426f8c12db 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -789,6 +789,7 @@ struct vmbus_device { u16 dev_type; guid_t guid; bool perf_device; + bool allowed_in_isolated; }; #define VMBUS_DEFAULT_MAX_PKT_SIZE 4096 -- cgit v1.2.3 From 7ef4b2f0d9adb73eb66e1f87f22953169c3dc7f8 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 1 Feb 2021 15:48:13 +0100 Subject: Drivers: hv: vmbus: Enforce 'VMBus version >= 5.2' on isolated guests Restrict the protocol version(s) that will be negotiated with the host to be 5.2 or greater if the guest is running isolated. This reduces the footprint of the code that will be exercised by Confidential VMs and hence the exposure to bugs and vulnerabilities. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210201144814.2701-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/connection.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 11170d9a2e1a..c83612cddb99 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -244,6 +244,13 @@ int vmbus_connect(void) break; } + if (hv_is_isolation_supported() && version < VERSION_WIN10_V5_2) { + pr_err("Invalid VMBus version %d.%d (expected >= %d.%d) from the host supporting isolation\n", + version >> 16, version & 0xFFFF, VERSION_WIN10_V5_2 >> 16, VERSION_WIN10_V5_2 & 0xFFFF); + ret = -EINVAL; + goto cleanup; + } + vmbus_proto_version = version; pr_info("Vmbus version:%d.%d\n", version >> 16, version & 0xFFFF); -- cgit v1.2.3 From 96854bbda24febe2cc9231e1f6ffbd3059dc57fc Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 1 Feb 2021 15:48:14 +0100 Subject: hv_netvsc: Restrict configurations on isolated guests Restrict the NVSP protocol version(s) that will be negotiated with the host to be NVSP_PROTOCOL_VERSION_61 or greater if the guest is running isolated. Moreover, do not advertise the SR-IOV capability and ignore NVSP_MSG_4_TYPE_SEND_VF_ASSOCIATION messages in isolated guests, which are not supposed to support SR-IOV. This reduces the footprint of the code that will be exercised by Confidential VMs and hence the exposure to bugs and vulnerabilities. Signed-off-by: Andrea Parri (Microsoft) Acked-by: Jakub Kicinski Reviewed-by: Haiyang Zhang Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210201144814.2701-5-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/net/hyperv/netvsc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 1510a236aa34..51005f2d4a82 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -22,6 +22,7 @@ #include #include +#include #include "hyperv_net.h" #include "netvsc_trace.h" @@ -544,7 +545,10 @@ static int negotiate_nvsp_ver(struct hv_device *device, init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1; if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) { - init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1; + if (hv_is_isolation_supported()) + netdev_info(ndev, "SR-IOV not advertised by guests on the host supporting isolation\n"); + else + init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1; /* Teaming bit is needed to receive link speed updates */ init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1; @@ -591,6 +595,13 @@ static int netvsc_connect_vsp(struct hv_device *device, goto cleanup; } + if (hv_is_isolation_supported() && net_device->nvsp_version < NVSP_PROTOCOL_VERSION_61) { + netdev_err(ndev, "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n", + net_device->nvsp_version, NVSP_PROTOCOL_VERSION_61); + ret = -EPROTO; + goto cleanup; + } + pr_debug("Negotiated NVSP version:%x\n", net_device->nvsp_version); /* Send the ndis version */ @@ -1357,7 +1368,10 @@ static void netvsc_receive_inband(struct net_device *ndev, break; case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION: - netvsc_send_vf(ndev, nvmsg, msglen); + if (hv_is_isolation_supported()) + netdev_err(ndev, "Ignore VF_ASSOCIATION msg from the host supporting isolation\n"); + else + netvsc_send_vf(ndev, nvmsg, msglen); break; } } -- cgit v1.2.3 From 78785010d428f7755bf51d1c08cb2566a73dc7f5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 1 Feb 2021 11:43:34 -0600 Subject: hv: hyperv.h: Replace one-element array with flexible-array in struct icmsg_negotiate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code according to the use of a flexible-array member in struct icmsg_negotiate, instead of a one-element array. Also, this helps the ongoing efforts to enable -Warray-bounds and fix the following warnings: drivers/hv/channel_mgmt.c:315:23: warning: array subscript 1 is above array bounds of ‘struct ic_version[1]’ [-Warray-bounds] drivers/hv/channel_mgmt.c:316:23: warning: array subscript 1 is above array bounds of ‘struct ic_version[1]’ [-Warray-bounds] [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210201174334.GA171933@embeddedor Signed-off-by: Wei Liu --- include/linux/hyperv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index e3426f8c12db..9dd22af1b7f6 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1529,14 +1529,14 @@ struct icmsg_hdr { #define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100 #define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)) #define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \ - (ICMSG_HDR + offsetof(struct icmsg_negotiate, icversion_data) + \ + (ICMSG_HDR + sizeof(struct icmsg_negotiate) + \ (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version))) struct icmsg_negotiate { u16 icframe_vercnt; u16 icmsg_vercnt; u32 reserved; - struct ic_version icversion_data[1]; /* any size array */ + struct ic_version icversion_data[]; /* any size array */ } __packed; struct shutdown_msg_data { -- cgit v1.2.3 From 8f1d14cb835672cd27f6533f22f4c73e60a30727 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:20 +0000 Subject: asm-generic/hyperv: change HV_CPU_POWER_MANAGEMENT to HV_CPU_MANAGEMENT This makes the name match Hyper-V TLFS. Signed-off-by: Wei Liu Reviewed-by: Vitaly Kuznetsov Reviewed-by: Pavel Tatashin Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-2-wei.liu@kernel.org --- include/asm-generic/hyperv-tlfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 20d3cd950204..e232ddcb0a2d 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -88,7 +88,7 @@ #define HV_CONNECT_PORT BIT(7) #define HV_ACCESS_STATS BIT(8) #define HV_DEBUGGING BIT(11) -#define HV_CPU_POWER_MANAGEMENT BIT(12) +#define HV_CPU_MANAGEMENT BIT(12) #define HV_ISOLATION BIT(22) -- cgit v1.2.3 From e997720202b363ba8000d769f114e3c2c5822227 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:21 +0000 Subject: x86/hyperv: detect if Linux is the root partition For now we can use the privilege flag to check. Stash the value to be used later. Put in a bunch of defines for future use when we want to have more fine-grained detection. Signed-off-by: Wei Liu Reviewed-by: Pavel Tatashin Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-3-wei.liu@kernel.org --- arch/x86/include/asm/hyperv-tlfs.h | 10 ++++++++++ arch/x86/include/asm/mshyperv.h | 2 ++ arch/x86/kernel/cpu/mshyperv.c | 20 ++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 6aed936e5e96..e3a6a723d661 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -21,6 +21,7 @@ #define HYPERV_CPUID_FEATURES 0x40000003 #define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 #define HYPERV_CPUID_NESTED_FEATURES 0x4000000A #define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C @@ -111,6 +112,15 @@ /* Recommend using enlightened VMCS */ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) +/* + * CPU management features identification. + * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits. + */ +#define HV_X64_START_LOGICAL_PROCESSOR BIT(0) +#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1) +#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2) +#define HV_X64_RESERVED_IDENTITY_BIT BIT(31) + /* * Virtual processor will never share a physical core with another virtual * processor, except for virtual processors that are reported as sibling SMT diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 30f76b966857..4014a1ca2fae 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -239,6 +239,8 @@ int hyperv_fill_flush_guest_mapping_list( struct hv_guest_mapping_flush_list *flush, u64 start_gfn, u64 end_gfn); +extern bool hv_root_partition; + #ifdef CONFIG_X86_64 void hv_apic_init(void); void __init hv_init_spinlocks(void); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index fc6cb84c9fe5..39a4b4509b87 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -32,6 +32,10 @@ #include #include +/* Is Linux running as the root partition? */ +bool hv_root_partition; +EXPORT_SYMBOL_GPL(hv_root_partition); + struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -256,6 +260,22 @@ static void __init ms_hyperv_init_platform(void) pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* + * Check CPU management privilege. + * + * To mirror what Windows does we should extract CPU management + * features and use the ReservedIdentityBit to detect if Linux is the + * root partition. But that requires negotiating CPU management + * interface (a process to be finalized). + * + * For now, use the privilege flag as the indicator for running as + * root. + */ + if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + hv_root_partition = true; + pr_info("Hyper-V: running as root partition\n"); + } + /* * Extract host information. */ -- cgit v1.2.3 From 7e279d78664aa91107ebff4b03eca367967f5908 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:22 +0000 Subject: Drivers: hv: vmbus: skip VMBus initialization if Linux is root There is no VMBus and the other infrastructures initialized in hv_acpi_init when Linux is running as the root partition. Signed-off-by: Wei Liu Reviewed-by: Pavel Tatashin Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-4-wei.liu@kernel.org --- drivers/hv/vmbus_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 65535c32662d..10dce9f91216 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2645,6 +2645,9 @@ static int __init hv_acpi_init(void) if (!hv_is_hyperv_initialized()) return -ENODEV; + if (hv_root_partition) + return 0; + init_completion(&probe_event); /* -- cgit v1.2.3 From 7d4163c8315729140ad99d6e1ab10dfc7a685640 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:23 +0000 Subject: clocksource/hyperv: use MSR-based access if running as root When Linux runs as the root partition, the setup required for TSC page is different. Luckily Linux also has access to the MSR based clocksource. We can just disable the TSC page clocksource if Linux is the root partition. Signed-off-by: Wei Liu Acked-by: Daniel Lezcano Reviewed-by: Pavel Tatashin Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-5-wei.liu@kernel.org --- drivers/clocksource/hyperv_timer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index ba04cb381cd3..269a691bd2c4 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -426,6 +426,9 @@ static bool __init hv_init_tsc_clocksource(void) if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) return false; + if (hv_root_partition) + return false; + hv_read_reference_counter = read_hv_clock_tsc; phys_addr = virt_to_phys(hv_get_tsc_page()); -- cgit v1.2.3 From 5d0f077e0f413b7eca827b16ea8bfc4569e3946c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:24 +0000 Subject: x86/hyperv: allocate output arg pages if required When Linux runs as the root partition, it will need to make hypercalls which return data from the hypervisor. Allocate pages for storing results when Linux runs as the root partition. Signed-off-by: Lillian Grassin-Drake Co-Developed-by: Lillian Grassin-Drake Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-6-wei.liu@kernel.org --- arch/x86/hyperv/hv_init.c | 35 ++++++++++++++++++++++++++++++----- arch/x86/include/asm/mshyperv.h | 1 + 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6608d50d7aaa..a73f4db31870 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -45,6 +45,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page); void __percpu **hyperv_pcpu_input_arg; EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); +void __percpu **hyperv_pcpu_output_arg; +EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); + u32 hv_max_vp_index; EXPORT_SYMBOL_GPL(hv_max_vp_index); @@ -77,12 +80,19 @@ static int hv_cpu_init(unsigned int cpu) void **input_arg; struct page *pg; - input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ - pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL); + pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0); if (unlikely(!pg)) return -ENOMEM; + + input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); *input_arg = page_address(pg); + if (hv_root_partition) { + void **output_arg; + + output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *output_arg = page_address(pg + 1); + } hv_get_vp_index(msr_vp_index); @@ -209,14 +219,23 @@ static int hv_cpu_die(unsigned int cpu) unsigned int new_cpu; unsigned long flags; void **input_arg; - void *input_pg = NULL; + void *pg; local_irq_save(flags); input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - input_pg = *input_arg; + pg = *input_arg; *input_arg = NULL; + + if (hv_root_partition) { + void **output_arg; + + output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *output_arg = NULL; + } + local_irq_restore(flags); - free_page((unsigned long)input_pg); + + free_pages((unsigned long)pg, hv_root_partition ? 1 : 0); if (hv_vp_assist_page && hv_vp_assist_page[cpu]) wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); @@ -369,6 +388,12 @@ void __init hyperv_init(void) BUG_ON(hyperv_pcpu_input_arg == NULL); + /* Allocate the per-CPU state for output arg for root */ + if (hv_root_partition) { + hyperv_pcpu_output_arg = alloc_percpu(void *); + BUG_ON(hyperv_pcpu_output_arg == NULL); + } + /* Allocate percpu VP index */ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), GFP_KERNEL); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 4014a1ca2fae..ef06cdac8444 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -78,6 +78,7 @@ extern int hyperv_init_cpuhp; extern void *hv_hypercall_pg; extern void __percpu **hyperv_pcpu_input_arg; +extern void __percpu **hyperv_pcpu_output_arg; static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { -- cgit v1.2.3 From 99a0f46af6a7715147e81c558d558021aad4e207 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:25 +0000 Subject: x86/hyperv: extract partition ID from Microsoft Hypervisor if necessary We will need the partition ID for executing some hypercalls later. Signed-off-by: Lillian Grassin-Drake Co-Developed-by: Sunil Muthuswamy Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-7-wei.liu@kernel.org --- arch/x86/hyperv/hv_init.c | 26 ++++++++++++++++++++++++++ arch/x86/include/asm/mshyperv.h | 2 ++ include/asm-generic/hyperv-tlfs.h | 6 ++++++ 3 files changed, 34 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a73f4db31870..612c555fb9aa 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -29,6 +29,8 @@ #include int hyperv_init_cpuhp; +u64 hv_current_partition_id = ~0ull; +EXPORT_SYMBOL_GPL(hv_current_partition_id); void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); @@ -354,6 +356,24 @@ static void __init hv_stimer_setup_percpu_clockev(void) old_setup_percpu_clockev(); } +static void __init hv_get_partition_id(void) +{ + struct hv_get_partition_id *output_page; + u64 status; + unsigned long flags; + + local_irq_save(flags); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + /* No point in proceeding if this failed */ + pr_err("Failed to get partition ID: %lld\n", status); + BUG(); + } + hv_current_partition_id = output_page->partition_id; + local_irq_restore(flags); +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -454,6 +474,12 @@ void __init hyperv_init(void) register_syscore_ops(&hv_syscore_ops); hyperv_init_cpuhp = cpuhp; + + if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) + hv_get_partition_id(); + + BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); + return; remove_cpuhp_state: diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ef06cdac8444..b8324202d850 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -80,6 +80,8 @@ extern void *hv_hypercall_pg; extern void __percpu **hyperv_pcpu_input_arg; extern void __percpu **hyperv_pcpu_output_arg; +extern u64 hv_current_partition_id; + static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { u64 input_address = input ? virt_to_phys(input) : 0; diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index e232ddcb0a2d..ba96bcb5c657 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -142,6 +142,7 @@ struct ms_hyperv_tsc_page { #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_GET_PARTITION_ID 0x0046 #define HVCALL_GET_VP_REGISTERS 0x0050 #define HVCALL_SET_VP_REGISTERS 0x0051 #define HVCALL_POST_MESSAGE 0x005c @@ -408,6 +409,11 @@ struct hv_tlb_flush_ex { u64 gva_list[]; } __packed; +/* HvGetPartitionId hypercall (output only) */ +struct hv_get_partition_id { + u64 partition_id; +} __packed; + /* HvRetargetDeviceInterrupt hypercall */ union hv_msi_entry { u64 as_uint64; -- cgit v1.2.3 From 80f73c9f7468b15484e3ee4a29870fc9fa0419cc Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:26 +0000 Subject: x86/hyperv: handling hypercall page setup for root When Linux is running as the root partition, the hypercall page will have already been setup by Hyper-V. Copy the content over to the allocated page. Add checks to hv_suspend & co to bail early because they are not supported in this setup yet. Signed-off-by: Lillian Grassin-Drake Signed-off-by: Sunil Muthuswamy Signed-off-by: Nuno Das Neves Co-Developed-by: Lillian Grassin-Drake Co-Developed-by: Sunil Muthuswamy Co-Developed-by: Nuno Das Neves Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-8-wei.liu@kernel.org --- arch/x86/hyperv/hv_init.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 612c555fb9aa..aeea8fbf3c23 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -27,6 +27,7 @@ #include #include #include +#include int hyperv_init_cpuhp; u64 hv_current_partition_id = ~0ull; @@ -286,6 +287,9 @@ static int hv_suspend(void) union hv_x64_msr_hypercall_contents hypercall_msr; int ret; + if (hv_root_partition) + return -EPERM; + /* * Reset the hypercall page as it is going to be invalidated * accross hibernation. Setting hv_hypercall_pg to NULL ensures @@ -454,8 +458,35 @@ void __init hyperv_init(void) rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + if (hv_root_partition) { + struct page *pg; + void *src, *dst; + + /* + * For the root partition, the hypervisor will set up its + * hypercall page. The hypervisor guarantees it will not show + * up in the root's address space. The root can't change the + * location of the hypercall page. + * + * Order is important here. We must enable the hypercall page + * so it is populated with code, then copy the code to an + * executable page. + */ + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + pg = vmalloc_to_page(hv_hypercall_pg); + dst = kmap(pg); + src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, + MEMREMAP_WB); + BUG_ON(!(src && dst)); + memcpy(dst, src, HV_HYP_PAGE_SIZE); + memunmap(src); + kunmap(pg); + } else { + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + } /* * hyperv_init() is called before LAPIC is initialized: see @@ -604,7 +635,7 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); bool hv_is_hibernation_supported(void) { - return acpi_sleep_state_supported(ACPI_STATE_S4); + return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); } EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); -- cgit v1.2.3 From 4f0455cf6f23800c78265c88922c6afd875d08a7 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:27 +0000 Subject: ACPI / NUMA: add a stub function for node_to_pxm() There is already a stub function for pxm_to_node but conversion to the other direction is missing. It will be used by Microsoft Hypervisor code later. Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20210203150435.27941-9-wei.liu@kernel.org --- include/acpi/acpi_numa.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index a4c6ef809e27..40a91ce87e04 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -30,6 +30,10 @@ static inline int pxm_to_node(int pxm) { return 0; } +static inline int node_to_pxm(int node) +{ + return 0; +} #endif /* CONFIG_ACPI_NUMA */ #ifdef CONFIG_ACPI_HMAT -- cgit v1.2.3 From 86b5ec3552f3c09694e6f7934834b0a2a3aeebbe Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:28 +0000 Subject: x86/hyperv: provide a bunch of helper functions They are used to deposit pages into Microsoft Hypervisor and bring up logical and virtual processors. Signed-off-by: Lillian Grassin-Drake Signed-off-by: Sunil Muthuswamy Signed-off-by: Nuno Das Neves Co-Developed-by: Lillian Grassin-Drake Co-Developed-by: Sunil Muthuswamy Co-Developed-by: Nuno Das Neves Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-10-wei.liu@kernel.org --- arch/x86/hyperv/Makefile | 2 +- arch/x86/hyperv/hv_proc.c | 219 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/mshyperv.h | 4 + include/asm-generic/hyperv-tlfs.h | 67 ++++++++++++ 4 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 arch/x86/hyperv/hv_proc.c diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 89b1f74d3225..565358020921 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := hv_init.o mmu.o nested.o -obj-$(CONFIG_X86_64) += hv_apic.o +obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c new file mode 100644 index 000000000000..60461e598239 --- /dev/null +++ b/arch/x86/hyperv/hv_proc.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * See struct hv_deposit_memory. The first u64 is partition ID, the rest + * are GPAs. + */ +#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1) + +/* Deposits exact number of pages. Must be called with interrupts enabled. */ +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + struct page **pages, *page; + int *counts; + int num_allocations; + int i, j, page_count; + int order; + u64 status; + int ret; + u64 base_pfn; + struct hv_deposit_memory *input_page; + unsigned long flags; + + if (num_pages > HV_DEPOSIT_MAX) + return -E2BIG; + if (!num_pages) + return 0; + + /* One buffer for page pointers and counts */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + pages = page_address(page); + + counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL); + if (!counts) { + free_page((unsigned long)pages); + return -ENOMEM; + } + + /* Allocate all the pages before disabling interrupts */ + i = 0; + + while (num_pages) { + /* Find highest order we can actually allocate */ + order = 31 - __builtin_clz(num_pages); + + while (1) { + pages[i] = alloc_pages_node(node, GFP_KERNEL, order); + if (pages[i]) + break; + if (!order) { + ret = -ENOMEM; + num_allocations = i; + goto err_free_allocations; + } + --order; + } + + split_page(pages[i], order); + counts[i] = 1 << order; + num_pages -= counts[i]; + i++; + } + num_allocations = i; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + + /* Populate gpa_page_list - these will fit on the input page */ + for (i = 0, page_count = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j, ++page_count) + input_page->gpa_page_list[page_count] = base_pfn + j; + } + status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY, + page_count, 0, input_page, NULL); + local_irq_restore(flags); + + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + pr_err("Failed to deposit pages: %lld\n", status); + ret = status; + goto err_free_allocations; + } + + ret = 0; + goto free_buf; + +err_free_allocations: + for (i = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j) + __free_page(pfn_to_page(base_pfn + j)); + } + +free_buf: + free_page((unsigned long)pages); + kfree(counts); + return ret; +} + +int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) +{ + struct hv_add_logical_processor_in *input; + struct hv_add_logical_processor_out *output; + u64 status; + unsigned long flags; + int ret = 0; + int pxm = node_to_pxm(node); + + /* + * When adding a logical processor, the hypervisor may return + * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more + * pages and retry. + */ + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + /* We don't do anything with the output right now */ + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input->lp_index = lp_index; + input->apic_id = apic_id; + input->flags = 0; + input->proximity_domain_info.domain_id = pxm; + input->proximity_domain_info.flags.reserved = 0; + input->proximity_domain_info.flags.proximity_info_valid = 1; + input->proximity_domain_info.flags.proximity_preferred = 1; + status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, + input, output); + local_irq_restore(flags); + + status &= HV_HYPERCALL_RESULT_MASK; + + if (status != HV_STATUS_INSUFFICIENT_MEMORY) { + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, + lp_index, apic_id, status); + ret = status; + } + break; + } + ret = hv_call_deposit_pages(node, hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + struct hv_create_vp *input; + u64 status; + unsigned long irq_flags; + int ret = 0; + int pxm = node_to_pxm(node); + + /* Root VPs don't seem to need pages deposited */ + if (partition_id != hv_current_partition_id) { + /* The value 90 is empirically determined. It may change. */ + ret = hv_call_deposit_pages(node, partition_id, 90); + if (ret) + return ret; + } + + do { + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->flags = flags; + input->subnode_type = HvSubnodeAny; + if (node != NUMA_NO_NODE) { + input->proximity_domain_info.domain_id = pxm; + input->proximity_domain_info.flags.reserved = 0; + input->proximity_domain_info.flags.proximity_info_valid = 1; + input->proximity_domain_info.flags.proximity_preferred = 1; + } else { + input->proximity_domain_info.as_uint64 = 0; + } + status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); + local_irq_restore(irq_flags); + + status &= HV_HYPERCALL_RESULT_MASK; + + if (status != HV_STATUS_INSUFFICIENT_MEMORY) { + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, + vp_index, flags, status); + ret = status; + } + break; + } + ret = hv_call_deposit_pages(node, partition_id, 1); + + } while (!ret); + + return ret; +} + diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index b8324202d850..f9119781f2bb 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -82,6 +82,10 @@ extern void __percpu **hyperv_pcpu_output_arg; extern u64 hv_current_partition_id; +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); + static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { u64 input_address = input ? virt_to_phys(input) : 0; diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index ba96bcb5c657..562a29981632 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -143,6 +143,8 @@ struct ms_hyperv_tsc_page { #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_SEND_IPI_EX 0x0015 #define HVCALL_GET_PARTITION_ID 0x0046 +#define HVCALL_DEPOSIT_MEMORY 0x0048 +#define HVCALL_CREATE_VP 0x004e #define HVCALL_GET_VP_REGISTERS 0x0050 #define HVCALL_SET_VP_REGISTERS 0x0051 #define HVCALL_POST_MESSAGE 0x005c @@ -150,6 +152,7 @@ struct ms_hyperv_tsc_page { #define HVCALL_POST_DEBUG_DATA 0x0069 #define HVCALL_RETRIEVE_DEBUG_DATA 0x006a #define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 @@ -414,6 +417,70 @@ struct hv_get_partition_id { u64 partition_id; } __packed; +/* HvDepositMemory hypercall */ +struct hv_deposit_memory { + u64 partition_id; + u64 gpa_page_list[]; +} __packed; + +struct hv_proximity_domain_flags { + u32 proximity_preferred : 1; + u32 reserved : 30; + u32 proximity_info_valid : 1; +} __packed; + +/* Not a union in windows but useful for zeroing */ +union hv_proximity_domain_info { + struct { + u32 domain_id; + struct hv_proximity_domain_flags flags; + }; + u64 as_uint64; +} __packed; + +struct hv_lp_startup_status { + u64 hv_status; + u64 substatus1; + u64 substatus2; + u64 substatus3; + u64 substatus4; + u64 substatus5; + u64 substatus6; +} __packed; + +/* HvAddLogicalProcessor hypercall */ +struct hv_add_logical_processor_in { + u32 lp_index; + u32 apic_id; + union hv_proximity_domain_info proximity_domain_info; + u64 flags; +} __packed; + +struct hv_add_logical_processor_out { + struct hv_lp_startup_status startup_status; +} __packed; + +enum HV_SUBNODE_TYPE +{ + HvSubnodeAny = 0, + HvSubnodeSocket = 1, + HvSubnodeAmdNode = 2, + HvSubnodeL3 = 3, + HvSubnodeCount = 4, + HvSubnodeInvalid = -1 +}; + +/* HvCreateVp hypercall */ +struct hv_create_vp { + u64 partition_id; + u32 vp_index; + u8 padding[3]; + u8 subnode_type; + u64 subnode_id; + union hv_proximity_domain_info proximity_domain_info; + u64 flags; +} __packed; + /* HvRetargetDeviceInterrupt hypercall */ union hv_msi_entry { u64 as_uint64; -- cgit v1.2.3 From 333abaf5abb396820c4c7c26a8eecc7523c99184 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:29 +0000 Subject: x86/hyperv: implement and use hv_smp_prepare_cpus Microsoft Hypervisor requires the root partition to make a few hypercalls to setup application processors before they can be used. Signed-off-by: Lillian Grassin-Drake Signed-off-by: Sunil Muthuswamy Co-Developed-by: Lillian Grassin-Drake Co-Developed-by: Sunil Muthuswamy Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-11-wei.liu@kernel.org --- arch/x86/kernel/cpu/mshyperv.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 39a4b4509b87..e88bc296afca 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -31,6 +31,7 @@ #include #include #include +#include /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -230,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void) hv_init_spinlocks(); #endif } + +static void __init hv_smp_prepare_cpus(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_64 + int i; + int ret; +#endif + + native_smp_prepare_cpus(max_cpus); + +#ifdef CONFIG_X86_64 + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + BUG_ON(ret); + } + + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i); + BUG_ON(ret); + } +#endif +} #endif static void __init ms_hyperv_init_platform(void) @@ -395,6 +422,8 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; + if (hv_root_partition) + smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif /* -- cgit v1.2.3 From d589ae61bc27b2b9aaac0bf20a9077b6fbda32b6 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:30 +0000 Subject: asm-generic/hyperv: update hv_msi_entry We will soon need to access fields inside the MSI address and MSI data fields. Introduce hv_msi_address_register and hv_msi_data_register. Fix up one user of hv_msi_entry in mshyperv.h. No functional change expected. Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-12-wei.liu@kernel.org --- arch/x86/include/asm/mshyperv.h | 4 ++-- include/asm-generic/hyperv-tlfs.h | 28 ++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f9119781f2bb..7bd4022da061 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -259,8 +259,8 @@ static inline void hv_apic_init(void) {} static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, struct msi_desc *msi_desc) { - msi_entry->address = msi_desc->msg.address_lo; - msi_entry->data = msi_desc->msg.data; + msi_entry->address.as_uint32 = msi_desc->msg.address_lo; + msi_entry->data.as_uint32 = msi_desc->msg.data; } #else /* CONFIG_HYPERV */ diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 562a29981632..8ac797e6a91a 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -481,12 +481,36 @@ struct hv_create_vp { u64 flags; } __packed; +union hv_msi_address_register { + u32 as_uint32; + struct { + u32 reserved1:2; + u32 destination_mode:1; + u32 redirection_hint:1; + u32 reserved2:8; + u32 destination_id:8; + u32 msi_base:12; + }; +} __packed; + +union hv_msi_data_register { + u32 as_uint32; + struct { + u32 vector:8; + u32 delivery_mode:3; + u32 reserved1:3; + u32 level_assert:1; + u32 trigger_mode:1; + u32 reserved2:16; + }; +} __packed; + /* HvRetargetDeviceInterrupt hypercall */ union hv_msi_entry { u64 as_uint64; struct { - u32 address; - u32 data; + union hv_msi_address_register address; + union hv_msi_data_register data; } __packed; }; -- cgit v1.2.3 From b59fb7b60d47b2af3a114daf0ae198aa23921698 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:31 +0000 Subject: asm-generic/hyperv: update hv_interrupt_entry We will soon use the same structure to handle IO-APIC interrupts as well. Introduce an enum to identify the source and a data structure for IO-APIC RTE. While at it, update pci-hyperv.c to use the enum. No functional change. Signed-off-by: Wei Liu Acked-by: Rob Herring Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-13-wei.liu@kernel.org --- drivers/pci/controller/pci-hyperv.c | 2 +- include/asm-generic/hyperv-tlfs.h | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 6db8d96a78eb..87aa62ee0368 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1216,7 +1216,7 @@ static void hv_irq_unmask(struct irq_data *data) params = &hbus->retarget_msi_interrupt_params; memset(params, 0, sizeof(*params)); params->partition_id = HV_PARTITION_ID_SELF; - params->int_entry.source = 1; /* MSI(-X) */ + params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; hv_set_msi_entry_from_desc(¶ms->int_entry.msi_entry, msi_desc); params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | (hbus->hdev->dev_instance.b[4] << 16) | diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 8ac797e6a91a..723f487f3edc 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -481,6 +481,11 @@ struct hv_create_vp { u64 flags; } __packed; +enum hv_interrupt_source { + HV_INTERRUPT_SOURCE_MSI = 1, /* MSI and MSI-X */ + HV_INTERRUPT_SOURCE_IOAPIC, +}; + union hv_msi_address_register { u32 as_uint32; struct { @@ -514,10 +519,37 @@ union hv_msi_entry { } __packed; }; +union hv_ioapic_rte { + u64 as_uint64; + + struct { + u32 vector:8; + u32 delivery_mode:3; + u32 destination_mode:1; + u32 delivery_status:1; + u32 interrupt_polarity:1; + u32 remote_irr:1; + u32 trigger_mode:1; + u32 interrupt_mask:1; + u32 reserved1:15; + + u32 reserved2:24; + u32 destination_id:8; + }; + + struct { + u32 low_uint32; + u32 high_uint32; + }; +} __packed; + struct hv_interrupt_entry { - u32 source; /* 1 for MSI(-X) */ + u32 source; u32 reserved1; - union hv_msi_entry msi_entry; + union { + union hv_msi_entry msi_entry; + union hv_ioapic_rte ioapic_rte; + }; } __packed; /* -- cgit v1.2.3 From 12434e5fb6aed4655340ce74cd2a0dd859dff5bd Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:32 +0000 Subject: asm-generic/hyperv: introduce hv_device_id and auxiliary structures We will need to identify the device we want Microsoft Hypervisor to manipulate. Introduce the data structures for that purpose. They will be used in a later patch. Signed-off-by: Sunil Muthuswamy Co-Developed-by: Sunil Muthuswamy Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-14-wei.liu@kernel.org --- include/asm-generic/hyperv-tlfs.h | 79 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 723f487f3edc..20457a85c046 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -624,4 +624,83 @@ struct hv_set_vp_registers_input { } element[]; } __packed; +enum hv_device_type { + HV_DEVICE_TYPE_LOGICAL = 0, + HV_DEVICE_TYPE_PCI = 1, + HV_DEVICE_TYPE_IOAPIC = 2, + HV_DEVICE_TYPE_ACPI = 3, +}; + +typedef u16 hv_pci_rid; +typedef u16 hv_pci_segment; +typedef u64 hv_logical_device_id; +union hv_pci_bdf { + u16 as_uint16; + + struct { + u8 function:3; + u8 device:5; + u8 bus; + }; +} __packed; + +union hv_pci_bus_range { + u16 as_uint16; + + struct { + u8 subordinate_bus; + u8 secondary_bus; + }; +} __packed; + +union hv_device_id { + u64 as_uint64; + + struct { + u64 reserved0:62; + u64 device_type:2; + }; + + /* HV_DEVICE_TYPE_LOGICAL */ + struct { + u64 id:62; + u64 device_type:2; + } logical; + + /* HV_DEVICE_TYPE_PCI */ + struct { + union { + hv_pci_rid rid; + union hv_pci_bdf bdf; + }; + + hv_pci_segment segment; + union hv_pci_bus_range shadow_bus_range; + + u16 phantom_function_bits:2; + u16 source_shadow:1; + + u16 rsvdz0:11; + u16 device_type:2; + } pci; + + /* HV_DEVICE_TYPE_IOAPIC */ + struct { + u8 ioapic_id; + u8 rsvdz0; + u16 rsvdz1; + u16 rsvdz2; + + u16 rsvdz3:14; + u16 device_type:2; + } ioapic; + + /* HV_DEVICE_TYPE_ACPI */ + struct { + u32 input_mapping_base; + u32 input_mapping_count:30; + u32 device_type:2; + } acpi; +} __packed; + #endif -- cgit v1.2.3 From 466a9c3f88d04152ca83e840ca940c5f700402ac Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:33 +0000 Subject: asm-generic/hyperv: import data structures for mapping device interrupts Signed-off-by: Sunil Muthuswamy Co-Developed-by: Sunil Muthuswamy Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-15-wei.liu@kernel.org --- arch/x86/include/asm/hyperv-tlfs.h | 13 +++++++++++++ include/asm-generic/hyperv-tlfs.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e3a6a723d661..e6cd3fee562b 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -548,6 +548,19 @@ struct hv_partition_assist_pg { u32 tlb_lock_count; }; +enum hv_interrupt_type { + HV_X64_INTERRUPT_TYPE_FIXED = 0x0000, + HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001, + HV_X64_INTERRUPT_TYPE_SMI = 0x0002, + HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003, + HV_X64_INTERRUPT_TYPE_NMI = 0x0004, + HV_X64_INTERRUPT_TYPE_INIT = 0x0005, + HV_X64_INTERRUPT_TYPE_SIPI = 0x0006, + HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007, + HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008, + HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009, + HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A, +}; #include diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 20457a85c046..83448e837ded 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -153,6 +153,8 @@ struct ms_hyperv_tsc_page { #define HVCALL_RETRIEVE_DEBUG_DATA 0x006a #define HVCALL_RESET_DEBUG_SESSION 0x006b #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 +#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c +#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 @@ -703,4 +705,38 @@ union hv_device_id { } acpi; } __packed; +enum hv_interrupt_trigger_mode { + HV_INTERRUPT_TRIGGER_MODE_EDGE = 0, + HV_INTERRUPT_TRIGGER_MODE_LEVEL = 1, +}; + +struct hv_device_interrupt_descriptor { + u32 interrupt_type; + u32 trigger_mode; + u32 vector_count; + u32 reserved; + struct hv_device_interrupt_target target; +} __packed; + +struct hv_input_map_device_interrupt { + u64 partition_id; + u64 device_id; + u64 flags; + struct hv_interrupt_entry logical_interrupt_entry; + struct hv_device_interrupt_descriptor interrupt_descriptor; +} __packed; + +struct hv_output_map_device_interrupt { + struct hv_interrupt_entry interrupt_entry; +} __packed; + +struct hv_input_unmap_device_interrupt { + u64 partition_id; + u64 device_id; + struct hv_interrupt_entry interrupt_entry; +} __packed; + +#define HV_SOURCE_SHADOW_NONE 0x0 +#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1 + #endif -- cgit v1.2.3 From e39397d1fd6851bef4dfb63a631b8e15d1f43329 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:34 +0000 Subject: x86/hyperv: implement an MSI domain for root partition When Linux runs as the root partition on Microsoft Hypervisor, its interrupts are remapped. Linux will need to explicitly map and unmap interrupts for hardware. Implement an MSI domain to issue the correct hypercalls. And initialize this irq domain as the default MSI irq domain. Signed-off-by: Sunil Muthuswamy Co-Developed-by: Sunil Muthuswamy Signed-off-by: Wei Liu Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-16-wei.liu@kernel.org --- arch/x86/hyperv/Makefile | 2 +- arch/x86/hyperv/hv_init.c | 9 + arch/x86/hyperv/irqdomain.c | 360 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/mshyperv.h | 2 + 4 files changed, 372 insertions(+), 1 deletion(-) create mode 100644 arch/x86/hyperv/irqdomain.c diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 565358020921..48e2c51464e8 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y := hv_init.o mmu.o nested.o +obj-y := hv_init.o mmu.o nested.o irqdomain.o obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o ifdef CONFIG_X86_64 diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index aeea8fbf3c23..b81047dec1da 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -511,6 +511,15 @@ void __init hyperv_init(void) BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); +#ifdef CONFIG_PCI_MSI + /* + * If we're running as root, we want to create our own PCI MSI domain. + * We can't set this in hv_pci_init because that would be too late. + */ + if (hv_root_partition) + x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; +#endif + return; remove_cpuhp_state: diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c new file mode 100644 index 000000000000..bddcf1d6860d --- /dev/null +++ b/arch/x86/hyperv/irqdomain.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor. + * + * Authors: + * Sunil Muthuswamy + * Wei Liu + */ + +#include +#include +#include + +static int hv_map_interrupt(union hv_device_id device_id, bool level, + int cpu, int vector, struct hv_interrupt_entry *entry) +{ + struct hv_input_map_device_interrupt *input; + struct hv_output_map_device_interrupt *output; + struct hv_device_interrupt_descriptor *intr_desc; + unsigned long flags; + u64 status; + int nr_bank, var_size; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + intr_desc = &input->interrupt_descriptor; + memset(input, 0, sizeof(*input)); + input->partition_id = hv_current_partition_id; + input->device_id = device_id.as_uint64; + intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED; + intr_desc->vector_count = 1; + intr_desc->target.vector = vector; + + if (level) + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL; + else + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE; + + intr_desc->target.vp_set.valid_bank_mask = 0; + intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu)); + if (nr_bank < 0) { + local_irq_restore(flags); + pr_err("%s: unable to generate VP set\n", __func__); + return EINVAL; + } + intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; + + /* + * var-sized hypercall, var-size starts after vp_mask (thus + * vp_set.format does not count, but vp_set.valid_bank_mask + * does). + */ + var_size = nr_bank + 1; + + status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size, + input, output); + *entry = output->interrupt_entry; + + local_irq_restore(flags); + + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + pr_err("%s: hypercall failed, status %lld\n", __func__, status); + + return status & HV_HYPERCALL_RESULT_MASK; +} + +static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) +{ + unsigned long flags; + struct hv_input_unmap_device_interrupt *input; + struct hv_interrupt_entry *intr_entry; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + intr_entry = &input->interrupt_entry; + input->partition_id = hv_current_partition_id; + input->device_id = id; + *intr_entry = *old_entry; + + status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); + local_irq_restore(flags); + + return status & HV_HYPERCALL_RESULT_MASK; +} + +#ifdef CONFIG_PCI_MSI +struct rid_data { + struct pci_dev *bridge; + u32 rid; +}; + +static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct rid_data *rd = data; + u8 bus = PCI_BUS_NUM(rd->rid); + + if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) { + rd->bridge = pdev; + rd->rid = alias; + } + + return 0; +} + +static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) +{ + union hv_device_id dev_id; + struct rid_data data = { + .bridge = NULL, + .rid = PCI_DEVID(dev->bus->number, dev->devfn) + }; + + pci_for_each_dma_alias(dev, get_rid_cb, &data); + + dev_id.as_uint64 = 0; + dev_id.device_type = HV_DEVICE_TYPE_PCI; + dev_id.pci.segment = pci_domain_nr(dev->bus); + + dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid); + dev_id.pci.bdf.device = PCI_SLOT(data.rid); + dev_id.pci.bdf.function = PCI_FUNC(data.rid); + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE; + + if (data.bridge) { + int pos; + + /* + * Microsoft Hypervisor requires a bus range when the bridge is + * running in PCI-X mode. + * + * To distinguish conventional vs PCI-X bridge, we can check + * the bridge's PCI-X Secondary Status Register, Secondary Bus + * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge + * Specification Revision 1.0 5.2.2.1.3. + * + * Value zero means it is in conventional mode, otherwise it is + * in PCI-X mode. + */ + + pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX); + if (pos) { + u16 status; + + pci_read_config_word(data.bridge, pos + + PCI_X_BRIDGE_SSTATUS, &status); + + if (status & PCI_X_SSTATUS_FREQ) { + /* Non-zero, PCI-X mode */ + u8 sec_bus, sub_bus; + + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE; + + pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus); + dev_id.pci.shadow_bus_range.secondary_bus = sec_bus; + pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus); + dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus; + } + } + } + + return dev_id; +} + +static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector, + struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id = hv_build_pci_dev_id(dev); + + return hv_map_interrupt(device_id, false, cpu, vector, entry); +} + +static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) +{ + /* High address is always 0 */ + msg->address_hi = 0; + msg->address_lo = entry->msi_entry.address.as_uint32; + msg->data = entry->msi_entry.data.as_uint32; +} + +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); +static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *msidesc; + struct pci_dev *dev; + struct hv_interrupt_entry out_entry, *stored_entry; + struct irq_cfg *cfg = irqd_cfg(data); + cpumask_t *affinity; + int cpu; + u64 status; + + msidesc = irq_data_get_msi_desc(data); + dev = msi_desc_to_pci_dev(msidesc); + + if (!cfg) { + pr_debug("%s: cfg is NULL", __func__); + return; + } + + affinity = irq_data_get_effective_affinity_mask(data); + cpu = cpumask_first_and(affinity, cpu_online_mask); + + if (data->chip_data) { + /* + * This interrupt is already mapped. Let's unmap first. + * + * We don't use retarget interrupt hypercalls here because + * Microsoft Hypervisor doens't allow root to change the vector + * or specify VPs outside of the set that is initially used + * during mapping. + */ + stored_entry = data->chip_data; + data->chip_data = NULL; + + status = hv_unmap_msi_interrupt(dev, stored_entry); + + kfree(stored_entry); + + if (status != HV_STATUS_SUCCESS) { + pr_debug("%s: failed to unmap, status %lld", __func__, status); + return; + } + } + + stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); + if (!stored_entry) { + pr_debug("%s: failed to allocate chip data\n", __func__); + return; + } + + status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry); + if (status != HV_STATUS_SUCCESS) { + kfree(stored_entry); + return; + } + + *stored_entry = out_entry; + data->chip_data = stored_entry; + entry_to_msi_msg(&out_entry, msg); + + return; +} + +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry) +{ + return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry); +} + +static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq) +{ + u64 status; + struct hv_interrupt_entry old_entry; + struct irq_desc *desc; + struct irq_data *data; + struct msi_msg msg; + + desc = irq_to_desc(irq); + if (!desc) { + pr_debug("%s: no irq desc\n", __func__); + return; + } + + data = &desc->irq_data; + if (!data) { + pr_debug("%s: no irq data\n", __func__); + return; + } + + if (!data->chip_data) { + pr_debug("%s: no chip data\n!", __func__); + return; + } + + old_entry = *(struct hv_interrupt_entry *)data->chip_data; + entry_to_msi_msg(&old_entry, &msg); + + kfree(data->chip_data); + data->chip_data = NULL; + + status = hv_unmap_msi_interrupt(dev, &old_entry); + + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: hypercall failed, status %lld\n", __func__, status); + return; + } +} + +static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + int i; + struct msi_desc *entry; + struct pci_dev *pdev; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + pdev = to_pci_dev(dev); + + for_each_pci_msi_entry(entry, pdev) { + if (entry->irq) { + for (i = 0; i < entry->nvec_used; i++) { + hv_teardown_msi_irq_common(pdev, entry, entry->irq + i); + irq_domain_free_irqs(entry->irq + i, 1); + } + } + } +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip hv_pci_msi_controller = { + .name = "HV-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = hv_irq_compose_msi_msg, + .irq_set_affinity = msi_domain_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct msi_domain_ops pci_msi_domain_ops = { + .domain_free_irqs = hv_msi_domain_free_irqs, + .msi_prepare = pci_msi_prepare, +}; + +static struct msi_domain_info hv_pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &hv_pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + +struct irq_domain * __init hv_create_pci_msi_domain(void) +{ + struct irq_domain *d = NULL; + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI"); + if (fn) + d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain); + + /* No point in going further if we can't get an irq domain */ + BUG_ON(!d); + + return d; +} + +#endif /* CONFIG_PCI_MSI */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 7bd4022da061..4533773115ea 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -263,6 +263,8 @@ static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, msi_entry->data.as_uint32 = msi_desc->msg.data; } +struct irq_domain *hv_create_pci_msi_domain(void); + #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} -- cgit v1.2.3 From fb5ef35165a37ca63ef0227657eabd06f0a39cf9 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 3 Feb 2021 15:04:35 +0000 Subject: iommu/hyperv: setup an IO-APIC IRQ remapping domain for root partition Just like MSI/MSI-X, IO-APIC interrupts are remapped by Microsoft Hypervisor when Linux runs as the root partition. Implement an IRQ domain to handle mapping and unmapping of IO-APIC interrupts. Signed-off-by: Wei Liu Acked-by: Joerg Roedel Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210203150435.27941-17-wei.liu@kernel.org --- arch/x86/hyperv/irqdomain.c | 25 ++++++ arch/x86/include/asm/mshyperv.h | 4 + drivers/iommu/hyperv-iommu.c | 177 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 203 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index bddcf1d6860d..4421a8d92e23 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -358,3 +358,28 @@ struct irq_domain * __init hv_create_pci_msi_domain(void) } #endif /* CONFIG_PCI_MSI */ + +int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id; + + device_id.as_uint64 = 0; + device_id.device_type = HV_DEVICE_TYPE_IOAPIC; + device_id.ioapic.ioapic_id = (u8)ioapic_id; + + return hv_unmap_interrupt(device_id.as_uint64, entry); +} +EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt); + +int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector, + struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id; + + device_id.as_uint64 = 0; + device_id.device_type = HV_DEVICE_TYPE_IOAPIC; + device_id.ioapic.ioapic_id = (u8)ioapic_id; + + return hv_map_interrupt(device_id, level, cpu, vector, entry); +} +EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 4533773115ea..ccf60a809a17 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -265,6 +265,10 @@ static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, struct irq_domain *hv_create_pci_msi_domain(void); +int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, + struct hv_interrupt_entry *entry); +int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); + #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 1d21a0b5f724..e285a220c913 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "irq_remapping.h" @@ -115,30 +116,43 @@ static const struct irq_domain_ops hyperv_ir_domain_ops = { .free = hyperv_irq_remapping_free, }; +static const struct irq_domain_ops hyperv_root_ir_domain_ops; static int __init hyperv_prepare_irq_remapping(void) { struct fwnode_handle *fn; int i; + const char *name; + const struct irq_domain_ops *ops; if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) || x86_init.hyper.msi_ext_dest_id() || !x2apic_supported()) return -ENODEV; - fn = irq_domain_alloc_named_id_fwnode("HYPERV-IR", 0); + if (hv_root_partition) { + name = "HYPERV-ROOT-IR"; + ops = &hyperv_root_ir_domain_ops; + } else { + name = "HYPERV-IR"; + ops = &hyperv_ir_domain_ops; + } + + fn = irq_domain_alloc_named_id_fwnode(name, 0); if (!fn) return -ENOMEM; ioapic_ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), - 0, IOAPIC_REMAPPING_ENTRY, fn, - &hyperv_ir_domain_ops, NULL); + 0, IOAPIC_REMAPPING_ENTRY, fn, ops, NULL); if (!ioapic_ir_domain) { irq_domain_free_fwnode(fn); return -ENOMEM; } + if (hv_root_partition) + return 0; /* The rest is only relevant to guests */ + /* * Hyper-V doesn't provide irq remapping function for * IO-APIC and so IO-APIC only accepts 8-bit APIC ID. @@ -166,4 +180,161 @@ struct irq_remap_ops hyperv_irq_remap_ops = { .enable = hyperv_enable_irq_remapping, }; +/* IRQ remapping domain when Linux runs as the root partition */ +struct hyperv_root_ir_data { + u8 ioapic_id; + bool is_level; + struct hv_interrupt_entry entry; +}; + +static void +hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) +{ + u64 status; + u32 vector; + struct irq_cfg *cfg; + int ioapic_id; + struct cpumask *affinity; + int cpu; + struct hv_interrupt_entry entry; + struct hyperv_root_ir_data *data = irq_data->chip_data; + struct IO_APIC_route_entry e; + + cfg = irqd_cfg(irq_data); + affinity = irq_data_get_effective_affinity_mask(irq_data); + cpu = cpumask_first_and(affinity, cpu_online_mask); + + vector = cfg->vector; + ioapic_id = data->ioapic_id; + + if (data->entry.source == HV_DEVICE_TYPE_IOAPIC + && data->entry.ioapic_rte.as_uint64) { + entry = data->entry; + + status = hv_unmap_ioapic_interrupt(ioapic_id, &entry); + + if (status != HV_STATUS_SUCCESS) + pr_debug("%s: unexpected unmap status %lld\n", __func__, status); + + data->entry.ioapic_rte.as_uint64 = 0; + data->entry.source = 0; /* Invalid source */ + } + + + status = hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu, + vector, &entry); + + if (status != HV_STATUS_SUCCESS) { + pr_err("%s: map hypercall failed, status %lld\n", __func__, status); + return; + } + + data->entry = entry; + + /* Turn it into an IO_APIC_route_entry, and generate MSI MSG. */ + e.w1 = entry.ioapic_rte.low_uint32; + e.w2 = entry.ioapic_rte.high_uint32; + + memset(msg, 0, sizeof(*msg)); + msg->arch_data.vector = e.vector; + msg->arch_data.delivery_mode = e.delivery_mode; + msg->arch_addr_lo.dest_mode_logical = e.dest_mode_logical; + msg->arch_addr_lo.dmar_format = e.ir_format; + msg->arch_addr_lo.dmar_index_0_14 = e.ir_index_0_14; +} + +static int hyperv_root_ir_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = data->parent_data; + struct irq_cfg *cfg = irqd_cfg(data); + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + send_cleanup_vector(cfg); + + return 0; +} + +static struct irq_chip hyperv_root_ir_chip = { + .name = "HYPERV-ROOT-IR", + .irq_ack = apic_ack_irq, + .irq_set_affinity = hyperv_root_ir_set_affinity, + .irq_compose_msi_msg = hyperv_root_ir_compose_msi_msg, +}; + +static int hyperv_root_irq_remapping_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + struct irq_alloc_info *info = arg; + struct irq_data *irq_data; + struct hyperv_root_ir_data *data; + int ret = 0; + + if (!info || info->type != X86_IRQ_ALLOC_TYPE_IOAPIC || nr_irqs > 1) + return -EINVAL; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + irq_domain_free_irqs_common(domain, virq, nr_irqs); + return -ENOMEM; + } + + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) { + kfree(data); + irq_domain_free_irqs_common(domain, virq, nr_irqs); + return -EINVAL; + } + + data->ioapic_id = info->devid; + data->is_level = info->ioapic.is_level; + + irq_data->chip = &hyperv_root_ir_chip; + irq_data->chip_data = data; + + return 0; +} + +static void hyperv_root_irq_remapping_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct hyperv_root_ir_data *data; + struct hv_interrupt_entry *e; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + e = &data->entry; + + if (e->source == HV_DEVICE_TYPE_IOAPIC + && e->ioapic_rte.as_uint64) + hv_unmap_ioapic_interrupt(data->ioapic_id, + &data->entry); + + kfree(data); + } + } + + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static const struct irq_domain_ops hyperv_root_ir_domain_ops = { + .select = hyperv_irq_remapping_select, + .alloc = hyperv_root_irq_remapping_alloc, + .free = hyperv_root_irq_remapping_free, +}; + #endif -- cgit v1.2.3 From 3019270282a175defc02c8331786c73e082cd2a8 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 15 Feb 2021 10:44:58 +0000 Subject: Revert "Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer" This reverts commit a8c3209998afb5c4941b49e35b513cea9050cb4a. It is reported that the said commit caused regression in netvsc. Reported-by: Andrea Parri (Microsoft) Signed-off-by: Wei Liu --- drivers/hv/channel.c | 9 ++--- drivers/hv/hv_fcopy.c | 1 - drivers/hv/hv_kvp.c | 1 - drivers/hv/hyperv_vmbus.h | 2 +- drivers/hv/ring_buffer.c | 82 ++++++--------------------------------- drivers/net/hyperv/hyperv_net.h | 3 -- drivers/net/hyperv/netvsc.c | 2 - drivers/net/hyperv/rndis_filter.c | 2 - drivers/scsi/storvsc_drv.c | 10 ----- include/linux/hyperv.h | 48 ++++------------------- net/vmw_vsock/hyperv_transport.c | 4 +- 11 files changed, 25 insertions(+), 139 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 9aa789e5f22b..0bd202de7960 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -597,15 +597,12 @@ static int __vmbus_open(struct vmbus_channel *newchannel, newchannel->onchannel_callback = onchannelcallback; newchannel->channel_callback_context = context; - if (!newchannel->max_pkt_size) - newchannel->max_pkt_size = VMBUS_DEFAULT_MAX_PKT_SIZE; - - err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages, 0); + err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages); if (err) goto error_clean_ring; - err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], - recv_pages, newchannel->max_pkt_size); + err = hv_ringbuffer_init(&newchannel->inbound, + &page[send_pages], recv_pages); if (err) goto error_clean_ring; diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index 660036da7449..59ce85e00a02 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -349,7 +349,6 @@ int hv_fcopy_init(struct hv_util_service *srv) { recv_buffer = srv->recv_buffer; fcopy_transaction.recv_channel = srv->channel; - fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2; /* * When this driver loads, the user level daemon that diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index c698592b83e4..b49962d312ce 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -757,7 +757,6 @@ hv_kvp_init(struct hv_util_service *srv) { recv_buffer = srv->recv_buffer; kvp_transaction.recv_channel = srv->channel; - kvp_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 4; /* * When this driver loads, the user level daemon that diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 42f3d9d123a1..9416e09ebd58 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -174,7 +174,7 @@ extern int hv_synic_cleanup(unsigned int cpu); void hv_ringbuffer_pre_init(struct vmbus_channel *channel); int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 pagecnt, u32 max_pkt_size); + struct page *pages, u32 pagecnt); void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 29e90477363a..35833d4d1a1d 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -190,7 +190,7 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) /* Initialize the ring buffer. */ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 page_cnt, u32 max_pkt_size) + struct page *pages, u32 page_cnt) { int i; struct page **pages_wraparound; @@ -232,14 +232,6 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, sizeof(struct hv_ring_buffer); ring_info->priv_read_index = 0; - /* Initialize buffer that holds copies of incoming packets */ - if (max_pkt_size) { - ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL); - if (!ring_info->pkt_buffer) - return -ENOMEM; - ring_info->pkt_buffer_size = max_pkt_size; - } - spin_lock_init(&ring_info->ring_lock); return 0; @@ -252,9 +244,6 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info) vunmap(ring_info->ring_buffer); ring_info->ring_buffer = NULL; mutex_unlock(&ring_info->ring_buffer_mutex); - - kfree(ring_info->pkt_buffer); - ring_info->pkt_buffer_size = 0; } /* Write to the ring buffer. */ @@ -396,7 +385,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, memcpy(buffer, (const char *)desc + offset, packetlen); /* Advance ring index to next packet descriptor */ - __hv_pkt_iter_next(channel, desc, true); + __hv_pkt_iter_next(channel, desc); /* Notify host of update */ hv_pkt_iter_close(channel); @@ -422,22 +411,6 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) return (rbi->ring_datasize - priv_read_loc) + write_loc; } -/* - * Get first vmbus packet without copying it out of the ring buffer - */ -struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *rbi = &channel->inbound; - - hv_debug_delay_test(channel, MESSAGE_DELAY); - - if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) - return NULL; - - return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index); -} -EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw); - /* * Get first vmbus packet from ring buffer after read_index * @@ -446,49 +419,17 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw); struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel) { struct hv_ring_buffer_info *rbi = &channel->inbound; - struct vmpacket_descriptor *desc, *desc_copy; - u32 bytes_avail, pkt_len, pkt_offset; + struct vmpacket_descriptor *desc; - desc = hv_pkt_iter_first_raw(channel); - if (!desc) + hv_debug_delay_test(channel, MESSAGE_DELAY); + if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) return NULL; - bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi)); - - /* - * Ensure the compiler does not use references to incoming Hyper-V values (which - * could change at any moment) when reading local variables later in the code - */ - pkt_len = READ_ONCE(desc->len8) << 3; - pkt_offset = READ_ONCE(desc->offset8) << 3; - - /* - * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and - * rbi->pkt_buffer_size - */ - if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail) - pkt_len = bytes_avail; - - /* - * If pkt_offset is invalid, arbitrarily set it to - * the size of vmpacket_descriptor - */ - if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len) - pkt_offset = sizeof(struct vmpacket_descriptor); - - /* Copy the Hyper-V packet out of the ring buffer */ - desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer; - memcpy(desc_copy, desc, pkt_len); - - /* - * Hyper-V could still change len8 and offset8 after the earlier read. - * Ensure that desc_copy has legal values for len8 and offset8 that - * are consistent with the copy we just made - */ - desc_copy->len8 = pkt_len >> 3; - desc_copy->offset8 = pkt_offset >> 3; + desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index; + if (desc) + prefetch((char *)desc + (desc->len8 << 3)); - return desc_copy; + return desc; } EXPORT_SYMBOL_GPL(hv_pkt_iter_first); @@ -500,8 +441,7 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first); */ struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *desc, - bool copy) + const struct vmpacket_descriptor *desc) { struct hv_ring_buffer_info *rbi = &channel->inbound; u32 packetlen = desc->len8 << 3; @@ -514,7 +454,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel, rbi->priv_read_index -= dsize; /* more data? */ - return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel); + return hv_pkt_iter_first(channel); } EXPORT_SYMBOL_GPL(__hv_pkt_iter_next); diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 7ea6936f86ef..2a87cfa27ac0 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -860,12 +860,9 @@ static inline u32 netvsc_rqstor_size(unsigned long ringbytes) ringbytes / NETVSC_MIN_IN_MSG_SIZE; } -#define NETVSC_MAX_XFER_PAGE_RANGES 375 #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \ (offsetof(struct vmtransfer_page_packet_header, ranges) + \ (rng_cnt) * sizeof(struct vmtransfer_page_range)) -#define NETVSC_MAX_PKT_SIZE (NETVSC_XFER_HEADER_SIZE(NETVSC_MAX_XFER_PAGE_RANGES) + \ - sizeof(struct nvsp_message) + (sizeof(u32) * VRSS_SEND_TAB_SIZE)) struct multi_send_data { struct sk_buff *skb; /* skb containing the pkt */ diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 51005f2d4a82..2353623259f3 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1544,8 +1544,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Open the channel */ device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); - device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE; - ret = vmbus_open(device->channel, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, net_device->chan_table); diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 7e6dee2f02a4..598713c0d5a8 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1174,8 +1174,6 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) nvchan->channel = new_sc; new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); - new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE; - ret = vmbus_open(new_sc, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, nvchan); diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 7e59284dbf5b..2e4fa77445fd 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -414,14 +414,6 @@ static void storvsc_on_channel_callback(void *context); #define STORVSC_IDE_MAX_TARGETS 1 #define STORVSC_IDE_MAX_CHANNELS 1 -/* - * Upper bound on the size of a storvsc packet. vmscsi_size_delta is not - * included in the calculation because it is set after STORVSC_MAX_PKT_SIZE - * is used in storvsc_connect_to_vsp - */ -#define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\ - sizeof(struct vstor_packet)) - struct storvsc_cmd_request { struct scsi_cmnd *cmd; @@ -706,7 +698,6 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) return; memset(&props, 0, sizeof(struct vmstorage_channel_properties)); - new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE; /* * The size of vmbus_requestor is an upper bound on the number of requests @@ -1289,7 +1280,6 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, memset(&props, 0, sizeof(struct vmstorage_channel_properties)); - device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE; /* * The size of vmbus_requestor is an upper bound on the number of requests * that can be in-progress at any one time across all channels. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 9dd22af1b7f6..f1d74dcf0353 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -181,10 +181,6 @@ struct hv_ring_buffer_info { * being freed while the ring buffer is being accessed. */ struct mutex ring_buffer_mutex; - - /* Buffer that holds a copy of an incoming host packet */ - void *pkt_buffer; - u32 pkt_buffer_size; }; @@ -792,8 +788,6 @@ struct vmbus_device { bool allowed_in_isolated; }; -#define VMBUS_DEFAULT_MAX_PKT_SIZE 4096 - struct vmbus_channel { struct list_head listentry; @@ -1016,9 +1010,6 @@ struct vmbus_channel { /* request/transaction ids for VMBus */ struct vmbus_requestor requestor; u32 rqstor_size; - - /* The max size of a packet on this channel */ - u32 max_pkt_size; }; u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr); @@ -1660,44 +1651,15 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc) } -struct vmpacket_descriptor * -hv_pkt_iter_first_raw(struct vmbus_channel *channel); - struct vmpacket_descriptor * hv_pkt_iter_first(struct vmbus_channel *channel); struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt, - bool copy); + const struct vmpacket_descriptor *pkt); void hv_pkt_iter_close(struct vmbus_channel *channel); -static inline struct vmpacket_descriptor * -hv_pkt_iter_next_pkt(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt, - bool copy) -{ - struct vmpacket_descriptor *nxt; - - nxt = __hv_pkt_iter_next(channel, pkt, copy); - if (!nxt) - hv_pkt_iter_close(channel); - - return nxt; -} - -/* - * Get next packet descriptor without copying it out of the ring buffer - * If at end of list, return NULL and update host. - */ -static inline struct vmpacket_descriptor * -hv_pkt_iter_next_raw(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt) -{ - return hv_pkt_iter_next_pkt(channel, pkt, false); -} - /* * Get next packet descriptor from iterator * If at end of list, return NULL and update host. @@ -1706,7 +1668,13 @@ static inline struct vmpacket_descriptor * hv_pkt_iter_next(struct vmbus_channel *channel, const struct vmpacket_descriptor *pkt) { - return hv_pkt_iter_next_pkt(channel, pkt, true); + struct vmpacket_descriptor *nxt; + + nxt = __hv_pkt_iter_next(channel, pkt); + if (!nxt) + hv_pkt_iter_close(channel); + + return nxt; } #define foreach_vmbus_pkt(pkt, channel) \ diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index cd8b7c1ca9f1..630b851f8150 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -600,7 +600,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, return -EOPNOTSUPP; if (need_refill) { - hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan); + hvs->recv_desc = hv_pkt_iter_first(hvs->chan); ret = hvs_update_recv_data(hvs); if (ret) return ret; @@ -614,7 +614,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, hvs->recv_data_len -= to_read; if (hvs->recv_data_len == 0) { - hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc); + hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc); if (hvs->recv_desc) { ret = hvs_update_recv_data(hvs); if (ret) -- cgit v1.2.3