From 29fee6eed2811ff1089b30fc579a2d19d78016ab Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2018 17:42:33 +0000 Subject: xenbus: track caller request id Commit fd8aa9095a95 ("xen: optimize xenbus driver for multiple concurrent xenstore accesses") optimized xenbus concurrent accesses but in doing so broke UABI of /dev/xen/xenbus. Through /dev/xen/xenbus applications are in charge of xenbus message exchange with the correct header and body. Now, after the mentioned commit the replies received by application will no longer have the header req_id echoed back as it was on request (see specification below for reference), because that particular field is being overwritten by kernel. struct xsd_sockmsg { uint32_t type; /* XS_??? */ uint32_t req_id;/* Request identifier, echoed in daemon's response. */ uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */ uint32_t len; /* Length of data following this. */ /* Generally followed by nul-terminated string(s). */ }; Before there was only one request at a time so req_id could simply be forwarded back and forth. To allow simultaneous requests we need a different req_id for each message thus kernel keeps a monotonic increasing counter for this field and is written on every request irrespective of userspace value. Forwarding again the req_id on userspace requests is not a solution because we would open the possibility of userspace-generated req_id colliding with kernel ones. So this patch instead takes another route which is to artificially keep user req_id while keeping the xenbus logic as is. We do that by saving the original req_id before xs_send(), use the private kernel counter as req_id and then once reply comes and was validated, we restore back the original req_id. Cc: # 4.11 Fixes: fd8aa9095a ("xen: optimize xenbus driver for multiple concurrent xenstore accesses") Reported-by: Bhavesh Davda Signed-off-by: Joao Martins Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus.h | 1 + drivers/xen/xenbus/xenbus_comms.c | 1 + drivers/xen/xenbus/xenbus_xs.c | 3 +++ 3 files changed, 5 insertions(+) diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 149c5e7efc89..092981171df1 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -76,6 +76,7 @@ struct xb_req_data { struct list_head list; wait_queue_head_t wq; struct xsd_sockmsg msg; + uint32_t caller_req_id; enum xsd_sockmsg_type type; char *body; const struct kvec *vec; diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 5b081a01779d..d239fc3c5e3d 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -309,6 +309,7 @@ static int process_msg(void) goto out; if (req->state == xb_req_state_wait_reply) { + req->msg.req_id = req->caller_req_id; req->msg.type = state.msg.type; req->msg.len = state.msg.len; req->body = state.body; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 3e59590c7254..3f3b29398ab8 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -227,6 +227,8 @@ static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) req->state = xb_req_state_queued; init_waitqueue_head(&req->wq); + /* Save the caller req_id and restore it later in the reply */ + req->caller_req_id = req->msg.req_id; req->msg.req_id = xs_request_enter(req); mutex_lock(&xb_write_mutex); @@ -310,6 +312,7 @@ static void *xs_talkv(struct xenbus_transaction t, req->num_vecs = num_vecs; req->cb = xs_wake_up; + msg.req_id = 0; msg.tx_id = t.id; msg.type = type; msg.len = 0; -- cgit v1.2.3 From 63e708f826bb21470155d37b103a75d8a9e25b18 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 7 Feb 2018 18:49:23 -0500 Subject: x86/xen: Calculate __max_logical_packages on PV domains The kernel panics on PV domains because native_smp_cpus_done() is only called for HVM domains. Calculate __max_logical_packages for PV domains. Fixes: b4c0a7326f5d ("x86/smpboot: Fix __max_logical_packages estimate") Signed-off-by: Prarit Bhargava Tested-and-reported-by: Simon Gaiser Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Dou Liyang Cc: Prarit Bhargava Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Andy Lutomirski Cc: Andi Kleen Cc: Vitaly Kuznetsov Cc: xen-devel@lists.xenproject.org Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 10 ++++++++-- arch/x86/xen/smp.c | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 461f53d27708..a4189762b266 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -129,6 +129,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); +void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index cfc61e1d45e2..9eee25d07586 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1281,11 +1281,10 @@ void __init native_smp_prepare_boot_cpu(void) cpu_set_state_online(me); } -void __init native_smp_cpus_done(unsigned int max_cpus) +void __init calculate_max_logical_packages(void) { int ncpus; - pr_debug("Boot done\n"); /* * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. @@ -1293,6 +1292,13 @@ void __init native_smp_cpus_done(unsigned int max_cpus) ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); +} + +void __init native_smp_cpus_done(unsigned int max_cpus) +{ + pr_debug("Boot done\n"); + + calculate_max_logical_packages(); if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 77c959cf81e7..7a43b2ae19f1 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -122,6 +122,8 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); + else + calculate_max_logical_packages(); if (xen_have_vcpu_info_placement) return; -- cgit v1.2.3 From 64d6871827b1e2ac8c9daf49f2c883378c7d50cd Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 14 Feb 2018 10:28:23 -0800 Subject: pvcalls-front: introduce a per sock_mapping refcount Introduce a per sock_mapping refcount, in addition to the existing global refcount. Thanks to the sock_mapping refcount, we can safely wait for it to be 1 in pvcalls_front_release before freeing an active socket, instead of waiting for the global refcount to be 1. Signed-off-by: Stefano Stabellini Acked-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/pvcalls-front.c | 191 ++++++++++++++++++-------------------------- 1 file changed, 79 insertions(+), 112 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 753d9cb437d0..11ce470b41a5 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -60,6 +60,7 @@ struct sock_mapping { bool active_socket; struct list_head list; struct socket *sock; + atomic_t refcount; union { struct { int irq; @@ -93,6 +94,32 @@ struct sock_mapping { }; }; +static inline struct sock_mapping *pvcalls_enter_sock(struct socket *sock) +{ + struct sock_mapping *map; + + if (!pvcalls_front_dev || + dev_get_drvdata(&pvcalls_front_dev->dev) == NULL) + return ERR_PTR(-ENOTCONN); + + map = (struct sock_mapping *)sock->sk->sk_send_head; + if (map == NULL) + return ERR_PTR(-ENOTSOCK); + + pvcalls_enter(); + atomic_inc(&map->refcount); + return map; +} + +static inline void pvcalls_exit_sock(struct socket *sock) +{ + struct sock_mapping *map; + + map = (struct sock_mapping *)sock->sk->sk_send_head; + atomic_dec(&map->refcount); + pvcalls_exit(); +} + static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) { *req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); @@ -369,31 +396,23 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *)sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - spin_lock(&bedata->socket_lock); ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } ret = create_active(map, &evtchn); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -423,7 +442,7 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, smp_rmb(); ret = bedata->rsp[req_id].ret; bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -488,23 +507,15 @@ int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB)) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - mutex_lock(&map->active.out_mutex); if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) { mutex_unlock(&map->active.out_mutex); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EAGAIN; } if (len > INT_MAX) @@ -526,7 +537,7 @@ again: tot_sent = sent; mutex_unlock(&map->active.out_mutex); - pvcalls_exit(); + pvcalls_exit_sock(sock); return tot_sent; } @@ -591,19 +602,11 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC)) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - mutex_lock(&map->active.in_mutex); if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); @@ -623,7 +626,7 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ret = 0; mutex_unlock(&map->active.in_mutex); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -637,24 +640,16 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (map == NULL) { - pvcalls_exit(); - return -ENOTSOCK; - } - spin_lock(&bedata->socket_lock); ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } req = RING_GET_REQUEST(&bedata->ring, req_id); @@ -684,7 +679,7 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; map->passive.status = PVCALLS_STATUS_BIND; - pvcalls_exit(); + pvcalls_exit_sock(sock); return 0; } @@ -695,21 +690,13 @@ int pvcalls_front_listen(struct socket *sock, int backlog) struct xen_pvcalls_request *req; int notify, req_id, ret; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - if (map->passive.status != PVCALLS_STATUS_BIND) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EOPNOTSUPP; } @@ -717,7 +704,7 @@ int pvcalls_front_listen(struct socket *sock, int backlog) ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } req = RING_GET_REQUEST(&bedata->ring, req_id); @@ -741,7 +728,7 @@ int pvcalls_front_listen(struct socket *sock, int backlog) bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; map->passive.status = PVCALLS_STATUS_LISTEN; - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -753,21 +740,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) struct xen_pvcalls_request *req; int notify, req_id, ret, evtchn, nonblock; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - if (map->passive.status != PVCALLS_STATUS_LISTEN) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EINVAL; } @@ -785,13 +764,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) goto received; } if (nonblock) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EAGAIN; } if (wait_event_interruptible(map->passive.inflight_accept_req, !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags))) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EINTR; } } @@ -802,7 +781,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } map2 = kzalloc(sizeof(*map2), GFP_ATOMIC); @@ -810,7 +789,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -ENOMEM; } ret = create_active(map2, &evtchn); @@ -819,7 +798,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } list_add_tail(&map2->list, &bedata->socket_mappings); @@ -841,13 +820,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) /* We could check if we have received a response before returning. */ if (nonblock) { WRITE_ONCE(map->passive.inflight_req_id, req_id); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EAGAIN; } if (wait_event_interruptible(bedata->inflight_req, READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EINTR; } /* read req_id, then the content */ @@ -862,7 +841,7 @@ received: clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); pvcalls_front_free_map(bedata, map2); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -ENOMEM; } newsock->sk->sk_send_head = (void *)map2; @@ -874,7 +853,7 @@ received: clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); wake_up(&map->passive.inflight_accept_req); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -965,23 +944,16 @@ __poll_t pvcalls_front_poll(struct file *file, struct socket *sock, struct sock_mapping *map; __poll_t ret; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) return EPOLLNVAL; - } bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return EPOLLNVAL; - } if (map->active_socket) ret = pvcalls_front_poll_active(file, bedata, map, wait); else ret = pvcalls_front_poll_passive(file, bedata, map, wait); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -995,25 +967,20 @@ int pvcalls_front_release(struct socket *sock) if (sock->sk == NULL) return 0; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -EIO; + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) { + if (PTR_ERR(map) == -ENOTCONN) + return -EIO; + else + return 0; } - bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (map == NULL) { - pvcalls_exit(); - return 0; - } - spin_lock(&bedata->socket_lock); ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } sock->sk->sk_send_head = NULL; @@ -1043,10 +1010,10 @@ int pvcalls_front_release(struct socket *sock) /* * We need to make sure that sendmsg/recvmsg on this socket have * not started before we've cleared sk_send_head here. The - * easiest (though not optimal) way to guarantee this is to see - * that no pvcall (other than us) is in progress. + * easiest way to guarantee this is to see that no pvcalls + * (other than us) is in progress on this socket. */ - while (atomic_read(&pvcalls_refcount) > 1) + while (atomic_read(&map->refcount) > 1) cpu_relax(); pvcalls_front_free_map(bedata, map); -- cgit v1.2.3 From d1a75e0896f5e9f5cb6a979caaea39f1f4b9feb1 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 14 Feb 2018 10:28:24 -0800 Subject: pvcalls-front: wait for other operations to return when release passive sockets Passive sockets can have ongoing operations on them, specifically, we have two wait_event_interruptable calls in pvcalls_front_accept. Add two wake_up calls in pvcalls_front_release, then wait for the potential waiters to return and release the sock_mapping refcount. Signed-off-by: Stefano Stabellini Acked-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/pvcalls-front.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 11ce470b41a5..aedbee3b2838 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1018,6 +1018,12 @@ int pvcalls_front_release(struct socket *sock) pvcalls_front_free_map(bedata, map); } else { + wake_up(&bedata->inflight_req); + wake_up(&map->passive.inflight_accept_req); + + while (atomic_read(&map->refcount) > 1) + cpu_relax(); + spin_lock(&bedata->socket_lock); list_del(&map->list); spin_unlock(&bedata->socket_lock); -- cgit v1.2.3