summaryrefslogtreecommitdiff
path: root/net/vmw_vsock
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-07-01 01:51:09 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-07-01 01:51:09 +0300
commitdbe69e43372212527abf48609aba7fc39a6daa27 (patch)
tree96cfafdf70f5325ceeac1054daf7deca339c9730 /net/vmw_vsock
parenta6eaf3850cb171c328a8b0db6d3c79286a1eba9d (diff)
parentb6df00789e2831fff7a2c65aa7164b2a4dcbe599 (diff)
downloadlinux-dbe69e43372212527abf48609aba7fc39a6daa27.tar.xz
Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - BPF: - add syscall program type and libbpf support for generating instructions and bindings for in-kernel BPF loaders (BPF loaders for BPF), this is a stepping stone for signed BPF programs - infrastructure to migrate TCP child sockets from one listener to another in the same reuseport group/map to improve flexibility of service hand-off/restart - add broadcast support to XDP redirect - allow bypass of the lockless qdisc to improving performance (for pktgen: +23% with one thread, +44% with 2 threads) - add a simpler version of "DO_ONCE()" which does not require jump labels, intended for slow-path usage - virtio/vsock: introduce SOCK_SEQPACKET support - add getsocketopt to retrieve netns cookie - ip: treat lowest address of a IPv4 subnet as ordinary unicast address allowing reclaiming of precious IPv4 addresses - ipv6: use prandom_u32() for ID generation - ip: add support for more flexible field selection for hashing across multi-path routes (w/ offload to mlxsw) - icmp: add support for extended RFC 8335 PROBE (ping) - seg6: add support for SRv6 End.DT46 behavior - mptcp: - DSS checksum support (RFC 8684) to detect middlebox meddling - support Connection-time 'C' flag - time stamping support - sctp: packetization Layer Path MTU Discovery (RFC 8899) - xfrm: speed up state addition with seq set - WiFi: - hidden AP discovery on 6 GHz and other HE 6 GHz improvements - aggregation handling improvements for some drivers - minstrel improvements for no-ack frames - deferred rate control for TXQs to improve reaction times - switch from round robin to virtual time-based airtime scheduler - add trace points: - tcp checksum errors - openvswitch - action execution, upcalls - socket errors via sk_error_report Device APIs: - devlink: add rate API for hierarchical control of max egress rate of virtual devices (VFs, SFs etc.) - don't require RCU read lock to be held around BPF hooks in NAPI context - page_pool: generic buffer recycling New hardware/drivers: - mobile: - iosm: PCIe Driver for Intel M.2 Modem - support for Qualcomm MSM8998 (ipa) - WiFi: Qualcomm QCN9074 and WCN6855 PCI devices - sparx5: Microchip SparX-5 family of Enterprise Ethernet switches - Mellanox BlueField Gigabit Ethernet (control NIC of the DPU) - NXP SJA1110 Automotive Ethernet 10-port switch - Qualcomm QCA8327 switch support (qca8k) - Mikrotik 10/25G NIC (atl1c) Driver changes: - ACPI support for some MDIO, MAC and PHY devices from Marvell and NXP (our first foray into MAC/PHY description via ACPI) - HW timestamping (PTP) support: bnxt_en, ice, sja1105, hns3, tja11xx - Mellanox/Nvidia NIC (mlx5) - NIC VF offload of L2 bridging - support IRQ distribution to Sub-functions - Marvell (prestera): - add flower and match all - devlink trap - link aggregation - Netronome (nfp): connection tracking offload - Intel 1GE (igc): add AF_XDP support - Marvell DPU (octeontx2): ingress ratelimit offload - Google vNIC (gve): new ring/descriptor format support - Qualcomm mobile (rmnet & ipa): inline checksum offload support - MediaTek WiFi (mt76) - mt7915 MSI support - mt7915 Tx status reporting - mt7915 thermal sensors support - mt7921 decapsulation offload - mt7921 enable runtime pm and deep sleep - Realtek WiFi (rtw88) - beacon filter support - Tx antenna path diversity support - firmware crash information via devcoredump - Qualcomm WiFi (wcn36xx) - Wake-on-WLAN support with magic packets and GTK rekeying - Micrel PHY (ksz886x/ksz8081): add cable test support" * tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2168 commits) tcp: change ICSK_CA_PRIV_SIZE definition tcp_yeah: check struct yeah size at compile time gve: DQO: Fix off by one in gve_rx_dqo() stmmac: intel: set PCI_D3hot in suspend stmmac: intel: Enable PHY WOL option in EHL net: stmmac: option to enable PHY WOL with PMT enabled net: say "local" instead of "static" addresses in ndo_dflt_fdb_{add,del} net: use netdev_info in ndo_dflt_fdb_{add,del} ptp: Set lookup cookie when creating a PTP PPS source. net: sock: add trace for socket errors net: sock: introduce sk_error_report net: dsa: replay the local bridge FDB entries pointing to the bridge dev too net: dsa: ensure during dsa_fdb_offload_notify that dev_hold and dev_put are on the same dev net: dsa: include fdb entries pointing to bridge in the host fdb list net: dsa: include bridge addresses which are local in the host fdb list net: dsa: sync static FDB entries on foreign interfaces to hardware net: dsa: install the host MDB and FDB entries in the master's RX filter net: dsa: reference count the FDB addresses at the cross-chip notifier level net: dsa: introduce a separate cross-chip notifier type for host FDBs net: dsa: reference count the MDB entries at the cross-chip notifier level ...
Diffstat (limited to 'net/vmw_vsock')
-rw-r--r--net/vmw_vsock/af_vsock.c470
-rw-r--r--net/vmw_vsock/virtio_transport.c30
-rw-r--r--net/vmw_vsock/virtio_transport_common.c178
-rw-r--r--net/vmw_vsock/vmci_transport.c6
-rw-r--r--net/vmw_vsock/vsock_loopback.c12
5 files changed, 509 insertions, 187 deletions
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 92a72f0e0d94..3e02cc3b24f8 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -415,8 +415,8 @@ static void vsock_deassign_transport(struct vsock_sock *vsk)
/* Assign a transport to a socket and call the .init transport callback.
*
- * Note: for stream socket this must be called when vsk->remote_addr is set
- * (e.g. during the connect() or when a connection request on a listener
+ * Note: for connection oriented socket this must be called when vsk->remote_addr
+ * is set (e.g. during the connect() or when a connection request on a listener
* socket is received).
* The vsk->remote_addr is used to decide which transport to use:
* - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if
@@ -452,6 +452,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
new_transport = transport_dgram;
break;
case SOCK_STREAM:
+ case SOCK_SEQPACKET:
if (vsock_use_local_transport(remote_cid))
new_transport = transport_local;
else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g ||
@@ -469,10 +470,10 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
return 0;
/* transport->release() must be called with sock lock acquired.
- * This path can only be taken during vsock_stream_connect(),
- * where we have already held the sock lock.
- * In the other cases, this function is called on a new socket
- * which is not assigned to any transport.
+ * This path can only be taken during vsock_connect(), where we
+ * have already held the sock lock. In the other cases, this
+ * function is called on a new socket which is not assigned to
+ * any transport.
*/
vsk->transport->release(vsk);
vsock_deassign_transport(vsk);
@@ -484,6 +485,14 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
if (!new_transport || !try_module_get(new_transport->module))
return -ENODEV;
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ if (!new_transport->seqpacket_allow ||
+ !new_transport->seqpacket_allow(remote_cid)) {
+ module_put(new_transport->module);
+ return -ESOCKTNOSUPPORT;
+ }
+ }
+
ret = new_transport->init(vsk, psk);
if (ret) {
module_put(new_transport->module);
@@ -604,8 +613,8 @@ out:
/**** SOCKET OPERATIONS ****/
-static int __vsock_bind_stream(struct vsock_sock *vsk,
- struct sockaddr_vm *addr)
+static int __vsock_bind_connectible(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
{
static u32 port;
struct sockaddr_vm new_addr;
@@ -649,9 +658,10 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
- /* Remove stream sockets from the unbound list and add them to the hash
- * table for easy lookup by its address. The unbound list is simply an
- * extra entry at the end of the hash table, a trick used by AF_UNIX.
+ /* Remove connection oriented sockets from the unbound list and add them
+ * to the hash table for easy lookup by its address. The unbound list
+ * is simply an extra entry at the end of the hash table, a trick used
+ * by AF_UNIX.
*/
__vsock_remove_bound(vsk);
__vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
@@ -684,8 +694,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
switch (sk->sk_socket->type) {
case SOCK_STREAM:
+ case SOCK_SEQPACKET:
spin_lock_bh(&vsock_table_lock);
- retval = __vsock_bind_stream(vsk, addr);
+ retval = __vsock_bind_connectible(vsk, addr);
spin_unlock_bh(&vsock_table_lock);
break;
@@ -768,6 +779,11 @@ static struct sock *__vsock_create(struct net *net,
return sk;
}
+static bool sock_type_connectible(u16 type)
+{
+ return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET);
+}
+
static void __vsock_release(struct sock *sk, int level)
{
if (sk) {
@@ -786,7 +802,7 @@ static void __vsock_release(struct sock *sk, int level)
if (vsk->transport)
vsk->transport->release(vsk);
- else if (sk->sk_type == SOCK_STREAM)
+ else if (sock_type_connectible(sk->sk_type))
vsock_remove_sock(vsk);
sock_orphan(sk);
@@ -844,6 +860,16 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
}
EXPORT_SYMBOL_GPL(vsock_stream_has_data);
+static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
+{
+ struct sock *sk = sk_vsock(vsk);
+
+ if (sk->sk_type == SOCK_SEQPACKET)
+ return vsk->transport->seqpacket_has_data(vsk);
+ else
+ return vsock_stream_has_data(vsk);
+}
+
s64 vsock_stream_has_space(struct vsock_sock *vsk)
{
return vsk->transport->stream_has_space(vsk);
@@ -937,10 +963,10 @@ static int vsock_shutdown(struct socket *sock, int mode)
if ((mode & ~SHUTDOWN_MASK) || !mode)
return -EINVAL;
- /* If this is a STREAM socket and it is not connected then bail out
- * immediately. If it is a DGRAM socket then we must first kick the
- * socket so that it wakes up from any sleeping calls, for example
- * recv(), and then afterwards return the error.
+ /* If this is a connection oriented socket and it is not connected then
+ * bail out immediately. If it is a DGRAM socket then we must first
+ * kick the socket so that it wakes up from any sleeping calls, for
+ * example recv(), and then afterwards return the error.
*/
sk = sock->sk;
@@ -948,7 +974,7 @@ static int vsock_shutdown(struct socket *sock, int mode)
lock_sock(sk);
if (sock->state == SS_UNCONNECTED) {
err = -ENOTCONN;
- if (sk->sk_type == SOCK_STREAM)
+ if (sock_type_connectible(sk->sk_type))
goto out;
} else {
sock->state = SS_DISCONNECTING;
@@ -961,7 +987,7 @@ static int vsock_shutdown(struct socket *sock, int mode)
sk->sk_shutdown |= mode;
sk->sk_state_change(sk);
- if (sk->sk_type == SOCK_STREAM) {
+ if (sock_type_connectible(sk->sk_type)) {
sock_reset_flag(sk, SOCK_DONE);
vsock_send_shutdown(sk, mode);
}
@@ -1016,7 +1042,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
if (!(sk->sk_shutdown & SEND_SHUTDOWN))
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
- } else if (sock->type == SOCK_STREAM) {
+ } else if (sock_type_connectible(sk->sk_type)) {
const struct vsock_transport *transport;
lock_sock(sk);
@@ -1255,7 +1281,7 @@ static void vsock_connect_timeout(struct work_struct *work)
(sk->sk_shutdown != SHUTDOWN_MASK)) {
sk->sk_state = TCP_CLOSE;
sk->sk_err = ETIMEDOUT;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
vsock_transport_cancel_pkt(vsk);
}
release_sock(sk);
@@ -1263,8 +1289,8 @@ static void vsock_connect_timeout(struct work_struct *work)
sock_put(sk);
}
-static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
- int addr_len, int flags)
+static int vsock_connect(struct socket *sock, struct sockaddr *addr,
+ int addr_len, int flags)
{
int err;
struct sock *sk;
@@ -1369,7 +1395,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
- sk->sk_state = TCP_CLOSE;
+ sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE;
sock->state = SS_UNCONNECTED;
vsock_transport_cancel_pkt(vsk);
goto out_wait;
@@ -1414,7 +1440,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
lock_sock(listener);
- if (sock->type != SOCK_STREAM) {
+ if (!sock_type_connectible(sock->type)) {
err = -EOPNOTSUPP;
goto out;
}
@@ -1491,7 +1517,7 @@ static int vsock_listen(struct socket *sock, int backlog)
lock_sock(sk);
- if (sock->type != SOCK_STREAM) {
+ if (!sock_type_connectible(sk->sk_type)) {
err = -EOPNOTSUPP;
goto out;
}
@@ -1535,11 +1561,11 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk,
vsk->buffer_size = val;
}
-static int vsock_stream_setsockopt(struct socket *sock,
- int level,
- int optname,
- sockptr_t optval,
- unsigned int optlen)
+static int vsock_connectible_setsockopt(struct socket *sock,
+ int level,
+ int optname,
+ sockptr_t optval,
+ unsigned int optlen)
{
int err;
struct sock *sk;
@@ -1617,10 +1643,10 @@ exit:
return err;
}
-static int vsock_stream_getsockopt(struct socket *sock,
- int level, int optname,
- char __user *optval,
- int __user *optlen)
+static int vsock_connectible_getsockopt(struct socket *sock,
+ int level, int optname,
+ char __user *optval,
+ int __user *optlen)
{
int err;
int len;
@@ -1688,8 +1714,8 @@ static int vsock_stream_getsockopt(struct socket *sock,
return 0;
}
-static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
- size_t len)
+static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
{
struct sock *sk;
struct vsock_sock *vsk;
@@ -1712,7 +1738,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
transport = vsk->transport;
- /* Callers should not provide a destination with stream sockets. */
+ /* Callers should not provide a destination with connection oriented
+ * sockets.
+ */
if (msg->msg_namelen) {
err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
goto out;
@@ -1803,9 +1831,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
* responsibility to check how many bytes we were able to send.
*/
- written = transport->stream_enqueue(
- vsk, msg,
- len - total_written);
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ written = transport->seqpacket_enqueue(vsk,
+ msg, len - total_written);
+ } else {
+ written = transport->stream_enqueue(vsk,
+ msg, len - total_written);
+ }
if (written < 0) {
err = -ENOMEM;
goto out_err;
@@ -1821,72 +1853,98 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
}
out_err:
- if (total_written > 0)
- err = total_written;
+ if (total_written > 0) {
+ /* Return number of written bytes only if:
+ * 1) SOCK_STREAM socket.
+ * 2) SOCK_SEQPACKET socket when whole buffer is sent.
+ */
+ if (sk->sk_type == SOCK_STREAM || total_written == len)
+ err = total_written;
+ }
out:
release_sock(sk);
return err;
}
-
-static int
-vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
- int flags)
+static int vsock_connectible_wait_data(struct sock *sk,
+ struct wait_queue_entry *wait,
+ long timeout,
+ struct vsock_transport_recv_notify_data *recv_data,
+ size_t target)
{
- struct sock *sk;
- struct vsock_sock *vsk;
const struct vsock_transport *transport;
+ struct vsock_sock *vsk;
+ s64 data;
int err;
- size_t target;
- ssize_t copied;
- long timeout;
- struct vsock_transport_recv_notify_data recv_data;
- DEFINE_WAIT(wait);
-
- sk = sock->sk;
vsk = vsock_sk(sk);
err = 0;
+ transport = vsk->transport;
- lock_sock(sk);
+ while ((data = vsock_connectible_has_data(vsk)) == 0) {
+ prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE);
- transport = vsk->transport;
+ if (sk->sk_err != 0 ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ (vsk->peer_shutdown & SEND_SHUTDOWN)) {
+ break;
+ }
- if (!transport || sk->sk_state != TCP_ESTABLISHED) {
- /* Recvmsg is supposed to return 0 if a peer performs an
- * orderly shutdown. Differentiate between that case and when a
- * peer has not connected or a local shutdown occurred with the
- * SOCK_DONE flag.
- */
- if (sock_flag(sk, SOCK_DONE))
- err = 0;
- else
- err = -ENOTCONN;
+ /* Don't wait for non-blocking sockets. */
+ if (timeout == 0) {
+ err = -EAGAIN;
+ break;
+ }
- goto out;
- }
+ if (recv_data) {
+ err = transport->notify_recv_pre_block(vsk, target, recv_data);
+ if (err < 0)
+ break;
+ }
- if (flags & MSG_OOB) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
- /* We don't check peer_shutdown flag here since peer may actually shut
- * down, but there can be data in the queue that a local socket can
- * receive.
- */
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- err = 0;
- goto out;
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeout);
+ break;
+ } else if (timeout == 0) {
+ err = -EAGAIN;
+ break;
+ }
}
- /* It is valid on Linux to pass in a zero-length receive buffer. This
- * is not an error. We may as well bail out now.
+ finish_wait(sk_sleep(sk), wait);
+
+ if (err)
+ return err;
+
+ /* Internal transport error when checking for available
+ * data. XXX This should be changed to a connection
+ * reset in a later change.
*/
- if (!len) {
- err = 0;
- goto out;
- }
+ if (data < 0)
+ return -ENOMEM;
+
+ return data;
+}
+
+static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct vsock_transport_recv_notify_data recv_data;
+ const struct vsock_transport *transport;
+ struct vsock_sock *vsk;
+ ssize_t copied;
+ size_t target;
+ long timeout;
+ int err;
+
+ DEFINE_WAIT(wait);
+
+ vsk = vsock_sk(sk);
+ transport = vsk->transport;
/* We must not copy less than target bytes into the user's buffer
* before returning successfully, so we wait for the consume queue to
@@ -1908,94 +1966,158 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
while (1) {
- s64 ready;
+ ssize_t read;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- ready = vsock_stream_has_data(vsk);
+ err = vsock_connectible_wait_data(sk, &wait, timeout,
+ &recv_data, target);
+ if (err <= 0)
+ break;
- if (ready == 0) {
- if (sk->sk_err != 0 ||
- (sk->sk_shutdown & RCV_SHUTDOWN) ||
- (vsk->peer_shutdown & SEND_SHUTDOWN)) {
- finish_wait(sk_sleep(sk), &wait);
- break;
- }
- /* Don't wait for non-blocking sockets. */
- if (timeout == 0) {
- err = -EAGAIN;
- finish_wait(sk_sleep(sk), &wait);
- break;
- }
+ err = transport->notify_recv_pre_dequeue(vsk, target,
+ &recv_data);
+ if (err < 0)
+ break;
- err = transport->notify_recv_pre_block(
- vsk, target, &recv_data);
- if (err < 0) {
- finish_wait(sk_sleep(sk), &wait);
- break;
- }
- release_sock(sk);
- timeout = schedule_timeout(timeout);
- lock_sock(sk);
+ read = transport->stream_dequeue(vsk, msg, len - copied, flags);
+ if (read < 0) {
+ err = -ENOMEM;
+ break;
+ }
- if (signal_pending(current)) {
- err = sock_intr_errno(timeout);
- finish_wait(sk_sleep(sk), &wait);
- break;
- } else if (timeout == 0) {
- err = -EAGAIN;
- finish_wait(sk_sleep(sk), &wait);
- break;
- }
- } else {
- ssize_t read;
+ copied += read;
- finish_wait(sk_sleep(sk), &wait);
+ err = transport->notify_recv_post_dequeue(vsk, target, read,
+ !(flags & MSG_PEEK), &recv_data);
+ if (err < 0)
+ goto out;
- if (ready < 0) {
- /* Invalid queue pair content. XXX This should
- * be changed to a connection reset in a later
- * change.
- */
+ if (read >= target || flags & MSG_PEEK)
+ break;
- err = -ENOMEM;
- goto out;
- }
+ target -= read;
+ }
- err = transport->notify_recv_pre_dequeue(
- vsk, target, &recv_data);
- if (err < 0)
- break;
+ if (sk->sk_err)
+ err = -sk->sk_err;
+ else if (sk->sk_shutdown & RCV_SHUTDOWN)
+ err = 0;
- read = transport->stream_dequeue(
- vsk, msg,
- len - copied, flags);
- if (read < 0) {
- err = -ENOMEM;
- break;
- }
+ if (copied > 0)
+ err = copied;
+
+out:
+ return err;
+}
- copied += read;
+static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags)
+{
+ const struct vsock_transport *transport;
+ struct vsock_sock *vsk;
+ ssize_t record_len;
+ long timeout;
+ int err = 0;
+ DEFINE_WAIT(wait);
- err = transport->notify_recv_post_dequeue(
- vsk, target, read,
- !(flags & MSG_PEEK), &recv_data);
- if (err < 0)
- goto out;
+ vsk = vsock_sk(sk);
+ transport = vsk->transport;
- if (read >= target || flags & MSG_PEEK)
- break;
+ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
- target -= read;
- }
+ err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0);
+ if (err <= 0)
+ goto out;
+
+ record_len = transport->seqpacket_dequeue(vsk, msg, flags);
+
+ if (record_len < 0) {
+ err = -ENOMEM;
+ goto out;
}
- if (sk->sk_err)
+ if (sk->sk_err) {
err = -sk->sk_err;
- else if (sk->sk_shutdown & RCV_SHUTDOWN)
+ } else if (sk->sk_shutdown & RCV_SHUTDOWN) {
err = 0;
+ } else {
+ /* User sets MSG_TRUNC, so return real length of
+ * packet.
+ */
+ if (flags & MSG_TRUNC)
+ err = record_len;
+ else
+ err = len - msg_data_left(msg);
- if (copied > 0)
- err = copied;
+ /* Always set MSG_TRUNC if real length of packet is
+ * bigger than user's buffer.
+ */
+ if (record_len > len)
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+out:
+ return err;
+}
+
+static int
+vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
+ int err;
+
+ DEFINE_WAIT(wait);
+
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+ err = 0;
+
+ lock_sock(sk);
+
+ transport = vsk->transport;
+
+ if (!transport || sk->sk_state != TCP_ESTABLISHED) {
+ /* Recvmsg is supposed to return 0 if a peer performs an
+ * orderly shutdown. Differentiate between that case and when a
+ * peer has not connected or a local shutdown occurred with the
+ * SOCK_DONE flag.
+ */
+ if (sock_flag(sk, SOCK_DONE))
+ err = 0;
+ else
+ err = -ENOTCONN;
+
+ goto out;
+ }
+
+ if (flags & MSG_OOB) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* We don't check peer_shutdown flag here since peer may actually shut
+ * down, but there can be data in the queue that a local socket can
+ * receive.
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ err = 0;
+ goto out;
+ }
+
+ /* It is valid on Linux to pass in a zero-length receive buffer. This
+ * is not an error. We may as well bail out now.
+ */
+ if (!len) {
+ err = 0;
+ goto out;
+ }
+
+ if (sk->sk_type == SOCK_STREAM)
+ err = __vsock_stream_recvmsg(sk, msg, len, flags);
+ else
+ err = __vsock_seqpacket_recvmsg(sk, msg, len, flags);
out:
release_sock(sk);
@@ -2007,7 +2129,7 @@ static const struct proto_ops vsock_stream_ops = {
.owner = THIS_MODULE,
.release = vsock_release,
.bind = vsock_bind,
- .connect = vsock_stream_connect,
+ .connect = vsock_connect,
.socketpair = sock_no_socketpair,
.accept = vsock_accept,
.getname = vsock_getname,
@@ -2015,10 +2137,31 @@ static const struct proto_ops vsock_stream_ops = {
.ioctl = sock_no_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
- .setsockopt = vsock_stream_setsockopt,
- .getsockopt = vsock_stream_getsockopt,
- .sendmsg = vsock_stream_sendmsg,
- .recvmsg = vsock_stream_recvmsg,
+ .setsockopt = vsock_connectible_setsockopt,
+ .getsockopt = vsock_connectible_getsockopt,
+ .sendmsg = vsock_connectible_sendmsg,
+ .recvmsg = vsock_connectible_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct proto_ops vsock_seqpacket_ops = {
+ .family = PF_VSOCK,
+ .owner = THIS_MODULE,
+ .release = vsock_release,
+ .bind = vsock_bind,
+ .connect = vsock_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = vsock_accept,
+ .getname = vsock_getname,
+ .poll = vsock_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = vsock_listen,
+ .shutdown = vsock_shutdown,
+ .setsockopt = vsock_connectible_setsockopt,
+ .getsockopt = vsock_connectible_getsockopt,
+ .sendmsg = vsock_connectible_sendmsg,
+ .recvmsg = vsock_connectible_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
@@ -2043,6 +2186,9 @@ static int vsock_create(struct net *net, struct socket *sock,
case SOCK_STREAM:
sock->ops = &vsock_stream_ops;
break;
+ case SOCK_SEQPACKET:
+ sock->ops = &vsock_seqpacket_ops;
+ break;
default:
return -ESOCKTNOSUPPORT;
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 2700a63ab095..e0c2c992ad9c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -62,6 +62,7 @@ struct virtio_vsock {
struct virtio_vsock_event event_list[8];
u32 guest_cid;
+ bool seqpacket_allow;
};
static u32 virtio_transport_get_local_cid(void)
@@ -359,7 +360,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
lock_sock(sk);
sk->sk_state = TCP_CLOSE;
sk->sk_err = ECONNRESET;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
release_sock(sk);
}
@@ -443,6 +444,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}
+static bool virtio_transport_seqpacket_allow(u32 remote_cid);
+
static struct virtio_transport virtio_transport = {
.transport = {
.module = THIS_MODULE,
@@ -469,6 +472,11 @@ static struct virtio_transport virtio_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,
+ .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
+ .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
+ .seqpacket_allow = virtio_transport_seqpacket_allow,
+ .seqpacket_has_data = virtio_transport_seqpacket_has_data,
+
.notify_poll_in = virtio_transport_notify_poll_in,
.notify_poll_out = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
@@ -485,6 +493,21 @@ static struct virtio_transport virtio_transport = {
.send_pkt = virtio_transport_send_pkt,
};
+static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+{
+ struct virtio_vsock *vsock;
+ bool seqpacket_allow;
+
+ seqpacket_allow = false;
+ rcu_read_lock();
+ vsock = rcu_dereference(the_virtio_vsock);
+ if (vsock)
+ seqpacket_allow = vsock->seqpacket_allow;
+ rcu_read_unlock();
+
+ return seqpacket_allow;
+}
+
static void virtio_transport_rx_work(struct work_struct *work)
{
struct virtio_vsock *vsock =
@@ -608,10 +631,14 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->event_run = true;
mutex_unlock(&vsock->event_lock);
+ if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
+ vsock->seqpacket_allow = true;
+
vdev->priv = vsock;
rcu_assign_pointer(the_virtio_vsock, vsock);
mutex_unlock(&the_virtio_vsock_mutex);
+
return 0;
out:
@@ -695,6 +722,7 @@ static struct virtio_device_id id_table[] = {
};
static unsigned int features[] = {
+ VIRTIO_VSOCK_F_SEQPACKET
};
static struct virtio_driver virtio_vsock_driver = {
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 902cb6dd710b..169ba8b72a63 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -74,6 +74,10 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
err = memcpy_from_msg(pkt->buf, info->msg, len);
if (err)
goto out;
+
+ if (msg_data_left(info->msg) == 0 &&
+ info->type == VIRTIO_VSOCK_TYPE_SEQPACKET)
+ pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
}
trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -165,6 +169,14 @@ void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt)
}
EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt);
+static u16 virtio_transport_get_type(struct sock *sk)
+{
+ if (sk->sk_type == SOCK_STREAM)
+ return VIRTIO_VSOCK_TYPE_STREAM;
+ else
+ return VIRTIO_VSOCK_TYPE_SEQPACKET;
+}
+
/* This function can only be used on connecting/connected sockets,
* since a socket assigned to a transport is required.
*
@@ -179,6 +191,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
struct virtio_vsock_pkt *pkt;
u32 pkt_len = info->pkt_len;
+ info->type = virtio_transport_get_type(sk_vsock(vsk));
+
t_ops = virtio_transport_get_ops(vsk);
if (unlikely(!t_ops))
return -EFAULT;
@@ -269,13 +283,10 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
}
EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
-static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
- int type,
- struct virtio_vsock_hdr *hdr)
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
{
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
- .type = type,
.vsk = vsk,
};
@@ -383,11 +394,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
* messages, we set the limit to a high value. TODO: experiment
* with different values.
*/
- if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
- virtio_transport_send_credit_update(vsk,
- VIRTIO_VSOCK_TYPE_STREAM,
- NULL);
- }
+ if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+ virtio_transport_send_credit_update(vsk);
return total;
@@ -397,6 +405,75 @@ out:
return err;
}
+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ int flags)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ struct virtio_vsock_pkt *pkt;
+ int dequeued_len = 0;
+ size_t user_buf_len = msg_data_left(msg);
+ bool msg_ready = false;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ if (vvs->msg_count == 0) {
+ spin_unlock_bh(&vvs->rx_lock);
+ return 0;
+ }
+
+ while (!msg_ready) {
+ pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list);
+
+ if (dequeued_len >= 0) {
+ size_t pkt_len;
+ size_t bytes_to_copy;
+
+ pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
+ bytes_to_copy = min(user_buf_len, pkt_len);
+
+ if (bytes_to_copy) {
+ int err;
+
+ /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy);
+ if (err) {
+ /* Copy of message failed. Rest of
+ * fragments will be freed without copy.
+ */
+ dequeued_len = err;
+ } else {
+ user_buf_len -= bytes_to_copy;
+ }
+
+ spin_lock_bh(&vvs->rx_lock);
+ }
+
+ if (dequeued_len >= 0)
+ dequeued_len += pkt_len;
+ }
+
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+ msg_ready = true;
+ vvs->msg_count--;
+ }
+
+ virtio_transport_dec_rx_pkt(vvs, pkt);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+
+ spin_unlock_bh(&vvs->rx_lock);
+
+ virtio_transport_send_credit_update(vsk);
+
+ return dequeued_len;
+}
+
ssize_t
virtio_transport_stream_dequeue(struct vsock_sock *vsk,
struct msghdr *msg,
@@ -409,6 +486,38 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk,
}
EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ int flags)
+{
+ if (flags & MSG_PEEK)
+ return -EOPNOTSUPP;
+
+ return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);
+
+int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ spin_lock_bh(&vvs->tx_lock);
+
+ if (len > vvs->peer_buf_alloc) {
+ spin_unlock_bh(&vvs->tx_lock);
+ return -EMSGSIZE;
+ }
+
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return virtio_transport_stream_enqueue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue);
+
int
virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
struct msghdr *msg,
@@ -431,6 +540,19 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
}
EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ u32 msg_count;
+
+ spin_lock_bh(&vvs->rx_lock);
+ msg_count = vvs->msg_count;
+ spin_unlock_bh(&vvs->rx_lock);
+
+ return msg_count;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data);
+
static s64 virtio_transport_has_space(struct vsock_sock *vsk)
{
struct virtio_vsock_sock *vvs = vsk->trans;
@@ -496,8 +618,7 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val)
vvs->buf_alloc = *val;
- virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
- NULL);
+ virtio_transport_send_credit_update(vsk);
}
EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size);
@@ -624,7 +745,6 @@ int virtio_transport_connect(struct vsock_sock *vsk)
{
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_REQUEST,
- .type = VIRTIO_VSOCK_TYPE_STREAM,
.vsk = vsk,
};
@@ -636,7 +756,6 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
{
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_SHUTDOWN,
- .type = VIRTIO_VSOCK_TYPE_STREAM,
.flags = (mode & RCV_SHUTDOWN ?
VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
(mode & SEND_SHUTDOWN ?
@@ -665,7 +784,6 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
{
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RW,
- .type = VIRTIO_VSOCK_TYPE_STREAM,
.msg = msg,
.pkt_len = len,
.vsk = vsk,
@@ -688,7 +806,6 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
{
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RST,
- .type = VIRTIO_VSOCK_TYPE_STREAM,
.reply = !!pkt,
.vsk = vsk,
};
@@ -848,7 +965,7 @@ void virtio_transport_release(struct vsock_sock *vsk)
struct sock *sk = &vsk->sk;
bool remove_sock = true;
- if (sk->sk_type == SOCK_STREAM)
+ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
remove_sock = virtio_transport_close(vsk);
if (remove_sock) {
@@ -890,7 +1007,7 @@ destroy:
virtio_transport_reset(vsk, pkt);
sk->sk_state = TCP_CLOSE;
sk->sk_err = skerr;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return err;
}
@@ -912,6 +1029,9 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
goto out;
}
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
+ vvs->msg_count++;
+
/* Try to copy small packets into the buffer of last packet queued,
* to avoid wasting memory queueing the entire buffer with a small
* payload.
@@ -923,13 +1043,18 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
struct virtio_vsock_pkt, list);
/* If there is space in the last packet queued, we copy the
- * new packet in its buffer.
+ * new packet in its buffer. We avoid this if the last packet
+ * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is
+ * delimiter of SEQPACKET record, so 'pkt' is the first packet
+ * of a new record.
*/
- if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
+ if ((pkt->len <= last_pkt->buf_len - last_pkt->len) &&
+ !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) {
memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
pkt->len);
last_pkt->len += pkt->len;
free_pkt = true;
+ last_pkt->hdr.flags |= pkt->hdr.flags;
goto out;
}
}
@@ -1000,7 +1125,6 @@ virtio_transport_send_response(struct vsock_sock *vsk,
{
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RESPONSE,
- .type = VIRTIO_VSOCK_TYPE_STREAM,
.remote_cid = le64_to_cpu(pkt->hdr.src_cid),
.remote_port = le32_to_cpu(pkt->hdr.src_port),
.reply = true,
@@ -1096,6 +1220,12 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
return 0;
}
+static bool virtio_transport_valid_type(u16 type)
+{
+ return (type == VIRTIO_VSOCK_TYPE_STREAM) ||
+ (type == VIRTIO_VSOCK_TYPE_SEQPACKET);
+}
+
/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
* lock.
*/
@@ -1121,7 +1251,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
le32_to_cpu(pkt->hdr.buf_alloc),
le32_to_cpu(pkt->hdr.fwd_cnt));
- if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+ if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) {
(void)virtio_transport_reset_no_sock(t, pkt);
goto free_pkt;
}
@@ -1138,6 +1268,12 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
}
}
+ if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) {
+ (void)virtio_transport_reset_no_sock(t, pkt);
+ sock_put(sk);
+ goto free_pkt;
+ }
+
vsk = vsock_sk(sk);
lock_sock(sk);
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index c99bc4ce78e2..7aef34e32bdf 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -831,7 +831,7 @@ static void vmci_transport_handle_detach(struct sock *sk)
sk->sk_state = TCP_CLOSE;
sk->sk_err = ECONNRESET;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return;
}
sk->sk_state = TCP_CLOSE;
@@ -1248,7 +1248,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,
vsock_remove_pending(listener, pending);
vsock_enqueue_accept(listener, pending);
- /* Callers of accept() will be be waiting on the listening socket, not
+ /* Callers of accept() will be waiting on the listening socket, not
* the pending socket.
*/
listener->sk_data_ready(listener);
@@ -1365,7 +1365,7 @@ destroy:
sk->sk_state = TCP_CLOSE;
sk->sk_err = skerr;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return err;
}
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index a45f7ffca8c5..169a8cf65b39 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -63,6 +63,8 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
return 0;
}
+static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+
static struct virtio_transport loopback_transport = {
.transport = {
.module = THIS_MODULE,
@@ -89,6 +91,11 @@ static struct virtio_transport loopback_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,
+ .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
+ .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
+ .seqpacket_allow = vsock_loopback_seqpacket_allow,
+ .seqpacket_has_data = virtio_transport_seqpacket_has_data,
+
.notify_poll_in = virtio_transport_notify_poll_in,
.notify_poll_out = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
@@ -105,6 +112,11 @@ static struct virtio_transport loopback_transport = {
.send_pkt = vsock_loopback_send_pkt,
};
+static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
+{
+ return true;
+}
+
static void vsock_loopback_work(struct work_struct *work)
{
struct vsock_loopback *vsock =