From 5e7b30205cef80f6bb922e61834437ca7bff5837 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 4 Aug 2020 22:50:56 -0700 Subject: bpf: Change uapi for bpf iterator map elements Commit a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") added bpf iterator support for map elements. The map element bpf iterator requires info to identify a particular map. In the above commit, the attr->link_create.target_fd is used to carry map_fd and an enum bpf_iter_link_info is added to uapi to specify the target_fd actually representing a map_fd: enum bpf_iter_link_info { BPF_ITER_LINK_UNSPEC = 0, BPF_ITER_LINK_MAP_FD = 1, MAX_BPF_ITER_LINK_INFO, }; This is an extensible approach as we can grow enumerator for pid, cgroup_id, etc. and we can unionize target_fd for pid, cgroup_id, etc. But in the future, there are chances that more complex customization may happen, e.g., for tasks, it could be filtered based on both cgroup_id and user_id. This patch changed the uapi to have fields __aligned_u64 iter_info; __u32 iter_info_len; for additional iter_info for link_create. The iter_info is defined as union bpf_iter_link_info { struct { __u32 map_fd; } map; }; So future extension for additional customization will be easier. The bpf_iter_link_info will be passed to target callback to validate and generic bpf_iter framework does not need to deal it any more. Note that map_fd = 0 will be considered invalid and -EBADF will be returned to user space. Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com --- net/core/bpf_sk_storage.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d3377c90a291..b988f48153a4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -1384,18 +1384,39 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, return 0; } -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { - struct bpf_map *map = aux->map; + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) - return -EINVAL; + goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) - return -EACCES; + if (prog->aux->max_rdonly_access > map->value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { @@ -1414,8 +1435,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), -- cgit v1.2.3 From 7ee2492635d862fac39bc5ef234ffd3d8e9f833b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 7 Aug 2020 19:03:53 +0200 Subject: mptcp: fix warn at shutdown time for unaccepted msk sockets With commit b93df08ccda3 ("mptcp: explicitly track the fully established status"), the status of unaccepted mptcp closed in mptcp_sock_destruct() changes from TCP_SYN_RECV to TCP_ESTABLISHED. As a result mptcp_sock_destruct() does not perform the proper cleanup and inet_sock_destruct() will later emit a warn. Address the issue updating the condition tested in mptcp_sock_destruct(). Also update the related comment. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/66 Reported-and-tested-by: Christoph Paasch Fixes: b93df08ccda3 ("mptcp: explicitly track the fully established status") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 96f4f2fe50ad..e8cac2655c82 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -423,12 +423,12 @@ static void mptcp_sock_destruct(struct sock *sk) * also remove the mptcp socket, via * sock_put(ctx->conn). * - * Problem is that the mptcp socket will not be in - * SYN_RECV state and doesn't have SOCK_DEAD flag. + * Problem is that the mptcp socket will be in + * ESTABLISHED state and will not have the SOCK_DEAD flag. * Both result in warnings from inet_sock_destruct. */ - if (sk->sk_state == TCP_SYN_RECV) { + if (sk->sk_state == TCP_ESTABLISHED) { sk->sk_state = TCP_CLOSE; WARN_ON_ONCE(sk->sk_socket); sock_orphan(sk); -- cgit v1.2.3 From 1c3b63f155f637594268cd1add8335461691b314 Mon Sep 17 00:00:00 2001 From: Rouven Czerwinski Date: Thu, 6 Aug 2020 08:49:06 +0200 Subject: net/tls: allow MSG_CMSG_COMPAT in sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to use ktls on a system with 32-bit userspace and 64-bit kernel results in a EOPNOTSUPP message during sendmsg: setsockopt(3, SOL_TLS, TLS_TX, …, 40) = 0 sendmsg(3, …, msg_flags=0}, 0) = -1 EOPNOTSUPP (Operation not supported) The tls_sw implementation does strict flag checking and does not allow the MSG_CMSG_COMPAT flag, which is set if the message comes in through the compat syscall. This patch adds MSG_CMSG_COMPAT to the flag check to allow the usage of the TLS SW implementation on systems using the compat syscall path. Note that the same check is present in the sendmsg path for the TLS device implementation, however the flag hasn't been added there for lack of testing hardware. Signed-off-by: Rouven Czerwinski Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 710bd44eaa49..9a3d9fedd7aa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -935,7 +935,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret = 0; int pending; - if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_CMSG_COMPAT)) return -EOPNOTSUPP; mutex_lock(&tls_ctx->tx_lock); -- cgit v1.2.3 From 6b07edebe6d31529346e553a7b309bc5251da712 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Aug 2020 19:52:24 +0800 Subject: net: Use helper function fdput() Use helper function fdput() to fput() the file iff FDPUT_FPUT is set. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/socket.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index aff52e81653c..3c3d6abe4c1e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1804,8 +1804,7 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, upeer_addrlen, flags, rlimit(RLIMIT_NOFILE)); - if (f.flags) - fput(f.file); + fdput(f); } return ret; @@ -1868,8 +1867,7 @@ int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) ret = move_addr_to_kernel(uservaddr, addrlen, &address); if (!ret) ret = __sys_connect_file(f.file, &address, addrlen, 0); - if (f.flags) - fput(f.file); + fdput(f); } return ret; -- cgit v1.2.3 From ce787a5a074a86f76f5d3fd804fa78e01bfb9e89 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Aug 2020 19:53:16 +0800 Subject: net: Set fput_needed iff FDPUT_FPUT is set We should fput() file iff FDPUT_FPUT is set. So we should set fput_needed accordingly. Fixes: 00e188ef6a7e ("sockfd_lookup_light(): switch to fdget^W^Waway from fget_light") Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 3c3d6abe4c1e..e08415b4f939 100644 --- a/net/socket.c +++ b/net/socket.c @@ -500,7 +500,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) if (f.file) { sock = sock_from_file(f.file, err); if (likely(sock)) { - *fput_needed = f.flags; + *fput_needed = f.flags & FDPUT_FPUT; return sock; } fdput(f); -- cgit v1.2.3 From 47260ba937822cd1a57ee8b520b8789bdda5964e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Aug 2020 19:54:19 +0800 Subject: net: Remove meaningless jump label out_fs The out_fs jump label has nothing to do but goto out. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/socket.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index e08415b4f939..c61f036d24f5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3063,7 +3063,7 @@ static int __init sock_init(void) err = register_filesystem(&sock_fs_type); if (err) - goto out_fs; + goto out; sock_mnt = kern_mount(&sock_fs_type); if (IS_ERR(sock_mnt)) { err = PTR_ERR(sock_mnt); @@ -3086,7 +3086,6 @@ out: out_mount: unregister_filesystem(&sock_fs_type); -out_fs: goto out; } -- cgit v1.2.3 From 11f920d2aa9d4c2c2d53ccd79f4b5512f2169623 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Aug 2020 19:57:18 +0800 Subject: net: Use helper function ip_is_fragment() Use helper function ip_is_fragment() to check ip fragment. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2828f6d5ba89..7e2e502ef519 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4853,7 +4853,7 @@ static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) if (err < 0) goto out; - if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + if (ip_is_fragment(ip_hdr(skb))) fragment = true; off = ip_hdrlen(skb); -- cgit v1.2.3 From 7c7ab580db49cc7befe5f4b91bb1920cd6b07575 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 8 Aug 2020 16:23:10 +0800 Subject: net: Convert to use the fallthrough macro Convert the uses of fallthrough comments to fallthrough macro. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/socket.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index c61f036d24f5..f4d5998bdcba 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1325,7 +1325,7 @@ int sock_wake_async(struct socket_wq *wq, int how, int band) case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; - /* fall through */ + fallthrough; case SOCK_WAKE_IO: call_kill: kill_fasync(&wq->fasync_list, SIGIO, band); @@ -3158,13 +3158,13 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) return -ENOMEM; buf_size += rule_cnt * sizeof(u32); - /* fall through */ + fallthrough; case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_SRXCLSRLINS: convert_out = true; - /* fall through */ + fallthrough; case ETHTOOL_SRXCLSRLDEL: buf_size += sizeof(struct ethtool_rxnfc); convert_in = true; -- cgit v1.2.3 From 519a8a6cf91dda095be2d36216fc4ebc525270a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2020 18:42:14 +0200 Subject: net: Revert "net: optimize the sockptr_t for unified kernel/user address spaces" This reverts commits 6d04fe15f78acdf8e32329e208552e226f7a8ae6 and a31edb2059ed4e498f9aa8230c734b59d0ad797a. It turns out the idea to share a single pointer for both kernel and user space address causes various kinds of problems. So use the slightly less optimal version that uses an extra bit, but which is guaranteed to be safe everywhere. Fixes: 6d04fe15f78a ("net: optimize the sockptr_t for unified kernel/user address spaces") Reported-by: Eric Dumazet Reported-by: John Stultz Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/sockptr.h | 26 ++------------------------ net/ipv4/bpfilter/sockopt.c | 14 ++++++-------- net/socket.c | 6 +----- 3 files changed, 9 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 96840def9d69..ea193414298b 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -8,26 +8,9 @@ #ifndef _LINUX_SOCKPTR_H #define _LINUX_SOCKPTR_H -#include #include #include -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE -typedef union { - void *kernel; - void __user *user; -} sockptr_t; - -static inline bool sockptr_is_kernel(sockptr_t sockptr) -{ - return (unsigned long)sockptr.kernel >= TASK_SIZE; -} - -static inline sockptr_t KERNEL_SOCKPTR(void *p) -{ - return (sockptr_t) { .kernel = p }; -} -#else /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ typedef struct { union { void *kernel; @@ -45,15 +28,10 @@ static inline sockptr_t KERNEL_SOCKPTR(void *p) { return (sockptr_t) { .kernel = p, .is_kernel = true }; } -#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ -static inline int __must_check init_user_sockptr(sockptr_t *sp, void __user *p, - size_t size) +static inline sockptr_t USER_SOCKPTR(void __user *p) { - if (!access_ok(p, size)) - return -EFAULT; - *sp = (sockptr_t) { .user = p }; - return 0; + return (sockptr_t) { .user = p }; } static inline bool sockptr_is_null(sockptr_t sockptr) diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 545b2640f019..1b34cb9a7708 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -57,18 +57,16 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, return bpfilter_mbox_request(sk, optname, optval, optlen, true); } -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, - char __user *user_optval, int __user *optlen) +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) { - sockptr_t optval; - int err, len; + int len; if (get_user(len, optlen)) return -EFAULT; - err = init_user_sockptr(&optval, user_optval, len); - if (err) - return err; - return bpfilter_mbox_request(sk, optname, optval, len, false); + + return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len, + false); } static int __init bpfilter_sockopt_init(void) diff --git a/net/socket.c b/net/socket.c index f4d5998bdcba..dbbe8ea7d395 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2095,7 +2095,7 @@ static bool sock_use_custom_sol_socket(const struct socket *sock) int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { - sockptr_t optval; + sockptr_t optval = USER_SOCKPTR(user_optval); char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2103,10 +2103,6 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, if (optlen < 0) return -EINVAL; - err = init_user_sockptr(&optval, user_optval, optlen); - if (err) - return err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; -- cgit v1.2.3 From f19008e676366c44e9241af57f331b6c6edf9552 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2020 13:38:39 -0400 Subject: tcp: correct read of TFO keys on big endian systems When TFO keys are read back on big endian systems either via the global sysctl interface or via getsockopt() using TCP_FASTOPEN_KEY, the values don't match what was written. For example, on s390x: # echo "1-2-3-4" > /proc/sys/net/ipv4/tcp_fastopen_key # cat /proc/sys/net/ipv4/tcp_fastopen_key 02000000-01000000-04000000-03000000 Instead of: # cat /proc/sys/net/ipv4/tcp_fastopen_key 00000001-00000002-00000003-00000004 Fix this by converting to the correct endianness on read. This was reported by Colin Ian King when running the 'tcp_fastopen_backup_key' net selftest on s390x, which depends on the read value matching what was written. I've confirmed that the test now passes on big and little endian systems. Signed-off-by: Jason Baron Fixes: 438ac88009bc ("net: fastopen: robustness and endianness fixes for SipHash") Cc: Ard Biesheuvel Cc: Eric Dumazet Reported-and-tested-by: Colin Ian King Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/sysctl_net_ipv4.c | 16 ++++------------ net/ipv4/tcp.c | 16 ++++------------ net/ipv4/tcp_fastopen.c | 23 +++++++++++++++++++++++ 4 files changed, 33 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index dbf5c791a6eb..eab6c7510b5b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1672,6 +1672,8 @@ void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); +int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, + u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5653e3b011bf..54023a46db04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -301,24 +301,16 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)) }; - struct tcp_fastopen_context *ctx; - u32 user_key[TCP_FASTOPEN_KEY_MAX * 4]; - __le32 key[TCP_FASTOPEN_KEY_MAX * 4]; + u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)]; + __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)]; char *backup_data; - int ret, i = 0, off = 0, n_keys = 0; + int ret, i = 0, off = 0, n_keys; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; - rcu_read_lock(); - ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); - if (ctx) { - n_keys = tcp_fastopen_context_len(ctx); - memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys); - } - rcu_read_unlock(); - + n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key); if (!n_keys) { memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); n_keys = 1; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c06d2bfd2ec4..31f3b858db81 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3685,22 +3685,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; - struct tcp_fastopen_context *ctx; - unsigned int key_len = 0; + u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; + unsigned int key_len; if (get_user(len, optlen)) return -EFAULT; - rcu_read_lock(); - ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); - if (ctx) { - key_len = tcp_fastopen_context_len(ctx) * - TCP_FASTOPEN_KEY_LENGTH; - memcpy(&key[0], &ctx->key[0], key_len); - } - rcu_read_unlock(); - + key_len = tcp_fastopen_get_cipher(net, icsk, key) * + TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 19ad9586c720..1bb85821f1e6 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -108,6 +108,29 @@ out: return err; } +int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, + u64 *key) +{ + struct tcp_fastopen_context *ctx; + int n_keys = 0, i; + + rcu_read_lock(); + if (icsk) + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + else + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctx) { + n_keys = tcp_fastopen_context_len(ctx); + for (i = 0; i < n_keys; i++) { + put_unaligned_le64(ctx->key[i].key[0], key + (i * 2)); + put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1); + } + } + rcu_read_unlock(); + + return n_keys; +} + static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, struct sk_buff *syn, const siphash_key_t *key, -- cgit v1.2.3 From b06c19d9f827f6743122795570bfc0c72db482b0 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 10 Aug 2020 17:02:58 -0700 Subject: net/tls: Fix kmap usage When MSG_OOB is specified to tls_device_sendpage() the mapped page is never unmapped. Hold off mapping the page until after the flags are checked and the page is actually needed. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Ira Weiny Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 18fa6067bb7f..b74e2741f74f 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -561,7 +561,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter msg_iter; - char *kaddr = kmap(page); + char *kaddr; struct kvec iov; int rc; @@ -576,6 +576,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, goto out; } + kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); -- cgit v1.2.3 From 26896f01467a28651f7a536143fe5ac8449d4041 Mon Sep 17 00:00:00 2001 From: Qingyu Li Date: Mon, 10 Aug 2020 09:51:00 +0800 Subject: net/nfc/rawsock.c: add CAP_NET_RAW check. When creating a raw AF_NFC socket, CAP_NET_RAW needs to be checked first. Signed-off-by: Qingyu Li Signed-off-by: David S. Miller --- net/nfc/rawsock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index b2061b6746ea..955c195ae14b 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -328,10 +328,13 @@ static int rawsock_create(struct net *net, struct socket *sock, if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW)) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!capable(CAP_NET_RAW)) + return -EPERM; sock->ops = &rawsock_raw_ops; - else + } else { sock->ops = &rawsock_ops; + } sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) -- cgit v1.2.3 From 0f5907af39137f8183ed536aaa00f322d7365130 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 10 Aug 2020 08:16:58 -0400 Subject: net: Fix potential memory leak in proto_register() If we failed to assign proto idx, we free the twsk_slab_name but forget to free the twsk_slab. Add a helper function tw_prot_cleanup() to free these together and also use this helper function in proto_unregister(). Fixes: b45ce32135d1 ("sock: fix potential memory leak in proto_register()") Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/core/sock.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 49cd5ffe673e..c9083ad44ea1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3406,6 +3406,16 @@ static void sock_inuse_add(struct net *net, int val) } #endif +static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) +{ + if (!twsk_prot) + return; + kfree(twsk_prot->twsk_slab_name); + twsk_prot->twsk_slab_name = NULL; + kmem_cache_destroy(twsk_prot->twsk_slab); + twsk_prot->twsk_slab = NULL; +} + static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) @@ -3476,7 +3486,7 @@ int proto_register(struct proto *prot, int alloc_slab) prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) - goto out_free_timewait_sock_slab_name; + goto out_free_timewait_sock_slab; } } @@ -3484,15 +3494,15 @@ int proto_register(struct proto *prot, int alloc_slab) ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); - goto out_free_timewait_sock_slab_name; + goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; -out_free_timewait_sock_slab_name: +out_free_timewait_sock_slab: if (alloc_slab && prot->twsk_prot) - kfree(prot->twsk_prot->twsk_slab_name); + tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); @@ -3516,12 +3526,7 @@ void proto_unregister(struct proto *prot) prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); - - if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { - kmem_cache_destroy(prot->twsk_prot->twsk_slab); - kfree(prot->twsk_prot->twsk_slab_name); - prot->twsk_prot->twsk_slab = NULL; - } + tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); -- cgit v1.2.3 From 62ffc589abb176821662efc4525ee4ac0b9c3894 Mon Sep 17 00:00:00 2001 From: Tim Froidcoeur Date: Tue, 11 Aug 2020 20:33:23 +0200 Subject: net: refactor bind_bucket fastreuse into helper Refactor the fastreuse update code in inet_csk_get_port into a small helper function that can be called from other places. Acked-by: Matthieu Baerts Signed-off-by: Tim Froidcoeur Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 ++ net/ipv4/inet_connection_sock.c | 97 +++++++++++++++++++++----------------- 2 files changed, 57 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 1e209ce7d1bd..aa8893c68c50 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -304,6 +304,10 @@ void inet_csk_listen_stop(struct sock *sk); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); +/* update the fast reuse flag when adding a socket */ +void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk); + struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); #define TCP_PINGPONG_THRESH 3 diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d1a3913eebe0..b457dd2d6c75 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -296,6 +296,57 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } +void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + + if (hlist_empty(&tb->owners)) { + tb->fastreuse = reuse; + if (sk->sk_reuseport) { + tb->fastreuseport = FASTREUSEPORT_ANY; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } else { + tb->fastreuseport = 0; + } + } else { + if (!reuse) + tb->fastreuse = 0; + if (sk->sk_reuseport) { + /* We didn't match or we don't have fastreuseport set on + * the tb, but we have sk_reuseport set on this socket + * and we know that there are no bind conflicts with + * this socket in this tb, so reset our tb's reuseport + * settings so that any subsequent sockets that match + * our current socket will be put on the fast path. + * + * If we reset we need to set FASTREUSEPORT_STRICT so we + * do extra checking for all subsequent sk_reuseport + * socks. + */ + if (!sk_reuseport_match(tb, sk)) { + tb->fastreuseport = FASTREUSEPORT_STRICT; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } + } else { + tb->fastreuseport = 0; + } + } +} + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) @@ -308,7 +359,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); struct inet_bind_bucket *tb = NULL; - kuid_t uid = sock_i_uid(sk); int l3mdev; l3mdev = inet_sk_bound_l3mdev(sk); @@ -345,49 +395,8 @@ tb_found: goto fail_unlock; } success: - if (hlist_empty(&tb->owners)) { - tb->fastreuse = reuse; - if (sk->sk_reuseport) { - tb->fastreuseport = FASTREUSEPORT_ANY; - tb->fastuid = uid; - tb->fast_rcv_saddr = sk->sk_rcv_saddr; - tb->fast_ipv6_only = ipv6_only_sock(sk); - tb->fast_sk_family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; -#endif - } else { - tb->fastreuseport = 0; - } - } else { - if (!reuse) - tb->fastreuse = 0; - if (sk->sk_reuseport) { - /* We didn't match or we don't have fastreuseport set on - * the tb, but we have sk_reuseport set on this socket - * and we know that there are no bind conflicts with - * this socket in this tb, so reset our tb's reuseport - * settings so that any subsequent sockets that match - * our current socket will be put on the fast path. - * - * If we reset we need to set FASTREUSEPORT_STRICT so we - * do extra checking for all subsequent sk_reuseport - * socks. - */ - if (!sk_reuseport_match(tb, sk)) { - tb->fastreuseport = FASTREUSEPORT_STRICT; - tb->fastuid = uid; - tb->fast_rcv_saddr = sk->sk_rcv_saddr; - tb->fast_ipv6_only = ipv6_only_sock(sk); - tb->fast_sk_family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; -#endif - } - } else { - tb->fastreuseport = 0; - } - } + inet_csk_update_fastreuse(tb, sk); + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); -- cgit v1.2.3 From d76f3351cea2d927fdf70dd7c06898235035e84e Mon Sep 17 00:00:00 2001 From: Tim Froidcoeur Date: Tue, 11 Aug 2020 20:33:24 +0200 Subject: net: initialize fastreuse on inet_inherit_port In the case of TPROXY, bind_conflict optimizations for SO_REUSEADDR or SO_REUSEPORT are broken, possibly resulting in O(n) instead of O(1) bind behaviour or in the incorrect reuse of a bind. the kernel keeps track for each bind_bucket if all sockets in the bind_bucket support SO_REUSEADDR or SO_REUSEPORT in two fastreuse flags. These flags allow skipping the costly bind_conflict check when possible (meaning when all sockets have the proper SO_REUSE option). For every socket added to a bind_bucket, these flags need to be updated. As soon as a socket that does not support reuse is added, the flag is set to false and will never go back to true, unless the bind_bucket is deleted. Note that there is no mechanism to re-evaluate these flags when a socket is removed (this might make sense when removing a socket that would not allow reuse; this leaves room for a future patch). For this optimization to work, it is mandatory that these flags are properly initialized and updated. When a child socket is created from a listen socket in __inet_inherit_port, the TPROXY case could create a new bind bucket without properly initializing these flags, thus preventing the optimization to work. Alternatively, a socket not allowing reuse could be added to an existing bind bucket without updating the flags, causing bind_conflict to never be called as it should. Call inet_csk_update_fastreuse when __inet_inherit_port decides to create a new bind_bucket or use a different bind_bucket than the one of the listen socket. Fixes: 093d282321da ("tproxy: fix hash locking issue when using port redirection in __inet_inherit_port()") Acked-by: Matthieu Baerts Signed-off-by: Tim Froidcoeur Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4eb4cd8d20dd..239e54474b65 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -163,6 +163,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOMEM; } } + inet_csk_update_fastreuse(tb, child); } inet_bind_hash(child, tb, port); spin_unlock(&head->lock); -- cgit v1.2.3 From 1980c05844830a44708c98c96d600833aa3fae08 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 12 Aug 2020 14:56:02 +0200 Subject: vsock: fix potential null pointer dereference in vsock_poll() syzbot reported this issue where in the vsock_poll() we find the socket state at TCP_ESTABLISHED, but 'transport' is null: general protection fault, probably for non-canonical address 0xdffffc0000000012: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000090-0x0000000000000097] CPU: 0 PID: 8227 Comm: syz-executor.2 Not tainted 5.8.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:vsock_poll+0x75a/0x8e0 net/vmw_vsock/af_vsock.c:1038 Call Trace: sock_poll+0x159/0x460 net/socket.c:1266 vfs_poll include/linux/poll.h:90 [inline] do_pollfd fs/select.c:869 [inline] do_poll fs/select.c:917 [inline] do_sys_poll+0x607/0xd40 fs/select.c:1011 __do_sys_poll fs/select.c:1069 [inline] __se_sys_poll fs/select.c:1057 [inline] __x64_sys_poll+0x18c/0x440 fs/select.c:1057 do_syscall_64+0x60/0xe0 arch/x86/entry/common.c:384 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This issue can happen if the TCP_ESTABLISHED state is set after we read the vsk->transport in the vsock_poll(). We could put barriers to synchronize, but this can only happen during connection setup, so we can simply check that 'transport' is valid. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Reported-and-tested-by: syzbot+a61bac2fcc1a7c6623fe@syzkaller.appspotmail.com Signed-off-by: Stefano Garzarella Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 27bbcfad9c17..9e93bc201cc0 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1032,7 +1032,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, } /* Connected sockets that can produce data can be written. */ - if (sk->sk_state == TCP_ESTABLISHED) { + if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( -- cgit v1.2.3 From 06a7a37be55e29961c9ba2abec4d07c8e0e21861 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 12 Aug 2020 21:08:53 +0200 Subject: ipv4: tunnel: fix compilation on ARCH=um With certain configurations, a 64-bit ARCH=um errors out here with an unknown csum_ipv6_magic() function. Include the right header file to always have it. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 75c6013ff9a4..5cb30b7aa752 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -38,6 +38,7 @@ #include #include #include +#include const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; -- cgit v1.2.3 From 2e0d8fef5f76bce0887c73b824d9e625a08e7406 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Aug 2020 18:34:40 -0700 Subject: net: accept an empty mask in /sys/class/net/*/queues/rx-*/rps_cpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We must accept an empty mask in store_rps_map(), or we are not able to disable RPS on a queue. Fixes: 07bbecb34106 ("net: Restrict receive packets queuing to housekeeping CPUs") Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Cc: Alex Belits Cc: Nitesh Narayan Lal Cc: Peter Zijlstra (Intel) Reviewed-by: Maciej Żenczykowski Acked-by: Peter Zijlstra (Intel) Acked-by: Nitesh Narayan Lal Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9de33b594ff2..efec66fa78b7 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -757,11 +757,13 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, return err; } - hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; - cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); - if (cpumask_empty(mask)) { - free_cpumask_var(mask); - return -EINVAL; + if (!cpumask_empty(mask)) { + hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; + cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); + if (cpumask_empty(mask)) { + free_cpumask_var(mask); + return -EINVAL; + } } map = kzalloc(max_t(unsigned int, -- cgit v1.2.3 From 9643609423c7667fb748cc3ccff023d761d0ac90 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 12 Aug 2020 13:26:37 -0700 Subject: Revert "ipv4: tunnel: fix compilation on ARCH=um" This reverts commit 06a7a37be55e29961c9ba2abec4d07c8e0e21861. The bug was already fixed, this added a dup include. Reported-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 5cb30b7aa752..75c6013ff9a4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -38,7 +38,6 @@ #include #include #include -#include const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; -- cgit v1.2.3 From 88fd1cb80daa20af063bce81e1fad14e945a8dc4 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 13 Aug 2020 21:45:25 +0206 Subject: af_packet: TPACKET_V3: fix fill status rwlock imbalance After @blk_fill_in_prog_lock is acquired there is an early out vnet situation that can occur. In that case, the rwlock needs to be released. Also, since @blk_fill_in_prog_lock is only acquired when @tp_version is exactly TPACKET_V3, only release it on that exact condition as well. And finally, add sparse annotation so that it is clearer that prb_fill_curr_block() and prb_clear_blk_fill_status() are acquiring and releasing @blk_fill_in_prog_lock, respectively. sparse is still unable to understand the balance, but the warnings are now on a higher level that make more sense. Fixes: 632ca50f2cbd ("af_packet: TPACKET_V3: replace busy-wait loop") Signed-off-by: John Ogness Reported-by: kernel test robot Signed-off-by: David S. Miller --- net/packet/af_packet.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0b8160d1a6e0..479c257ded73 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -941,6 +941,7 @@ static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) + __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); @@ -989,6 +990,7 @@ static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) + __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; @@ -2286,8 +2288,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (do_vnet && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le(), true, 0)) + vio_le(), true, 0)) { + if (po->tp_version == TPACKET_V3) + prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; + } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); @@ -2393,7 +2398,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); - } else { + } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } -- cgit v1.2.3 From 1f3a090b9033f69de380c03db3ea1a1015c850cf Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Wed, 12 Aug 2020 17:56:39 +0800 Subject: net: openvswitch: introduce common code for flushing flows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid some issues, for example RCU usage warning and double free, we should flush the flows under ovs_lock. This patch refactors table_instance_destroy and introduces table_instance_flow_flush which can be invoked by __dp_destroy or ovs_flow_tbl_flush. Fixes: 50b0e61b32ee ("net: openvswitch: fix possible memleak on destroy flow-table") Reported-by: Johan Knöös Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2020-August/050489.html Signed-off-by: Tonghao Zhang Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 10 +++++++++- net/openvswitch/flow_table.c | 35 +++++++++++++++-------------------- net/openvswitch/flow_table.h | 3 +++ 3 files changed, 27 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 42f8cc70bb2c..6e47ef7ef036 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1756,6 +1756,7 @@ err: /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { + struct flow_table *table = &dp->table; int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { @@ -1774,7 +1775,14 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); - /* RCU destroy the flow table */ + /* Flush sw_flow in the tables. RCU cb only releases resource + * such as dp, ports and tables. That may avoid some issues + * such as RCU usage warning. + */ + table_instance_flow_flush(table, ovsl_dereference(table->ti), + ovsl_dereference(table->ufid_ti)); + + /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 8c12675cbb67..e2235849a57e 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -473,19 +473,15 @@ static void table_instance_flow_free(struct flow_table *table, flow_mask_remove(table, flow->mask); } -static void table_instance_destroy(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - bool deferred) +/* Must be called with OVS mutex held. */ +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti) { int i; - if (!ti) - return; - - BUG_ON(!ufid_ti); if (ti->keep_flows) - goto skip_flows; + return; for (i = 0; i < ti->n_buckets; i++) { struct sw_flow *flow; @@ -497,18 +493,16 @@ static void table_instance_destroy(struct flow_table *table, table_instance_flow_free(table, ti, ufid_ti, flow, false); - ovs_flow_free(flow, deferred); + ovs_flow_free(flow, true); } } +} -skip_flows: - if (deferred) { - call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); - } else { - __table_instance_destroy(ti); - __table_instance_destroy(ufid_ti); - } +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti) +{ + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); } /* No need for locking this function is called from RCU callback or @@ -523,7 +517,7 @@ void ovs_flow_tbl_destroy(struct flow_table *table) call_rcu(&mc->rcu, mask_cache_rcu_cb); call_rcu(&ma->rcu, mask_array_rcu_cb); - table_instance_destroy(table, ti, ufid_ti, false); + table_instance_destroy(ti, ufid_ti); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -641,7 +635,8 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) flow_table->count = 0; flow_table->ufid_count = 0; - table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); + table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); + table_instance_destroy(old_ti, old_ufid_ti); return 0; err_free_ti: diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 74ce48fecba9..6e7d4ac59353 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -105,5 +105,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask); void ovs_flow_masks_rebalance(struct flow_table *table); +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti); #endif /* flow_table.h */ -- cgit v1.2.3