diff options
Diffstat (limited to 'net/ipv4/tcp_bpf.c')
-rw-r--r-- | net/ipv4/tcp_bpf.c | 152 |
1 files changed, 92 insertions, 60 deletions
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 2e9547467edb..81f0dff69e0b 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -11,6 +11,24 @@ #include <net/inet_common.h> #include <net/tls.h> +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tcp; + int copied; + + if (!skb || !skb->len || !sk_is_tcp(sk)) + return; + + if (skb_bpf_strparser(skb)) + return; + + tcp = tcp_sk(sk); + copied = tcp->copied_seq + skb->len; + WRITE_ONCE(tcp->copied_seq, copied); + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, skb->len); +} + static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -70,6 +88,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { + struct msghdr msghdr = {}; bool apply = apply_bytes; struct scatterlist *sge; struct page *page; @@ -77,6 +96,7 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, u32 off; while (1) { + struct bio_vec bvec; bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); @@ -87,17 +107,20 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, tcp_rate_check_app_limited(sk); retry: + msghdr.msg_flags = flags | MSG_SPLICE_PAGES; has_tx_ulp = tls_sw_has_ctx_tx(sk); - if (has_tx_ulp) { - flags |= MSG_SENDPAGE_NOPOLICY; - ret = kernel_sendpage_locked(sk, - page, off, size, flags); - } else { - ret = do_tcp_sendpages(sk, page, off, size, flags); - } + if (has_tx_ulp) + msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY; + + if (size < sge->length && msg->sg.start != msg->sg.end) + msghdr.msg_flags |= MSG_MORE; + bvec_set_page(&bvec, page, size, off); + iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); + ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret <= 0) return ret; + if (apply) apply_bytes -= ret; msg->sg.size -= ret; @@ -174,14 +197,34 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } +static bool is_next_msg_fin(struct sk_psock *psock) +{ + struct scatterlist *sge; + struct sk_msg *msg_rx; + int i; + + msg_rx = sk_psock_peek_msg(psock); + i = msg_rx->sg.start; + sge = sk_msg_elem(msg_rx, i); + if (!sge->length) { + struct sk_buff *skb = msg_rx->skb; + + if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + return true; + } + return false; +} + static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { + struct tcp_sock *tcp = tcp_sk(sk); + u32 seq = tcp->copied_seq; struct sk_psock *psock; - int copied; + int copied = 0; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -194,8 +237,43 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, return tcp_recvmsg(sk, msg, len, flags, addr_len); lock_sock(sk); + + /* We may have received data on the sk_receive_queue pre-accept and + * then we can not use read_skb in this context because we haven't + * assigned a sk_socket yet so have no link to the ops. The work-around + * is to check the sk_receive_queue and in these cases read skbs off + * queue again. The read_skb hook is not running at this point because + * of lock_sock so we avoid having multiple runners in read_skb. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + tcp_data_ready(sk); + /* This handles the ENOMEM errors if we both receive data + * pre accept and are already under memory pressure. At least + * let user know to retry. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + copied = -EAGAIN; + goto out; + } + } + msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + /* The typical case for EFAULT is the socket was gracefully + * shutdown with a FIN pkt. So check here the other case is + * some error on copy_page_to_iter which would be unexpected. + * On fin return correct return code to zero. + */ + if (copied == -EFAULT) { + bool is_fin = is_next_msg_fin(psock); + + if (is_fin) { + copied = 0; + seq++; + goto out; + } + } + seq += copied; if (!copied) { long timeo; int data; @@ -233,6 +311,10 @@ msg_bytes_ready: copied = -EAGAIN; } out: + WRITE_ONCE(tcp->copied_seq, seq); + tcp_rcv_space_adjust(sk); + if (copied > 0) + __tcp_cleanup_rbuf(sk, copied); release_sock(sk); sk_psock_put(sk, psock); return copied; @@ -404,7 +486,7 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) long timeo; int flags; - /* Don't let internal do_tcp_sendpages() flags through */ + /* Don't let internal flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; @@ -484,54 +566,6 @@ out_err: return copied ? copied : err; } -static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct sk_msg tmp, *msg = NULL; - int err = 0, copied = 0; - struct sk_psock *psock; - bool enospc = false; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_sendpage(sk, page, offset, size, flags); - - lock_sock(sk); - if (psock->cork) { - msg = psock->cork; - } else { - msg = &tmp; - sk_msg_init(msg); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(sk_msg_full(msg))) - goto out_err; - - sk_msg_page_add(msg, page, size, offset); - sk_mem_charge(sk, size); - copied = size; - if (sk_msg_full(msg)) - enospc = true; - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - if (psock->cork_bytes && !enospc) - goto out_err; - /* All cork bytes are accounted, rerun the prog. */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); -out_err: - release_sock(sk); - sk_psock_put(sk, psock); - return copied ? copied : err; -} - enum { TCP_BPF_IPV4, TCP_BPF_IPV6, @@ -561,7 +595,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; - prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; @@ -596,8 +629,7 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && - ops->sendmsg == tcp_sendmsg && - ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; + ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) |