summaryrefslogtreecommitdiff
path: root/net/ipv4/udp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r--net/ipv4/udp.c382
1 files changed, 263 insertions, 119 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index aa32afd871ee..42a96b3547c9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -103,6 +103,7 @@
#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
+#include <net/gso.h>
#include <net/xfrm.h>
#include <trace/events/udp.h>
#include <linux/static_key.h>
@@ -1062,8 +1063,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int free = 0;
int connected = 0;
__be32 daddr, faddr, saddr;
+ u8 tos, scope;
__be16 dport;
- u8 tos;
int err, is_udplite = IS_UDPLITE(sk);
int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
@@ -1183,12 +1184,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
connected = 0;
}
tos = get_rttos(&ipc, inet);
- if (sock_flag(sk, SOCK_LOCALROUTE) ||
- (msg->msg_flags & MSG_DONTROUTE) ||
- (ipc.opt && ipc.opt->opt.is_strictroute)) {
- tos |= RTO_ONLINK;
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
+ if (scope == RT_SCOPE_LINK)
connected = 0;
- }
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
@@ -1221,11 +1219,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
- flow_flags,
- faddr, saddr, dport, inet->inet_sport,
- sk->sk_uid);
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+ sk->sk_protocol, flow_flags, faddr, saddr,
+ dport, inet->inet_sport, sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1329,58 +1325,20 @@ do_confirm:
}
EXPORT_SYMBOL(udp_sendmsg);
-int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+void udp_splice_eof(struct socket *sock)
{
- struct inet_sock *inet = inet_sk(sk);
+ struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
- int ret;
-
- if (flags & MSG_SENDPAGE_NOTLAST)
- flags |= MSG_MORE;
- if (!up->pending) {
- struct msghdr msg = { .msg_flags = flags|MSG_MORE };
-
- /* Call udp_sendmsg to specify destination address which
- * sendpage interface can't pass.
- * This will succeed only when the socket is connected.
- */
- ret = udp_sendmsg(sk, &msg, 0);
- if (ret < 0)
- return ret;
- }
+ if (!up->pending || READ_ONCE(up->corkflag))
+ return;
lock_sock(sk);
-
- if (unlikely(!up->pending)) {
- release_sock(sk);
-
- net_dbg_ratelimited("cork failed\n");
- return -EINVAL;
- }
-
- ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
- page, offset, size, flags);
- if (ret == -EOPNOTSUPP) {
- release_sock(sk);
- return sock_no_sendpage(sk->sk_socket, page, offset,
- size, flags);
- }
- if (ret < 0) {
- udp_flush_pending_frames(sk);
- goto out;
- }
-
- up->len += size;
- if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
- ret = udp_push_pending_frames(sk);
- if (!ret)
- ret = size;
-out:
+ if (up->pending && !READ_ONCE(up->corkflag))
+ udp_push_pending_frames(sk);
release_sock(sk);
- return ret;
}
+EXPORT_SYMBOL_GPL(udp_splice_eof);
#define UDP_SKB_IS_STATELESS 0x80000000
@@ -1720,21 +1678,19 @@ static int first_packet_length(struct sock *sk)
* IOCTL requests applicable to the UDP protocol
*/
-int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+int udp_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ:
{
- int amount = sk_wmem_alloc_get(sk);
-
- return put_user(amount, (int __user *)arg);
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
}
case SIOCINQ:
{
- int amount = max_t(int, 0, first_packet_length(sk));
-
- return put_user(amount, (int __user *)arg);
+ *karg = max_t(int, 0, first_packet_length(sk));
+ return 0;
}
default:
@@ -1818,7 +1774,7 @@ EXPORT_SYMBOL(__skb_recv_udp);
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct sk_buff *skb;
- int err, copied;
+ int err;
try_again:
skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
@@ -1837,10 +1793,7 @@ try_again:
}
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
- copied = recv_actor(sk, skb);
- kfree_skb(skb);
-
- return copied;
+ return recv_actor(sk, skb);
}
EXPORT_SYMBOL(udp_read_skb);
@@ -2930,7 +2883,8 @@ EXPORT_SYMBOL(udp_poll);
int udp_abort(struct sock *sk, int err)
{
- lock_sock(sk);
+ if (!has_current_bpf_ctx())
+ lock_sock(sk);
/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
* with close()
@@ -2943,7 +2897,8 @@ int udp_abort(struct sock *sk, int err)
__udp_disconnect(sk, 0);
out:
- release_sock(sk);
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
return 0;
}
@@ -2963,7 +2918,7 @@ struct proto udp_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
+ .splice_eof = udp_splice_eof,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
@@ -2988,9 +2943,30 @@ EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
-static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo,
- struct net *net)
+static unsigned short seq_file_family(const struct seq_file *seq);
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static const struct seq_operations bpf_iter_udp_seq_ops;
+#endif
+static struct udp_table *udp_get_table_seq(struct seq_file *seq,
+ struct net *net)
+{
+ const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ if (seq->op == &bpf_iter_udp_seq_ops)
+ return net->ipv4.udp_table;
+#endif
+
+ afinfo = pde_data(file_inode(seq->file));
return afinfo->udp_table ? : net->ipv4.udp_table;
}
@@ -2998,16 +2974,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
{
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
- struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
struct sock *sk;
- if (state->bpf_seq_afinfo)
- afinfo = state->bpf_seq_afinfo;
- else
- afinfo = pde_data(file_inode(seq->file));
-
- udptable = udp_get_table_afinfo(afinfo, net);
+ udptable = udp_get_table_seq(seq, net);
for (state->bucket = start; state->bucket <= udptable->mask;
++state->bucket) {
@@ -3018,10 +2988,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
spin_lock_bh(&hslot->lock);
sk_for_each(sk, &hslot->head) {
- if (!net_eq(sock_net(sk), net))
- continue;
- if (afinfo->family == AF_UNSPEC ||
- sk->sk_family == afinfo->family)
+ if (seq_sk_match(seq, sk))
goto found;
}
spin_unlock_bh(&hslot->lock);
@@ -3035,22 +3002,14 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
- struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
- if (state->bpf_seq_afinfo)
- afinfo = state->bpf_seq_afinfo;
- else
- afinfo = pde_data(file_inode(seq->file));
-
do {
sk = sk_next(sk);
- } while (sk && (!net_eq(sock_net(sk), net) ||
- (afinfo->family != AF_UNSPEC &&
- sk->sk_family != afinfo->family)));
+ } while (sk && !seq_sk_match(seq, sk));
if (!sk) {
- udptable = udp_get_table_afinfo(afinfo, net);
+ udptable = udp_get_table_seq(seq, net);
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
@@ -3096,15 +3055,9 @@ EXPORT_SYMBOL(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v)
{
struct udp_iter_state *state = seq->private;
- struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
- if (state->bpf_seq_afinfo)
- afinfo = state->bpf_seq_afinfo;
- else
- afinfo = pde_data(file_inode(seq->file));
-
- udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq));
+ udptable = udp_get_table_seq(seq, seq_file_net(seq));
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
@@ -3157,6 +3110,143 @@ struct bpf_iter__udp {
int bucket __aligned(8);
};
+struct bpf_udp_iter_state {
+ struct udp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ int offset;
+ struct sock **batch;
+ bool st_bucket_done;
+};
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz);
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct udp_iter_state *state = &iter->state;
+ struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
+ unsigned int batch_sks = 0;
+ bool resized = false;
+ struct sock *sk;
+
+ /* The current batch is done, so advance the bucket. */
+ if (iter->st_bucket_done) {
+ state->bucket++;
+ iter->offset = 0;
+ }
+
+ udptable = udp_get_table_seq(seq, net);
+
+again:
+ /* New batch for the next bucket.
+ * Iterate over the hash table to find a bucket with sockets matching
+ * the iterator attributes, and return the first matching socket from
+ * the bucket. The remaining matched sockets from the bucket are batched
+ * before releasing the bucket lock. This allows BPF programs that are
+ * called in seq_show to acquire the bucket lock if needed.
+ */
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+ iter->st_bucket_done = false;
+ batch_sks = 0;
+
+ for (; state->bucket <= udptable->mask; state->bucket++) {
+ struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
+
+ if (hlist_empty(&hslot2->head)) {
+ iter->offset = 0;
+ continue;
+ }
+
+ spin_lock_bh(&hslot2->lock);
+ udp_portaddr_for_each_entry(sk, &hslot2->head) {
+ if (seq_sk_match(seq, sk)) {
+ /* Resume from the last iterated socket at the
+ * offset in the bucket before iterator was stopped.
+ */
+ if (iter->offset) {
+ --iter->offset;
+ continue;
+ }
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+ batch_sks++;
+ }
+ }
+ spin_unlock_bh(&hslot2->lock);
+
+ if (iter->end_sk)
+ break;
+
+ /* Reset the current bucket's offset before moving to the next bucket. */
+ iter->offset = 0;
+ }
+
+ /* All done: no batch made. */
+ if (!iter->end_sk)
+ return NULL;
+
+ if (iter->end_sk == batch_sks) {
+ /* Batching is done for the current bucket; return the first
+ * socket to be iterated from the batch.
+ */
+ iter->st_bucket_done = true;
+ goto done;
+ }
+ if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
+ resized = true;
+ /* After allocating a larger batch, retry one more time to grab
+ * the whole bucket.
+ */
+ state->bucket--;
+ goto again;
+ }
+done:
+ return iter->batch[0];
+}
+
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so unref the iter->cur_sk.
+ */
+ if (iter->cur_sk < iter->end_sk) {
+ sock_put(iter->batch[iter->cur_sk++]);
+ ++iter->offset;
+ }
+
+ /* After updating iter->cur_sk, check if there are more sockets
+ * available in the current bucket batch.
+ */
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk];
+ else
+ /* Prepare a new batch. */
+ sk = bpf_iter_udp_batch(seq);
+
+ ++*pos;
+ return sk;
+}
+
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_udp_batch(seq);
+
+ return SEQ_START_TOKEN;
+}
+
static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
{
@@ -3177,18 +3267,37 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
struct bpf_prog *prog;
struct sock *sk = v;
uid_t uid;
+ int ret;
if (v == SEQ_START_TOKEN)
return 0;
+ lock_sock(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
- return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+ ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+ release_sock(sk);
+ return ret;
+}
+
+static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
+{
+ while (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++]);
}
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
{
+ struct bpf_udp_iter_state *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
@@ -3199,17 +3308,35 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
}
- udp_seq_stop(seq, v);
+ if (iter->cur_sk < iter->end_sk) {
+ bpf_iter_udp_put_batch(iter);
+ iter->st_bucket_done = false;
+ }
}
static const struct seq_operations bpf_iter_udp_seq_ops = {
- .start = udp_seq_start,
- .next = udp_seq_next,
+ .start = bpf_iter_udp_seq_start,
+ .next = bpf_iter_udp_seq_next,
.stop = bpf_iter_udp_seq_stop,
.show = bpf_iter_udp_seq_show,
};
#endif
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+ const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* BPF iterator: bpf programs to filter sockets. */
+ if (seq->op == &bpf_iter_udp_seq_ops)
+ return AF_UNSPEC;
+#endif
+
+ /* Proc fs iterator */
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->family;
+}
+
const struct seq_operations udp_seq_ops = {
.start = udp_seq_start,
.next = udp_seq_next,
@@ -3418,38 +3545,55 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
-static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz)
{
- struct udp_iter_state *st = priv_data;
- struct udp_seq_afinfo *afinfo;
- int ret;
+ struct sock **new_batch;
- afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
- if (!afinfo)
+ new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+ GFP_USER | __GFP_NOWARN);
+ if (!new_batch)
return -ENOMEM;
- afinfo->family = AF_UNSPEC;
- afinfo->udp_table = NULL;
- st->bpf_seq_afinfo = afinfo;
+ bpf_iter_udp_put_batch(iter);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_udp_iter_state *iter = priv_data;
+ int ret;
+
ret = bpf_iter_init_seq_net(priv_data, aux);
if (ret)
- kfree(afinfo);
+ return ret;
+
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+ if (ret)
+ bpf_iter_fini_seq_net(priv_data);
+
return ret;
}
static void bpf_iter_fini_udp(void *priv_data)
{
- struct udp_iter_state *st = priv_data;
+ struct bpf_udp_iter_state *iter = priv_data;
- kfree(st->bpf_seq_afinfo);
bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
}
static const struct bpf_iter_seq_info udp_seq_info = {
.seq_ops = &bpf_iter_udp_seq_ops,
.init_seq_private = bpf_iter_init_udp,
.fini_seq_private = bpf_iter_fini_udp,
- .seq_priv_size = sizeof(struct udp_iter_state),
+ .seq_priv_size = sizeof(struct bpf_udp_iter_state),
};
static struct bpf_iter_reg udp_reg_info = {
@@ -3457,7 +3601,7 @@ static struct bpf_iter_reg udp_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__udp, udp_sk),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &udp_seq_info,
};