From 2585cd62f0986a6e6d9c83363ed6dbcc66bc9f32 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:05 -0800 Subject: bpf: Only reply field should be writeable Currently, a sock_ops BPF program can write the op field and all the reply fields (reply and replylong). This is a bug. The op field should not have been writeable and there is currently no way to use replylong field for indices >= 1. This patch enforces that only the reply field (which equals replylong[0]) is writeable. Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops") Signed-off-by: Lawrence Brakmo Acked-by: Yuchung Cheng Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 18da42a81d0c..bf9bb755e369 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3845,8 +3845,7 @@ static bool sock_ops_is_valid_access(int off, int size, { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): break; default: return false; -- cgit v1.2.3 From a33de3973488680def6a900e74ec67ba2bef226c Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:06 -0800 Subject: bpf: Make SOCK_OPS_GET_TCP size independent Make SOCK_OPS_GET_TCP helper macro size independent (before only worked with 4-byte fields. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index bf9bb755e369..62e78748d95f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4470,9 +4470,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; /* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ +#define SOCK_OPS_GET_TCP(FIELD_NAME) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \ + FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4484,16 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \ + FIELD_NAME), si->dst_reg, \ + si->dst_reg, \ offsetof(struct tcp_sock, FIELD_NAME)); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP32(snd_cwnd); + SOCK_OPS_GET_TCP(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP32(srtt_us); + SOCK_OPS_GET_TCP(srtt_us); break; } return insn - insn_buf; -- cgit v1.2.3 From 34d367c59233464dbd1f07445c4665099a7128ec Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:07 -0800 Subject: bpf: Make SOCK_OPS_GET_TCP struct independent Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added 2 arguments so now it can also work with struct sock fields. The first argument is the name of the field in the bpf_sock_ops struct, the 2nd argument is the name of the field in the OBJ struct. Previous: SOCK_OPS_GET_TCP(FIELD_NAME) New: SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) Where OBJ is either "struct tcp_sock" or "struct sock" (without quotation). BPF_FIELD is the name of the field in the bpf_sock_ops struct and OBJ_FIELD is the name of the field in the OBJ struct. Although the field names are currently the same, the kernel struct names could change in the future and this change makes it easier to support that. Note that adding access to tcp_sock fields in sock_ops programs does not preclude the tcp_sock fields from being removed as long as we are willing to do one of the following: 1) Return a fixed value (e.x. 0 or 0xffffffff), or 2) Make the verifier fail if that field is accessed (i.e. program fails to load) so the user will know that field is no longer supported. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 62e78748d95f..dbb6d2f60680 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4469,11 +4469,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; -/* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP(FIELD_NAME) \ +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \ - FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4485,18 +4485,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \ - FIELD_NAME), si->dst_reg, \ - si->dst_reg, \ - offsetof(struct tcp_sock, FIELD_NAME)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP(snd_cwnd); + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP(srtt_us); + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; } return insn - insn_buf; -- cgit v1.2.3 From b73042b8a28e2603ac178295ab96c876ba5a97a1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:08 -0800 Subject: bpf: Add write access to tcp_sock and sock fields This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to struct tcp_sock or struct sock fields. This required adding a new field "temp" to struct bpf_sock_ops_kern for temporary storage that is used by sock_ops_convert_ctx_access. It is used to store and recover the contents of a register, so the register can be used to store the address of the sk. Since we cannot overwrite the dst_reg because it contains the pointer to ctx, nor the src_reg since it contains the value we want to store, we need an extra register to contain the address of the sk. Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the GET or SET macros depending on the value of the TYPE field. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 9 +++++++++ include/net/tcp.h | 2 +- net/core/filter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/filter.h b/include/linux/filter.h index 425056c7f96c..daa5a676335f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1007,6 +1007,15 @@ struct bpf_sock_ops_kern { u32 replylong[4]; }; u32 is_fullsock; + u64 temp; /* temp and everything after is not + * initialized to 0 before calling + * the BPF program. New fields that + * should be initialized to 0 should + * be inserted before temp. + * temp is scratch storage used by + * sock_ops_convert_ctx_access + * as temporary storage of a register. + */ }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a1d26a18599..6092eaff61cf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op) struct bpf_sock_ops_kern sock_ops; int ret; - memset(&sock_ops, 0, sizeof(sock_ops)); + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); diff --git a/net/core/filter.c b/net/core/filter.c index dbb6d2f60680..c356ec02b1a5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4491,6 +4491,54 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, offsetof(OBJ, OBJ_FIELD)); \ } while (0) +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; -- cgit v1.2.3 From b13d880721729384757f235166068c315326f4a1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:10 -0800 Subject: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary use is to determine if there should be calls to sock_ops bpf program at various points in the TCP code. The field is initialized to zero, disabling the calls. A sock_ops BPF program can set it, per connection and as necessary, when the connection is established. It also adds support for reading and writting the field within a sock_ops BPF program. Reading is done by accessing the field directly. However, writing is done through the helper function bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program is trying to set a callback that is not supported in the current kernel (i.e. running an older kernel). The helper function returns 0 if it was able to set all of the bits set in the argument, a positive number containing the bits that could not be set, or -EINVAL if the socket is not a full TCP socket. Examples of where one could call the bpf program: 1) When RTO fires 2) When a packet is retransmitted 3) When the connection terminates 4) When a packet is sent 5) When a packet is received Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 11 +++++++++++ include/uapi/linux/bpf.h | 17 ++++++++++++++++- net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4f93f0953c41..8f4c54986f97 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -335,6 +335,17 @@ struct tcp_sock { int linger2; + +/* Sock_ops bpf program related variables */ +#ifdef CONFIG_BPF + u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs + * values defined in uapi/linux/tcp.h + */ +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) +#else +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 +#endif + /* Receiver side RTT estimation */ struct { u32 rtt_us; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8d5874c2c4ff..aa128407c44d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -748,7 +756,8 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -969,8 +978,14 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ diff --git a/net/core/filter.c b/net/core/filter.c index c356ec02b1a5..6936d19ac736 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3328,6 +3328,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3510,6 +3537,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -4546,6 +4575,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; } return insn - insn_buf; } -- cgit v1.2.3 From 44f0e43037d3a17b043843ba67610ac7c7e37db6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:12 -0800 Subject: bpf: Add support for reading sk_state and more Add support for reading many more tcp_sock fields state, same as sk->sk_state rtt_min same as sk->rtt_min.s[0].v (current rtt_min) snd_ssthresh rcv_nxt snd_nxt snd_una mss_cache ecn_flags rate_delivered rate_interval_us packets_out retrans_out total_retrans segs_in data_segs_in segs_out data_segs_out lost_out sacked_out sk_txhash bytes_received (__u64) bytes_acked (__u64) Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 ++++++++ net/core/filter.c | 143 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 154 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8cecf9cf5bd..46520eae37fa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -979,6 +979,28 @@ struct bpf_sock_ops { __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index 6936d19ac736..a858ebc4ece4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3855,33 +3855,43 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - - return true; -} -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4498,6 +4508,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ @@ -4580,6 +4616,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; } return insn - insn_buf; } -- cgit v1.2.3 From 6f9bd3d731aac0d2ac21dd78a642af5df38fb5c5 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:13 -0800 Subject: bpf: Add sock_ops R/W access to tclass Adds direct write access to sk_txhash and access to tclass for ipv6 flows through getsockopt and setsockopt. Sample usage for tclass: bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v)) where skops is a pointer to the ctx (struct bpf_sock_ops). Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index a858ebc4ece4..fe2c7937351f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3232,6 +3232,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3241,7 +3264,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3307,6 +3331,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3871,6 +3911,7 @@ static bool sock_ops_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; @@ -4690,7 +4731,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, sk_txhash): - SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); break; case offsetof(struct bpf_sock_ops, bytes_received): @@ -4701,6 +4743,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); break; + } return insn - insn_buf; } -- cgit v1.2.3 From 1d621674d923790d09cab9e2c7da7da6446a6257 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:36 +0100 Subject: bpf: xor of a/x in cbpf can be done in 32 bit alu Very minor optimization; saves 1 byte per program in x86_64 JIT in cBPF prologue. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index fe2c7937351f..60d8c8712652 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to -- cgit v1.2.3 From f6b1b3bf0d5f681631a293cfe1ca934b81716f1e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:39 +0100 Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception One of the ugly leftovers from the early eBPF days is that div/mod operations based on registers have a hard-coded src_reg == 0 test in the interpreter as well as in JIT code generators that would return from the BPF program with exit code 0. This was basically adopted from cBPF interpreter for historical reasons. There are multiple reasons why this is very suboptimal and prone to bugs. To name one: the return code mapping for such abnormal program exit of 0 does not always match with a suitable program type's exit code mapping. For example, '0' in tc means action 'ok' where the packet gets passed further up the stack, which is just undesirable for such cases (e.g. when implementing policy) and also does not match with other program types. While trying to work out an exception handling scheme, I also noticed that programs crafted like the following will currently pass the verifier: 0: (bf) r6 = r1 1: (85) call pc+8 caller: R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 callee: frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1 10: (b4) (u32) r2 = (u32) 0 11: (b4) (u32) r3 = (u32) 1 12: (3c) (u32) r3 /= (u32) r2 13: (61) r0 = *(u32 *)(r1 +76) 14: (95) exit returning from callee: frame1: R0_w=pkt(id=0,off=0,r=0,imm=0) R1=ctx(id=0,off=0,imm=0) R2_w=inv0 R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_1 to caller at 2: R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 2: (bf) r1 = r6 3: (61) r1 = *(u32 *)(r1 +80) 4: (bf) r2 = r0 5: (07) r2 += 8 6: (2d) if r2 > r1 goto pc+1 R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0) R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 7: (71) r0 = *(u8 *)(r0 +0) 8: (b7) r0 = 1 9: (95) exit from 6 to 8: safe processed 16 insns (limit 131072), stack depth 0+0 Basically what happens is that in the subprog we make use of a div/mod by 0 exception and in the 'normal' subprog's exit path we just return skb->data back to the main prog. This has the implication that the verifier thinks we always get a pkt pointer in R0 while we still have the implicit 'return 0' from the div as an alternative unconditional return path earlier. Thus, R0 then contains 0, meaning back in the parent prog we get the address range of [0x0, skb->data_end] as read and writeable. Similar can be crafted with other pointer register types. Since i) BPF_ABS/IND is not allowed in programs that contain BPF to BPF calls (and generally it's also disadvised to use in native eBPF context), ii) unknown opcodes don't return zero anymore, iii) we don't return an exception code in dead branches, the only last missing case affected and to fix is the div/mod handling. What we would really need is some infrastructure to propagate exceptions all the way to the original prog unwinding the current stack and returning that code to the caller of the BPF program. In user space such exception handling for similar runtimes is typically implemented with setjmp(3) and longjmp(3) as one possibility which is not available in the kernel, though (kgdb used to implement it in kernel long time ago). I implemented a PoC exception handling mechanism into the BPF interpreter with porting setjmp()/longjmp() into x86_64 and adding a new internal BPF_ABRT opcode that can use a program specific exception code for all exception cases we have (e.g. div/mod by 0, unknown opcodes, etc). While this seems to work in the constrained BPF environment (meaning, here, we don't need to deal with state e.g. from memory allocations that we would need to undo before going into exception state), it still has various drawbacks: i) we would need to implement the setjmp()/longjmp() for every arch supported in the kernel and for x86_64, arm64, sparc64 JITs currently supporting calls, ii) it has unconditional additional cost on main program entry to store CPU register state in initial setjmp() call, and we would need some way to pass the jmp_buf down into ___bpf_prog_run() for main prog and all subprogs, but also storing on stack is not really nice (other option would be per-cpu storage for this, but it also has the drawback that we need to disable preemption for every BPF program types). All in all this approach would add a lot of complexity. Another poor-man's solution would be to have some sort of additional shared register or scratch buffer to hold state for exceptions, and test that after every call return to chain returns and pass R0 all the way down to BPF prog caller. This is also problematic in various ways: i) an additional register doesn't map well into JITs, and some other scratch space could only be on per-cpu storage, which, again has the side-effect that this only works when we disable preemption, or somewhere in the input context which is not available everywhere either, and ii) this adds significant runtime overhead by putting conditionals after each and every call, as well as implementation complexity. Yet another option is to teach verifier that div/mod can return an integer, which however is also complex to implement as verifier would need to walk such fake 'mov r0,; exit;' sequeuence and there would still be no guarantee for having propagation of this further down to the BPF caller as proper exception code. For parent prog, it is also is not distinguishable from a normal return of a constant scalar value. The approach taken here is a completely different one with little complexity and no additional overhead involved in that we make use of the fact that a div/mod by 0 is undefined behavior. Instead of bailing out, we adapt the same behavior as on some major archs like ARMv8 [0] into eBPF as well: X div 0 results in 0, and X mod 0 results in X. aarch64 and aarch32 ISA do not generate any traps or otherwise aborts of program execution for unsigned divides. I verified this also with a test program compiled by gcc and clang, and the behavior matches with the spec. Going forward we adapt the eBPF verifier to emit such rewrites once div/mod by register was seen. cBPF is not touched and will keep existing 'return 0' semantics. Given the options, it seems the most suitable from all of them, also since major archs have similar schemes in place. Given this is all in the realm of undefined behavior, we still have the option to adapt if deemed necessary and this way we would also have the option of more flexibility from LLVM code generation side (which is then fully visible to verifier). Thus, this patch i) fixes the panic seen in above program and ii) doesn't bypass the verifier observations. [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b] http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf 1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV) "A division by zero results in a zero being written to the destination register, without any indication that the division by zero occurred." 2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV) "For the SDIV and UDIV instructions, division by zero always returns a zero result." Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 8 -------- kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++-------- net/core/filter.c | 9 ++++++++- 3 files changed, 38 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8301de2d1f96..5f35f93dcab2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -999,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -1019,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c5269415090..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5400,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - /* due to JIT bugs clear upper 32-bits of src register - * before div/mod operation - */ - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); - insn_buf[1] = *insn; - cnt = 2; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) return -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index 60d8c8712652..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; -- cgit v1.2.3